firecrawl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d79423650fbec60124bab7710e372f16774987f437e8bb52cb13e72a434bdec6
4
+ data.tar.gz: bfb0ec5c302e4e646812855d5c954e2fc6c075e4a03af495a6a09815ce588e44
5
+ SHA512:
6
+ metadata.gz: 46c819135a19388beb434c797e16a3a73f52b1d85cc37f317e03dc159de8ed1a56a6a5170f2e2457e2798fbff25d93cfbf11f7be12682fcb6ef13534aa347f62
7
+ data.tar.gz: ffe9b239c29138617a1902f8502099b6f0b2019eeed8b266cd8f77e94eee868e8e876ee9f8545cd584d701a8bfffa75f6552508b2949463f2cc187580eeb35fe
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Endless International
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/firecrawl.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ Gem::Specification.new do | spec |
2
+
3
+ spec.name = 'firecrawl'
4
+ spec.version = '0.0.1'
5
+ spec.authors = [ 'Kristoph Cichocki-Romanov' ]
6
+ spec.email = [ 'rubygems.org@kristoph.net' ]
7
+
8
+ spec.summary =
9
+ "The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API which takes " \
10
+ "a URL, crawls it and returns html, markdown, or structured data. It is of particular value" \
11
+ "when used with LLM's for grounding."
12
+ spec.description =
13
+ "The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl " \
14
+ "can take a URL, scrape the page contents and return the whole page or principal content " \
15
+ "as html, markdown, or structured data.\n" \
16
+ "\n" \
17
+ "In addition, Firecrawl can crawl an entire site returning the pages it encounters or just " \
18
+ "the map of the pages, which can be used for subsequent scraping."
19
+ spec.license = 'MIT'
20
+ spec.homepage = 'https://github.com/EndlessInternational/firecrawl'
21
+ spec.metadata = {
22
+ 'source_code_uri' => 'https://github.com/EndlessInternational/firecrawl',
23
+ 'bug_tracker_uri' => 'https://github.com/EndlessInternational/firecrawl/issues',
24
+ # 'documentation_uri' => 'https://github.com/EndlessInternational/firecrawl'
25
+ }
26
+
27
+ spec.required_ruby_version = '>= 3.0'
28
+ spec.files = Dir[ "lib/**/*.rb", "LICENSE", "README.md", "firecrawl.gemspec" ]
29
+ spec.require_paths = [ "lib" ]
30
+
31
+ spec.add_runtime_dependency 'faraday', '~> 2.7'
32
+ spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta03'
33
+
34
+ spec.add_development_dependency 'rspec', '~> 3.13'
35
+ spec.add_development_dependency 'debug', '~> 1.9'
36
+
37
+ end
@@ -0,0 +1,106 @@
1
+ module Firecrawl
2
+
3
+ ##
4
+ # The +BatchScrapeRequest+ class encapsulates a batch scrape request to the Firecrawl API.
5
+ # After creating a new +BatchScrapeRequest+ instance you can begin batch scraping by calling
6
+ # the +begin_scraping+ method and then subsequently evaluate the results by calling the
7
+ # +continue_scraping' method.
8
+ #
9
+ # === examples
10
+ #
11
+ # require 'firecrawl'
12
+ #
13
+ # request = Firecrawl::BatchScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
14
+ #
15
+ # urls = [ 'https://example.com', 'https://icann.org' ]
16
+ # options = Firecrawl::ScrapeOptions.build do
17
+ # format [ :markdown, 'screenshot@full_page' ]
18
+ # only_main_content true
19
+ # end
20
+ #
21
+ # batch_response = request.beging_scraping( urls, options )
22
+ # while response.success?
23
+ # batch_result = batch_response.result
24
+ # if batch_result.success?
25
+ # batch_result.scrape_results.each do | result |
26
+ # puts response.metadata[ 'title ]
27
+ # puts '---'
28
+ # puts response.markdown
29
+ # puts "\n\n"
30
+ # end
31
+ # end
32
+ # break unless batch_result.status?( :scraping )
33
+ # end
34
+ #
35
+ # unless response.success?
36
+ # puts response.result.error_description
37
+ # end
38
+ #
39
+ class BatchScrapeRequest < Request
40
+
41
+ ##
42
+ # The +start_scraping+ method makes a Firecrawl '/batch/scrape' POST request which will
43
+ # initiate batch scraping of the given urls.
44
+ #
45
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
46
+ # then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
47
+ # successful then +response.result+ will be an instance of +ErrorResult+.
48
+ #
49
+ # Remember that you should call +response.success?+ to valida that the call to the API was
50
+ # successful and then +response.result.success?+ to validate that the API processed the
51
+ # request successfuly.
52
+ #
53
+ def start_scraping( urls, options = nil, &block )
54
+ if options
55
+ options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
56
+ options = options.to_h
57
+ else
58
+ options = {}
59
+ end
60
+ options[ :urls ] = [ urls ].flatten
61
+
62
+ response = post( "#{BASE_URI}/batch/scrape", options, &block )
63
+ result = nil
64
+ if response.success?
65
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
66
+ attributes ||= { success: false, status: :failed }
67
+ result = BatchScrapeResult.new( attributes[ :success ], attributes )
68
+ else
69
+ result = ErrorResult.new( response.status, attributes )
70
+ end
71
+
72
+ ResponseMethods.install( response, result )
73
+ end
74
+
75
+ ##
76
+ # The +retrieve_scraping+ method makes a Firecrawl '/batch/scrape' GET request which will
77
+ # retrieve batch scraping results. Note that there is no guarantee that there are any batch
78
+ # scraping results at the time of the call and you may need to call this method multiple
79
+ # times.
80
+ #
81
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
82
+ # then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
83
+ # successful then +response.result+ will be an instance of +ErrorResult+.
84
+ #
85
+ # Remember that you should call +response.success?+ to valida that the call to the API was
86
+ # successful and then +response.result.success?+ to validate that the API processed the
87
+ # request successfuly.
88
+ #
89
+ def retrieve_scraping( batch_result, &block )
90
+ raise ArgumentError, "The first argument must be an instance of BatchScrapeResult." \
91
+ unless batch_result.is_a?( BatchScrapeResult )
92
+ response = get( batch_result.next_url, &block )
93
+ result = nil
94
+ if response.success?
95
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
96
+ attributes ||= { success: false, status: :failed }
97
+ result = batch_result.merge( attributes )
98
+ else
99
+ result = ErrorResult.new( response.status, attributes )
100
+ end
101
+
102
+ ResponseMethods.install( response, result )
103
+ end
104
+
105
+ end
106
+ end
@@ -0,0 +1,63 @@
1
+ module Firecrawl
2
+ class BatchScrapeResult
3
+
4
+ def initialize( success, attributes )
5
+ @success = success
6
+ @attributes = attributes || {}
7
+ end
8
+
9
+ def success?
10
+ @success || false
11
+ end
12
+
13
+ def status
14
+ # the initial Firecrawl response does not have a status so we synthesize a 'scraping'
15
+ # status if the operation was otherwise successful
16
+ @attributes[ :status ]&.to_sym || ( @success ? :scraping : :failed )
17
+ end
18
+
19
+ def status?( status )
20
+ self.status == status
21
+ end
22
+
23
+ def id
24
+ @attributes[ :id ]
25
+ end
26
+
27
+ def total
28
+ @attributes[ :total ] || 0
29
+ end
30
+
31
+ def completed
32
+ @attributes[ :completed ] || 0
33
+ end
34
+
35
+ def credits_used
36
+ @attributes[ :creditsUsed ] || 0
37
+ end
38
+
39
+ def expires_at
40
+ Date.parse( @attributes[ :expiresAt ] ) rescue nil
41
+ end
42
+
43
+ def url
44
+ @attributes[ :url ]
45
+ end
46
+
47
+ def next_url
48
+ @attributes[ :next ] || @attributes[ :url ]
49
+ end
50
+
51
+ def scrape_results
52
+ success = @attributes[ :success ]
53
+ # note the &.compact is here because I've noted null entries in the data
54
+ ( @attributes[ :data ]&.compact || [] ).map do | attr |
55
+ ScrapeResult.new( success, attr )
56
+ end
57
+ end
58
+
59
+ def merge( attributes )
60
+ self.class.new( attributes[ :success ], @attributes.merge( attributes ) )
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,47 @@
1
+ module Firecrawl
2
+ class CrawlOptions
3
+ include DynamicSchema::Definable
4
+ include DynamicSchema::Buildable
5
+
6
+ FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot ]
7
+
8
+ ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
9
+
10
+ schema do
11
+ exclude_paths String, as: :excludePaths, array: true
12
+ include_paths String, as: :includePaths, array: true
13
+ maximum_depth Integer, as: :maxDepth
14
+ ignore_sitemap [ TrueClass, FalseClass ], as: :ignoreSitemap
15
+ limit Integer
16
+ allow_backward_links [ TrueClass, FalseClass ], as: :allowBackwardLinks
17
+ allow_external_links [ TrueClass, FalseClass ], as: :allowExternalLinks
18
+ webhook String
19
+ scrape_options as: :scrapeOptions, &ScrapeOptions.schema
20
+ end
21
+
22
+ def self.build( options = nil, &block )
23
+ new( api_options: builder.build( options, &block ) )
24
+ end
25
+
26
+ def self.build!( options = nil, &block )
27
+ new( api_options: builder.build!( options, &block ) )
28
+ end
29
+
30
+ def initialize( options, api_options: nil )
31
+ @options = self.class.builder.build( options || {} )
32
+ @options = api_options.merge( @options ) if api_options
33
+
34
+ scrape_options = @options[ :scrapeOptions ]
35
+ if scrape_options
36
+ scrape_options[ :formats ]&.map!( &method( :string_camelize ) )
37
+ end
38
+ end
39
+
40
+ def to_h
41
+ @options.to_h
42
+ end
43
+
44
+ end
45
+ end
46
+
47
+
@@ -0,0 +1,45 @@
1
+ module Firecrawl
2
+ class ErrorResult
3
+
4
+ attr_reader :error_type, :error_description
5
+
6
+ def initialize( status_code, attributes = nil )
7
+ @error_code, @error_description = status_code_to_error( status_code )
8
+ @error_description = attributes[ :error ] if @attributes&.respond_to?( :[] )
9
+ end
10
+
11
+ private
12
+ def status_code_to_error( status_code )
13
+ case status_code
14
+ # this is here because I've noted invalid payloads being returned with a 200
15
+ when 200
16
+ [ :unexpected_error,
17
+ "The response was successful but it did not include a valid payload." ]
18
+ when 400
19
+ [ :invalid_request_error,
20
+ "There was an issue with the format or content of your request." ]
21
+ when 401
22
+ [ :authentication_error,
23
+ "There's an issue with your API key." ]
24
+ when 402
25
+ [ :payment_required,
26
+ "The request requires a paid account" ]
27
+ when 404
28
+ [ :not_found_error,
29
+ "The requested resource was not found." ]
30
+ when 429
31
+ [ :rate_limit_error,
32
+ "Your account has hit a rate limit." ]
33
+ when 500..505
34
+ [ :api_error,
35
+ "An unexpected Firecrawl server error has occurred." ]
36
+ when 529
37
+ [ :overloaded_error,
38
+ "The Firecrawl service is overloaded." ]
39
+ else
40
+ [ :unknown_error,
41
+ "The Firecrawl service returned an unexpected status code: '#{status_code}'." ]
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,8 @@
1
+ module Firecrawl
2
+ module Helpers
3
+ def string_camelize( string )
4
+ words = string.split( /[\s_\-]/ )
5
+ words.map.with_index { |word, index| index.zero? ? word.downcase : word.capitalize }.join
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,32 @@
1
+ module Firecrawl
2
+ class MapOptions
3
+ include DynamicSchema::Definable
4
+
5
+ schema do
6
+ search String
7
+ ignore_sitemap [ TrueClass, FalseClass ]
8
+ ignore_subdomains [ TrueClass, FalseClass ]
9
+ limit Integer
10
+ end
11
+
12
+ def self.build( options = nil, &block )
13
+ new( api_options: builder.build( options, &block ) )
14
+ end
15
+
16
+ def self.build!( options = nil, &block )
17
+ new( api_options: builder.build!( options, &block ) )
18
+ end
19
+
20
+ def initialize( options = {}, api_options: nil )
21
+ @options = self.class.builder.build( options || {} )
22
+ @options = api_options.merge( @options ) if api_options
23
+ end
24
+
25
+ def to_h
26
+ @options.to_h
27
+ end
28
+
29
+ end
30
+ end
31
+
32
+
@@ -0,0 +1,58 @@
1
+ module Firecrawl
2
+ ##
3
+ # The +MapRequest+ class encapsulates a '/map' POST request to the Firecrawl API. After creating
4
+ # a new +MapRequest+ instance you can make the request by calling the +map+ method to crawl the
5
+ # site and retrieve +links+
6
+ #
7
+ # === examples
8
+ #
9
+ # require 'firecrawl'
10
+ #
11
+ # request = Firecrawl::MapRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
12
+ #
13
+ # response = request.map( 'https://example.com', { limit: 100 } )
14
+ # if response.success?
15
+ # result = response.result
16
+ # if result.success?
17
+ # result.links.each do | link |
18
+ # puts link
19
+ # end
20
+ # end
21
+ # else
22
+ # puts response.result.error_description
23
+ # end
24
+ #
25
+ class MapRequest < Request
26
+
27
+ ##
28
+ # The +map+ method makes a Firecrawl '/map' POST request which will scrape the site with
29
+ # given url.
30
+ #
31
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
32
+ # then +response.result+ will be an instance +MapResult+. If the request is not successful
33
+ # then +response.result+ will be an instance of +ErrorResult+.
34
+ #
35
+ def map( url, options = nil, &block )
36
+ if options
37
+ options = options.is_a?( MapOptions ) ? options : MapOptions.build( options.to_h )
38
+ options = options.to_h
39
+ else
40
+ options = {}
41
+ end
42
+ options[ :url ] = url.to_s
43
+
44
+ response = post( "#{BASE_URI}/map", options, &block )
45
+ result = nil
46
+ if response.success?
47
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
48
+ attributes ||= { success: false }
49
+ result = MapResult.new( attributes[ :success ], attributes )
50
+ else
51
+ result = ErrorResult.new( response.status, attributes )
52
+ end
53
+
54
+ ResponseMethods.install( response, result )
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,29 @@
1
+ module Firecrawl
2
+ class MapResult
3
+
4
+ def initialize( success, attributes )
5
+ @success = success
6
+ @attributes = attributes
7
+ end
8
+
9
+ ##
10
+ # The +success?+ method returns +true+ if the scraping was successful.
11
+ #
12
+ # Note that the response +success?+ tells you if the call to the Firecrawl api was successful
13
+ # while this +success?+ method tells you if the actual scraping operation was successful.
14
+ #
15
+ def success?
16
+ @success || false
17
+ end
18
+
19
+ ##
20
+ # The +links+ method returns an array of the links that were scraped from the the page.
21
+ # The +links+ are empty unless the request options +formats+ included +links+.
22
+ #
23
+ def links
24
+ @attributes[ :links ] || []
25
+ end
26
+
27
+ end
28
+ end
29
+
@@ -0,0 +1,75 @@
1
+ module Firecrawl
2
+
3
+ ##
4
+ # The +Request+ class encapsulates a request to the Firecrawl API. After creating a new
5
+ # +Request+ instance you can make the actual request by calling the +scrape+, +begin_crawl+
6
+ # +crawl+, +end_crawl+ or +map+ methods.
7
+ #
8
+ # === example
9
+ #
10
+ # require 'firecrawl'
11
+ #
12
+ # request = Firecrawl::Request.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
13
+ #
14
+ # options = Firecrawl::ScrapeOptions.build do
15
+ # format [ :markdown, :screenshot ]
16
+ # only_main_content true
17
+ # end
18
+ #
19
+ # response = request.scrape( 'https://cnn.com', criteria )
20
+ # if response.success?
21
+ # result = response.result
22
+ # puts response.metadata[ :title ]
23
+ # puts '---'
24
+ # puts response.markdown
25
+ # else
26
+ # puts response.result.error_description
27
+ # end
28
+ #
29
+ class Request
30
+
31
+ DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
32
+
33
+ BASE_URI = 'https://api.firecrawl.dev/v1'
34
+
35
+ ##
36
+ # The +initialize+ method initializes the +Request+ instance. You MUST pass an +api_key+ and
37
+ # and optionally a (Faraday) +connection+.
38
+ #
39
+ def initialize( connection: nil, api_key: nil )
40
+ @connection = connection || DEFAULT_CONNECTION
41
+ @api_key = api_key || Firecrawl.api_key
42
+ raise ArgumentError, "An 'api_key' is required unless configured using 'Firecrawl.api_key'." \
43
+ unless @api_key
44
+ end
45
+
46
+ protected
47
+
48
+ def post( uri, body, &block )
49
+ headers = {
50
+ 'Authorization' => "Bearer #{@api_key}",
51
+ 'Content-Type' => 'application/json'
52
+ }
53
+
54
+ @connection.post( uri ) do | request |
55
+ headers.each { | key, value | request.headers[ key ] = value }
56
+ request.body = body.is_a?( String ) ? body : JSON.generate( body )
57
+ block.call( request ) if block
58
+ end
59
+ end
60
+
61
+ def get( uri, &block )
62
+ headers = {
63
+ 'Authorization' => "Bearer #{@api_key}",
64
+ 'Content-Type' => 'application/json'
65
+ }
66
+
67
+ @connection.get( uri ) do | request |
68
+ headers.each { | key, value | request.headers[ key ] = value }
69
+ block.call( request ) if block
70
+ end
71
+ end
72
+
73
+ end
74
+
75
+ end
@@ -0,0 +1,15 @@
1
+ module Firecrawl
2
+ #
3
+ # The ResponseMethods module extends a Faraday reponse, adding the +result+ method.
4
+ #
5
+ module ResponseMethods
6
+ def self.install( response, result )
7
+ response.instance_variable_set( "@_firecrawl_result", result )
8
+ response.extend( ResponseMethods )
9
+ end
10
+
11
+ def result
12
+ @_firecrawl_result
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,58 @@
1
+ module Firecrawl
2
+ class ScrapeOptions
3
+ include DynamicSchema::Definable
4
+ include Helpers
5
+
6
+ FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot, :"screenshot@full_page" ]
7
+
8
+ ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
9
+
10
+ schema do
11
+ # note: both format and formats are defined as a semantic convenience
12
+ format String, as: :formats, array: true, in: FORMATS
13
+ formats String, array: true, in: FORMATS
14
+ only_main_content [ TrueClass, FalseClass ], as: :onlyMainContent
15
+ include_tags String, as: :includeTags, array: true
16
+ exclude_tags String, as: :excludeTags, array: true
17
+ wait_for Integer
18
+ timeout Integer
19
+ extract do
20
+ #schema Hash
21
+ system_prompt String, as: :systemPrompt
22
+ prompt String
23
+ end
24
+ action as: :actions, arguments: :type, array: true do
25
+ type Symbol, required: true, in: ACTIONS
26
+ # wait
27
+ milliseconds Integer
28
+ # click
29
+ selector String
30
+ # write
31
+ text String
32
+ # press
33
+ key String
34
+ end
35
+ end
36
+
37
+ def self.build( options = nil, &block )
38
+ new( api_options: builder.build( options, &block ) )
39
+ end
40
+
41
+ def self.build!( options = nil, &block )
42
+ new( api_options: builder.build!( options, &block ) )
43
+ end
44
+
45
+ def initialize( options = {}, api_options: nil )
46
+ @options = self.class.builder.build( options || {} )
47
+ @options = api_options.merge( @options ) if api_options
48
+ @options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
49
+ end
50
+
51
+ def to_h
52
+ @options.to_h
53
+ end
54
+
55
+ end
56
+ end
57
+
58
+
@@ -0,0 +1,60 @@
1
+ module Firecrawl
2
+ ##
3
+ # The +ScrapeRequest+ class encapsulates a '/scrape' POST request to the Firecrawl API. After
4
+ # creating a new +ScrapeRequest+ instance you can initiate the request by calling the +scrape+
5
+ # method to perform synchronous scraping.
6
+ #
7
+ # === examples
8
+ #
9
+ # require 'firecrawl'
10
+ #
11
+ # request = Firecrawl::ScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
12
+ #
13
+ # options = Firecrawl::ScrapeOptions.build do
14
+ # format [ :markdown, 'screenshot@full_page' ]
15
+ # only_main_content true
16
+ # end
17
+ #
18
+ # response = request.scrape( 'https://example.com', options )
19
+ # if response.success?
20
+ # result = response.result
21
+ # puts response.metadata[ 'title ]
22
+ # puts '---'
23
+ # puts response.markdown
24
+ # else
25
+ # puts response.result.error_description
26
+ # end
27
+ #
28
+ class ScrapeRequest < Request
29
+
30
+ ##
31
+ # The +scrape+ method makes a Firecrawl '/scrape' POST request which will scrape the given url.
32
+ #
33
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
34
+ # then +response.result+ will be an instance +ScrapeResult+. If the request is not successful
35
+ # then +response.result+ will be an instance of +ErrorResult+.
36
+ #
37
+ def scrape( url, options = nil, &block )
38
+ if options
39
+ options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
40
+ options = options.to_h
41
+ else
42
+ options = {}
43
+ end
44
+ options[ :url ] = url.to_s
45
+
46
+ response = post( "#{BASE_URI}/scrape", options, &block )
47
+ result = nil
48
+ if response.success?
49
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
50
+ attributes ||= { success: false }
51
+ result = ScrapeResult.new( attributes[ :success ], attributes[ :data ] )
52
+ else
53
+ result = ErrorResult.new( response.status, attributes )
54
+ end
55
+
56
+ ResponseMethods.install( response, result )
57
+ end
58
+
59
+ end
60
+ end
@@ -0,0 +1,92 @@
1
+ module Firecrawl
2
+ class ScrapeResult
3
+
4
+ def initialize( success, attributes )
5
+ @success = success
6
+ @attributes = attributes || {}
7
+ end
8
+
9
+ ##
10
+ # The +success?+ method returns +true+ if the scraping was successful.
11
+ #
12
+ # Note that the response +success?+ tells you if the call to the Firecrawl api was successful
13
+ # while this +success?+ method tells you if the actual scraping operation was successful.
14
+ #
15
+ def success?
16
+ @success || false
17
+ end
18
+
19
+ ##
20
+ # The +markdown+ method returns scraped content that has been converted to markdown. The
21
+ # markdown content is present only if the request options +formats+ included +markdown+.
22
+ #
23
+ def markdown
24
+ @attributes[ :markdown ]
25
+ end
26
+
27
+ ##
28
+ # The +html+ method returns scraped html content. The html content is present only if the
29
+ # request options +formats+ included +html+.
30
+ #
31
+ def html
32
+ @attributes[ :html ]
33
+ end
34
+
35
+ ##
36
+ # The +raw_html+ method returns the full scraped html content of the page. The raw html
37
+ # content is present only if the request options +formats+ included +raw_html+.
38
+ #
39
+ def raw_html
40
+ @attributes[ :rawHtml ]
41
+ end
42
+
43
+ ##
44
+ # The +screenshot_url+ method returns the url of the screenshot of the requested page. The
45
+ # screenshot url is present only if the request options +formats+ included +screenshot+ or
46
+ # +screenshot@full_page+.
47
+ #
48
+ def screenshot_url
49
+ @attributes[ :screenshot ]
50
+ end
51
+
52
+ ##
53
+ # The +links+ method returns an array of the links that were scraped from the the page.
54
+ # The +links+ are empty unless the request options +formats+ included +links+.
55
+ #
56
+ def links
57
+ @attributes[ :links ] || []
58
+ end
59
+
60
+ ##
61
+ # The +actions+ method returns an object of action results ( +scrapes+ or +screenshots+ ).
62
+ # The +actions+ are empty unless the request options included +scrape+ or +scresshot+
63
+ # actions.
64
+ #
65
+ def actions
66
+ @attributes[ :actions ] || {}
67
+ end
68
+
69
+ def metadata
70
+ unless @metadata
71
+ metadata = @attributes[ :metadata ] || {}
72
+ @metadata = metadata.transform_keys do | key |
73
+ key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
74
+ end
75
+ # remove the camelCase forms injected by Firecrawl
76
+ @metadata.delete_if do | key, _ |
77
+ key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
78
+ end
79
+ end
80
+ @metadata
81
+ end
82
+
83
+ def llm_extraction
84
+ @attributes[ :llm_extraction ] || {}
85
+ end
86
+
87
+ def warning
88
+ @attributes[ :warning ]
89
+ end
90
+
91
+ end
92
+ end
data/lib/firecrawl.rb ADDED
@@ -0,0 +1,27 @@
1
+ require 'json'
2
+ require 'base64'
3
+ require 'uri'
4
+
5
+ require 'faraday'
6
+ require 'dynamic_schema'
7
+
8
+ require_relative 'firecrawl/helpers'
9
+ require_relative 'firecrawl/error_result'
10
+ require_relative 'firecrawl/request'
11
+ require_relative 'firecrawl/response_methods'
12
+
13
+ require_relative 'firecrawl/scrape_options'
14
+ require_relative 'firecrawl/scrape_result'
15
+ require_relative 'firecrawl/scrape_request'
16
+ require_relative 'firecrawl/batch_scrape_result'
17
+ require_relative 'firecrawl/batch_scrape_request'
18
+ require_relative 'firecrawl/map_options'
19
+ require_relative 'firecrawl/map_result'
20
+ require_relative 'firecrawl/map_request'
21
+
22
+ module Firecrawl
23
+ class << self
24
+ attr_accessor :api_key
25
+ end
26
+ end
27
+
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: firecrawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Kristoph Cichocki-Romanov
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-11-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: faraday
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: dynamicschema
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.0.0.beta03
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.0.0.beta03
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.13'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.13'
55
+ - !ruby/object:Gem::Dependency
56
+ name: debug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.9'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.9'
69
+ description: |-
70
+ The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl can take a URL, scrape the page contents and return the whole page or principal content as html, markdown, or structured data.
71
+
72
+ In addition, Firecrawl can crawl an entire site returning the pages it encounters or just the map of the pages, which can be used for subsequent scraping.
73
+ email:
74
+ - rubygems.org@kristoph.net
75
+ executables: []
76
+ extensions: []
77
+ extra_rdoc_files: []
78
+ files:
79
+ - LICENSE
80
+ - firecrawl.gemspec
81
+ - lib/firecrawl.rb
82
+ - lib/firecrawl/batch_scrape_request.rb
83
+ - lib/firecrawl/batch_scrape_result.rb
84
+ - lib/firecrawl/crawl_options.rb
85
+ - lib/firecrawl/error_result.rb
86
+ - lib/firecrawl/helpers.rb
87
+ - lib/firecrawl/map_options.rb
88
+ - lib/firecrawl/map_request.rb
89
+ - lib/firecrawl/map_result.rb
90
+ - lib/firecrawl/request.rb
91
+ - lib/firecrawl/response_methods.rb
92
+ - lib/firecrawl/scrape_options.rb
93
+ - lib/firecrawl/scrape_request.rb
94
+ - lib/firecrawl/scrape_result.rb
95
+ homepage: https://github.com/EndlessInternational/firecrawl
96
+ licenses:
97
+ - MIT
98
+ metadata:
99
+ source_code_uri: https://github.com/EndlessInternational/firecrawl
100
+ bug_tracker_uri: https://github.com/EndlessInternational/firecrawl/issues
101
+ post_install_message:
102
+ rdoc_options: []
103
+ require_paths:
104
+ - lib
105
+ required_ruby_version: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '3.0'
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ requirements: []
116
+ rubygems_version: 3.5.19
117
+ signing_key:
118
+ specification_version: 4
119
+ summary: The Firecrawl gem implements a lightweight interface to the Firecrawl.dev
120
+ API which takes a URL, crawls it and returns html, markdown, or structured data.
121
+ It is of particular valuewhen used with LLM's for grounding.
122
+ test_files: []