firecrawl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d79423650fbec60124bab7710e372f16774987f437e8bb52cb13e72a434bdec6
4
+ data.tar.gz: bfb0ec5c302e4e646812855d5c954e2fc6c075e4a03af495a6a09815ce588e44
5
+ SHA512:
6
+ metadata.gz: 46c819135a19388beb434c797e16a3a73f52b1d85cc37f317e03dc159de8ed1a56a6a5170f2e2457e2798fbff25d93cfbf11f7be12682fcb6ef13534aa347f62
7
+ data.tar.gz: ffe9b239c29138617a1902f8502099b6f0b2019eeed8b266cd8f77e94eee868e8e876ee9f8545cd584d701a8bfffa75f6552508b2949463f2cc187580eeb35fe
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Endless International
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/firecrawl.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ Gem::Specification.new do | spec |
2
+
3
+ spec.name = 'firecrawl'
4
+ spec.version = '0.0.1'
5
+ spec.authors = [ 'Kristoph Cichocki-Romanov' ]
6
+ spec.email = [ 'rubygems.org@kristoph.net' ]
7
+
8
+ spec.summary =
9
+ "The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API which takes " \
10
+ "a URL, crawls it and returns html, markdown, or structured data. It is of particular value" \
11
+ "when used with LLM's for grounding."
12
+ spec.description =
13
+ "The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl " \
14
+ "can take a URL, scrape the page contents and return the whole page or principal content " \
15
+ "as html, markdown, or structured data.\n" \
16
+ "\n" \
17
+ "In addition, Firecrawl can crawl an entire site returning the pages it encounters or just " \
18
+ "the map of the pages, which can be used for subsequent scraping."
19
+ spec.license = 'MIT'
20
+ spec.homepage = 'https://github.com/EndlessInternational/firecrawl'
21
+ spec.metadata = {
22
+ 'source_code_uri' => 'https://github.com/EndlessInternational/firecrawl',
23
+ 'bug_tracker_uri' => 'https://github.com/EndlessInternational/firecrawl/issues',
24
+ # 'documentation_uri' => 'https://github.com/EndlessInternational/firecrawl'
25
+ }
26
+
27
+ spec.required_ruby_version = '>= 3.0'
28
+ spec.files = Dir[ "lib/**/*.rb", "LICENSE", "README.md", "firecrawl.gemspec" ]
29
+ spec.require_paths = [ "lib" ]
30
+
31
+ spec.add_runtime_dependency 'faraday', '~> 2.7'
32
+ spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta03'
33
+
34
+ spec.add_development_dependency 'rspec', '~> 3.13'
35
+ spec.add_development_dependency 'debug', '~> 1.9'
36
+
37
+ end
@@ -0,0 +1,106 @@
1
+ module Firecrawl
2
+
3
+ ##
4
+ # The +BatchScrapeRequest+ class encapsulates a batch scrape request to the Firecrawl API.
5
+ # After creating a new +BatchScrapeRequest+ instance you can begin batch scraping by calling
6
+ # the +begin_scraping+ method and then subsequently evaluate the results by calling the
7
+ # +continue_scraping' method.
8
+ #
9
+ # === examples
10
+ #
11
+ # require 'firecrawl'
12
+ #
13
+ # request = Firecrawl::BatchScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
14
+ #
15
+ # urls = [ 'https://example.com', 'https://icann.org' ]
16
+ # options = Firecrawl::ScrapeOptions.build do
17
+ # format [ :markdown, 'screenshot@full_page' ]
18
+ # only_main_content true
19
+ # end
20
+ #
21
+ # batch_response = request.beging_scraping( urls, options )
22
+ # while response.success?
23
+ # batch_result = batch_response.result
24
+ # if batch_result.success?
25
+ # batch_result.scrape_results.each do | result |
26
+ # puts response.metadata[ 'title ]
27
+ # puts '---'
28
+ # puts response.markdown
29
+ # puts "\n\n"
30
+ # end
31
+ # end
32
+ # break unless batch_result.status?( :scraping )
33
+ # end
34
+ #
35
+ # unless response.success?
36
+ # puts response.result.error_description
37
+ # end
38
+ #
39
+ class BatchScrapeRequest < Request
40
+
41
+ ##
42
+ # The +start_scraping+ method makes a Firecrawl '/batch/scrape' POST request which will
43
+ # initiate batch scraping of the given urls.
44
+ #
45
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
46
+ # then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
47
+ # successful then +response.result+ will be an instance of +ErrorResult+.
48
+ #
49
+ # Remember that you should call +response.success?+ to valida that the call to the API was
50
+ # successful and then +response.result.success?+ to validate that the API processed the
51
+ # request successfuly.
52
+ #
53
+ def start_scraping( urls, options = nil, &block )
54
+ if options
55
+ options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
56
+ options = options.to_h
57
+ else
58
+ options = {}
59
+ end
60
+ options[ :urls ] = [ urls ].flatten
61
+
62
+ response = post( "#{BASE_URI}/batch/scrape", options, &block )
63
+ result = nil
64
+ if response.success?
65
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
66
+ attributes ||= { success: false, status: :failed }
67
+ result = BatchScrapeResult.new( attributes[ :success ], attributes )
68
+ else
69
+ result = ErrorResult.new( response.status, attributes )
70
+ end
71
+
72
+ ResponseMethods.install( response, result )
73
+ end
74
+
75
+ ##
76
+ # The +retrieve_scraping+ method makes a Firecrawl '/batch/scrape' GET request which will
77
+ # retrieve batch scraping results. Note that there is no guarantee that there are any batch
78
+ # scraping results at the time of the call and you may need to call this method multiple
79
+ # times.
80
+ #
81
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
82
+ # then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
83
+ # successful then +response.result+ will be an instance of +ErrorResult+.
84
+ #
85
+ # Remember that you should call +response.success?+ to valida that the call to the API was
86
+ # successful and then +response.result.success?+ to validate that the API processed the
87
+ # request successfuly.
88
+ #
89
+ def retrieve_scraping( batch_result, &block )
90
+ raise ArgumentError, "The first argument must be an instance of BatchScrapeResult." \
91
+ unless batch_result.is_a?( BatchScrapeResult )
92
+ response = get( batch_result.next_url, &block )
93
+ result = nil
94
+ if response.success?
95
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
96
+ attributes ||= { success: false, status: :failed }
97
+ result = batch_result.merge( attributes )
98
+ else
99
+ result = ErrorResult.new( response.status, attributes )
100
+ end
101
+
102
+ ResponseMethods.install( response, result )
103
+ end
104
+
105
+ end
106
+ end
@@ -0,0 +1,63 @@
1
+ module Firecrawl
2
+ class BatchScrapeResult
3
+
4
+ def initialize( success, attributes )
5
+ @success = success
6
+ @attributes = attributes || {}
7
+ end
8
+
9
+ def success?
10
+ @success || false
11
+ end
12
+
13
+ def status
14
+ # the initial Firecrawl response does not have a status so we synthesize a 'scraping'
15
+ # status if the operation was otherwise successful
16
+ @attributes[ :status ]&.to_sym || ( @success ? :scraping : :failed )
17
+ end
18
+
19
+ def status?( status )
20
+ self.status == status
21
+ end
22
+
23
+ def id
24
+ @attributes[ :id ]
25
+ end
26
+
27
+ def total
28
+ @attributes[ :total ] || 0
29
+ end
30
+
31
+ def completed
32
+ @attributes[ :completed ] || 0
33
+ end
34
+
35
+ def credits_used
36
+ @attributes[ :creditsUsed ] || 0
37
+ end
38
+
39
+ def expires_at
40
+ Date.parse( @attributes[ :expiresAt ] ) rescue nil
41
+ end
42
+
43
+ def url
44
+ @attributes[ :url ]
45
+ end
46
+
47
+ def next_url
48
+ @attributes[ :next ] || @attributes[ :url ]
49
+ end
50
+
51
+ def scrape_results
52
+ success = @attributes[ :success ]
53
+ # note the &.compact is here because I've noted null entries in the data
54
+ ( @attributes[ :data ]&.compact || [] ).map do | attr |
55
+ ScrapeResult.new( success, attr )
56
+ end
57
+ end
58
+
59
+ def merge( attributes )
60
+ self.class.new( attributes[ :success ], @attributes.merge( attributes ) )
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,47 @@
1
+ module Firecrawl
2
+ class CrawlOptions
3
+ include DynamicSchema::Definable
4
+ include DynamicSchema::Buildable
5
+
6
+ FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot ]
7
+
8
+ ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
9
+
10
+ schema do
11
+ exclude_paths String, as: :excludePaths, array: true
12
+ include_paths String, as: :includePaths, array: true
13
+ maximum_depth Integer, as: :maxDepth
14
+ ignore_sitemap [ TrueClass, FalseClass ], as: :ignoreSitemap
15
+ limit Integer
16
+ allow_backward_links [ TrueClass, FalseClass ], as: :allowBackwardLinks
17
+ allow_external_links [ TrueClass, FalseClass ], as: :allowExternalLinks
18
+ webhook String
19
+ scrape_options as: :scrapeOptions, &ScrapeOptions.schema
20
+ end
21
+
22
+ def self.build( options = nil, &block )
23
+ new( api_options: builder.build( options, &block ) )
24
+ end
25
+
26
+ def self.build!( options = nil, &block )
27
+ new( api_options: builder.build!( options, &block ) )
28
+ end
29
+
30
+ def initialize( options, api_options: nil )
31
+ @options = self.class.builder.build( options || {} )
32
+ @options = api_options.merge( @options ) if api_options
33
+
34
+ scrape_options = @options[ :scrapeOptions ]
35
+ if scrape_options
36
+ scrape_options[ :formats ]&.map!( &method( :string_camelize ) )
37
+ end
38
+ end
39
+
40
+ def to_h
41
+ @options.to_h
42
+ end
43
+
44
+ end
45
+ end
46
+
47
+
@@ -0,0 +1,45 @@
1
+ module Firecrawl
2
+ class ErrorResult
3
+
4
+ attr_reader :error_type, :error_description
5
+
6
+ def initialize( status_code, attributes = nil )
7
+ @error_code, @error_description = status_code_to_error( status_code )
8
+ @error_description = attributes[ :error ] if @attributes&.respond_to?( :[] )
9
+ end
10
+
11
+ private
12
+ def status_code_to_error( status_code )
13
+ case status_code
14
+ # this is here because I've noted invalid payloads being returned with a 200
15
+ when 200
16
+ [ :unexpected_error,
17
+ "The response was successful but it did not include a valid payload." ]
18
+ when 400
19
+ [ :invalid_request_error,
20
+ "There was an issue with the format or content of your request." ]
21
+ when 401
22
+ [ :authentication_error,
23
+ "There's an issue with your API key." ]
24
+ when 402
25
+ [ :payment_required,
26
+ "The request requires a paid account" ]
27
+ when 404
28
+ [ :not_found_error,
29
+ "The requested resource was not found." ]
30
+ when 429
31
+ [ :rate_limit_error,
32
+ "Your account has hit a rate limit." ]
33
+ when 500..505
34
+ [ :api_error,
35
+ "An unexpected Firecrawl server error has occurred." ]
36
+ when 529
37
+ [ :overloaded_error,
38
+ "The Firecrawl service is overloaded." ]
39
+ else
40
+ [ :unknown_error,
41
+ "The Firecrawl service returned an unexpected status code: '#{status_code}'." ]
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,8 @@
1
+ module Firecrawl
2
+ module Helpers
3
+ def string_camelize( string )
4
+ words = string.split( /[\s_\-]/ )
5
+ words.map.with_index { |word, index| index.zero? ? word.downcase : word.capitalize }.join
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,32 @@
1
+ module Firecrawl
2
+ class MapOptions
3
+ include DynamicSchema::Definable
4
+
5
+ schema do
6
+ search String
7
+ ignore_sitemap [ TrueClass, FalseClass ]
8
+ ignore_subdomains [ TrueClass, FalseClass ]
9
+ limit Integer
10
+ end
11
+
12
+ def self.build( options = nil, &block )
13
+ new( api_options: builder.build( options, &block ) )
14
+ end
15
+
16
+ def self.build!( options = nil, &block )
17
+ new( api_options: builder.build!( options, &block ) )
18
+ end
19
+
20
+ def initialize( options = {}, api_options: nil )
21
+ @options = self.class.builder.build( options || {} )
22
+ @options = api_options.merge( @options ) if api_options
23
+ end
24
+
25
+ def to_h
26
+ @options.to_h
27
+ end
28
+
29
+ end
30
+ end
31
+
32
+
@@ -0,0 +1,58 @@
1
+ module Firecrawl
2
+ ##
3
+ # The +MapRequest+ class encapsulates a '/map' POST request to the Firecrawl API. After creating
4
+ # a new +MapRequest+ instance you can make the request by calling the +map+ method to crawl the
5
+ # site and retrieve +links+
6
+ #
7
+ # === examples
8
+ #
9
+ # require 'firecrawl'
10
+ #
11
+ # request = Firecrawl::MapRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
12
+ #
13
+ # response = request.map( 'https://example.com', { limit: 100 } )
14
+ # if response.success?
15
+ # result = response.result
16
+ # if result.success?
17
+ # result.links.each do | link |
18
+ # puts link
19
+ # end
20
+ # end
21
+ # else
22
+ # puts response.result.error_description
23
+ # end
24
+ #
25
+ class MapRequest < Request
26
+
27
+ ##
28
+ # The +map+ method makes a Firecrawl '/map' POST request which will scrape the site with
29
+ # given url.
30
+ #
31
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
32
+ # then +response.result+ will be an instance +MapResult+. If the request is not successful
33
+ # then +response.result+ will be an instance of +ErrorResult+.
34
+ #
35
+ def map( url, options = nil, &block )
36
+ if options
37
+ options = options.is_a?( MapOptions ) ? options : MapOptions.build( options.to_h )
38
+ options = options.to_h
39
+ else
40
+ options = {}
41
+ end
42
+ options[ :url ] = url.to_s
43
+
44
+ response = post( "#{BASE_URI}/map", options, &block )
45
+ result = nil
46
+ if response.success?
47
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
48
+ attributes ||= { success: false }
49
+ result = MapResult.new( attributes[ :success ], attributes )
50
+ else
51
+ result = ErrorResult.new( response.status, attributes )
52
+ end
53
+
54
+ ResponseMethods.install( response, result )
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,29 @@
1
+ module Firecrawl
2
+ class MapResult
3
+
4
+ def initialize( success, attributes )
5
+ @success = success
6
+ @attributes = attributes
7
+ end
8
+
9
+ ##
10
+ # The +success?+ method returns +true+ if the scraping was successful.
11
+ #
12
+ # Note that the response +success?+ tells you if the call to the Firecrawl api was successful
13
+ # while this +success?+ method tells you if the actual scraping operation was successful.
14
+ #
15
+ def success?
16
+ @success || false
17
+ end
18
+
19
+ ##
20
+ # The +links+ method returns an array of the links that were scraped from the the page.
21
+ # The +links+ are empty unless the request options +formats+ included +links+.
22
+ #
23
+ def links
24
+ @attributes[ :links ] || []
25
+ end
26
+
27
+ end
28
+ end
29
+
@@ -0,0 +1,75 @@
1
+ module Firecrawl
2
+
3
+ ##
4
+ # The +Request+ class encapsulates a request to the Firecrawl API. After creating a new
5
+ # +Request+ instance you can make the actual request by calling the +scrape+, +begin_crawl+
6
+ # +crawl+, +end_crawl+ or +map+ methods.
7
+ #
8
+ # === example
9
+ #
10
+ # require 'firecrawl'
11
+ #
12
+ # request = Firecrawl::Request.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
13
+ #
14
+ # options = Firecrawl::ScrapeOptions.build do
15
+ # format [ :markdown, :screenshot ]
16
+ # only_main_content true
17
+ # end
18
+ #
19
+ # response = request.scrape( 'https://cnn.com', criteria )
20
+ # if response.success?
21
+ # result = response.result
22
+ # puts response.metadata[ :title ]
23
+ # puts '---'
24
+ # puts response.markdown
25
+ # else
26
+ # puts response.result.error_description
27
+ # end
28
+ #
29
+ class Request
30
+
31
+ DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
32
+
33
+ BASE_URI = 'https://api.firecrawl.dev/v1'
34
+
35
+ ##
36
+ # The +initialize+ method initializes the +Request+ instance. You MUST pass an +api_key+ and
37
+ # and optionally a (Faraday) +connection+.
38
+ #
39
+ def initialize( connection: nil, api_key: nil )
40
+ @connection = connection || DEFAULT_CONNECTION
41
+ @api_key = api_key || Firecrawl.api_key
42
+ raise ArgumentError, "An 'api_key' is required unless configured using 'Firecrawl.api_key'." \
43
+ unless @api_key
44
+ end
45
+
46
+ protected
47
+
48
+ def post( uri, body, &block )
49
+ headers = {
50
+ 'Authorization' => "Bearer #{@api_key}",
51
+ 'Content-Type' => 'application/json'
52
+ }
53
+
54
+ @connection.post( uri ) do | request |
55
+ headers.each { | key, value | request.headers[ key ] = value }
56
+ request.body = body.is_a?( String ) ? body : JSON.generate( body )
57
+ block.call( request ) if block
58
+ end
59
+ end
60
+
61
+ def get( uri, &block )
62
+ headers = {
63
+ 'Authorization' => "Bearer #{@api_key}",
64
+ 'Content-Type' => 'application/json'
65
+ }
66
+
67
+ @connection.get( uri ) do | request |
68
+ headers.each { | key, value | request.headers[ key ] = value }
69
+ block.call( request ) if block
70
+ end
71
+ end
72
+
73
+ end
74
+
75
+ end
@@ -0,0 +1,15 @@
1
+ module Firecrawl
2
+ #
3
+ # The ResponseMethods module extends a Faraday reponse, adding the +result+ method.
4
+ #
5
+ module ResponseMethods
6
+ def self.install( response, result )
7
+ response.instance_variable_set( "@_firecrawl_result", result )
8
+ response.extend( ResponseMethods )
9
+ end
10
+
11
+ def result
12
+ @_firecrawl_result
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,58 @@
1
+ module Firecrawl
2
+ class ScrapeOptions
3
+ include DynamicSchema::Definable
4
+ include Helpers
5
+
6
+ FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot, :"screenshot@full_page" ]
7
+
8
+ ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
9
+
10
+ schema do
11
+ # note: both format and formats are defined as a semantic convenience
12
+ format String, as: :formats, array: true, in: FORMATS
13
+ formats String, array: true, in: FORMATS
14
+ only_main_content [ TrueClass, FalseClass ], as: :onlyMainContent
15
+ include_tags String, as: :includeTags, array: true
16
+ exclude_tags String, as: :excludeTags, array: true
17
+ wait_for Integer
18
+ timeout Integer
19
+ extract do
20
+ #schema Hash
21
+ system_prompt String, as: :systemPrompt
22
+ prompt String
23
+ end
24
+ action as: :actions, arguments: :type, array: true do
25
+ type Symbol, required: true, in: ACTIONS
26
+ # wait
27
+ milliseconds Integer
28
+ # click
29
+ selector String
30
+ # write
31
+ text String
32
+ # press
33
+ key String
34
+ end
35
+ end
36
+
37
+ def self.build( options = nil, &block )
38
+ new( api_options: builder.build( options, &block ) )
39
+ end
40
+
41
+ def self.build!( options = nil, &block )
42
+ new( api_options: builder.build!( options, &block ) )
43
+ end
44
+
45
+ def initialize( options = {}, api_options: nil )
46
+ @options = self.class.builder.build( options || {} )
47
+ @options = api_options.merge( @options ) if api_options
48
+ @options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
49
+ end
50
+
51
+ def to_h
52
+ @options.to_h
53
+ end
54
+
55
+ end
56
+ end
57
+
58
+
@@ -0,0 +1,60 @@
1
+ module Firecrawl
2
+ ##
3
+ # The +ScrapeRequest+ class encapsulates a '/scrape' POST request to the Firecrawl API. After
4
+ # creating a new +ScrapeRequest+ instance you can initiate the request by calling the +scrape+
5
+ # method to perform synchronous scraping.
6
+ #
7
+ # === examples
8
+ #
9
+ # require 'firecrawl'
10
+ #
11
+ # request = Firecrawl::ScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
12
+ #
13
+ # options = Firecrawl::ScrapeOptions.build do
14
+ # format [ :markdown, 'screenshot@full_page' ]
15
+ # only_main_content true
16
+ # end
17
+ #
18
+ # response = request.scrape( 'https://example.com', options )
19
+ # if response.success?
20
+ # result = response.result
21
+ # puts response.metadata[ 'title ]
22
+ # puts '---'
23
+ # puts response.markdown
24
+ # else
25
+ # puts response.result.error_description
26
+ # end
27
+ #
28
+ class ScrapeRequest < Request
29
+
30
+ ##
31
+ # The +scrape+ method makes a Firecrawl '/scrape' POST request which will scrape the given url.
32
+ #
33
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
34
+ # then +response.result+ will be an instance +ScrapeResult+. If the request is not successful
35
+ # then +response.result+ will be an instance of +ErrorResult+.
36
+ #
37
+ def scrape( url, options = nil, &block )
38
+ if options
39
+ options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
40
+ options = options.to_h
41
+ else
42
+ options = {}
43
+ end
44
+ options[ :url ] = url.to_s
45
+
46
+ response = post( "#{BASE_URI}/scrape", options, &block )
47
+ result = nil
48
+ if response.success?
49
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
50
+ attributes ||= { success: false }
51
+ result = ScrapeResult.new( attributes[ :success ], attributes[ :data ] )
52
+ else
53
+ result = ErrorResult.new( response.status, attributes )
54
+ end
55
+
56
+ ResponseMethods.install( response, result )
57
+ end
58
+
59
+ end
60
+ end
@@ -0,0 +1,92 @@
1
+ module Firecrawl
2
+ class ScrapeResult
3
+
4
+ def initialize( success, attributes )
5
+ @success = success
6
+ @attributes = attributes || {}
7
+ end
8
+
9
+ ##
10
+ # The +success?+ method returns +true+ if the scraping was successful.
11
+ #
12
+ # Note that the response +success?+ tells you if the call to the Firecrawl api was successful
13
+ # while this +success?+ method tells you if the actual scraping operation was successful.
14
+ #
15
+ def success?
16
+ @success || false
17
+ end
18
+
19
+ ##
20
+ # The +markdown+ method returns scraped content that has been converted to markdown. The
21
+ # markdown content is present only if the request options +formats+ included +markdown+.
22
+ #
23
+ def markdown
24
+ @attributes[ :markdown ]
25
+ end
26
+
27
+ ##
28
+ # The +html+ method returns scraped html content. The html content is present only if the
29
+ # request options +formats+ included +html+.
30
+ #
31
+ def html
32
+ @attributes[ :html ]
33
+ end
34
+
35
+ ##
36
+ # The +raw_html+ method returns the full scraped html content of the page. The raw html
37
+ # content is present only if the request options +formats+ included +raw_html+.
38
+ #
39
+ def raw_html
40
+ @attributes[ :rawHtml ]
41
+ end
42
+
43
+ ##
44
+ # The +screenshot_url+ method returns the url of the screenshot of the requested page. The
45
+ # screenshot url is present only if the request options +formats+ included +screenshot+ or
46
+ # +screenshot@full_page+.
47
+ #
48
+ def screenshot_url
49
+ @attributes[ :screenshot ]
50
+ end
51
+
52
+ ##
53
+ # The +links+ method returns an array of the links that were scraped from the the page.
54
+ # The +links+ are empty unless the request options +formats+ included +links+.
55
+ #
56
+ def links
57
+ @attributes[ :links ] || []
58
+ end
59
+
60
+ ##
61
+ # The +actions+ method returns an object of action results ( +scrapes+ or +screenshots+ ).
62
+ # The +actions+ are empty unless the request options included +scrape+ or +scresshot+
63
+ # actions.
64
+ #
65
+ def actions
66
+ @attributes[ :actions ] || {}
67
+ end
68
+
69
+ def metadata
70
+ unless @metadata
71
+ metadata = @attributes[ :metadata ] || {}
72
+ @metadata = metadata.transform_keys do | key |
73
+ key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
74
+ end
75
+ # remove the camelCase forms injected by Firecrawl
76
+ @metadata.delete_if do | key, _ |
77
+ key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
78
+ end
79
+ end
80
+ @metadata
81
+ end
82
+
83
+ def llm_extraction
84
+ @attributes[ :llm_extraction ] || {}
85
+ end
86
+
87
+ def warning
88
+ @attributes[ :warning ]
89
+ end
90
+
91
+ end
92
+ end
data/lib/firecrawl.rb ADDED
@@ -0,0 +1,27 @@
1
+ require 'json'
2
+ require 'base64'
3
+ require 'uri'
4
+
5
+ require 'faraday'
6
+ require 'dynamic_schema'
7
+
8
+ require_relative 'firecrawl/helpers'
9
+ require_relative 'firecrawl/error_result'
10
+ require_relative 'firecrawl/request'
11
+ require_relative 'firecrawl/response_methods'
12
+
13
+ require_relative 'firecrawl/scrape_options'
14
+ require_relative 'firecrawl/scrape_result'
15
+ require_relative 'firecrawl/scrape_request'
16
+ require_relative 'firecrawl/batch_scrape_result'
17
+ require_relative 'firecrawl/batch_scrape_request'
18
+ require_relative 'firecrawl/map_options'
19
+ require_relative 'firecrawl/map_result'
20
+ require_relative 'firecrawl/map_request'
21
+
22
+ module Firecrawl
23
+ class << self
24
+ attr_accessor :api_key
25
+ end
26
+ end
27
+
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: firecrawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Kristoph Cichocki-Romanov
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-11-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: faraday
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: dynamicschema
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.0.0.beta03
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.0.0.beta03
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.13'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.13'
55
+ - !ruby/object:Gem::Dependency
56
+ name: debug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.9'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.9'
69
+ description: |-
70
+ The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl can take a URL, scrape the page contents and return the whole page or principal content as html, markdown, or structured data.
71
+
72
+ In addition, Firecrawl can crawl an entire site returning the pages it encounters or just the map of the pages, which can be used for subsequent scraping.
73
+ email:
74
+ - rubygems.org@kristoph.net
75
+ executables: []
76
+ extensions: []
77
+ extra_rdoc_files: []
78
+ files:
79
+ - LICENSE
80
+ - firecrawl.gemspec
81
+ - lib/firecrawl.rb
82
+ - lib/firecrawl/batch_scrape_request.rb
83
+ - lib/firecrawl/batch_scrape_result.rb
84
+ - lib/firecrawl/crawl_options.rb
85
+ - lib/firecrawl/error_result.rb
86
+ - lib/firecrawl/helpers.rb
87
+ - lib/firecrawl/map_options.rb
88
+ - lib/firecrawl/map_request.rb
89
+ - lib/firecrawl/map_result.rb
90
+ - lib/firecrawl/request.rb
91
+ - lib/firecrawl/response_methods.rb
92
+ - lib/firecrawl/scrape_options.rb
93
+ - lib/firecrawl/scrape_request.rb
94
+ - lib/firecrawl/scrape_result.rb
95
+ homepage: https://github.com/EndlessInternational/firecrawl
96
+ licenses:
97
+ - MIT
98
+ metadata:
99
+ source_code_uri: https://github.com/EndlessInternational/firecrawl
100
+ bug_tracker_uri: https://github.com/EndlessInternational/firecrawl/issues
101
+ post_install_message:
102
+ rdoc_options: []
103
+ require_paths:
104
+ - lib
105
+ required_ruby_version: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '3.0'
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ requirements: []
116
+ rubygems_version: 3.5.19
117
+ signing_key:
118
+ specification_version: 4
119
+ summary: The Firecrawl gem implements a lightweight interface to the Firecrawl.dev
120
+ API which takes a URL, crawls it and returns html, markdown, or structured data.
121
+ It is of particular valuewhen used with LLM's for grounding.
122
+ test_files: []