firecrawl 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d79423650fbec60124bab7710e372f16774987f437e8bb52cb13e72a434bdec6
4
- data.tar.gz: bfb0ec5c302e4e646812855d5c954e2fc6c075e4a03af495a6a09815ce588e44
3
+ metadata.gz: 3ba3c8f728651fdc912a44ad7aee4fcdae849cbac7f351b62dc8cec9fdb29973
4
+ data.tar.gz: dfd0382fb5cafdd471572cb1e3c470b7c28548d20a466c469bbec53ea5c41209
5
5
  SHA512:
6
- metadata.gz: 46c819135a19388beb434c797e16a3a73f52b1d85cc37f317e03dc159de8ed1a56a6a5170f2e2457e2798fbff25d93cfbf11f7be12682fcb6ef13534aa347f62
7
- data.tar.gz: ffe9b239c29138617a1902f8502099b6f0b2019eeed8b266cd8f77e94eee868e8e876ee9f8545cd584d701a8bfffa75f6552508b2949463f2cc187580eeb35fe
6
+ metadata.gz: 03a76beec314251ed927c8abc6a2fc2493c464dff7907699d484c752b3a153e25a9df184f03f98d7849bf944051010807b7dbb549a1b466710a3b22e83cdd96c
7
+ data.tar.gz: ca04e151e6ffc27e38325fe1a4ae701fc007cee62659b77ceb079191b4c07667b91accd38dcec5531a194a5c7b9f0d685b4a26e0ee8d9679792040ac07745f6b
data/README.md ADDED
@@ -0,0 +1,198 @@
1
+ # Firecrawl
2
+
3
+ Firecrawl is a lightweight Ruby gem that provides a semantically straightfoward interface to
4
+ the Firecrawl.dev API, allowing you to easily scrape web content, take screenshots, as well as
5
+ crawl entire web domains.
6
+
7
+ The gem is particularly useful when working with Large Language Models (LLMs) as it can
8
+ provide markdown information for real time information lookup as well as grounding.
9
+
10
+ ```ruby
11
+ require 'firecrawl'
12
+
13
+ Firecrawl.api_key ENV[ 'FIRECRAWL_API_KEY' ]
14
+ response = Firecrawl.scrape( 'https://example.com', options )
15
+ if response.success?
16
+ result = response.result
17
+ puts result.metadata[ 'title' ]
18
+ puts '---'
19
+ puts result.markdown
20
+ puts "Screenshot URL: #{ result.screenshot_url }"
21
+ else
22
+ puts response.result.error_description
23
+ end
24
+ ```
25
+
26
+ ## Installation
27
+
28
+ Add this line to your application's Gemfile:
29
+
30
+ ```ruby
31
+ gem 'firecrawl'
32
+ ```
33
+
34
+ Then execute:
35
+
36
+ ```bash
37
+ $ bundle install
38
+ ```
39
+
40
+ Or install it directly:
41
+
42
+ ```bash
43
+ $ gem install firecrawl
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ ### Basic Scraping
49
+
50
+ The simplest way to use Firecrawl is to scrape a single page:
51
+
52
+ ```ruby
53
+ Firecrawl.api_key ENV['FIRECRAWL_API_KEY']
54
+ response = Firecrawl.scrape('https://example.com', format: :markdown )
55
+
56
+ if response.success?
57
+ result = response.result
58
+ if result.success?
59
+ puts result.metadata['title']
60
+ puts result.markdown
61
+ end
62
+ else
63
+ puts response.result.error_description
64
+ end
65
+ ```
66
+
67
+ ### Scrape Options
68
+
69
+ You can customize scraping behavior using `ScrapeOptions`:
70
+
71
+ ```ruby
72
+ options = Firecrawl::ScrapeOptions.build do
73
+ formats [ :html, :markdown, :screenshot ]
74
+ only_main_content true
75
+ include_tags [ 'article', 'main' ]
76
+ exclude_tags [ 'nav', 'footer' ]
77
+ wait_for 5000 # milliseconds
78
+ end
79
+
80
+ request = Firecrawl::ScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
81
+ response = request.scrape('https://example.com', options)
82
+ ```
83
+
84
+ ### Batch Scraping
85
+
86
+ For scraping multiple URLs efficiently:
87
+
88
+ ```ruby
89
+ request = Firecrawl::BatchScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
90
+
91
+ urls = [ 'https://example.com', 'https://example.org' ]
92
+ options = Firecrawl::ScrapeOptions.build do
93
+ format :markdown
94
+ only_main_content true
95
+ end
96
+
97
+ response = request.scrape( urls, options )
98
+ while response.success?
99
+ batch_result = response.result
100
+ batch_result.scrape_results.each do |result|
101
+ puts result.metadata['title']
102
+ puts result.markdown
103
+ puts "\n---\n"
104
+ end
105
+ break unless batch_result.status?( :scraping )
106
+ sleep 0.5
107
+ response = request.retrieve_scrape_results( batch_result )
108
+ end
109
+ ```
110
+
111
+ ### Site Mapping
112
+
113
+ To retrieve a site's structure:
114
+
115
+ ```ruby
116
+ request = Firecrawl::MapRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
117
+
118
+ options = Firecrawl::MapOptions.build do
119
+ limit 100
120
+ ignore_subdomains true
121
+ end
122
+
123
+ response = request.map( 'https://example.com', options )
124
+ if response.success?
125
+ result = response.result
126
+ result.links.each do |link|
127
+ puts link
128
+ end
129
+ end
130
+ ```
131
+
132
+ ### Site Crawling
133
+
134
+ For comprehensive site crawling:
135
+
136
+ ```ruby
137
+ request = Firecrawl::CrawlRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
138
+
139
+ options = Firecrawl::CrawlOptions.build do
140
+ maximum_depth 2
141
+ limit 10
142
+ scrape_options do
143
+ format :markdown
144
+ only_main_content true
145
+ end
146
+ end
147
+
148
+ response = request.crawl( 'https://example.com', options )
149
+ while response.success?
150
+ crawl_result = response.result
151
+ crawl_result.scrape_results.each do |result|
152
+ puts result.metadata['title']
153
+ puts result.markdown
154
+ end
155
+ break unless crawl_result.status?(:scraping)
156
+ sleep 0.5
157
+ response = request.retrieve_crawl_results(crawl_result)
158
+ end
159
+ ```
160
+
161
+ ## Response Structure
162
+
163
+ All Firecrawl requests return a Faraday response with an added `result` method. The result will
164
+ be one of:
165
+
166
+ - `ScrapeResult`: Contains the scraped content and metadata
167
+ - `BatchScrapeResult`: Contains multiple scrape results
168
+ - `MapResult`: Contains discovered links
169
+ - `CrawlResult`: Contains scrape results from crawled pages
170
+ - `ErrorResult`: Contains error information if the request failed
171
+
172
+ ### Working with Results
173
+
174
+ ```ruby
175
+ response = request.scrape(url, options)
176
+ if response.success?
177
+ result = response.result
178
+ if result.success?
179
+ # Access scraped content
180
+ puts result.metadata['title']
181
+ puts result.markdown
182
+ puts result.html
183
+ puts result.raw_html
184
+ puts result.screenshot_url
185
+ puts result.links
186
+
187
+ # Check for warnings
188
+ puts result.warning if result.warning
189
+ end
190
+ else
191
+ error = response.result
192
+ puts "#{error.error_type}: #{error.error_description}"
193
+ end
194
+ ```
195
+
196
+ ## License
197
+
198
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/firecrawl.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do | spec |
2
2
 
3
3
  spec.name = 'firecrawl'
4
- spec.version = '0.0.1'
4
+ spec.version = '0.1.0'
5
5
  spec.authors = [ 'Kristoph Cichocki-Romanov' ]
6
6
  spec.email = [ 'rubygems.org@kristoph.net' ]
7
7
 
@@ -29,9 +29,10 @@ Gem::Specification.new do | spec |
29
29
  spec.require_paths = [ "lib" ]
30
30
 
31
31
  spec.add_runtime_dependency 'faraday', '~> 2.7'
32
- spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta03'
32
+ spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta04'
33
33
 
34
34
  spec.add_development_dependency 'rspec', '~> 3.13'
35
35
  spec.add_development_dependency 'debug', '~> 1.9'
36
+ spec.add_development_dependency 'vcr', '~> 6.3'
36
37
 
37
38
  end
@@ -3,8 +3,8 @@ module Firecrawl
3
3
  ##
4
4
  # The +BatchScrapeRequest+ class encapsulates a batch scrape request to the Firecrawl API.
5
5
  # After creating a new +BatchScrapeRequest+ instance you can begin batch scraping by calling
6
- # the +begin_scraping+ method and then subsequently evaluate the results by calling the
7
- # +continue_scraping' method.
6
+ # the +scrape+ method and then subsequently retrieve the results by calling the
7
+ # +retrieve_scrape_results' method.
8
8
  #
9
9
  # === examples
10
10
  #
@@ -18,7 +18,7 @@ module Firecrawl
18
18
  # only_main_content true
19
19
  # end
20
20
  #
21
- # batch_response = request.beging_scraping( urls, options )
21
+ # batch_response = request.scrape( urls, options )
22
22
  # while response.success?
23
23
  # batch_result = batch_response.result
24
24
  # if batch_result.success?
@@ -30,17 +30,18 @@ module Firecrawl
30
30
  # end
31
31
  # end
32
32
  # break unless batch_result.status?( :scraping )
33
+ # batch_response = request.retrieve_scrape_results( batch_result )
33
34
  # end
34
35
  #
35
- # unless response.success?
36
- # puts response.result.error_description
36
+ # unless batch_response.success?
37
+ # puts batch_response.result.error_description
37
38
  # end
38
39
  #
39
40
  class BatchScrapeRequest < Request
40
41
 
41
42
  ##
42
- # The +start_scraping+ method makes a Firecrawl '/batch/scrape' POST request which will
43
- # initiate batch scraping of the given urls.
43
+ # The +scrape+ method makes a Firecrawl '/batch/scrape/{id}' POST request which will initiate
44
+ # batch scraping of the given urls.
44
45
  #
45
46
  # The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
46
47
  # then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
@@ -50,7 +51,7 @@ module Firecrawl
50
51
  # successful and then +response.result.success?+ to validate that the API processed the
51
52
  # request successfuly.
52
53
  #
53
- def start_scraping( urls, options = nil, &block )
54
+ def scrape( urls, options = nil, &block )
54
55
  if options
55
56
  options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
56
57
  options = options.to_h
@@ -58,7 +59,6 @@ module Firecrawl
58
59
  options = {}
59
60
  end
60
61
  options[ :urls ] = [ urls ].flatten
61
-
62
62
  response = post( "#{BASE_URI}/batch/scrape", options, &block )
63
63
  result = nil
64
64
  if response.success?
@@ -73,10 +73,11 @@ module Firecrawl
73
73
  end
74
74
 
75
75
  ##
76
- # The +retrieve_scraping+ method makes a Firecrawl '/batch/scrape' GET request which will
77
- # retrieve batch scraping results. Note that there is no guarantee that there are any batch
78
- # scraping results at the time of the call and you may need to call this method multiple
79
- # times.
76
+ # The +retrieve_scrape_results+ method makes a Firecrawl '/batch/scrape' GET request which
77
+ # will return the scrape results that were completed since the previous call to this method
78
+ # ( or, if this is the first call to this method, since the batch scrape was started ). Note
79
+ # that there is no guarantee that there are any new batch scrape results at the time you make
80
+ # this call ( scrape_results may be empty ).
80
81
  #
81
82
  # The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
82
83
  # then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
@@ -86,7 +87,7 @@ module Firecrawl
86
87
  # successful and then +response.result.success?+ to validate that the API processed the
87
88
  # request successfuly.
88
89
  #
89
- def retrieve_scraping( batch_result, &block )
90
+ def retrieve_scrape_results( batch_result, &block )
90
91
  raise ArgumentError, "The first argument must be an instance of BatchScrapeResult." \
91
92
  unless batch_result.is_a?( BatchScrapeResult )
92
93
  response = get( batch_result.next_url, &block )
@@ -1,21 +1,17 @@
1
1
  module Firecrawl
2
2
  class CrawlOptions
3
3
  include DynamicSchema::Definable
4
- include DynamicSchema::Buildable
5
-
6
- FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot ]
7
-
8
- ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
4
+ include Helpers
9
5
 
10
6
  schema do
11
7
  exclude_paths String, as: :excludePaths, array: true
12
8
  include_paths String, as: :includePaths, array: true
13
9
  maximum_depth Integer, as: :maxDepth
14
10
  ignore_sitemap [ TrueClass, FalseClass ], as: :ignoreSitemap
15
- limit Integer
11
+ limit Integer, in: (0..)
16
12
  allow_backward_links [ TrueClass, FalseClass ], as: :allowBackwardLinks
17
13
  allow_external_links [ TrueClass, FalseClass ], as: :allowExternalLinks
18
- webhook String
14
+ webhook_uri URI, as: :webhook
19
15
  scrape_options as: :scrapeOptions, &ScrapeOptions.schema
20
16
  end
21
17
 
@@ -27,13 +23,13 @@ module Firecrawl
27
23
  new( api_options: builder.build!( options, &block ) )
28
24
  end
29
25
 
30
- def initialize( options, api_options: nil )
26
+ def initialize( options = nil, api_options: nil )
31
27
  @options = self.class.builder.build( options || {} )
32
28
  @options = api_options.merge( @options ) if api_options
33
29
 
34
30
  scrape_options = @options[ :scrapeOptions ]
35
31
  if scrape_options
36
- scrape_options[ :formats ]&.map!( &method( :string_camelize ) )
32
+ scrape_options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
37
33
  end
38
34
  end
39
35
 
@@ -0,0 +1,136 @@
1
+ module Firecrawl
2
+
3
+ ##
4
+ # The +CrawlRequest+ class encapsulates a crawl request to the Firecrawl API. After creating
5
+ # a new +CrawlRequest+ instance you can begin crawling by calling the +crawl+ method and
6
+ # then subsequently retrieving the results by calling the +retrieve_crawl_results+ method.
7
+ # You can also optionally cancel the crawling operation by calling +cancel_crawl+.
8
+ #
9
+ # === examples
10
+ #
11
+ # require 'firecrawl'
12
+ #
13
+ # request = Firecrawl::CrawlRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
14
+ #
15
+ # urls = 'https://icann.org'
16
+ # options = Firecrawl::CrawlOptions.build do
17
+ # scrape_options do
18
+ # main_content_only true
19
+ # end
20
+ # end
21
+ #
22
+ # crawl_response = request.crawl( urls, options )
23
+ # while crawl_response.success?
24
+ # crawl_result = crawl_response.result
25
+ # if crawl_result.success?
26
+ # crawl_result.scrape_results.each do | result |
27
+ # puts response.metadata[ 'title ]
28
+ # puts '---'
29
+ # puts response.markdown
30
+ # puts "\n\n"
31
+ # end
32
+ # end
33
+ # break unless crawl_result.status?( :scraping )
34
+ # crawl_response = request.
35
+ # end
36
+ #
37
+ # unless crawl_response.success?
38
+ # puts crawl_response.result.error_description
39
+ # end
40
+ #
41
+ class CrawlRequest < Request
42
+
43
+ ##
44
+ # The +crawl+ method makes a Firecrawl '/crawl' POST request which will initiate crawling
45
+ # of the given url.
46
+ #
47
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
48
+ # then +response.result+ will be an instance +CrawlResult+. If the request is not successful
49
+ # then +response.result+ will be an instance of +ErrorResult+.
50
+ #
51
+ # Remember that you should call +response.success?+ to validr that the call to the API was
52
+ # successful and then +response.result.success?+ to validate that the API processed the
53
+ # request successfuly.
54
+ #
55
+ def crawl( url, options = nil, &block )
56
+ if options
57
+ options = options.is_a?( CrawlOptions ) ? options : CrawlOptions.build( options.to_h )
58
+ options = options.to_h
59
+ else
60
+ options = {}
61
+ end
62
+ options[ url ] = url
63
+ response = post( "#{BASE_URI}/crawl", options, &block )
64
+ result = nil
65
+ if response.success?
66
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
67
+ attributes ||= { success: false, status: :failed }
68
+ result = CrawlResult.new( attributes[ :success ], attributes )
69
+ else
70
+ result = ErrorResult.new( response.status, attributes )
71
+ end
72
+
73
+ ResponseMethods.install( response, result )
74
+ end
75
+
76
+ ##
77
+ # The +retrieve_crawl_results+ method makes a Firecrawl '/crawl/{id}' GET request which
78
+ # will return the crawl results that were completed since the previous call to this method
79
+ # ( or, if this is the first call to this method, since the crawl was started ). Note that
80
+ # there is no guarantee that there are any new crawl results at the time you make this call
81
+ # ( scrape_results may be empty ).
82
+ #
83
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is
84
+ # +true+, then +response.result+ will be an instance +CrawlResult+. If the request is not
85
+ # successful then +response.result+ will be an instance of +ErrorResult+.
86
+ #
87
+ # Remember that you should call +response.success?+ to validate that the call to the API was
88
+ # successful and then +response.result.success?+ to validate that the API processed the
89
+ # request successfuly.
90
+ #
91
+ def retrieve_crawl_results( crawl_result, &block )
92
+ raise ArgumentError, "The first argument must be an instance of CrawlResult." \
93
+ unless crawl_result.is_a?( CrawlResult )
94
+ response = get( crawl_result.next_url, &block )
95
+ result = nil
96
+ if response.success?
97
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
98
+ attributes ||= { success: false, status: :failed }
99
+ result = crawl_result.merge( attributes )
100
+ else
101
+ result = ErrorResult.new( response.status, attributes )
102
+ end
103
+
104
+ ResponseMethods.install( response, result )
105
+ end
106
+
107
+ ##
108
+ # The +cance_crawl+ method makes a Firecrawl '/crawl/{id}' DELETE request which will cancel
109
+ # a previouslly started crawl.
110
+ #
111
+ # The response is always an instance of +Faraday::Response+. If +response.success?+ is
112
+ # +true+, then +response.result+ will be an instance +CrawlResult+. If the request is not
113
+ # successful then +response.result+ will be an instance of +ErrorResult+.
114
+ #
115
+ # Remember that you should call +response.success?+ to validate that the call to the API was
116
+ # successful and then +response.result.success?+ to validate that the API processed the
117
+ # request successfuly.
118
+ #
119
+ def cancel_crawl( crawl_result, &block )
120
+ raise ArgumentError, "The first argument must be an instance of CrawlResult." \
121
+ unless crawl_result.is_a?( CrawlResult )
122
+ response = get( crawl_result.url, &block )
123
+ result = nil
124
+ if response.success?
125
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
126
+ attributes ||= { success: false, status: :failed }
127
+ result = crawl_result.merge( attributes )
128
+ else
129
+ result = ErrorResult.new( response.status, attributes )
130
+ end
131
+
132
+ ResponseMethods.install( response, result )
133
+ end
134
+
135
+ end
136
+ end
@@ -0,0 +1,63 @@
1
+ module Firecrawl
2
+ class CrawlResult
3
+
4
+ def initialize( success, attributes )
5
+ @success = success
6
+ @attributes = attributes || {}
7
+ end
8
+
9
+ def success?
10
+ @success || false
11
+ end
12
+
13
+ def status
14
+ # the initial Firecrawl response does not have a status so we synthesize a 'crawling'
15
+ # status if the operation was otherwise successful
16
+ @attributes[ :status ]&.to_sym || ( @success ? :scraping : :failed )
17
+ end
18
+
19
+ def status?( status )
20
+ self.status == status
21
+ end
22
+
23
+ def id
24
+ @attributes[ :id ]
25
+ end
26
+
27
+ def total
28
+ @attributes[ :total ] || 0
29
+ end
30
+
31
+ def completed
32
+ @attributes[ :completed ] || 0
33
+ end
34
+
35
+ def credits_used
36
+ @attributes[ :creditsUsed ] || 0
37
+ end
38
+
39
+ def expires_at
40
+ Date.parse( @attributes[ :expiresAt ] ) rescue nil
41
+ end
42
+
43
+ def url
44
+ @attributes[ :url ]
45
+ end
46
+
47
+ def next_url
48
+ @attributes[ :next ] || @attributes[ :url ]
49
+ end
50
+
51
+ def scrape_results
52
+ success = @attributes[ :success ]
53
+ # note the &.compact is here because I've noted null entries in the data
54
+ ( @attributes[ :data ]&.compact || [] ).map do | attr |
55
+ ScrapeResult.new( success, attr )
56
+ end
57
+ end
58
+
59
+ def merge( attributes )
60
+ self.class.new( attributes[ :success ], @attributes.merge( attributes ) )
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,18 @@
1
+ module Firecrawl
2
+ module ModuleMethods
3
+ DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
4
+
5
+ def connection( connection = nil )
6
+ @connection = connection || @connection || DEFAULT_CONNECTION
7
+ end
8
+
9
+ def api_key( api_key = nil )
10
+ @api_key = api_key || @api_key
11
+ @api_key
12
+ end
13
+
14
+ def scrape( url, options = nil, &block )
15
+ Firecrawl::ScrapeRequest.new.scrape( url, options, &block )
16
+ end
17
+ end
18
+ end
@@ -28,8 +28,6 @@ module Firecrawl
28
28
  #
29
29
  class Request
30
30
 
31
- DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
32
-
33
31
  BASE_URI = 'https://api.firecrawl.dev/v1'
34
32
 
35
33
  ##
@@ -37,7 +35,7 @@ module Firecrawl
37
35
  # and optionally a (Faraday) +connection+.
38
36
  #
39
37
  def initialize( connection: nil, api_key: nil )
40
- @connection = connection || DEFAULT_CONNECTION
38
+ @connection = connection || Firecrawl.connection
41
39
  @api_key = api_key || Firecrawl.api_key
42
40
  raise ArgumentError, "An 'api_key' is required unless configured using 'Firecrawl.api_key'." \
43
41
  unless @api_key
@@ -70,6 +68,18 @@ module Firecrawl
70
68
  end
71
69
  end
72
70
 
71
+ def delete( uri, &block )
72
+ headers = {
73
+ 'Authorization' => "Bearer #{@api_key}",
74
+ 'Content-Type' => 'application/json'
75
+ }
76
+
77
+ @connection.delete( uri ) do | request |
78
+ headers.each { | key, value | request.headers[ key ] = value }
79
+ block.call( request ) if block
80
+ end
81
+ end
82
+
73
83
  end
74
84
 
75
85
  end
@@ -9,7 +9,7 @@ module Firecrawl
9
9
 
10
10
  schema do
11
11
  # note: both format and formats are defined as a semantic convenience
12
- format String, as: :formats, array: true, in: FORMATS
12
+ format String, as: :formats, array: true, in: FORMATS
13
13
  formats String, array: true, in: FORMATS
14
14
  only_main_content [ TrueClass, FalseClass ], as: :onlyMainContent
15
15
  include_tags String, as: :includeTags, array: true
@@ -17,7 +17,7 @@ module Firecrawl
17
17
  wait_for Integer
18
18
  timeout Integer
19
19
  extract do
20
- #schema Hash
20
+ schema Hash
21
21
  system_prompt String, as: :systemPrompt
22
22
  prompt String
23
23
  end
@@ -16,6 +16,20 @@ module Firecrawl
16
16
  @success || false
17
17
  end
18
18
 
19
+ def metadata
20
+ unless @metadata
21
+ metadata = @attributes[ :metadata ] || {}
22
+ @metadata = metadata.transform_keys do | key |
23
+ key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
24
+ end
25
+ # remove the camelCase forms injected by Firecrawl
26
+ @metadata.delete_if do | key, _ |
27
+ key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
28
+ end
29
+ end
30
+ @metadata
31
+ end
32
+
19
33
  ##
20
34
  # The +markdown+ method returns scraped content that has been converted to markdown. The
21
35
  # markdown content is present only if the request options +formats+ included +markdown+.
@@ -66,20 +80,6 @@ module Firecrawl
66
80
  @attributes[ :actions ] || {}
67
81
  end
68
82
 
69
- def metadata
70
- unless @metadata
71
- metadata = @attributes[ :metadata ] || {}
72
- @metadata = metadata.transform_keys do | key |
73
- key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
74
- end
75
- # remove the camelCase forms injected by Firecrawl
76
- @metadata.delete_if do | key, _ |
77
- key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
78
- end
79
- end
80
- @metadata
81
- end
82
-
83
83
  def llm_extraction
84
84
  @attributes[ :llm_extraction ] || {}
85
85
  end
data/lib/firecrawl.rb CHANGED
@@ -18,10 +18,14 @@ require_relative 'firecrawl/batch_scrape_request'
18
18
  require_relative 'firecrawl/map_options'
19
19
  require_relative 'firecrawl/map_result'
20
20
  require_relative 'firecrawl/map_request'
21
+ require_relative 'firecrawl/crawl_options'
22
+ require_relative 'firecrawl/crawl_result'
23
+ require_relative 'firecrawl/crawl_request'
24
+
25
+ require_relative 'firecrawl/module_methods'
21
26
 
22
27
  module Firecrawl
23
- class << self
24
- attr_accessor :api_key
25
- end
28
+ extend ModuleMethods
26
29
  end
27
30
 
31
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: firecrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kristoph Cichocki-Romanov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-11-04 00:00:00.000000000 Z
11
+ date: 2024-11-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: faraday
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.0.0.beta03
33
+ version: 1.0.0.beta04
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.0.0.beta03
40
+ version: 1.0.0.beta04
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '1.9'
69
+ - !ruby/object:Gem::Dependency
70
+ name: vcr
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '6.3'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '6.3'
69
83
  description: |-
70
84
  The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl can take a URL, scrape the page contents and return the whole page or principal content as html, markdown, or structured data.
71
85
 
@@ -77,16 +91,20 @@ extensions: []
77
91
  extra_rdoc_files: []
78
92
  files:
79
93
  - LICENSE
94
+ - README.md
80
95
  - firecrawl.gemspec
81
96
  - lib/firecrawl.rb
82
97
  - lib/firecrawl/batch_scrape_request.rb
83
98
  - lib/firecrawl/batch_scrape_result.rb
84
99
  - lib/firecrawl/crawl_options.rb
100
+ - lib/firecrawl/crawl_request.rb
101
+ - lib/firecrawl/crawl_result.rb
85
102
  - lib/firecrawl/error_result.rb
86
103
  - lib/firecrawl/helpers.rb
87
104
  - lib/firecrawl/map_options.rb
88
105
  - lib/firecrawl/map_request.rb
89
106
  - lib/firecrawl/map_result.rb
107
+ - lib/firecrawl/module_methods.rb
90
108
  - lib/firecrawl/request.rb
91
109
  - lib/firecrawl/response_methods.rb
92
110
  - lib/firecrawl/scrape_options.rb