firecrawl 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +198 -0
- data/firecrawl.gemspec +3 -2
- data/lib/firecrawl/batch_scrape_request.rb +15 -14
- data/lib/firecrawl/crawl_options.rb +5 -9
- data/lib/firecrawl/crawl_request.rb +136 -0
- data/lib/firecrawl/crawl_result.rb +63 -0
- data/lib/firecrawl/module_methods.rb +18 -0
- data/lib/firecrawl/request.rb +13 -3
- data/lib/firecrawl/scrape_options.rb +2 -2
- data/lib/firecrawl/scrape_result.rb +14 -14
- data/lib/firecrawl.rb +7 -3
- metadata +22 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ba3c8f728651fdc912a44ad7aee4fcdae849cbac7f351b62dc8cec9fdb29973
|
4
|
+
data.tar.gz: dfd0382fb5cafdd471572cb1e3c470b7c28548d20a466c469bbec53ea5c41209
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 03a76beec314251ed927c8abc6a2fc2493c464dff7907699d484c752b3a153e25a9df184f03f98d7849bf944051010807b7dbb549a1b466710a3b22e83cdd96c
|
7
|
+
data.tar.gz: ca04e151e6ffc27e38325fe1a4ae701fc007cee62659b77ceb079191b4c07667b91accd38dcec5531a194a5c7b9f0d685b4a26e0ee8d9679792040ac07745f6b
|
data/README.md
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# Firecrawl
|
2
|
+
|
3
|
+
Firecrawl is a lightweight Ruby gem that provides a semantically straightfoward interface to
|
4
|
+
the Firecrawl.dev API, allowing you to easily scrape web content, take screenshots, as well as
|
5
|
+
crawl entire web domains.
|
6
|
+
|
7
|
+
The gem is particularly useful when working with Large Language Models (LLMs) as it can
|
8
|
+
provide markdown information for real time information lookup as well as grounding.
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
require 'firecrawl'
|
12
|
+
|
13
|
+
Firecrawl.api_key ENV[ 'FIRECRAWL_API_KEY' ]
|
14
|
+
response = Firecrawl.scrape( 'https://example.com', options )
|
15
|
+
if response.success?
|
16
|
+
result = response.result
|
17
|
+
puts result.metadata[ 'title' ]
|
18
|
+
puts '---'
|
19
|
+
puts result.markdown
|
20
|
+
puts "Screenshot URL: #{ result.screenshot_url }"
|
21
|
+
else
|
22
|
+
puts response.result.error_description
|
23
|
+
end
|
24
|
+
```
|
25
|
+
|
26
|
+
## Installation
|
27
|
+
|
28
|
+
Add this line to your application's Gemfile:
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
gem 'firecrawl'
|
32
|
+
```
|
33
|
+
|
34
|
+
Then execute:
|
35
|
+
|
36
|
+
```bash
|
37
|
+
$ bundle install
|
38
|
+
```
|
39
|
+
|
40
|
+
Or install it directly:
|
41
|
+
|
42
|
+
```bash
|
43
|
+
$ gem install firecrawl
|
44
|
+
```
|
45
|
+
|
46
|
+
## Usage
|
47
|
+
|
48
|
+
### Basic Scraping
|
49
|
+
|
50
|
+
The simplest way to use Firecrawl is to scrape a single page:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
Firecrawl.api_key ENV['FIRECRAWL_API_KEY']
|
54
|
+
response = Firecrawl.scrape('https://example.com', format: :markdown )
|
55
|
+
|
56
|
+
if response.success?
|
57
|
+
result = response.result
|
58
|
+
if result.success?
|
59
|
+
puts result.metadata['title']
|
60
|
+
puts result.markdown
|
61
|
+
end
|
62
|
+
else
|
63
|
+
puts response.result.error_description
|
64
|
+
end
|
65
|
+
```
|
66
|
+
|
67
|
+
### Scrape Options
|
68
|
+
|
69
|
+
You can customize scraping behavior using `ScrapeOptions`:
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
options = Firecrawl::ScrapeOptions.build do
|
73
|
+
formats [ :html, :markdown, :screenshot ]
|
74
|
+
only_main_content true
|
75
|
+
include_tags [ 'article', 'main' ]
|
76
|
+
exclude_tags [ 'nav', 'footer' ]
|
77
|
+
wait_for 5000 # milliseconds
|
78
|
+
end
|
79
|
+
|
80
|
+
request = Firecrawl::ScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
|
81
|
+
response = request.scrape('https://example.com', options)
|
82
|
+
```
|
83
|
+
|
84
|
+
### Batch Scraping
|
85
|
+
|
86
|
+
For scraping multiple URLs efficiently:
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
request = Firecrawl::BatchScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
|
90
|
+
|
91
|
+
urls = [ 'https://example.com', 'https://example.org' ]
|
92
|
+
options = Firecrawl::ScrapeOptions.build do
|
93
|
+
format :markdown
|
94
|
+
only_main_content true
|
95
|
+
end
|
96
|
+
|
97
|
+
response = request.scrape( urls, options )
|
98
|
+
while response.success?
|
99
|
+
batch_result = response.result
|
100
|
+
batch_result.scrape_results.each do |result|
|
101
|
+
puts result.metadata['title']
|
102
|
+
puts result.markdown
|
103
|
+
puts "\n---\n"
|
104
|
+
end
|
105
|
+
break unless batch_result.status?( :scraping )
|
106
|
+
sleep 0.5
|
107
|
+
response = request.retrieve_scrape_results( batch_result )
|
108
|
+
end
|
109
|
+
```
|
110
|
+
|
111
|
+
### Site Mapping
|
112
|
+
|
113
|
+
To retrieve a site's structure:
|
114
|
+
|
115
|
+
```ruby
|
116
|
+
request = Firecrawl::MapRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
|
117
|
+
|
118
|
+
options = Firecrawl::MapOptions.build do
|
119
|
+
limit 100
|
120
|
+
ignore_subdomains true
|
121
|
+
end
|
122
|
+
|
123
|
+
response = request.map( 'https://example.com', options )
|
124
|
+
if response.success?
|
125
|
+
result = response.result
|
126
|
+
result.links.each do |link|
|
127
|
+
puts link
|
128
|
+
end
|
129
|
+
end
|
130
|
+
```
|
131
|
+
|
132
|
+
### Site Crawling
|
133
|
+
|
134
|
+
For comprehensive site crawling:
|
135
|
+
|
136
|
+
```ruby
|
137
|
+
request = Firecrawl::CrawlRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
|
138
|
+
|
139
|
+
options = Firecrawl::CrawlOptions.build do
|
140
|
+
maximum_depth 2
|
141
|
+
limit 10
|
142
|
+
scrape_options do
|
143
|
+
format :markdown
|
144
|
+
only_main_content true
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
response = request.crawl( 'https://example.com', options )
|
149
|
+
while response.success?
|
150
|
+
crawl_result = response.result
|
151
|
+
crawl_result.scrape_results.each do |result|
|
152
|
+
puts result.metadata['title']
|
153
|
+
puts result.markdown
|
154
|
+
end
|
155
|
+
break unless crawl_result.status?(:scraping)
|
156
|
+
sleep 0.5
|
157
|
+
response = request.retrieve_crawl_results(crawl_result)
|
158
|
+
end
|
159
|
+
```
|
160
|
+
|
161
|
+
## Response Structure
|
162
|
+
|
163
|
+
All Firecrawl requests return a Faraday response with an added `result` method. The result will
|
164
|
+
be one of:
|
165
|
+
|
166
|
+
- `ScrapeResult`: Contains the scraped content and metadata
|
167
|
+
- `BatchScrapeResult`: Contains multiple scrape results
|
168
|
+
- `MapResult`: Contains discovered links
|
169
|
+
- `CrawlResult`: Contains scrape results from crawled pages
|
170
|
+
- `ErrorResult`: Contains error information if the request failed
|
171
|
+
|
172
|
+
### Working with Results
|
173
|
+
|
174
|
+
```ruby
|
175
|
+
response = request.scrape(url, options)
|
176
|
+
if response.success?
|
177
|
+
result = response.result
|
178
|
+
if result.success?
|
179
|
+
# Access scraped content
|
180
|
+
puts result.metadata['title']
|
181
|
+
puts result.markdown
|
182
|
+
puts result.html
|
183
|
+
puts result.raw_html
|
184
|
+
puts result.screenshot_url
|
185
|
+
puts result.links
|
186
|
+
|
187
|
+
# Check for warnings
|
188
|
+
puts result.warning if result.warning
|
189
|
+
end
|
190
|
+
else
|
191
|
+
error = response.result
|
192
|
+
puts "#{error.error_type}: #{error.error_description}"
|
193
|
+
end
|
194
|
+
```
|
195
|
+
|
196
|
+
## License
|
197
|
+
|
198
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/firecrawl.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do | spec |
|
2
2
|
|
3
3
|
spec.name = 'firecrawl'
|
4
|
-
spec.version = '0.0
|
4
|
+
spec.version = '0.1.0'
|
5
5
|
spec.authors = [ 'Kristoph Cichocki-Romanov' ]
|
6
6
|
spec.email = [ 'rubygems.org@kristoph.net' ]
|
7
7
|
|
@@ -29,9 +29,10 @@ Gem::Specification.new do | spec |
|
|
29
29
|
spec.require_paths = [ "lib" ]
|
30
30
|
|
31
31
|
spec.add_runtime_dependency 'faraday', '~> 2.7'
|
32
|
-
spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.
|
32
|
+
spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta04'
|
33
33
|
|
34
34
|
spec.add_development_dependency 'rspec', '~> 3.13'
|
35
35
|
spec.add_development_dependency 'debug', '~> 1.9'
|
36
|
+
spec.add_development_dependency 'vcr', '~> 6.3'
|
36
37
|
|
37
38
|
end
|
@@ -3,8 +3,8 @@ module Firecrawl
|
|
3
3
|
##
|
4
4
|
# The +BatchScrapeRequest+ class encapsulates a batch scrape request to the Firecrawl API.
|
5
5
|
# After creating a new +BatchScrapeRequest+ instance you can begin batch scraping by calling
|
6
|
-
# the +
|
7
|
-
# +
|
6
|
+
# the +scrape+ method and then subsequently retrieve the results by calling the
|
7
|
+
# +retrieve_scrape_results' method.
|
8
8
|
#
|
9
9
|
# === examples
|
10
10
|
#
|
@@ -18,7 +18,7 @@ module Firecrawl
|
|
18
18
|
# only_main_content true
|
19
19
|
# end
|
20
20
|
#
|
21
|
-
# batch_response = request.
|
21
|
+
# batch_response = request.scrape( urls, options )
|
22
22
|
# while response.success?
|
23
23
|
# batch_result = batch_response.result
|
24
24
|
# if batch_result.success?
|
@@ -30,17 +30,18 @@ module Firecrawl
|
|
30
30
|
# end
|
31
31
|
# end
|
32
32
|
# break unless batch_result.status?( :scraping )
|
33
|
+
# batch_response = request.retrieve_scrape_results( batch_result )
|
33
34
|
# end
|
34
35
|
#
|
35
|
-
# unless
|
36
|
-
# puts
|
36
|
+
# unless batch_response.success?
|
37
|
+
# puts batch_response.result.error_description
|
37
38
|
# end
|
38
39
|
#
|
39
40
|
class BatchScrapeRequest < Request
|
40
41
|
|
41
42
|
##
|
42
|
-
# The +
|
43
|
-
#
|
43
|
+
# The +scrape+ method makes a Firecrawl '/batch/scrape/{id}' POST request which will initiate
|
44
|
+
# batch scraping of the given urls.
|
44
45
|
#
|
45
46
|
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
46
47
|
# then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
|
@@ -50,7 +51,7 @@ module Firecrawl
|
|
50
51
|
# successful and then +response.result.success?+ to validate that the API processed the
|
51
52
|
# request successfuly.
|
52
53
|
#
|
53
|
-
def
|
54
|
+
def scrape( urls, options = nil, &block )
|
54
55
|
if options
|
55
56
|
options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
|
56
57
|
options = options.to_h
|
@@ -58,7 +59,6 @@ module Firecrawl
|
|
58
59
|
options = {}
|
59
60
|
end
|
60
61
|
options[ :urls ] = [ urls ].flatten
|
61
|
-
|
62
62
|
response = post( "#{BASE_URI}/batch/scrape", options, &block )
|
63
63
|
result = nil
|
64
64
|
if response.success?
|
@@ -73,10 +73,11 @@ module Firecrawl
|
|
73
73
|
end
|
74
74
|
|
75
75
|
##
|
76
|
-
# The +
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
76
|
+
# The +retrieve_scrape_results+ method makes a Firecrawl '/batch/scrape' GET request which
|
77
|
+
# will return the scrape results that were completed since the previous call to this method
|
78
|
+
# ( or, if this is the first call to this method, since the batch scrape was started ). Note
|
79
|
+
# that there is no guarantee that there are any new batch scrape results at the time you make
|
80
|
+
# this call ( scrape_results may be empty ).
|
80
81
|
#
|
81
82
|
# The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
|
82
83
|
# then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
|
@@ -86,7 +87,7 @@ module Firecrawl
|
|
86
87
|
# successful and then +response.result.success?+ to validate that the API processed the
|
87
88
|
# request successfuly.
|
88
89
|
#
|
89
|
-
def
|
90
|
+
def retrieve_scrape_results( batch_result, &block )
|
90
91
|
raise ArgumentError, "The first argument must be an instance of BatchScrapeResult." \
|
91
92
|
unless batch_result.is_a?( BatchScrapeResult )
|
92
93
|
response = get( batch_result.next_url, &block )
|
@@ -1,21 +1,17 @@
|
|
1
1
|
module Firecrawl
|
2
2
|
class CrawlOptions
|
3
3
|
include DynamicSchema::Definable
|
4
|
-
include
|
5
|
-
|
6
|
-
FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot ]
|
7
|
-
|
8
|
-
ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
|
4
|
+
include Helpers
|
9
5
|
|
10
6
|
schema do
|
11
7
|
exclude_paths String, as: :excludePaths, array: true
|
12
8
|
include_paths String, as: :includePaths, array: true
|
13
9
|
maximum_depth Integer, as: :maxDepth
|
14
10
|
ignore_sitemap [ TrueClass, FalseClass ], as: :ignoreSitemap
|
15
|
-
limit Integer
|
11
|
+
limit Integer, in: (0..)
|
16
12
|
allow_backward_links [ TrueClass, FalseClass ], as: :allowBackwardLinks
|
17
13
|
allow_external_links [ TrueClass, FalseClass ], as: :allowExternalLinks
|
18
|
-
webhook
|
14
|
+
webhook_uri URI, as: :webhook
|
19
15
|
scrape_options as: :scrapeOptions, &ScrapeOptions.schema
|
20
16
|
end
|
21
17
|
|
@@ -27,13 +23,13 @@ module Firecrawl
|
|
27
23
|
new( api_options: builder.build!( options, &block ) )
|
28
24
|
end
|
29
25
|
|
30
|
-
def initialize( options, api_options: nil )
|
26
|
+
def initialize( options = nil, api_options: nil )
|
31
27
|
@options = self.class.builder.build( options || {} )
|
32
28
|
@options = api_options.merge( @options ) if api_options
|
33
29
|
|
34
30
|
scrape_options = @options[ :scrapeOptions ]
|
35
31
|
if scrape_options
|
36
|
-
scrape_options[ :formats ]&.map!
|
32
|
+
scrape_options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
|
37
33
|
end
|
38
34
|
end
|
39
35
|
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +CrawlRequest+ class encapsulates a crawl request to the Firecrawl API. After creating
|
5
|
+
# a new +CrawlRequest+ instance you can begin crawling by calling the +crawl+ method and
|
6
|
+
# then subsequently retrieving the results by calling the +retrieve_crawl_results+ method.
|
7
|
+
# You can also optionally cancel the crawling operation by calling +cancel_crawl+.
|
8
|
+
#
|
9
|
+
# === examples
|
10
|
+
#
|
11
|
+
# require 'firecrawl'
|
12
|
+
#
|
13
|
+
# request = Firecrawl::CrawlRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
14
|
+
#
|
15
|
+
# urls = 'https://icann.org'
|
16
|
+
# options = Firecrawl::CrawlOptions.build do
|
17
|
+
# scrape_options do
|
18
|
+
# main_content_only true
|
19
|
+
# end
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# crawl_response = request.crawl( urls, options )
|
23
|
+
# while crawl_response.success?
|
24
|
+
# crawl_result = crawl_response.result
|
25
|
+
# if crawl_result.success?
|
26
|
+
# crawl_result.scrape_results.each do | result |
|
27
|
+
# puts response.metadata[ 'title ]
|
28
|
+
# puts '---'
|
29
|
+
# puts response.markdown
|
30
|
+
# puts "\n\n"
|
31
|
+
# end
|
32
|
+
# end
|
33
|
+
# break unless crawl_result.status?( :scraping )
|
34
|
+
# crawl_response = request.
|
35
|
+
# end
|
36
|
+
#
|
37
|
+
# unless crawl_response.success?
|
38
|
+
# puts crawl_response.result.error_description
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
class CrawlRequest < Request
|
42
|
+
|
43
|
+
##
|
44
|
+
# The +crawl+ method makes a Firecrawl '/crawl' POST request which will initiate crawling
|
45
|
+
# of the given url.
|
46
|
+
#
|
47
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
48
|
+
# then +response.result+ will be an instance +CrawlResult+. If the request is not successful
|
49
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
50
|
+
#
|
51
|
+
# Remember that you should call +response.success?+ to validr that the call to the API was
|
52
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
53
|
+
# request successfuly.
|
54
|
+
#
|
55
|
+
def crawl( url, options = nil, &block )
|
56
|
+
if options
|
57
|
+
options = options.is_a?( CrawlOptions ) ? options : CrawlOptions.build( options.to_h )
|
58
|
+
options = options.to_h
|
59
|
+
else
|
60
|
+
options = {}
|
61
|
+
end
|
62
|
+
options[ url ] = url
|
63
|
+
response = post( "#{BASE_URI}/crawl", options, &block )
|
64
|
+
result = nil
|
65
|
+
if response.success?
|
66
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
67
|
+
attributes ||= { success: false, status: :failed }
|
68
|
+
result = CrawlResult.new( attributes[ :success ], attributes )
|
69
|
+
else
|
70
|
+
result = ErrorResult.new( response.status, attributes )
|
71
|
+
end
|
72
|
+
|
73
|
+
ResponseMethods.install( response, result )
|
74
|
+
end
|
75
|
+
|
76
|
+
##
|
77
|
+
# The +retrieve_crawl_results+ method makes a Firecrawl '/crawl/{id}' GET request which
|
78
|
+
# will return the crawl results that were completed since the previous call to this method
|
79
|
+
# ( or, if this is the first call to this method, since the crawl was started ). Note that
|
80
|
+
# there is no guarantee that there are any new crawl results at the time you make this call
|
81
|
+
# ( scrape_results may be empty ).
|
82
|
+
#
|
83
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is
|
84
|
+
# +true+, then +response.result+ will be an instance +CrawlResult+. If the request is not
|
85
|
+
# successful then +response.result+ will be an instance of +ErrorResult+.
|
86
|
+
#
|
87
|
+
# Remember that you should call +response.success?+ to validate that the call to the API was
|
88
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
89
|
+
# request successfuly.
|
90
|
+
#
|
91
|
+
def retrieve_crawl_results( crawl_result, &block )
|
92
|
+
raise ArgumentError, "The first argument must be an instance of CrawlResult." \
|
93
|
+
unless crawl_result.is_a?( CrawlResult )
|
94
|
+
response = get( crawl_result.next_url, &block )
|
95
|
+
result = nil
|
96
|
+
if response.success?
|
97
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
98
|
+
attributes ||= { success: false, status: :failed }
|
99
|
+
result = crawl_result.merge( attributes )
|
100
|
+
else
|
101
|
+
result = ErrorResult.new( response.status, attributes )
|
102
|
+
end
|
103
|
+
|
104
|
+
ResponseMethods.install( response, result )
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# The +cance_crawl+ method makes a Firecrawl '/crawl/{id}' DELETE request which will cancel
|
109
|
+
# a previouslly started crawl.
|
110
|
+
#
|
111
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is
|
112
|
+
# +true+, then +response.result+ will be an instance +CrawlResult+. If the request is not
|
113
|
+
# successful then +response.result+ will be an instance of +ErrorResult+.
|
114
|
+
#
|
115
|
+
# Remember that you should call +response.success?+ to validate that the call to the API was
|
116
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
117
|
+
# request successfuly.
|
118
|
+
#
|
119
|
+
def cancel_crawl( crawl_result, &block )
|
120
|
+
raise ArgumentError, "The first argument must be an instance of CrawlResult." \
|
121
|
+
unless crawl_result.is_a?( CrawlResult )
|
122
|
+
response = get( crawl_result.url, &block )
|
123
|
+
result = nil
|
124
|
+
if response.success?
|
125
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
126
|
+
attributes ||= { success: false, status: :failed }
|
127
|
+
result = crawl_result.merge( attributes )
|
128
|
+
else
|
129
|
+
result = ErrorResult.new( response.status, attributes )
|
130
|
+
end
|
131
|
+
|
132
|
+
ResponseMethods.install( response, result )
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class CrawlResult
|
3
|
+
|
4
|
+
def initialize( success, attributes )
|
5
|
+
@success = success
|
6
|
+
@attributes = attributes || {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def success?
|
10
|
+
@success || false
|
11
|
+
end
|
12
|
+
|
13
|
+
def status
|
14
|
+
# the initial Firecrawl response does not have a status so we synthesize a 'crawling'
|
15
|
+
# status if the operation was otherwise successful
|
16
|
+
@attributes[ :status ]&.to_sym || ( @success ? :scraping : :failed )
|
17
|
+
end
|
18
|
+
|
19
|
+
def status?( status )
|
20
|
+
self.status == status
|
21
|
+
end
|
22
|
+
|
23
|
+
def id
|
24
|
+
@attributes[ :id ]
|
25
|
+
end
|
26
|
+
|
27
|
+
def total
|
28
|
+
@attributes[ :total ] || 0
|
29
|
+
end
|
30
|
+
|
31
|
+
def completed
|
32
|
+
@attributes[ :completed ] || 0
|
33
|
+
end
|
34
|
+
|
35
|
+
def credits_used
|
36
|
+
@attributes[ :creditsUsed ] || 0
|
37
|
+
end
|
38
|
+
|
39
|
+
def expires_at
|
40
|
+
Date.parse( @attributes[ :expiresAt ] ) rescue nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def url
|
44
|
+
@attributes[ :url ]
|
45
|
+
end
|
46
|
+
|
47
|
+
def next_url
|
48
|
+
@attributes[ :next ] || @attributes[ :url ]
|
49
|
+
end
|
50
|
+
|
51
|
+
def scrape_results
|
52
|
+
success = @attributes[ :success ]
|
53
|
+
# note the &.compact is here because I've noted null entries in the data
|
54
|
+
( @attributes[ :data ]&.compact || [] ).map do | attr |
|
55
|
+
ScrapeResult.new( success, attr )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge( attributes )
|
60
|
+
self.class.new( attributes[ :success ], @attributes.merge( attributes ) )
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
module ModuleMethods
|
3
|
+
DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
|
4
|
+
|
5
|
+
def connection( connection = nil )
|
6
|
+
@connection = connection || @connection || DEFAULT_CONNECTION
|
7
|
+
end
|
8
|
+
|
9
|
+
def api_key( api_key = nil )
|
10
|
+
@api_key = api_key || @api_key
|
11
|
+
@api_key
|
12
|
+
end
|
13
|
+
|
14
|
+
def scrape( url, options = nil, &block )
|
15
|
+
Firecrawl::ScrapeRequest.new.scrape( url, options, &block )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/firecrawl/request.rb
CHANGED
@@ -28,8 +28,6 @@ module Firecrawl
|
|
28
28
|
#
|
29
29
|
class Request
|
30
30
|
|
31
|
-
DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
|
32
|
-
|
33
31
|
BASE_URI = 'https://api.firecrawl.dev/v1'
|
34
32
|
|
35
33
|
##
|
@@ -37,7 +35,7 @@ module Firecrawl
|
|
37
35
|
# and optionally a (Faraday) +connection+.
|
38
36
|
#
|
39
37
|
def initialize( connection: nil, api_key: nil )
|
40
|
-
@connection = connection ||
|
38
|
+
@connection = connection || Firecrawl.connection
|
41
39
|
@api_key = api_key || Firecrawl.api_key
|
42
40
|
raise ArgumentError, "An 'api_key' is required unless configured using 'Firecrawl.api_key'." \
|
43
41
|
unless @api_key
|
@@ -70,6 +68,18 @@ module Firecrawl
|
|
70
68
|
end
|
71
69
|
end
|
72
70
|
|
71
|
+
def delete( uri, &block )
|
72
|
+
headers = {
|
73
|
+
'Authorization' => "Bearer #{@api_key}",
|
74
|
+
'Content-Type' => 'application/json'
|
75
|
+
}
|
76
|
+
|
77
|
+
@connection.delete( uri ) do | request |
|
78
|
+
headers.each { | key, value | request.headers[ key ] = value }
|
79
|
+
block.call( request ) if block
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
73
83
|
end
|
74
84
|
|
75
85
|
end
|
@@ -9,7 +9,7 @@ module Firecrawl
|
|
9
9
|
|
10
10
|
schema do
|
11
11
|
# note: both format and formats are defined as a semantic convenience
|
12
|
-
format String, as: :formats, array: true, in: FORMATS
|
12
|
+
format String, as: :formats, array: true, in: FORMATS
|
13
13
|
formats String, array: true, in: FORMATS
|
14
14
|
only_main_content [ TrueClass, FalseClass ], as: :onlyMainContent
|
15
15
|
include_tags String, as: :includeTags, array: true
|
@@ -17,7 +17,7 @@ module Firecrawl
|
|
17
17
|
wait_for Integer
|
18
18
|
timeout Integer
|
19
19
|
extract do
|
20
|
-
|
20
|
+
schema Hash
|
21
21
|
system_prompt String, as: :systemPrompt
|
22
22
|
prompt String
|
23
23
|
end
|
@@ -16,6 +16,20 @@ module Firecrawl
|
|
16
16
|
@success || false
|
17
17
|
end
|
18
18
|
|
19
|
+
def metadata
|
20
|
+
unless @metadata
|
21
|
+
metadata = @attributes[ :metadata ] || {}
|
22
|
+
@metadata = metadata.transform_keys do | key |
|
23
|
+
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
24
|
+
end
|
25
|
+
# remove the camelCase forms injected by Firecrawl
|
26
|
+
@metadata.delete_if do | key, _ |
|
27
|
+
key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
|
28
|
+
end
|
29
|
+
end
|
30
|
+
@metadata
|
31
|
+
end
|
32
|
+
|
19
33
|
##
|
20
34
|
# The +markdown+ method returns scraped content that has been converted to markdown. The
|
21
35
|
# markdown content is present only if the request options +formats+ included +markdown+.
|
@@ -66,20 +80,6 @@ module Firecrawl
|
|
66
80
|
@attributes[ :actions ] || {}
|
67
81
|
end
|
68
82
|
|
69
|
-
def metadata
|
70
|
-
unless @metadata
|
71
|
-
metadata = @attributes[ :metadata ] || {}
|
72
|
-
@metadata = metadata.transform_keys do | key |
|
73
|
-
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
74
|
-
end
|
75
|
-
# remove the camelCase forms injected by Firecrawl
|
76
|
-
@metadata.delete_if do | key, _ |
|
77
|
-
key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
|
78
|
-
end
|
79
|
-
end
|
80
|
-
@metadata
|
81
|
-
end
|
82
|
-
|
83
83
|
def llm_extraction
|
84
84
|
@attributes[ :llm_extraction ] || {}
|
85
85
|
end
|
data/lib/firecrawl.rb
CHANGED
@@ -18,10 +18,14 @@ require_relative 'firecrawl/batch_scrape_request'
|
|
18
18
|
require_relative 'firecrawl/map_options'
|
19
19
|
require_relative 'firecrawl/map_result'
|
20
20
|
require_relative 'firecrawl/map_request'
|
21
|
+
require_relative 'firecrawl/crawl_options'
|
22
|
+
require_relative 'firecrawl/crawl_result'
|
23
|
+
require_relative 'firecrawl/crawl_request'
|
24
|
+
|
25
|
+
require_relative 'firecrawl/module_methods'
|
21
26
|
|
22
27
|
module Firecrawl
|
23
|
-
|
24
|
-
attr_accessor :api_key
|
25
|
-
end
|
28
|
+
extend ModuleMethods
|
26
29
|
end
|
27
30
|
|
31
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: firecrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kristoph Cichocki-Romanov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-11-
|
11
|
+
date: 2024-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: faraday
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.0.0.
|
33
|
+
version: 1.0.0.beta04
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.0.0.
|
40
|
+
version: 1.0.0.beta04
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.9'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: vcr
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '6.3'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '6.3'
|
69
83
|
description: |-
|
70
84
|
The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl can take a URL, scrape the page contents and return the whole page or principal content as html, markdown, or structured data.
|
71
85
|
|
@@ -77,16 +91,20 @@ extensions: []
|
|
77
91
|
extra_rdoc_files: []
|
78
92
|
files:
|
79
93
|
- LICENSE
|
94
|
+
- README.md
|
80
95
|
- firecrawl.gemspec
|
81
96
|
- lib/firecrawl.rb
|
82
97
|
- lib/firecrawl/batch_scrape_request.rb
|
83
98
|
- lib/firecrawl/batch_scrape_result.rb
|
84
99
|
- lib/firecrawl/crawl_options.rb
|
100
|
+
- lib/firecrawl/crawl_request.rb
|
101
|
+
- lib/firecrawl/crawl_result.rb
|
85
102
|
- lib/firecrawl/error_result.rb
|
86
103
|
- lib/firecrawl/helpers.rb
|
87
104
|
- lib/firecrawl/map_options.rb
|
88
105
|
- lib/firecrawl/map_request.rb
|
89
106
|
- lib/firecrawl/map_result.rb
|
107
|
+
- lib/firecrawl/module_methods.rb
|
90
108
|
- lib/firecrawl/request.rb
|
91
109
|
- lib/firecrawl/response_methods.rb
|
92
110
|
- lib/firecrawl/scrape_options.rb
|