firecrawl 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +198 -0
- data/firecrawl.gemspec +3 -2
- data/lib/firecrawl/batch_scrape_request.rb +15 -14
- data/lib/firecrawl/crawl_options.rb +5 -9
- data/lib/firecrawl/crawl_request.rb +136 -0
- data/lib/firecrawl/crawl_result.rb +63 -0
- data/lib/firecrawl/module_methods.rb +18 -0
- data/lib/firecrawl/request.rb +13 -3
- data/lib/firecrawl/scrape_options.rb +2 -2
- data/lib/firecrawl/scrape_result.rb +14 -14
- data/lib/firecrawl.rb +7 -3
- metadata +22 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ba3c8f728651fdc912a44ad7aee4fcdae849cbac7f351b62dc8cec9fdb29973
|
4
|
+
data.tar.gz: dfd0382fb5cafdd471572cb1e3c470b7c28548d20a466c469bbec53ea5c41209
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 03a76beec314251ed927c8abc6a2fc2493c464dff7907699d484c752b3a153e25a9df184f03f98d7849bf944051010807b7dbb549a1b466710a3b22e83cdd96c
|
7
|
+
data.tar.gz: ca04e151e6ffc27e38325fe1a4ae701fc007cee62659b77ceb079191b4c07667b91accd38dcec5531a194a5c7b9f0d685b4a26e0ee8d9679792040ac07745f6b
|
data/README.md
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# Firecrawl
|
2
|
+
|
3
|
+
Firecrawl is a lightweight Ruby gem that provides a semantically straightfoward interface to
|
4
|
+
the Firecrawl.dev API, allowing you to easily scrape web content, take screenshots, as well as
|
5
|
+
crawl entire web domains.
|
6
|
+
|
7
|
+
The gem is particularly useful when working with Large Language Models (LLMs) as it can
|
8
|
+
provide markdown information for real time information lookup as well as grounding.
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
require 'firecrawl'
|
12
|
+
|
13
|
+
Firecrawl.api_key ENV[ 'FIRECRAWL_API_KEY' ]
|
14
|
+
response = Firecrawl.scrape( 'https://example.com', options )
|
15
|
+
if response.success?
|
16
|
+
result = response.result
|
17
|
+
puts result.metadata[ 'title' ]
|
18
|
+
puts '---'
|
19
|
+
puts result.markdown
|
20
|
+
puts "Screenshot URL: #{ result.screenshot_url }"
|
21
|
+
else
|
22
|
+
puts response.result.error_description
|
23
|
+
end
|
24
|
+
```
|
25
|
+
|
26
|
+
## Installation
|
27
|
+
|
28
|
+
Add this line to your application's Gemfile:
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
gem 'firecrawl'
|
32
|
+
```
|
33
|
+
|
34
|
+
Then execute:
|
35
|
+
|
36
|
+
```bash
|
37
|
+
$ bundle install
|
38
|
+
```
|
39
|
+
|
40
|
+
Or install it directly:
|
41
|
+
|
42
|
+
```bash
|
43
|
+
$ gem install firecrawl
|
44
|
+
```
|
45
|
+
|
46
|
+
## Usage
|
47
|
+
|
48
|
+
### Basic Scraping
|
49
|
+
|
50
|
+
The simplest way to use Firecrawl is to scrape a single page:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
Firecrawl.api_key ENV['FIRECRAWL_API_KEY']
|
54
|
+
response = Firecrawl.scrape('https://example.com', format: :markdown )
|
55
|
+
|
56
|
+
if response.success?
|
57
|
+
result = response.result
|
58
|
+
if result.success?
|
59
|
+
puts result.metadata['title']
|
60
|
+
puts result.markdown
|
61
|
+
end
|
62
|
+
else
|
63
|
+
puts response.result.error_description
|
64
|
+
end
|
65
|
+
```
|
66
|
+
|
67
|
+
### Scrape Options
|
68
|
+
|
69
|
+
You can customize scraping behavior using `ScrapeOptions`:
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
options = Firecrawl::ScrapeOptions.build do
|
73
|
+
formats [ :html, :markdown, :screenshot ]
|
74
|
+
only_main_content true
|
75
|
+
include_tags [ 'article', 'main' ]
|
76
|
+
exclude_tags [ 'nav', 'footer' ]
|
77
|
+
wait_for 5000 # milliseconds
|
78
|
+
end
|
79
|
+
|
80
|
+
request = Firecrawl::ScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
|
81
|
+
response = request.scrape('https://example.com', options)
|
82
|
+
```
|
83
|
+
|
84
|
+
### Batch Scraping
|
85
|
+
|
86
|
+
For scraping multiple URLs efficiently:
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
request = Firecrawl::BatchScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
|
90
|
+
|
91
|
+
urls = [ 'https://example.com', 'https://example.org' ]
|
92
|
+
options = Firecrawl::ScrapeOptions.build do
|
93
|
+
format :markdown
|
94
|
+
only_main_content true
|
95
|
+
end
|
96
|
+
|
97
|
+
response = request.scrape( urls, options )
|
98
|
+
while response.success?
|
99
|
+
batch_result = response.result
|
100
|
+
batch_result.scrape_results.each do |result|
|
101
|
+
puts result.metadata['title']
|
102
|
+
puts result.markdown
|
103
|
+
puts "\n---\n"
|
104
|
+
end
|
105
|
+
break unless batch_result.status?( :scraping )
|
106
|
+
sleep 0.5
|
107
|
+
response = request.retrieve_scrape_results( batch_result )
|
108
|
+
end
|
109
|
+
```
|
110
|
+
|
111
|
+
### Site Mapping
|
112
|
+
|
113
|
+
To retrieve a site's structure:
|
114
|
+
|
115
|
+
```ruby
|
116
|
+
request = Firecrawl::MapRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
|
117
|
+
|
118
|
+
options = Firecrawl::MapOptions.build do
|
119
|
+
limit 100
|
120
|
+
ignore_subdomains true
|
121
|
+
end
|
122
|
+
|
123
|
+
response = request.map( 'https://example.com', options )
|
124
|
+
if response.success?
|
125
|
+
result = response.result
|
126
|
+
result.links.each do |link|
|
127
|
+
puts link
|
128
|
+
end
|
129
|
+
end
|
130
|
+
```
|
131
|
+
|
132
|
+
### Site Crawling
|
133
|
+
|
134
|
+
For comprehensive site crawling:
|
135
|
+
|
136
|
+
```ruby
|
137
|
+
request = Firecrawl::CrawlRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' ] )
|
138
|
+
|
139
|
+
options = Firecrawl::CrawlOptions.build do
|
140
|
+
maximum_depth 2
|
141
|
+
limit 10
|
142
|
+
scrape_options do
|
143
|
+
format :markdown
|
144
|
+
only_main_content true
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
response = request.crawl( 'https://example.com', options )
|
149
|
+
while response.success?
|
150
|
+
crawl_result = response.result
|
151
|
+
crawl_result.scrape_results.each do |result|
|
152
|
+
puts result.metadata['title']
|
153
|
+
puts result.markdown
|
154
|
+
end
|
155
|
+
break unless crawl_result.status?(:scraping)
|
156
|
+
sleep 0.5
|
157
|
+
response = request.retrieve_crawl_results(crawl_result)
|
158
|
+
end
|
159
|
+
```
|
160
|
+
|
161
|
+
## Response Structure
|
162
|
+
|
163
|
+
All Firecrawl requests return a Faraday response with an added `result` method. The result will
|
164
|
+
be one of:
|
165
|
+
|
166
|
+
- `ScrapeResult`: Contains the scraped content and metadata
|
167
|
+
- `BatchScrapeResult`: Contains multiple scrape results
|
168
|
+
- `MapResult`: Contains discovered links
|
169
|
+
- `CrawlResult`: Contains scrape results from crawled pages
|
170
|
+
- `ErrorResult`: Contains error information if the request failed
|
171
|
+
|
172
|
+
### Working with Results
|
173
|
+
|
174
|
+
```ruby
|
175
|
+
response = request.scrape(url, options)
|
176
|
+
if response.success?
|
177
|
+
result = response.result
|
178
|
+
if result.success?
|
179
|
+
# Access scraped content
|
180
|
+
puts result.metadata['title']
|
181
|
+
puts result.markdown
|
182
|
+
puts result.html
|
183
|
+
puts result.raw_html
|
184
|
+
puts result.screenshot_url
|
185
|
+
puts result.links
|
186
|
+
|
187
|
+
# Check for warnings
|
188
|
+
puts result.warning if result.warning
|
189
|
+
end
|
190
|
+
else
|
191
|
+
error = response.result
|
192
|
+
puts "#{error.error_type}: #{error.error_description}"
|
193
|
+
end
|
194
|
+
```
|
195
|
+
|
196
|
+
## License
|
197
|
+
|
198
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/firecrawl.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do | spec |
|
2
2
|
|
3
3
|
spec.name = 'firecrawl'
|
4
|
-
spec.version = '0.0
|
4
|
+
spec.version = '0.1.0'
|
5
5
|
spec.authors = [ 'Kristoph Cichocki-Romanov' ]
|
6
6
|
spec.email = [ 'rubygems.org@kristoph.net' ]
|
7
7
|
|
@@ -29,9 +29,10 @@ Gem::Specification.new do | spec |
|
|
29
29
|
spec.require_paths = [ "lib" ]
|
30
30
|
|
31
31
|
spec.add_runtime_dependency 'faraday', '~> 2.7'
|
32
|
-
spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.
|
32
|
+
spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta04'
|
33
33
|
|
34
34
|
spec.add_development_dependency 'rspec', '~> 3.13'
|
35
35
|
spec.add_development_dependency 'debug', '~> 1.9'
|
36
|
+
spec.add_development_dependency 'vcr', '~> 6.3'
|
36
37
|
|
37
38
|
end
|
@@ -3,8 +3,8 @@ module Firecrawl
|
|
3
3
|
##
|
4
4
|
# The +BatchScrapeRequest+ class encapsulates a batch scrape request to the Firecrawl API.
|
5
5
|
# After creating a new +BatchScrapeRequest+ instance you can begin batch scraping by calling
|
6
|
-
# the +
|
7
|
-
# +
|
6
|
+
# the +scrape+ method and then subsequently retrieve the results by calling the
|
7
|
+
# +retrieve_scrape_results' method.
|
8
8
|
#
|
9
9
|
# === examples
|
10
10
|
#
|
@@ -18,7 +18,7 @@ module Firecrawl
|
|
18
18
|
# only_main_content true
|
19
19
|
# end
|
20
20
|
#
|
21
|
-
# batch_response = request.
|
21
|
+
# batch_response = request.scrape( urls, options )
|
22
22
|
# while response.success?
|
23
23
|
# batch_result = batch_response.result
|
24
24
|
# if batch_result.success?
|
@@ -30,17 +30,18 @@ module Firecrawl
|
|
30
30
|
# end
|
31
31
|
# end
|
32
32
|
# break unless batch_result.status?( :scraping )
|
33
|
+
# batch_response = request.retrieve_scrape_results( batch_result )
|
33
34
|
# end
|
34
35
|
#
|
35
|
-
# unless
|
36
|
-
# puts
|
36
|
+
# unless batch_response.success?
|
37
|
+
# puts batch_response.result.error_description
|
37
38
|
# end
|
38
39
|
#
|
39
40
|
class BatchScrapeRequest < Request
|
40
41
|
|
41
42
|
##
|
42
|
-
# The +
|
43
|
-
#
|
43
|
+
# The +scrape+ method makes a Firecrawl '/batch/scrape/{id}' POST request which will initiate
|
44
|
+
# batch scraping of the given urls.
|
44
45
|
#
|
45
46
|
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
46
47
|
# then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
|
@@ -50,7 +51,7 @@ module Firecrawl
|
|
50
51
|
# successful and then +response.result.success?+ to validate that the API processed the
|
51
52
|
# request successfuly.
|
52
53
|
#
|
53
|
-
def
|
54
|
+
def scrape( urls, options = nil, &block )
|
54
55
|
if options
|
55
56
|
options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
|
56
57
|
options = options.to_h
|
@@ -58,7 +59,6 @@ module Firecrawl
|
|
58
59
|
options = {}
|
59
60
|
end
|
60
61
|
options[ :urls ] = [ urls ].flatten
|
61
|
-
|
62
62
|
response = post( "#{BASE_URI}/batch/scrape", options, &block )
|
63
63
|
result = nil
|
64
64
|
if response.success?
|
@@ -73,10 +73,11 @@ module Firecrawl
|
|
73
73
|
end
|
74
74
|
|
75
75
|
##
|
76
|
-
# The +
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
76
|
+
# The +retrieve_scrape_results+ method makes a Firecrawl '/batch/scrape' GET request which
|
77
|
+
# will return the scrape results that were completed since the previous call to this method
|
78
|
+
# ( or, if this is the first call to this method, since the batch scrape was started ). Note
|
79
|
+
# that there is no guarantee that there are any new batch scrape results at the time you make
|
80
|
+
# this call ( scrape_results may be empty ).
|
80
81
|
#
|
81
82
|
# The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
|
82
83
|
# then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
|
@@ -86,7 +87,7 @@ module Firecrawl
|
|
86
87
|
# successful and then +response.result.success?+ to validate that the API processed the
|
87
88
|
# request successfuly.
|
88
89
|
#
|
89
|
-
def
|
90
|
+
def retrieve_scrape_results( batch_result, &block )
|
90
91
|
raise ArgumentError, "The first argument must be an instance of BatchScrapeResult." \
|
91
92
|
unless batch_result.is_a?( BatchScrapeResult )
|
92
93
|
response = get( batch_result.next_url, &block )
|
@@ -1,21 +1,17 @@
|
|
1
1
|
module Firecrawl
|
2
2
|
class CrawlOptions
|
3
3
|
include DynamicSchema::Definable
|
4
|
-
include
|
5
|
-
|
6
|
-
FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot ]
|
7
|
-
|
8
|
-
ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
|
4
|
+
include Helpers
|
9
5
|
|
10
6
|
schema do
|
11
7
|
exclude_paths String, as: :excludePaths, array: true
|
12
8
|
include_paths String, as: :includePaths, array: true
|
13
9
|
maximum_depth Integer, as: :maxDepth
|
14
10
|
ignore_sitemap [ TrueClass, FalseClass ], as: :ignoreSitemap
|
15
|
-
limit Integer
|
11
|
+
limit Integer, in: (0..)
|
16
12
|
allow_backward_links [ TrueClass, FalseClass ], as: :allowBackwardLinks
|
17
13
|
allow_external_links [ TrueClass, FalseClass ], as: :allowExternalLinks
|
18
|
-
webhook
|
14
|
+
webhook_uri URI, as: :webhook
|
19
15
|
scrape_options as: :scrapeOptions, &ScrapeOptions.schema
|
20
16
|
end
|
21
17
|
|
@@ -27,13 +23,13 @@ module Firecrawl
|
|
27
23
|
new( api_options: builder.build!( options, &block ) )
|
28
24
|
end
|
29
25
|
|
30
|
-
def initialize( options, api_options: nil )
|
26
|
+
def initialize( options = nil, api_options: nil )
|
31
27
|
@options = self.class.builder.build( options || {} )
|
32
28
|
@options = api_options.merge( @options ) if api_options
|
33
29
|
|
34
30
|
scrape_options = @options[ :scrapeOptions ]
|
35
31
|
if scrape_options
|
36
|
-
scrape_options[ :formats ]&.map!
|
32
|
+
scrape_options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
|
37
33
|
end
|
38
34
|
end
|
39
35
|
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +CrawlRequest+ class encapsulates a crawl request to the Firecrawl API. After creating
|
5
|
+
# a new +CrawlRequest+ instance you can begin crawling by calling the +crawl+ method and
|
6
|
+
# then subsequently retrieving the results by calling the +retrieve_crawl_results+ method.
|
7
|
+
# You can also optionally cancel the crawling operation by calling +cancel_crawl+.
|
8
|
+
#
|
9
|
+
# === examples
|
10
|
+
#
|
11
|
+
# require 'firecrawl'
|
12
|
+
#
|
13
|
+
# request = Firecrawl::CrawlRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
14
|
+
#
|
15
|
+
# urls = 'https://icann.org'
|
16
|
+
# options = Firecrawl::CrawlOptions.build do
|
17
|
+
# scrape_options do
|
18
|
+
# main_content_only true
|
19
|
+
# end
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# crawl_response = request.crawl( urls, options )
|
23
|
+
# while crawl_response.success?
|
24
|
+
# crawl_result = crawl_response.result
|
25
|
+
# if crawl_result.success?
|
26
|
+
# crawl_result.scrape_results.each do | result |
|
27
|
+
# puts response.metadata[ 'title ]
|
28
|
+
# puts '---'
|
29
|
+
# puts response.markdown
|
30
|
+
# puts "\n\n"
|
31
|
+
# end
|
32
|
+
# end
|
33
|
+
# break unless crawl_result.status?( :scraping )
|
34
|
+
# crawl_response = request.
|
35
|
+
# end
|
36
|
+
#
|
37
|
+
# unless crawl_response.success?
|
38
|
+
# puts crawl_response.result.error_description
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
class CrawlRequest < Request
|
42
|
+
|
43
|
+
##
|
44
|
+
# The +crawl+ method makes a Firecrawl '/crawl' POST request which will initiate crawling
|
45
|
+
# of the given url.
|
46
|
+
#
|
47
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
48
|
+
# then +response.result+ will be an instance +CrawlResult+. If the request is not successful
|
49
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
50
|
+
#
|
51
|
+
# Remember that you should call +response.success?+ to validr that the call to the API was
|
52
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
53
|
+
# request successfuly.
|
54
|
+
#
|
55
|
+
def crawl( url, options = nil, &block )
|
56
|
+
if options
|
57
|
+
options = options.is_a?( CrawlOptions ) ? options : CrawlOptions.build( options.to_h )
|
58
|
+
options = options.to_h
|
59
|
+
else
|
60
|
+
options = {}
|
61
|
+
end
|
62
|
+
options[ url ] = url
|
63
|
+
response = post( "#{BASE_URI}/crawl", options, &block )
|
64
|
+
result = nil
|
65
|
+
if response.success?
|
66
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
67
|
+
attributes ||= { success: false, status: :failed }
|
68
|
+
result = CrawlResult.new( attributes[ :success ], attributes )
|
69
|
+
else
|
70
|
+
result = ErrorResult.new( response.status, attributes )
|
71
|
+
end
|
72
|
+
|
73
|
+
ResponseMethods.install( response, result )
|
74
|
+
end
|
75
|
+
|
76
|
+
##
|
77
|
+
# The +retrieve_crawl_results+ method makes a Firecrawl '/crawl/{id}' GET request which
|
78
|
+
# will return the crawl results that were completed since the previous call to this method
|
79
|
+
# ( or, if this is the first call to this method, since the crawl was started ). Note that
|
80
|
+
# there is no guarantee that there are any new crawl results at the time you make this call
|
81
|
+
# ( scrape_results may be empty ).
|
82
|
+
#
|
83
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is
|
84
|
+
# +true+, then +response.result+ will be an instance +CrawlResult+. If the request is not
|
85
|
+
# successful then +response.result+ will be an instance of +ErrorResult+.
|
86
|
+
#
|
87
|
+
# Remember that you should call +response.success?+ to validate that the call to the API was
|
88
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
89
|
+
# request successfuly.
|
90
|
+
#
|
91
|
+
def retrieve_crawl_results( crawl_result, &block )
|
92
|
+
raise ArgumentError, "The first argument must be an instance of CrawlResult." \
|
93
|
+
unless crawl_result.is_a?( CrawlResult )
|
94
|
+
response = get( crawl_result.next_url, &block )
|
95
|
+
result = nil
|
96
|
+
if response.success?
|
97
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
98
|
+
attributes ||= { success: false, status: :failed }
|
99
|
+
result = crawl_result.merge( attributes )
|
100
|
+
else
|
101
|
+
result = ErrorResult.new( response.status, attributes )
|
102
|
+
end
|
103
|
+
|
104
|
+
ResponseMethods.install( response, result )
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# The +cance_crawl+ method makes a Firecrawl '/crawl/{id}' DELETE request which will cancel
|
109
|
+
# a previouslly started crawl.
|
110
|
+
#
|
111
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is
|
112
|
+
# +true+, then +response.result+ will be an instance +CrawlResult+. If the request is not
|
113
|
+
# successful then +response.result+ will be an instance of +ErrorResult+.
|
114
|
+
#
|
115
|
+
# Remember that you should call +response.success?+ to validate that the call to the API was
|
116
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
117
|
+
# request successfuly.
|
118
|
+
#
|
119
|
+
def cancel_crawl( crawl_result, &block )
|
120
|
+
raise ArgumentError, "The first argument must be an instance of CrawlResult." \
|
121
|
+
unless crawl_result.is_a?( CrawlResult )
|
122
|
+
response = get( crawl_result.url, &block )
|
123
|
+
result = nil
|
124
|
+
if response.success?
|
125
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
126
|
+
attributes ||= { success: false, status: :failed }
|
127
|
+
result = crawl_result.merge( attributes )
|
128
|
+
else
|
129
|
+
result = ErrorResult.new( response.status, attributes )
|
130
|
+
end
|
131
|
+
|
132
|
+
ResponseMethods.install( response, result )
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class CrawlResult
|
3
|
+
|
4
|
+
def initialize( success, attributes )
|
5
|
+
@success = success
|
6
|
+
@attributes = attributes || {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def success?
|
10
|
+
@success || false
|
11
|
+
end
|
12
|
+
|
13
|
+
def status
|
14
|
+
# the initial Firecrawl response does not have a status so we synthesize a 'crawling'
|
15
|
+
# status if the operation was otherwise successful
|
16
|
+
@attributes[ :status ]&.to_sym || ( @success ? :scraping : :failed )
|
17
|
+
end
|
18
|
+
|
19
|
+
def status?( status )
|
20
|
+
self.status == status
|
21
|
+
end
|
22
|
+
|
23
|
+
def id
|
24
|
+
@attributes[ :id ]
|
25
|
+
end
|
26
|
+
|
27
|
+
def total
|
28
|
+
@attributes[ :total ] || 0
|
29
|
+
end
|
30
|
+
|
31
|
+
def completed
|
32
|
+
@attributes[ :completed ] || 0
|
33
|
+
end
|
34
|
+
|
35
|
+
def credits_used
|
36
|
+
@attributes[ :creditsUsed ] || 0
|
37
|
+
end
|
38
|
+
|
39
|
+
def expires_at
|
40
|
+
Date.parse( @attributes[ :expiresAt ] ) rescue nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def url
|
44
|
+
@attributes[ :url ]
|
45
|
+
end
|
46
|
+
|
47
|
+
def next_url
|
48
|
+
@attributes[ :next ] || @attributes[ :url ]
|
49
|
+
end
|
50
|
+
|
51
|
+
def scrape_results
|
52
|
+
success = @attributes[ :success ]
|
53
|
+
# note the &.compact is here because I've noted null entries in the data
|
54
|
+
( @attributes[ :data ]&.compact || [] ).map do | attr |
|
55
|
+
ScrapeResult.new( success, attr )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge( attributes )
|
60
|
+
self.class.new( attributes[ :success ], @attributes.merge( attributes ) )
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
module ModuleMethods
|
3
|
+
DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
|
4
|
+
|
5
|
+
def connection( connection = nil )
|
6
|
+
@connection = connection || @connection || DEFAULT_CONNECTION
|
7
|
+
end
|
8
|
+
|
9
|
+
def api_key( api_key = nil )
|
10
|
+
@api_key = api_key || @api_key
|
11
|
+
@api_key
|
12
|
+
end
|
13
|
+
|
14
|
+
def scrape( url, options = nil, &block )
|
15
|
+
Firecrawl::ScrapeRequest.new.scrape( url, options, &block )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/firecrawl/request.rb
CHANGED
@@ -28,8 +28,6 @@ module Firecrawl
|
|
28
28
|
#
|
29
29
|
class Request
|
30
30
|
|
31
|
-
DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
|
32
|
-
|
33
31
|
BASE_URI = 'https://api.firecrawl.dev/v1'
|
34
32
|
|
35
33
|
##
|
@@ -37,7 +35,7 @@ module Firecrawl
|
|
37
35
|
# and optionally a (Faraday) +connection+.
|
38
36
|
#
|
39
37
|
def initialize( connection: nil, api_key: nil )
|
40
|
-
@connection = connection ||
|
38
|
+
@connection = connection || Firecrawl.connection
|
41
39
|
@api_key = api_key || Firecrawl.api_key
|
42
40
|
raise ArgumentError, "An 'api_key' is required unless configured using 'Firecrawl.api_key'." \
|
43
41
|
unless @api_key
|
@@ -70,6 +68,18 @@ module Firecrawl
|
|
70
68
|
end
|
71
69
|
end
|
72
70
|
|
71
|
+
def delete( uri, &block )
|
72
|
+
headers = {
|
73
|
+
'Authorization' => "Bearer #{@api_key}",
|
74
|
+
'Content-Type' => 'application/json'
|
75
|
+
}
|
76
|
+
|
77
|
+
@connection.delete( uri ) do | request |
|
78
|
+
headers.each { | key, value | request.headers[ key ] = value }
|
79
|
+
block.call( request ) if block
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
73
83
|
end
|
74
84
|
|
75
85
|
end
|
@@ -9,7 +9,7 @@ module Firecrawl
|
|
9
9
|
|
10
10
|
schema do
|
11
11
|
# note: both format and formats are defined as a semantic convenience
|
12
|
-
format String, as: :formats, array: true, in: FORMATS
|
12
|
+
format String, as: :formats, array: true, in: FORMATS
|
13
13
|
formats String, array: true, in: FORMATS
|
14
14
|
only_main_content [ TrueClass, FalseClass ], as: :onlyMainContent
|
15
15
|
include_tags String, as: :includeTags, array: true
|
@@ -17,7 +17,7 @@ module Firecrawl
|
|
17
17
|
wait_for Integer
|
18
18
|
timeout Integer
|
19
19
|
extract do
|
20
|
-
|
20
|
+
schema Hash
|
21
21
|
system_prompt String, as: :systemPrompt
|
22
22
|
prompt String
|
23
23
|
end
|
@@ -16,6 +16,20 @@ module Firecrawl
|
|
16
16
|
@success || false
|
17
17
|
end
|
18
18
|
|
19
|
+
def metadata
|
20
|
+
unless @metadata
|
21
|
+
metadata = @attributes[ :metadata ] || {}
|
22
|
+
@metadata = metadata.transform_keys do | key |
|
23
|
+
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
24
|
+
end
|
25
|
+
# remove the camelCase forms injected by Firecrawl
|
26
|
+
@metadata.delete_if do | key, _ |
|
27
|
+
key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
|
28
|
+
end
|
29
|
+
end
|
30
|
+
@metadata
|
31
|
+
end
|
32
|
+
|
19
33
|
##
|
20
34
|
# The +markdown+ method returns scraped content that has been converted to markdown. The
|
21
35
|
# markdown content is present only if the request options +formats+ included +markdown+.
|
@@ -66,20 +80,6 @@ module Firecrawl
|
|
66
80
|
@attributes[ :actions ] || {}
|
67
81
|
end
|
68
82
|
|
69
|
-
def metadata
|
70
|
-
unless @metadata
|
71
|
-
metadata = @attributes[ :metadata ] || {}
|
72
|
-
@metadata = metadata.transform_keys do | key |
|
73
|
-
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
74
|
-
end
|
75
|
-
# remove the camelCase forms injected by Firecrawl
|
76
|
-
@metadata.delete_if do | key, _ |
|
77
|
-
key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
|
78
|
-
end
|
79
|
-
end
|
80
|
-
@metadata
|
81
|
-
end
|
82
|
-
|
83
83
|
def llm_extraction
|
84
84
|
@attributes[ :llm_extraction ] || {}
|
85
85
|
end
|
data/lib/firecrawl.rb
CHANGED
@@ -18,10 +18,14 @@ require_relative 'firecrawl/batch_scrape_request'
|
|
18
18
|
require_relative 'firecrawl/map_options'
|
19
19
|
require_relative 'firecrawl/map_result'
|
20
20
|
require_relative 'firecrawl/map_request'
|
21
|
+
require_relative 'firecrawl/crawl_options'
|
22
|
+
require_relative 'firecrawl/crawl_result'
|
23
|
+
require_relative 'firecrawl/crawl_request'
|
24
|
+
|
25
|
+
require_relative 'firecrawl/module_methods'
|
21
26
|
|
22
27
|
module Firecrawl
|
23
|
-
|
24
|
-
attr_accessor :api_key
|
25
|
-
end
|
28
|
+
extend ModuleMethods
|
26
29
|
end
|
27
30
|
|
31
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: firecrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kristoph Cichocki-Romanov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-11-
|
11
|
+
date: 2024-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: faraday
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.0.0.
|
33
|
+
version: 1.0.0.beta04
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.0.0.
|
40
|
+
version: 1.0.0.beta04
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.9'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: vcr
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '6.3'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '6.3'
|
69
83
|
description: |-
|
70
84
|
The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl can take a URL, scrape the page contents and return the whole page or principal content as html, markdown, or structured data.
|
71
85
|
|
@@ -77,16 +91,20 @@ extensions: []
|
|
77
91
|
extra_rdoc_files: []
|
78
92
|
files:
|
79
93
|
- LICENSE
|
94
|
+
- README.md
|
80
95
|
- firecrawl.gemspec
|
81
96
|
- lib/firecrawl.rb
|
82
97
|
- lib/firecrawl/batch_scrape_request.rb
|
83
98
|
- lib/firecrawl/batch_scrape_result.rb
|
84
99
|
- lib/firecrawl/crawl_options.rb
|
100
|
+
- lib/firecrawl/crawl_request.rb
|
101
|
+
- lib/firecrawl/crawl_result.rb
|
85
102
|
- lib/firecrawl/error_result.rb
|
86
103
|
- lib/firecrawl/helpers.rb
|
87
104
|
- lib/firecrawl/map_options.rb
|
88
105
|
- lib/firecrawl/map_request.rb
|
89
106
|
- lib/firecrawl/map_result.rb
|
107
|
+
- lib/firecrawl/module_methods.rb
|
90
108
|
- lib/firecrawl/request.rb
|
91
109
|
- lib/firecrawl/response_methods.rb
|
92
110
|
- lib/firecrawl/scrape_options.rb
|