firecrawl 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/firecrawl.gemspec +37 -0
- data/lib/firecrawl/batch_scrape_request.rb +106 -0
- data/lib/firecrawl/batch_scrape_result.rb +63 -0
- data/lib/firecrawl/crawl_options.rb +47 -0
- data/lib/firecrawl/error_result.rb +45 -0
- data/lib/firecrawl/helpers.rb +8 -0
- data/lib/firecrawl/map_options.rb +32 -0
- data/lib/firecrawl/map_request.rb +58 -0
- data/lib/firecrawl/map_result.rb +29 -0
- data/lib/firecrawl/request.rb +75 -0
- data/lib/firecrawl/response_methods.rb +15 -0
- data/lib/firecrawl/scrape_options.rb +58 -0
- data/lib/firecrawl/scrape_request.rb +60 -0
- data/lib/firecrawl/scrape_result.rb +92 -0
- data/lib/firecrawl.rb +27 -0
- metadata +122 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d79423650fbec60124bab7710e372f16774987f437e8bb52cb13e72a434bdec6
|
4
|
+
data.tar.gz: bfb0ec5c302e4e646812855d5c954e2fc6c075e4a03af495a6a09815ce588e44
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 46c819135a19388beb434c797e16a3a73f52b1d85cc37f317e03dc159de8ed1a56a6a5170f2e2457e2798fbff25d93cfbf11f7be12682fcb6ef13534aa347f62
|
7
|
+
data.tar.gz: ffe9b239c29138617a1902f8502099b6f0b2019eeed8b266cd8f77e94eee868e8e876ee9f8545cd584d701a8bfffa75f6552508b2949463f2cc187580eeb35fe
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Endless International
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/firecrawl.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
Gem::Specification.new do | spec |
|
2
|
+
|
3
|
+
spec.name = 'firecrawl'
|
4
|
+
spec.version = '0.0.1'
|
5
|
+
spec.authors = [ 'Kristoph Cichocki-Romanov' ]
|
6
|
+
spec.email = [ 'rubygems.org@kristoph.net' ]
|
7
|
+
|
8
|
+
spec.summary =
|
9
|
+
"The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API which takes " \
|
10
|
+
"a URL, crawls it and returns html, markdown, or structured data. It is of particular value" \
|
11
|
+
"when used with LLM's for grounding."
|
12
|
+
spec.description =
|
13
|
+
"The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl " \
|
14
|
+
"can take a URL, scrape the page contents and return the whole page or principal content " \
|
15
|
+
"as html, markdown, or structured data.\n" \
|
16
|
+
"\n" \
|
17
|
+
"In addition, Firecrawl can crawl an entire site returning the pages it encounters or just " \
|
18
|
+
"the map of the pages, which can be used for subsequent scraping."
|
19
|
+
spec.license = 'MIT'
|
20
|
+
spec.homepage = 'https://github.com/EndlessInternational/firecrawl'
|
21
|
+
spec.metadata = {
|
22
|
+
'source_code_uri' => 'https://github.com/EndlessInternational/firecrawl',
|
23
|
+
'bug_tracker_uri' => 'https://github.com/EndlessInternational/firecrawl/issues',
|
24
|
+
# 'documentation_uri' => 'https://github.com/EndlessInternational/firecrawl'
|
25
|
+
}
|
26
|
+
|
27
|
+
spec.required_ruby_version = '>= 3.0'
|
28
|
+
spec.files = Dir[ "lib/**/*.rb", "LICENSE", "README.md", "firecrawl.gemspec" ]
|
29
|
+
spec.require_paths = [ "lib" ]
|
30
|
+
|
31
|
+
spec.add_runtime_dependency 'faraday', '~> 2.7'
|
32
|
+
spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta03'
|
33
|
+
|
34
|
+
spec.add_development_dependency 'rspec', '~> 3.13'
|
35
|
+
spec.add_development_dependency 'debug', '~> 1.9'
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +BatchScrapeRequest+ class encapsulates a batch scrape request to the Firecrawl API.
|
5
|
+
# After creating a new +BatchScrapeRequest+ instance you can begin batch scraping by calling
|
6
|
+
# the +begin_scraping+ method and then subsequently evaluate the results by calling the
|
7
|
+
# +continue_scraping' method.
|
8
|
+
#
|
9
|
+
# === examples
|
10
|
+
#
|
11
|
+
# require 'firecrawl'
|
12
|
+
#
|
13
|
+
# request = Firecrawl::BatchScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
14
|
+
#
|
15
|
+
# urls = [ 'https://example.com', 'https://icann.org' ]
|
16
|
+
# options = Firecrawl::ScrapeOptions.build do
|
17
|
+
# format [ :markdown, 'screenshot@full_page' ]
|
18
|
+
# only_main_content true
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# batch_response = request.beging_scraping( urls, options )
|
22
|
+
# while response.success?
|
23
|
+
# batch_result = batch_response.result
|
24
|
+
# if batch_result.success?
|
25
|
+
# batch_result.scrape_results.each do | result |
|
26
|
+
# puts response.metadata[ 'title ]
|
27
|
+
# puts '---'
|
28
|
+
# puts response.markdown
|
29
|
+
# puts "\n\n"
|
30
|
+
# end
|
31
|
+
# end
|
32
|
+
# break unless batch_result.status?( :scraping )
|
33
|
+
# end
|
34
|
+
#
|
35
|
+
# unless response.success?
|
36
|
+
# puts response.result.error_description
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
class BatchScrapeRequest < Request
|
40
|
+
|
41
|
+
##
|
42
|
+
# The +start_scraping+ method makes a Firecrawl '/batch/scrape' POST request which will
|
43
|
+
# initiate batch scraping of the given urls.
|
44
|
+
#
|
45
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
46
|
+
# then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
|
47
|
+
# successful then +response.result+ will be an instance of +ErrorResult+.
|
48
|
+
#
|
49
|
+
# Remember that you should call +response.success?+ to valida that the call to the API was
|
50
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
51
|
+
# request successfuly.
|
52
|
+
#
|
53
|
+
def start_scraping( urls, options = nil, &block )
|
54
|
+
if options
|
55
|
+
options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
|
56
|
+
options = options.to_h
|
57
|
+
else
|
58
|
+
options = {}
|
59
|
+
end
|
60
|
+
options[ :urls ] = [ urls ].flatten
|
61
|
+
|
62
|
+
response = post( "#{BASE_URI}/batch/scrape", options, &block )
|
63
|
+
result = nil
|
64
|
+
if response.success?
|
65
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
66
|
+
attributes ||= { success: false, status: :failed }
|
67
|
+
result = BatchScrapeResult.new( attributes[ :success ], attributes )
|
68
|
+
else
|
69
|
+
result = ErrorResult.new( response.status, attributes )
|
70
|
+
end
|
71
|
+
|
72
|
+
ResponseMethods.install( response, result )
|
73
|
+
end
|
74
|
+
|
75
|
+
##
|
76
|
+
# The +retrieve_scraping+ method makes a Firecrawl '/batch/scrape' GET request which will
|
77
|
+
# retrieve batch scraping results. Note that there is no guarantee that there are any batch
|
78
|
+
# scraping results at the time of the call and you may need to call this method multiple
|
79
|
+
# times.
|
80
|
+
#
|
81
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
|
82
|
+
# then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
|
83
|
+
# successful then +response.result+ will be an instance of +ErrorResult+.
|
84
|
+
#
|
85
|
+
# Remember that you should call +response.success?+ to valida that the call to the API was
|
86
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
87
|
+
# request successfuly.
|
88
|
+
#
|
89
|
+
def retrieve_scraping( batch_result, &block )
|
90
|
+
raise ArgumentError, "The first argument must be an instance of BatchScrapeResult." \
|
91
|
+
unless batch_result.is_a?( BatchScrapeResult )
|
92
|
+
response = get( batch_result.next_url, &block )
|
93
|
+
result = nil
|
94
|
+
if response.success?
|
95
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
96
|
+
attributes ||= { success: false, status: :failed }
|
97
|
+
result = batch_result.merge( attributes )
|
98
|
+
else
|
99
|
+
result = ErrorResult.new( response.status, attributes )
|
100
|
+
end
|
101
|
+
|
102
|
+
ResponseMethods.install( response, result )
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class BatchScrapeResult
|
3
|
+
|
4
|
+
def initialize( success, attributes )
|
5
|
+
@success = success
|
6
|
+
@attributes = attributes || {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def success?
|
10
|
+
@success || false
|
11
|
+
end
|
12
|
+
|
13
|
+
def status
|
14
|
+
# the initial Firecrawl response does not have a status so we synthesize a 'scraping'
|
15
|
+
# status if the operation was otherwise successful
|
16
|
+
@attributes[ :status ]&.to_sym || ( @success ? :scraping : :failed )
|
17
|
+
end
|
18
|
+
|
19
|
+
def status?( status )
|
20
|
+
self.status == status
|
21
|
+
end
|
22
|
+
|
23
|
+
def id
|
24
|
+
@attributes[ :id ]
|
25
|
+
end
|
26
|
+
|
27
|
+
def total
|
28
|
+
@attributes[ :total ] || 0
|
29
|
+
end
|
30
|
+
|
31
|
+
def completed
|
32
|
+
@attributes[ :completed ] || 0
|
33
|
+
end
|
34
|
+
|
35
|
+
def credits_used
|
36
|
+
@attributes[ :creditsUsed ] || 0
|
37
|
+
end
|
38
|
+
|
39
|
+
def expires_at
|
40
|
+
Date.parse( @attributes[ :expiresAt ] ) rescue nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def url
|
44
|
+
@attributes[ :url ]
|
45
|
+
end
|
46
|
+
|
47
|
+
def next_url
|
48
|
+
@attributes[ :next ] || @attributes[ :url ]
|
49
|
+
end
|
50
|
+
|
51
|
+
def scrape_results
|
52
|
+
success = @attributes[ :success ]
|
53
|
+
# note the &.compact is here because I've noted null entries in the data
|
54
|
+
( @attributes[ :data ]&.compact || [] ).map do | attr |
|
55
|
+
ScrapeResult.new( success, attr )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge( attributes )
|
60
|
+
self.class.new( attributes[ :success ], @attributes.merge( attributes ) )
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class CrawlOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
include DynamicSchema::Buildable
|
5
|
+
|
6
|
+
FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot ]
|
7
|
+
|
8
|
+
ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
|
9
|
+
|
10
|
+
schema do
|
11
|
+
exclude_paths String, as: :excludePaths, array: true
|
12
|
+
include_paths String, as: :includePaths, array: true
|
13
|
+
maximum_depth Integer, as: :maxDepth
|
14
|
+
ignore_sitemap [ TrueClass, FalseClass ], as: :ignoreSitemap
|
15
|
+
limit Integer
|
16
|
+
allow_backward_links [ TrueClass, FalseClass ], as: :allowBackwardLinks
|
17
|
+
allow_external_links [ TrueClass, FalseClass ], as: :allowExternalLinks
|
18
|
+
webhook String
|
19
|
+
scrape_options as: :scrapeOptions, &ScrapeOptions.schema
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.build( options = nil, &block )
|
23
|
+
new( api_options: builder.build( options, &block ) )
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.build!( options = nil, &block )
|
27
|
+
new( api_options: builder.build!( options, &block ) )
|
28
|
+
end
|
29
|
+
|
30
|
+
def initialize( options, api_options: nil )
|
31
|
+
@options = self.class.builder.build( options || {} )
|
32
|
+
@options = api_options.merge( @options ) if api_options
|
33
|
+
|
34
|
+
scrape_options = @options[ :scrapeOptions ]
|
35
|
+
if scrape_options
|
36
|
+
scrape_options[ :formats ]&.map!( &method( :string_camelize ) )
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_h
|
41
|
+
@options.to_h
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class ErrorResult
|
3
|
+
|
4
|
+
attr_reader :error_type, :error_description
|
5
|
+
|
6
|
+
def initialize( status_code, attributes = nil )
|
7
|
+
@error_code, @error_description = status_code_to_error( status_code )
|
8
|
+
@error_description = attributes[ :error ] if @attributes&.respond_to?( :[] )
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
def status_code_to_error( status_code )
|
13
|
+
case status_code
|
14
|
+
# this is here because I've noted invalid payloads being returned with a 200
|
15
|
+
when 200
|
16
|
+
[ :unexpected_error,
|
17
|
+
"The response was successful but it did not include a valid payload." ]
|
18
|
+
when 400
|
19
|
+
[ :invalid_request_error,
|
20
|
+
"There was an issue with the format or content of your request." ]
|
21
|
+
when 401
|
22
|
+
[ :authentication_error,
|
23
|
+
"There's an issue with your API key." ]
|
24
|
+
when 402
|
25
|
+
[ :payment_required,
|
26
|
+
"The request requires a paid account" ]
|
27
|
+
when 404
|
28
|
+
[ :not_found_error,
|
29
|
+
"The requested resource was not found." ]
|
30
|
+
when 429
|
31
|
+
[ :rate_limit_error,
|
32
|
+
"Your account has hit a rate limit." ]
|
33
|
+
when 500..505
|
34
|
+
[ :api_error,
|
35
|
+
"An unexpected Firecrawl server error has occurred." ]
|
36
|
+
when 529
|
37
|
+
[ :overloaded_error,
|
38
|
+
"The Firecrawl service is overloaded." ]
|
39
|
+
else
|
40
|
+
[ :unknown_error,
|
41
|
+
"The Firecrawl service returned an unexpected status code: '#{status_code}'." ]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class MapOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
|
5
|
+
schema do
|
6
|
+
search String
|
7
|
+
ignore_sitemap [ TrueClass, FalseClass ]
|
8
|
+
ignore_subdomains [ TrueClass, FalseClass ]
|
9
|
+
limit Integer
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.build( options = nil, &block )
|
13
|
+
new( api_options: builder.build( options, &block ) )
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.build!( options = nil, &block )
|
17
|
+
new( api_options: builder.build!( options, &block ) )
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize( options = {}, api_options: nil )
|
21
|
+
@options = self.class.builder.build( options || {} )
|
22
|
+
@options = api_options.merge( @options ) if api_options
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_h
|
26
|
+
@options.to_h
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
##
|
3
|
+
# The +MapRequest+ class encapsulates a '/map' POST request to the Firecrawl API. After creating
|
4
|
+
# a new +MapRequest+ instance you can make the request by calling the +map+ method to crawl the
|
5
|
+
# site and retrieve +links+
|
6
|
+
#
|
7
|
+
# === examples
|
8
|
+
#
|
9
|
+
# require 'firecrawl'
|
10
|
+
#
|
11
|
+
# request = Firecrawl::MapRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
12
|
+
#
|
13
|
+
# response = request.map( 'https://example.com', { limit: 100 } )
|
14
|
+
# if response.success?
|
15
|
+
# result = response.result
|
16
|
+
# if result.success?
|
17
|
+
# result.links.each do | link |
|
18
|
+
# puts link
|
19
|
+
# end
|
20
|
+
# end
|
21
|
+
# else
|
22
|
+
# puts response.result.error_description
|
23
|
+
# end
|
24
|
+
#
|
25
|
+
class MapRequest < Request
|
26
|
+
|
27
|
+
##
|
28
|
+
# The +map+ method makes a Firecrawl '/map' POST request which will scrape the site with
|
29
|
+
# given url.
|
30
|
+
#
|
31
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
32
|
+
# then +response.result+ will be an instance +MapResult+. If the request is not successful
|
33
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
34
|
+
#
|
35
|
+
def map( url, options = nil, &block )
|
36
|
+
if options
|
37
|
+
options = options.is_a?( MapOptions ) ? options : MapOptions.build( options.to_h )
|
38
|
+
options = options.to_h
|
39
|
+
else
|
40
|
+
options = {}
|
41
|
+
end
|
42
|
+
options[ :url ] = url.to_s
|
43
|
+
|
44
|
+
response = post( "#{BASE_URI}/map", options, &block )
|
45
|
+
result = nil
|
46
|
+
if response.success?
|
47
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
48
|
+
attributes ||= { success: false }
|
49
|
+
result = MapResult.new( attributes[ :success ], attributes )
|
50
|
+
else
|
51
|
+
result = ErrorResult.new( response.status, attributes )
|
52
|
+
end
|
53
|
+
|
54
|
+
ResponseMethods.install( response, result )
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class MapResult
|
3
|
+
|
4
|
+
def initialize( success, attributes )
|
5
|
+
@success = success
|
6
|
+
@attributes = attributes
|
7
|
+
end
|
8
|
+
|
9
|
+
##
|
10
|
+
# The +success?+ method returns +true+ if the scraping was successful.
|
11
|
+
#
|
12
|
+
# Note that the response +success?+ tells you if the call to the Firecrawl api was successful
|
13
|
+
# while this +success?+ method tells you if the actual scraping operation was successful.
|
14
|
+
#
|
15
|
+
def success?
|
16
|
+
@success || false
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# The +links+ method returns an array of the links that were scraped from the the page.
|
21
|
+
# The +links+ are empty unless the request options +formats+ included +links+.
|
22
|
+
#
|
23
|
+
def links
|
24
|
+
@attributes[ :links ] || []
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +Request+ class encapsulates a request to the Firecrawl API. After creating a new
|
5
|
+
# +Request+ instance you can make the actual request by calling the +scrape+, +begin_crawl+
|
6
|
+
# +crawl+, +end_crawl+ or +map+ methods.
|
7
|
+
#
|
8
|
+
# === example
|
9
|
+
#
|
10
|
+
# require 'firecrawl'
|
11
|
+
#
|
12
|
+
# request = Firecrawl::Request.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
13
|
+
#
|
14
|
+
# options = Firecrawl::ScrapeOptions.build do
|
15
|
+
# format [ :markdown, :screenshot ]
|
16
|
+
# only_main_content true
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# response = request.scrape( 'https://cnn.com', criteria )
|
20
|
+
# if response.success?
|
21
|
+
# result = response.result
|
22
|
+
# puts response.metadata[ :title ]
|
23
|
+
# puts '---'
|
24
|
+
# puts response.markdown
|
25
|
+
# else
|
26
|
+
# puts response.result.error_description
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
class Request
|
30
|
+
|
31
|
+
DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
|
32
|
+
|
33
|
+
BASE_URI = 'https://api.firecrawl.dev/v1'
|
34
|
+
|
35
|
+
##
|
36
|
+
# The +initialize+ method initializes the +Request+ instance. You MUST pass an +api_key+ and
|
37
|
+
# and optionally a (Faraday) +connection+.
|
38
|
+
#
|
39
|
+
def initialize( connection: nil, api_key: nil )
|
40
|
+
@connection = connection || DEFAULT_CONNECTION
|
41
|
+
@api_key = api_key || Firecrawl.api_key
|
42
|
+
raise ArgumentError, "An 'api_key' is required unless configured using 'Firecrawl.api_key'." \
|
43
|
+
unless @api_key
|
44
|
+
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
|
48
|
+
def post( uri, body, &block )
|
49
|
+
headers = {
|
50
|
+
'Authorization' => "Bearer #{@api_key}",
|
51
|
+
'Content-Type' => 'application/json'
|
52
|
+
}
|
53
|
+
|
54
|
+
@connection.post( uri ) do | request |
|
55
|
+
headers.each { | key, value | request.headers[ key ] = value }
|
56
|
+
request.body = body.is_a?( String ) ? body : JSON.generate( body )
|
57
|
+
block.call( request ) if block
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def get( uri, &block )
|
62
|
+
headers = {
|
63
|
+
'Authorization' => "Bearer #{@api_key}",
|
64
|
+
'Content-Type' => 'application/json'
|
65
|
+
}
|
66
|
+
|
67
|
+
@connection.get( uri ) do | request |
|
68
|
+
headers.each { | key, value | request.headers[ key ] = value }
|
69
|
+
block.call( request ) if block
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
#
|
3
|
+
# The ResponseMethods module extends a Faraday reponse, adding the +result+ method.
|
4
|
+
#
|
5
|
+
module ResponseMethods
|
6
|
+
def self.install( response, result )
|
7
|
+
response.instance_variable_set( "@_firecrawl_result", result )
|
8
|
+
response.extend( ResponseMethods )
|
9
|
+
end
|
10
|
+
|
11
|
+
def result
|
12
|
+
@_firecrawl_result
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class ScrapeOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
include Helpers
|
5
|
+
|
6
|
+
FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot, :"screenshot@full_page" ]
|
7
|
+
|
8
|
+
ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
|
9
|
+
|
10
|
+
schema do
|
11
|
+
# note: both format and formats are defined as a semantic convenience
|
12
|
+
format String, as: :formats, array: true, in: FORMATS
|
13
|
+
formats String, array: true, in: FORMATS
|
14
|
+
only_main_content [ TrueClass, FalseClass ], as: :onlyMainContent
|
15
|
+
include_tags String, as: :includeTags, array: true
|
16
|
+
exclude_tags String, as: :excludeTags, array: true
|
17
|
+
wait_for Integer
|
18
|
+
timeout Integer
|
19
|
+
extract do
|
20
|
+
#schema Hash
|
21
|
+
system_prompt String, as: :systemPrompt
|
22
|
+
prompt String
|
23
|
+
end
|
24
|
+
action as: :actions, arguments: :type, array: true do
|
25
|
+
type Symbol, required: true, in: ACTIONS
|
26
|
+
# wait
|
27
|
+
milliseconds Integer
|
28
|
+
# click
|
29
|
+
selector String
|
30
|
+
# write
|
31
|
+
text String
|
32
|
+
# press
|
33
|
+
key String
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.build( options = nil, &block )
|
38
|
+
new( api_options: builder.build( options, &block ) )
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.build!( options = nil, &block )
|
42
|
+
new( api_options: builder.build!( options, &block ) )
|
43
|
+
end
|
44
|
+
|
45
|
+
def initialize( options = {}, api_options: nil )
|
46
|
+
@options = self.class.builder.build( options || {} )
|
47
|
+
@options = api_options.merge( @options ) if api_options
|
48
|
+
@options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
|
49
|
+
end
|
50
|
+
|
51
|
+
def to_h
|
52
|
+
@options.to_h
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
##
|
3
|
+
# The +ScrapeRequest+ class encapsulates a '/scrape' POST request to the Firecrawl API. After
|
4
|
+
# creating a new +ScrapeRequest+ instance you can initiate the request by calling the +scrape+
|
5
|
+
# method to perform synchronous scraping.
|
6
|
+
#
|
7
|
+
# === examples
|
8
|
+
#
|
9
|
+
# require 'firecrawl'
|
10
|
+
#
|
11
|
+
# request = Firecrawl::ScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
12
|
+
#
|
13
|
+
# options = Firecrawl::ScrapeOptions.build do
|
14
|
+
# format [ :markdown, 'screenshot@full_page' ]
|
15
|
+
# only_main_content true
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# response = request.scrape( 'https://example.com', options )
|
19
|
+
# if response.success?
|
20
|
+
# result = response.result
|
21
|
+
# puts response.metadata[ 'title ]
|
22
|
+
# puts '---'
|
23
|
+
# puts response.markdown
|
24
|
+
# else
|
25
|
+
# puts response.result.error_description
|
26
|
+
# end
|
27
|
+
#
|
28
|
+
class ScrapeRequest < Request
|
29
|
+
|
30
|
+
##
|
31
|
+
# The +scrape+ method makes a Firecrawl '/scrape' POST request which will scrape the given url.
|
32
|
+
#
|
33
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
34
|
+
# then +response.result+ will be an instance +ScrapeResult+. If the request is not successful
|
35
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
36
|
+
#
|
37
|
+
def scrape( url, options = nil, &block )
|
38
|
+
if options
|
39
|
+
options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
|
40
|
+
options = options.to_h
|
41
|
+
else
|
42
|
+
options = {}
|
43
|
+
end
|
44
|
+
options[ :url ] = url.to_s
|
45
|
+
|
46
|
+
response = post( "#{BASE_URI}/scrape", options, &block )
|
47
|
+
result = nil
|
48
|
+
if response.success?
|
49
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
50
|
+
attributes ||= { success: false }
|
51
|
+
result = ScrapeResult.new( attributes[ :success ], attributes[ :data ] )
|
52
|
+
else
|
53
|
+
result = ErrorResult.new( response.status, attributes )
|
54
|
+
end
|
55
|
+
|
56
|
+
ResponseMethods.install( response, result )
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class ScrapeResult
|
3
|
+
|
4
|
+
def initialize( success, attributes )
|
5
|
+
@success = success
|
6
|
+
@attributes = attributes || {}
|
7
|
+
end
|
8
|
+
|
9
|
+
##
|
10
|
+
# The +success?+ method returns +true+ if the scraping was successful.
|
11
|
+
#
|
12
|
+
# Note that the response +success?+ tells you if the call to the Firecrawl api was successful
|
13
|
+
# while this +success?+ method tells you if the actual scraping operation was successful.
|
14
|
+
#
|
15
|
+
def success?
|
16
|
+
@success || false
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# The +markdown+ method returns scraped content that has been converted to markdown. The
|
21
|
+
# markdown content is present only if the request options +formats+ included +markdown+.
|
22
|
+
#
|
23
|
+
def markdown
|
24
|
+
@attributes[ :markdown ]
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# The +html+ method returns scraped html content. The html content is present only if the
|
29
|
+
# request options +formats+ included +html+.
|
30
|
+
#
|
31
|
+
def html
|
32
|
+
@attributes[ :html ]
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# The +raw_html+ method returns the full scraped html content of the page. The raw html
|
37
|
+
# content is present only if the request options +formats+ included +raw_html+.
|
38
|
+
#
|
39
|
+
def raw_html
|
40
|
+
@attributes[ :rawHtml ]
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# The +screenshot_url+ method returns the url of the screenshot of the requested page. The
|
45
|
+
# screenshot url is present only if the request options +formats+ included +screenshot+ or
|
46
|
+
# +screenshot@full_page+.
|
47
|
+
#
|
48
|
+
def screenshot_url
|
49
|
+
@attributes[ :screenshot ]
|
50
|
+
end
|
51
|
+
|
52
|
+
##
|
53
|
+
# The +links+ method returns an array of the links that were scraped from the the page.
|
54
|
+
# The +links+ are empty unless the request options +formats+ included +links+.
|
55
|
+
#
|
56
|
+
def links
|
57
|
+
@attributes[ :links ] || []
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# The +actions+ method returns an object of action results ( +scrapes+ or +screenshots+ ).
|
62
|
+
# The +actions+ are empty unless the request options included +scrape+ or +scresshot+
|
63
|
+
# actions.
|
64
|
+
#
|
65
|
+
def actions
|
66
|
+
@attributes[ :actions ] || {}
|
67
|
+
end
|
68
|
+
|
69
|
+
def metadata
|
70
|
+
unless @metadata
|
71
|
+
metadata = @attributes[ :metadata ] || {}
|
72
|
+
@metadata = metadata.transform_keys do | key |
|
73
|
+
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
74
|
+
end
|
75
|
+
# remove the camelCase forms injected by Firecrawl
|
76
|
+
@metadata.delete_if do | key, _ |
|
77
|
+
key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
|
78
|
+
end
|
79
|
+
end
|
80
|
+
@metadata
|
81
|
+
end
|
82
|
+
|
83
|
+
def llm_extraction
|
84
|
+
@attributes[ :llm_extraction ] || {}
|
85
|
+
end
|
86
|
+
|
87
|
+
def warning
|
88
|
+
@attributes[ :warning ]
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
data/lib/firecrawl.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'base64'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
require 'faraday'
|
6
|
+
require 'dynamic_schema'
|
7
|
+
|
8
|
+
require_relative 'firecrawl/helpers'
|
9
|
+
require_relative 'firecrawl/error_result'
|
10
|
+
require_relative 'firecrawl/request'
|
11
|
+
require_relative 'firecrawl/response_methods'
|
12
|
+
|
13
|
+
require_relative 'firecrawl/scrape_options'
|
14
|
+
require_relative 'firecrawl/scrape_result'
|
15
|
+
require_relative 'firecrawl/scrape_request'
|
16
|
+
require_relative 'firecrawl/batch_scrape_result'
|
17
|
+
require_relative 'firecrawl/batch_scrape_request'
|
18
|
+
require_relative 'firecrawl/map_options'
|
19
|
+
require_relative 'firecrawl/map_result'
|
20
|
+
require_relative 'firecrawl/map_request'
|
21
|
+
|
22
|
+
module Firecrawl
|
23
|
+
class << self
|
24
|
+
attr_accessor :api_key
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
metadata
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: firecrawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kristoph Cichocki-Romanov
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-11-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: faraday
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: dynamicschema
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.0.0.beta03
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.0.0.beta03
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.13'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.13'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: debug
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.9'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.9'
|
69
|
+
description: |-
|
70
|
+
The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl can take a URL, scrape the page contents and return the whole page or principal content as html, markdown, or structured data.
|
71
|
+
|
72
|
+
In addition, Firecrawl can crawl an entire site returning the pages it encounters or just the map of the pages, which can be used for subsequent scraping.
|
73
|
+
email:
|
74
|
+
- rubygems.org@kristoph.net
|
75
|
+
executables: []
|
76
|
+
extensions: []
|
77
|
+
extra_rdoc_files: []
|
78
|
+
files:
|
79
|
+
- LICENSE
|
80
|
+
- firecrawl.gemspec
|
81
|
+
- lib/firecrawl.rb
|
82
|
+
- lib/firecrawl/batch_scrape_request.rb
|
83
|
+
- lib/firecrawl/batch_scrape_result.rb
|
84
|
+
- lib/firecrawl/crawl_options.rb
|
85
|
+
- lib/firecrawl/error_result.rb
|
86
|
+
- lib/firecrawl/helpers.rb
|
87
|
+
- lib/firecrawl/map_options.rb
|
88
|
+
- lib/firecrawl/map_request.rb
|
89
|
+
- lib/firecrawl/map_result.rb
|
90
|
+
- lib/firecrawl/request.rb
|
91
|
+
- lib/firecrawl/response_methods.rb
|
92
|
+
- lib/firecrawl/scrape_options.rb
|
93
|
+
- lib/firecrawl/scrape_request.rb
|
94
|
+
- lib/firecrawl/scrape_result.rb
|
95
|
+
homepage: https://github.com/EndlessInternational/firecrawl
|
96
|
+
licenses:
|
97
|
+
- MIT
|
98
|
+
metadata:
|
99
|
+
source_code_uri: https://github.com/EndlessInternational/firecrawl
|
100
|
+
bug_tracker_uri: https://github.com/EndlessInternational/firecrawl/issues
|
101
|
+
post_install_message:
|
102
|
+
rdoc_options: []
|
103
|
+
require_paths:
|
104
|
+
- lib
|
105
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '3.0'
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
|
+
requirements:
|
112
|
+
- - ">="
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: '0'
|
115
|
+
requirements: []
|
116
|
+
rubygems_version: 3.5.19
|
117
|
+
signing_key:
|
118
|
+
specification_version: 4
|
119
|
+
summary: The Firecrawl gem implements a lightweight interface to the Firecrawl.dev
|
120
|
+
API which takes a URL, crawls it and returns html, markdown, or structured data.
|
121
|
+
It is of particular valuewhen used with LLM's for grounding.
|
122
|
+
test_files: []
|