firecrawl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/firecrawl.gemspec +37 -0
- data/lib/firecrawl/batch_scrape_request.rb +106 -0
- data/lib/firecrawl/batch_scrape_result.rb +63 -0
- data/lib/firecrawl/crawl_options.rb +47 -0
- data/lib/firecrawl/error_result.rb +45 -0
- data/lib/firecrawl/helpers.rb +8 -0
- data/lib/firecrawl/map_options.rb +32 -0
- data/lib/firecrawl/map_request.rb +58 -0
- data/lib/firecrawl/map_result.rb +29 -0
- data/lib/firecrawl/request.rb +75 -0
- data/lib/firecrawl/response_methods.rb +15 -0
- data/lib/firecrawl/scrape_options.rb +58 -0
- data/lib/firecrawl/scrape_request.rb +60 -0
- data/lib/firecrawl/scrape_result.rb +92 -0
- data/lib/firecrawl.rb +27 -0
- metadata +122 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d79423650fbec60124bab7710e372f16774987f437e8bb52cb13e72a434bdec6
|
4
|
+
data.tar.gz: bfb0ec5c302e4e646812855d5c954e2fc6c075e4a03af495a6a09815ce588e44
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 46c819135a19388beb434c797e16a3a73f52b1d85cc37f317e03dc159de8ed1a56a6a5170f2e2457e2798fbff25d93cfbf11f7be12682fcb6ef13534aa347f62
|
7
|
+
data.tar.gz: ffe9b239c29138617a1902f8502099b6f0b2019eeed8b266cd8f77e94eee868e8e876ee9f8545cd584d701a8bfffa75f6552508b2949463f2cc187580eeb35fe
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Endless International
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/firecrawl.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
Gem::Specification.new do | spec |
|
2
|
+
|
3
|
+
spec.name = 'firecrawl'
|
4
|
+
spec.version = '0.0.1'
|
5
|
+
spec.authors = [ 'Kristoph Cichocki-Romanov' ]
|
6
|
+
spec.email = [ 'rubygems.org@kristoph.net' ]
|
7
|
+
|
8
|
+
spec.summary =
|
9
|
+
"The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API which takes " \
|
10
|
+
"a URL, crawls it and returns html, markdown, or structured data. It is of particular value" \
|
11
|
+
"when used with LLM's for grounding."
|
12
|
+
spec.description =
|
13
|
+
"The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl " \
|
14
|
+
"can take a URL, scrape the page contents and return the whole page or principal content " \
|
15
|
+
"as html, markdown, or structured data.\n" \
|
16
|
+
"\n" \
|
17
|
+
"In addition, Firecrawl can crawl an entire site returning the pages it encounters or just " \
|
18
|
+
"the map of the pages, which can be used for subsequent scraping."
|
19
|
+
spec.license = 'MIT'
|
20
|
+
spec.homepage = 'https://github.com/EndlessInternational/firecrawl'
|
21
|
+
spec.metadata = {
|
22
|
+
'source_code_uri' => 'https://github.com/EndlessInternational/firecrawl',
|
23
|
+
'bug_tracker_uri' => 'https://github.com/EndlessInternational/firecrawl/issues',
|
24
|
+
# 'documentation_uri' => 'https://github.com/EndlessInternational/firecrawl'
|
25
|
+
}
|
26
|
+
|
27
|
+
spec.required_ruby_version = '>= 3.0'
|
28
|
+
spec.files = Dir[ "lib/**/*.rb", "LICENSE", "README.md", "firecrawl.gemspec" ]
|
29
|
+
spec.require_paths = [ "lib" ]
|
30
|
+
|
31
|
+
spec.add_runtime_dependency 'faraday', '~> 2.7'
|
32
|
+
spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta03'
|
33
|
+
|
34
|
+
spec.add_development_dependency 'rspec', '~> 3.13'
|
35
|
+
spec.add_development_dependency 'debug', '~> 1.9'
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +BatchScrapeRequest+ class encapsulates a batch scrape request to the Firecrawl API.
|
5
|
+
# After creating a new +BatchScrapeRequest+ instance you can begin batch scraping by calling
|
6
|
+
# the +begin_scraping+ method and then subsequently evaluate the results by calling the
|
7
|
+
# +continue_scraping' method.
|
8
|
+
#
|
9
|
+
# === examples
|
10
|
+
#
|
11
|
+
# require 'firecrawl'
|
12
|
+
#
|
13
|
+
# request = Firecrawl::BatchScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
14
|
+
#
|
15
|
+
# urls = [ 'https://example.com', 'https://icann.org' ]
|
16
|
+
# options = Firecrawl::ScrapeOptions.build do
|
17
|
+
# format [ :markdown, 'screenshot@full_page' ]
|
18
|
+
# only_main_content true
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# batch_response = request.beging_scraping( urls, options )
|
22
|
+
# while response.success?
|
23
|
+
# batch_result = batch_response.result
|
24
|
+
# if batch_result.success?
|
25
|
+
# batch_result.scrape_results.each do | result |
|
26
|
+
# puts response.metadata[ 'title ]
|
27
|
+
# puts '---'
|
28
|
+
# puts response.markdown
|
29
|
+
# puts "\n\n"
|
30
|
+
# end
|
31
|
+
# end
|
32
|
+
# break unless batch_result.status?( :scraping )
|
33
|
+
# end
|
34
|
+
#
|
35
|
+
# unless response.success?
|
36
|
+
# puts response.result.error_description
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
class BatchScrapeRequest < Request
|
40
|
+
|
41
|
+
##
|
42
|
+
# The +start_scraping+ method makes a Firecrawl '/batch/scrape' POST request which will
|
43
|
+
# initiate batch scraping of the given urls.
|
44
|
+
#
|
45
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
46
|
+
# then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
|
47
|
+
# successful then +response.result+ will be an instance of +ErrorResult+.
|
48
|
+
#
|
49
|
+
# Remember that you should call +response.success?+ to valida that the call to the API was
|
50
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
51
|
+
# request successfuly.
|
52
|
+
#
|
53
|
+
def start_scraping( urls, options = nil, &block )
|
54
|
+
if options
|
55
|
+
options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
|
56
|
+
options = options.to_h
|
57
|
+
else
|
58
|
+
options = {}
|
59
|
+
end
|
60
|
+
options[ :urls ] = [ urls ].flatten
|
61
|
+
|
62
|
+
response = post( "#{BASE_URI}/batch/scrape", options, &block )
|
63
|
+
result = nil
|
64
|
+
if response.success?
|
65
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
66
|
+
attributes ||= { success: false, status: :failed }
|
67
|
+
result = BatchScrapeResult.new( attributes[ :success ], attributes )
|
68
|
+
else
|
69
|
+
result = ErrorResult.new( response.status, attributes )
|
70
|
+
end
|
71
|
+
|
72
|
+
ResponseMethods.install( response, result )
|
73
|
+
end
|
74
|
+
|
75
|
+
##
|
76
|
+
# The +retrieve_scraping+ method makes a Firecrawl '/batch/scrape' GET request which will
|
77
|
+
# retrieve batch scraping results. Note that there is no guarantee that there are any batch
|
78
|
+
# scraping results at the time of the call and you may need to call this method multiple
|
79
|
+
# times.
|
80
|
+
#
|
81
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
|
82
|
+
# then +response.result+ will be an instance +BatchScrapeResult+. If the request is not
|
83
|
+
# successful then +response.result+ will be an instance of +ErrorResult+.
|
84
|
+
#
|
85
|
+
# Remember that you should call +response.success?+ to valida that the call to the API was
|
86
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
87
|
+
# request successfuly.
|
88
|
+
#
|
89
|
+
def retrieve_scraping( batch_result, &block )
|
90
|
+
raise ArgumentError, "The first argument must be an instance of BatchScrapeResult." \
|
91
|
+
unless batch_result.is_a?( BatchScrapeResult )
|
92
|
+
response = get( batch_result.next_url, &block )
|
93
|
+
result = nil
|
94
|
+
if response.success?
|
95
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
96
|
+
attributes ||= { success: false, status: :failed }
|
97
|
+
result = batch_result.merge( attributes )
|
98
|
+
else
|
99
|
+
result = ErrorResult.new( response.status, attributes )
|
100
|
+
end
|
101
|
+
|
102
|
+
ResponseMethods.install( response, result )
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class BatchScrapeResult
|
3
|
+
|
4
|
+
def initialize( success, attributes )
|
5
|
+
@success = success
|
6
|
+
@attributes = attributes || {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def success?
|
10
|
+
@success || false
|
11
|
+
end
|
12
|
+
|
13
|
+
def status
|
14
|
+
# the initial Firecrawl response does not have a status so we synthesize a 'scraping'
|
15
|
+
# status if the operation was otherwise successful
|
16
|
+
@attributes[ :status ]&.to_sym || ( @success ? :scraping : :failed )
|
17
|
+
end
|
18
|
+
|
19
|
+
def status?( status )
|
20
|
+
self.status == status
|
21
|
+
end
|
22
|
+
|
23
|
+
def id
|
24
|
+
@attributes[ :id ]
|
25
|
+
end
|
26
|
+
|
27
|
+
def total
|
28
|
+
@attributes[ :total ] || 0
|
29
|
+
end
|
30
|
+
|
31
|
+
def completed
|
32
|
+
@attributes[ :completed ] || 0
|
33
|
+
end
|
34
|
+
|
35
|
+
def credits_used
|
36
|
+
@attributes[ :creditsUsed ] || 0
|
37
|
+
end
|
38
|
+
|
39
|
+
def expires_at
|
40
|
+
Date.parse( @attributes[ :expiresAt ] ) rescue nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def url
|
44
|
+
@attributes[ :url ]
|
45
|
+
end
|
46
|
+
|
47
|
+
def next_url
|
48
|
+
@attributes[ :next ] || @attributes[ :url ]
|
49
|
+
end
|
50
|
+
|
51
|
+
def scrape_results
|
52
|
+
success = @attributes[ :success ]
|
53
|
+
# note the &.compact is here because I've noted null entries in the data
|
54
|
+
( @attributes[ :data ]&.compact || [] ).map do | attr |
|
55
|
+
ScrapeResult.new( success, attr )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge( attributes )
|
60
|
+
self.class.new( attributes[ :success ], @attributes.merge( attributes ) )
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class CrawlOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
include DynamicSchema::Buildable
|
5
|
+
|
6
|
+
FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot ]
|
7
|
+
|
8
|
+
ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
|
9
|
+
|
10
|
+
schema do
|
11
|
+
exclude_paths String, as: :excludePaths, array: true
|
12
|
+
include_paths String, as: :includePaths, array: true
|
13
|
+
maximum_depth Integer, as: :maxDepth
|
14
|
+
ignore_sitemap [ TrueClass, FalseClass ], as: :ignoreSitemap
|
15
|
+
limit Integer
|
16
|
+
allow_backward_links [ TrueClass, FalseClass ], as: :allowBackwardLinks
|
17
|
+
allow_external_links [ TrueClass, FalseClass ], as: :allowExternalLinks
|
18
|
+
webhook String
|
19
|
+
scrape_options as: :scrapeOptions, &ScrapeOptions.schema
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.build( options = nil, &block )
|
23
|
+
new( api_options: builder.build( options, &block ) )
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.build!( options = nil, &block )
|
27
|
+
new( api_options: builder.build!( options, &block ) )
|
28
|
+
end
|
29
|
+
|
30
|
+
def initialize( options, api_options: nil )
|
31
|
+
@options = self.class.builder.build( options || {} )
|
32
|
+
@options = api_options.merge( @options ) if api_options
|
33
|
+
|
34
|
+
scrape_options = @options[ :scrapeOptions ]
|
35
|
+
if scrape_options
|
36
|
+
scrape_options[ :formats ]&.map!( &method( :string_camelize ) )
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_h
|
41
|
+
@options.to_h
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class ErrorResult
|
3
|
+
|
4
|
+
attr_reader :error_type, :error_description
|
5
|
+
|
6
|
+
def initialize( status_code, attributes = nil )
|
7
|
+
@error_code, @error_description = status_code_to_error( status_code )
|
8
|
+
@error_description = attributes[ :error ] if @attributes&.respond_to?( :[] )
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
def status_code_to_error( status_code )
|
13
|
+
case status_code
|
14
|
+
# this is here because I've noted invalid payloads being returned with a 200
|
15
|
+
when 200
|
16
|
+
[ :unexpected_error,
|
17
|
+
"The response was successful but it did not include a valid payload." ]
|
18
|
+
when 400
|
19
|
+
[ :invalid_request_error,
|
20
|
+
"There was an issue with the format or content of your request." ]
|
21
|
+
when 401
|
22
|
+
[ :authentication_error,
|
23
|
+
"There's an issue with your API key." ]
|
24
|
+
when 402
|
25
|
+
[ :payment_required,
|
26
|
+
"The request requires a paid account" ]
|
27
|
+
when 404
|
28
|
+
[ :not_found_error,
|
29
|
+
"The requested resource was not found." ]
|
30
|
+
when 429
|
31
|
+
[ :rate_limit_error,
|
32
|
+
"Your account has hit a rate limit." ]
|
33
|
+
when 500..505
|
34
|
+
[ :api_error,
|
35
|
+
"An unexpected Firecrawl server error has occurred." ]
|
36
|
+
when 529
|
37
|
+
[ :overloaded_error,
|
38
|
+
"The Firecrawl service is overloaded." ]
|
39
|
+
else
|
40
|
+
[ :unknown_error,
|
41
|
+
"The Firecrawl service returned an unexpected status code: '#{status_code}'." ]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class MapOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
|
5
|
+
schema do
|
6
|
+
search String
|
7
|
+
ignore_sitemap [ TrueClass, FalseClass ]
|
8
|
+
ignore_subdomains [ TrueClass, FalseClass ]
|
9
|
+
limit Integer
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.build( options = nil, &block )
|
13
|
+
new( api_options: builder.build( options, &block ) )
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.build!( options = nil, &block )
|
17
|
+
new( api_options: builder.build!( options, &block ) )
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize( options = {}, api_options: nil )
|
21
|
+
@options = self.class.builder.build( options || {} )
|
22
|
+
@options = api_options.merge( @options ) if api_options
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_h
|
26
|
+
@options.to_h
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
##
|
3
|
+
# The +MapRequest+ class encapsulates a '/map' POST request to the Firecrawl API. After creating
|
4
|
+
# a new +MapRequest+ instance you can make the request by calling the +map+ method to crawl the
|
5
|
+
# site and retrieve +links+
|
6
|
+
#
|
7
|
+
# === examples
|
8
|
+
#
|
9
|
+
# require 'firecrawl'
|
10
|
+
#
|
11
|
+
# request = Firecrawl::MapRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
12
|
+
#
|
13
|
+
# response = request.map( 'https://example.com', { limit: 100 } )
|
14
|
+
# if response.success?
|
15
|
+
# result = response.result
|
16
|
+
# if result.success?
|
17
|
+
# result.links.each do | link |
|
18
|
+
# puts link
|
19
|
+
# end
|
20
|
+
# end
|
21
|
+
# else
|
22
|
+
# puts response.result.error_description
|
23
|
+
# end
|
24
|
+
#
|
25
|
+
class MapRequest < Request
|
26
|
+
|
27
|
+
##
|
28
|
+
# The +map+ method makes a Firecrawl '/map' POST request which will scrape the site with
|
29
|
+
# given url.
|
30
|
+
#
|
31
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
32
|
+
# then +response.result+ will be an instance +MapResult+. If the request is not successful
|
33
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
34
|
+
#
|
35
|
+
def map( url, options = nil, &block )
|
36
|
+
if options
|
37
|
+
options = options.is_a?( MapOptions ) ? options : MapOptions.build( options.to_h )
|
38
|
+
options = options.to_h
|
39
|
+
else
|
40
|
+
options = {}
|
41
|
+
end
|
42
|
+
options[ :url ] = url.to_s
|
43
|
+
|
44
|
+
response = post( "#{BASE_URI}/map", options, &block )
|
45
|
+
result = nil
|
46
|
+
if response.success?
|
47
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
48
|
+
attributes ||= { success: false }
|
49
|
+
result = MapResult.new( attributes[ :success ], attributes )
|
50
|
+
else
|
51
|
+
result = ErrorResult.new( response.status, attributes )
|
52
|
+
end
|
53
|
+
|
54
|
+
ResponseMethods.install( response, result )
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class MapResult
|
3
|
+
|
4
|
+
def initialize( success, attributes )
|
5
|
+
@success = success
|
6
|
+
@attributes = attributes
|
7
|
+
end
|
8
|
+
|
9
|
+
##
|
10
|
+
# The +success?+ method returns +true+ if the scraping was successful.
|
11
|
+
#
|
12
|
+
# Note that the response +success?+ tells you if the call to the Firecrawl api was successful
|
13
|
+
# while this +success?+ method tells you if the actual scraping operation was successful.
|
14
|
+
#
|
15
|
+
def success?
|
16
|
+
@success || false
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# The +links+ method returns an array of the links that were scraped from the the page.
|
21
|
+
# The +links+ are empty unless the request options +formats+ included +links+.
|
22
|
+
#
|
23
|
+
def links
|
24
|
+
@attributes[ :links ] || []
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +Request+ class encapsulates a request to the Firecrawl API. After creating a new
|
5
|
+
# +Request+ instance you can make the actual request by calling the +scrape+, +begin_crawl+
|
6
|
+
# +crawl+, +end_crawl+ or +map+ methods.
|
7
|
+
#
|
8
|
+
# === example
|
9
|
+
#
|
10
|
+
# require 'firecrawl'
|
11
|
+
#
|
12
|
+
# request = Firecrawl::Request.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
13
|
+
#
|
14
|
+
# options = Firecrawl::ScrapeOptions.build do
|
15
|
+
# format [ :markdown, :screenshot ]
|
16
|
+
# only_main_content true
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# response = request.scrape( 'https://cnn.com', criteria )
|
20
|
+
# if response.success?
|
21
|
+
# result = response.result
|
22
|
+
# puts response.metadata[ :title ]
|
23
|
+
# puts '---'
|
24
|
+
# puts response.markdown
|
25
|
+
# else
|
26
|
+
# puts response.result.error_description
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
class Request
|
30
|
+
|
31
|
+
DEFAULT_CONNECTION = Faraday.new { | builder | builder.adapter Faraday.default_adapter }
|
32
|
+
|
33
|
+
BASE_URI = 'https://api.firecrawl.dev/v1'
|
34
|
+
|
35
|
+
##
|
36
|
+
# The +initialize+ method initializes the +Request+ instance. You MUST pass an +api_key+ and
|
37
|
+
# and optionally a (Faraday) +connection+.
|
38
|
+
#
|
39
|
+
def initialize( connection: nil, api_key: nil )
|
40
|
+
@connection = connection || DEFAULT_CONNECTION
|
41
|
+
@api_key = api_key || Firecrawl.api_key
|
42
|
+
raise ArgumentError, "An 'api_key' is required unless configured using 'Firecrawl.api_key'." \
|
43
|
+
unless @api_key
|
44
|
+
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
|
48
|
+
def post( uri, body, &block )
|
49
|
+
headers = {
|
50
|
+
'Authorization' => "Bearer #{@api_key}",
|
51
|
+
'Content-Type' => 'application/json'
|
52
|
+
}
|
53
|
+
|
54
|
+
@connection.post( uri ) do | request |
|
55
|
+
headers.each { | key, value | request.headers[ key ] = value }
|
56
|
+
request.body = body.is_a?( String ) ? body : JSON.generate( body )
|
57
|
+
block.call( request ) if block
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def get( uri, &block )
|
62
|
+
headers = {
|
63
|
+
'Authorization' => "Bearer #{@api_key}",
|
64
|
+
'Content-Type' => 'application/json'
|
65
|
+
}
|
66
|
+
|
67
|
+
@connection.get( uri ) do | request |
|
68
|
+
headers.each { | key, value | request.headers[ key ] = value }
|
69
|
+
block.call( request ) if block
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
#
|
3
|
+
# The ResponseMethods module extends a Faraday reponse, adding the +result+ method.
|
4
|
+
#
|
5
|
+
module ResponseMethods
|
6
|
+
def self.install( response, result )
|
7
|
+
response.instance_variable_set( "@_firecrawl_result", result )
|
8
|
+
response.extend( ResponseMethods )
|
9
|
+
end
|
10
|
+
|
11
|
+
def result
|
12
|
+
@_firecrawl_result
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class ScrapeOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
include Helpers
|
5
|
+
|
6
|
+
FORMATS = [ :markdown, :links, :html, :raw_html, :screenshot, :"screenshot@full_page" ]
|
7
|
+
|
8
|
+
ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
|
9
|
+
|
10
|
+
schema do
|
11
|
+
# note: both format and formats are defined as a semantic convenience
|
12
|
+
format String, as: :formats, array: true, in: FORMATS
|
13
|
+
formats String, array: true, in: FORMATS
|
14
|
+
only_main_content [ TrueClass, FalseClass ], as: :onlyMainContent
|
15
|
+
include_tags String, as: :includeTags, array: true
|
16
|
+
exclude_tags String, as: :excludeTags, array: true
|
17
|
+
wait_for Integer
|
18
|
+
timeout Integer
|
19
|
+
extract do
|
20
|
+
#schema Hash
|
21
|
+
system_prompt String, as: :systemPrompt
|
22
|
+
prompt String
|
23
|
+
end
|
24
|
+
action as: :actions, arguments: :type, array: true do
|
25
|
+
type Symbol, required: true, in: ACTIONS
|
26
|
+
# wait
|
27
|
+
milliseconds Integer
|
28
|
+
# click
|
29
|
+
selector String
|
30
|
+
# write
|
31
|
+
text String
|
32
|
+
# press
|
33
|
+
key String
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.build( options = nil, &block )
|
38
|
+
new( api_options: builder.build( options, &block ) )
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.build!( options = nil, &block )
|
42
|
+
new( api_options: builder.build!( options, &block ) )
|
43
|
+
end
|
44
|
+
|
45
|
+
def initialize( options = {}, api_options: nil )
|
46
|
+
@options = self.class.builder.build( options || {} )
|
47
|
+
@options = api_options.merge( @options ) if api_options
|
48
|
+
@options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
|
49
|
+
end
|
50
|
+
|
51
|
+
def to_h
|
52
|
+
@options.to_h
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
##
|
3
|
+
# The +ScrapeRequest+ class encapsulates a '/scrape' POST request to the Firecrawl API. After
|
4
|
+
# creating a new +ScrapeRequest+ instance you can initiate the request by calling the +scrape+
|
5
|
+
# method to perform synchronous scraping.
|
6
|
+
#
|
7
|
+
# === examples
|
8
|
+
#
|
9
|
+
# require 'firecrawl'
|
10
|
+
#
|
11
|
+
# request = Firecrawl::ScrapeRequest.new( api_key: ENV[ 'FIRECRAWL_API_KEY' )
|
12
|
+
#
|
13
|
+
# options = Firecrawl::ScrapeOptions.build do
|
14
|
+
# format [ :markdown, 'screenshot@full_page' ]
|
15
|
+
# only_main_content true
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# response = request.scrape( 'https://example.com', options )
|
19
|
+
# if response.success?
|
20
|
+
# result = response.result
|
21
|
+
# puts response.metadata[ 'title ]
|
22
|
+
# puts '---'
|
23
|
+
# puts response.markdown
|
24
|
+
# else
|
25
|
+
# puts response.result.error_description
|
26
|
+
# end
|
27
|
+
#
|
28
|
+
class ScrapeRequest < Request
|
29
|
+
|
30
|
+
##
|
31
|
+
# The +scrape+ method makes a Firecrawl '/scrape' POST request which will scrape the given url.
|
32
|
+
#
|
33
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
34
|
+
# then +response.result+ will be an instance +ScrapeResult+. If the request is not successful
|
35
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
36
|
+
#
|
37
|
+
def scrape( url, options = nil, &block )
|
38
|
+
if options
|
39
|
+
options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
|
40
|
+
options = options.to_h
|
41
|
+
else
|
42
|
+
options = {}
|
43
|
+
end
|
44
|
+
options[ :url ] = url.to_s
|
45
|
+
|
46
|
+
response = post( "#{BASE_URI}/scrape", options, &block )
|
47
|
+
result = nil
|
48
|
+
if response.success?
|
49
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
50
|
+
attributes ||= { success: false }
|
51
|
+
result = ScrapeResult.new( attributes[ :success ], attributes[ :data ] )
|
52
|
+
else
|
53
|
+
result = ErrorResult.new( response.status, attributes )
|
54
|
+
end
|
55
|
+
|
56
|
+
ResponseMethods.install( response, result )
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class ScrapeResult
|
3
|
+
|
4
|
+
def initialize( success, attributes )
|
5
|
+
@success = success
|
6
|
+
@attributes = attributes || {}
|
7
|
+
end
|
8
|
+
|
9
|
+
##
|
10
|
+
# The +success?+ method returns +true+ if the scraping was successful.
|
11
|
+
#
|
12
|
+
# Note that the response +success?+ tells you if the call to the Firecrawl api was successful
|
13
|
+
# while this +success?+ method tells you if the actual scraping operation was successful.
|
14
|
+
#
|
15
|
+
def success?
|
16
|
+
@success || false
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# The +markdown+ method returns scraped content that has been converted to markdown. The
|
21
|
+
# markdown content is present only if the request options +formats+ included +markdown+.
|
22
|
+
#
|
23
|
+
def markdown
|
24
|
+
@attributes[ :markdown ]
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# The +html+ method returns scraped html content. The html content is present only if the
|
29
|
+
# request options +formats+ included +html+.
|
30
|
+
#
|
31
|
+
def html
|
32
|
+
@attributes[ :html ]
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# The +raw_html+ method returns the full scraped html content of the page. The raw html
|
37
|
+
# content is present only if the request options +formats+ included +raw_html+.
|
38
|
+
#
|
39
|
+
def raw_html
|
40
|
+
@attributes[ :rawHtml ]
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# The +screenshot_url+ method returns the url of the screenshot of the requested page. The
|
45
|
+
# screenshot url is present only if the request options +formats+ included +screenshot+ or
|
46
|
+
# +screenshot@full_page+.
|
47
|
+
#
|
48
|
+
def screenshot_url
|
49
|
+
@attributes[ :screenshot ]
|
50
|
+
end
|
51
|
+
|
52
|
+
##
|
53
|
+
# The +links+ method returns an array of the links that were scraped from the the page.
|
54
|
+
# The +links+ are empty unless the request options +formats+ included +links+.
|
55
|
+
#
|
56
|
+
def links
|
57
|
+
@attributes[ :links ] || []
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# The +actions+ method returns an object of action results ( +scrapes+ or +screenshots+ ).
|
62
|
+
# The +actions+ are empty unless the request options included +scrape+ or +scresshot+
|
63
|
+
# actions.
|
64
|
+
#
|
65
|
+
def actions
|
66
|
+
@attributes[ :actions ] || {}
|
67
|
+
end
|
68
|
+
|
69
|
+
def metadata
|
70
|
+
unless @metadata
|
71
|
+
metadata = @attributes[ :metadata ] || {}
|
72
|
+
@metadata = metadata.transform_keys do | key |
|
73
|
+
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
74
|
+
end
|
75
|
+
# remove the camelCase forms injected by Firecrawl
|
76
|
+
@metadata.delete_if do | key, _ |
|
77
|
+
key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
|
78
|
+
end
|
79
|
+
end
|
80
|
+
@metadata
|
81
|
+
end
|
82
|
+
|
83
|
+
def llm_extraction
|
84
|
+
@attributes[ :llm_extraction ] || {}
|
85
|
+
end
|
86
|
+
|
87
|
+
def warning
|
88
|
+
@attributes[ :warning ]
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
data/lib/firecrawl.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'base64'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
require 'faraday'
|
6
|
+
require 'dynamic_schema'
|
7
|
+
|
8
|
+
require_relative 'firecrawl/helpers'
|
9
|
+
require_relative 'firecrawl/error_result'
|
10
|
+
require_relative 'firecrawl/request'
|
11
|
+
require_relative 'firecrawl/response_methods'
|
12
|
+
|
13
|
+
require_relative 'firecrawl/scrape_options'
|
14
|
+
require_relative 'firecrawl/scrape_result'
|
15
|
+
require_relative 'firecrawl/scrape_request'
|
16
|
+
require_relative 'firecrawl/batch_scrape_result'
|
17
|
+
require_relative 'firecrawl/batch_scrape_request'
|
18
|
+
require_relative 'firecrawl/map_options'
|
19
|
+
require_relative 'firecrawl/map_result'
|
20
|
+
require_relative 'firecrawl/map_request'
|
21
|
+
|
22
|
+
module Firecrawl
|
23
|
+
class << self
|
24
|
+
attr_accessor :api_key
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
metadata
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: firecrawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kristoph Cichocki-Romanov
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-11-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: faraday
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: dynamicschema
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.0.0.beta03
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.0.0.beta03
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.13'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.13'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: debug
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.9'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.9'
|
69
|
+
description: |-
|
70
|
+
The Firecrawl gem implements a lightweight interface to the Firecrawl.dev API. Firecrawl can take a URL, scrape the page contents and return the whole page or principal content as html, markdown, or structured data.
|
71
|
+
|
72
|
+
In addition, Firecrawl can crawl an entire site returning the pages it encounters or just the map of the pages, which can be used for subsequent scraping.
|
73
|
+
email:
|
74
|
+
- rubygems.org@kristoph.net
|
75
|
+
executables: []
|
76
|
+
extensions: []
|
77
|
+
extra_rdoc_files: []
|
78
|
+
files:
|
79
|
+
- LICENSE
|
80
|
+
- firecrawl.gemspec
|
81
|
+
- lib/firecrawl.rb
|
82
|
+
- lib/firecrawl/batch_scrape_request.rb
|
83
|
+
- lib/firecrawl/batch_scrape_result.rb
|
84
|
+
- lib/firecrawl/crawl_options.rb
|
85
|
+
- lib/firecrawl/error_result.rb
|
86
|
+
- lib/firecrawl/helpers.rb
|
87
|
+
- lib/firecrawl/map_options.rb
|
88
|
+
- lib/firecrawl/map_request.rb
|
89
|
+
- lib/firecrawl/map_result.rb
|
90
|
+
- lib/firecrawl/request.rb
|
91
|
+
- lib/firecrawl/response_methods.rb
|
92
|
+
- lib/firecrawl/scrape_options.rb
|
93
|
+
- lib/firecrawl/scrape_request.rb
|
94
|
+
- lib/firecrawl/scrape_result.rb
|
95
|
+
homepage: https://github.com/EndlessInternational/firecrawl
|
96
|
+
licenses:
|
97
|
+
- MIT
|
98
|
+
metadata:
|
99
|
+
source_code_uri: https://github.com/EndlessInternational/firecrawl
|
100
|
+
bug_tracker_uri: https://github.com/EndlessInternational/firecrawl/issues
|
101
|
+
post_install_message:
|
102
|
+
rdoc_options: []
|
103
|
+
require_paths:
|
104
|
+
- lib
|
105
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '3.0'
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
|
+
requirements:
|
112
|
+
- - ">="
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: '0'
|
115
|
+
requirements: []
|
116
|
+
rubygems_version: 3.5.19
|
117
|
+
signing_key:
|
118
|
+
specification_version: 4
|
119
|
+
summary: The Firecrawl gem implements a lightweight interface to the Firecrawl.dev
|
120
|
+
API which takes a URL, crawls it and returns html, markdown, or structured data.
|
121
|
+
It is of particular valuewhen used with LLM's for grounding.
|
122
|
+
test_files: []
|