firecrawl 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/firecrawl.gemspec +5 -3
- data/lib/firecrawl/batch_scrape_options.rb +67 -0
- data/lib/firecrawl/batch_scrape_request.rb +4 -5
- data/lib/firecrawl/batch_scrape_result.rb +27 -52
- data/lib/firecrawl/crawl_options.rb +15 -9
- data/lib/firecrawl/crawl_request.rb +8 -4
- data/lib/firecrawl/crawl_result.rb +24 -54
- data/lib/firecrawl/map_options.rb +12 -15
- data/lib/firecrawl/map_request.rb +1 -2
- data/lib/firecrawl/map_result.rb +15 -23
- data/lib/firecrawl/request.rb +1 -1
- data/lib/firecrawl/scrape_options.rb +43 -11
- data/lib/firecrawl/scrape_request.rb +5 -4
- data/lib/firecrawl/scrape_result.rb +35 -80
- data/lib/firecrawl/version.rb +3 -0
- data/lib/firecrawl.rb +3 -1
- metadata +9 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f895548e284fae55f2284c335f3e36d3d7dfb2c0c81fea525235968c90a2c8d
|
4
|
+
data.tar.gz: 91a8f19d1281a37c87b12cd37f1e312e1044808e3c7b14a976d250aea51fb989
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21420a482873a02c56a1caa7e7e149acd3f00543d6dc9e969f817a0abd42ff7ec442ea1cb0ebb18020f21c72af0da66927085c2595915388e5e169a3bea8a93f
|
7
|
+
data.tar.gz: 17b793284e7480b693f980d0b5f03e8ec1e4c606cabef3b8bd9728ad91029f7279b430bb4f6ea820347f45a7eb8cac02f82f9704fdde793ccd1028b7b86e3d18
|
data/firecrawl.gemspec
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require_relative 'lib/firecrawl/version'
|
2
|
+
|
1
3
|
Gem::Specification.new do | spec |
|
2
4
|
|
3
5
|
spec.name = 'firecrawl'
|
4
|
-
spec.version =
|
6
|
+
spec.version = Firecrawl::VERSION
|
5
7
|
spec.authors = [ 'Kristoph Cichocki-Romanov' ]
|
6
8
|
spec.email = [ 'rubygems.org@kristoph.net' ]
|
7
9
|
|
@@ -28,8 +30,8 @@ Gem::Specification.new do | spec |
|
|
28
30
|
spec.files = Dir[ "lib/**/*.rb", "LICENSE", "README.md", "firecrawl.gemspec" ]
|
29
31
|
spec.require_paths = [ "lib" ]
|
30
32
|
|
31
|
-
spec.add_runtime_dependency 'faraday', '~> 2
|
32
|
-
spec.add_runtime_dependency 'dynamicschema', '~>
|
33
|
+
spec.add_runtime_dependency 'faraday', '~> 2'
|
34
|
+
spec.add_runtime_dependency 'dynamicschema', '~> 2'
|
33
35
|
|
34
36
|
spec.add_development_dependency 'rspec', '~> 3.13'
|
35
37
|
spec.add_development_dependency 'debug', '~> 1.9'
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Firecrawl
|
2
|
+
class BatchScrapeOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
include Helpers
|
5
|
+
|
6
|
+
schema do
|
7
|
+
proxy Symbol, in: [ :basic, :stealth, :auto ]
|
8
|
+
skip_tls_verification [ TrueClass, FalseClass ], as: :skipTlsVerification
|
9
|
+
mobile [ TrueClass, FalseClass ]
|
10
|
+
location arguments: :country do
|
11
|
+
country String, required: true # two digit country code
|
12
|
+
languages String, array: true #en-US jp etc
|
13
|
+
end
|
14
|
+
max_age Integer, as: :maxAge
|
15
|
+
headers Hash
|
16
|
+
|
17
|
+
only_main_content [ TrueClass, FalseClass ], as: :onlyMainContent
|
18
|
+
include_tags String, as: :includeTags, array: true
|
19
|
+
exclude_tags String, as: :excludeTags, array: true
|
20
|
+
remove_base64_images [ TrueClass, FalseClass ], as: :removeBase64Images
|
21
|
+
block_ads [ TrueClass, FalseClass ], as: :blockAds
|
22
|
+
|
23
|
+
wait_for Integer, as: :waitFor
|
24
|
+
timeout Integer
|
25
|
+
parsers Symbol, array: true, in: [ :pdf ]
|
26
|
+
|
27
|
+
formats Symbol, in: ScrapeOptions::FORMATS, array: true
|
28
|
+
screenshot do
|
29
|
+
type String, default: 'screenshot'
|
30
|
+
full_page [ TrueClass, FalseClass ], as: :fullPage
|
31
|
+
quality Integer, in: 0..100
|
32
|
+
viewport do
|
33
|
+
height Integer, required: true
|
34
|
+
width Integer, required: true
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
webhook do
|
39
|
+
url URI, required: true
|
40
|
+
headers Hash
|
41
|
+
metadata Hash
|
42
|
+
events Symbol, array: true, in: [ :completed, :page, :failed, :started ]
|
43
|
+
end
|
44
|
+
|
45
|
+
cache [ TrueClass, FalseClass ], as: :storeInCache
|
46
|
+
zero_data_retention [ TrueClass, FalseClass ], as: :zeroDataRetention
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.build( options = nil, &block )
|
50
|
+
new( api_options: builder.build( options, &block ) )
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.build!( options = nil, &block )
|
54
|
+
new( api_options: builder.build!( options, &block ) )
|
55
|
+
end
|
56
|
+
|
57
|
+
def initialize( options = nil, api_options: nil )
|
58
|
+
@options = self.class.builder.build( options || {} )
|
59
|
+
@options = api_options.merge( @options ) if api_options
|
60
|
+
end
|
61
|
+
|
62
|
+
def to_h
|
63
|
+
@options.to_h
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
@@ -54,7 +54,7 @@ module Firecrawl
|
|
54
54
|
def submit( urls, options = nil, &block )
|
55
55
|
if options
|
56
56
|
options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
|
57
|
-
options = options.to_h
|
57
|
+
options = ScrapeOptions.normalize_options( options.to_h )
|
58
58
|
else
|
59
59
|
options = {}
|
60
60
|
end
|
@@ -63,8 +63,7 @@ module Firecrawl
|
|
63
63
|
result = nil
|
64
64
|
attributes = JSON.parse( response.body, symbolize_names: true ) rescue nil
|
65
65
|
if response.success?
|
66
|
-
|
67
|
-
result = BatchScrapeResult.new( attributes[ :success ], attributes )
|
66
|
+
result = BatchScrapeResult.new( attributes )
|
68
67
|
else
|
69
68
|
result = ErrorResult.new( response.status, attributes || {} )
|
70
69
|
end
|
@@ -95,7 +94,7 @@ module Firecrawl
|
|
95
94
|
attributes = JSON.parse( response.body, symbolize_names: true ) rescue nil
|
96
95
|
if response.success?
|
97
96
|
attributes ||= { success: false, status: :failed }
|
98
|
-
result = batch_result.
|
97
|
+
result = batch_result.merge_attributes( attributes )
|
99
98
|
else
|
100
99
|
result = ErrorResult.new( response.status, attributes || {} )
|
101
100
|
end
|
@@ -131,7 +130,7 @@ module Firecrawl
|
|
131
130
|
# the next url should not be set by this method so that retrieve and retrieve_all do
|
132
131
|
# not impact each other
|
133
132
|
attributes.delete( :next )
|
134
|
-
result = batch_result.
|
133
|
+
result = batch_result.merge_attributes( attributes )
|
135
134
|
else
|
136
135
|
result = ErrorResult.new( response.status, attributes || {} )
|
137
136
|
end
|
@@ -1,63 +1,38 @@
|
|
1
1
|
module Firecrawl
|
2
|
-
class BatchScrapeResult
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
def success?
|
10
|
-
@success || false
|
11
|
-
end
|
3
|
+
BatchScrapeResultSchema = DynamicSchema::Struct.define do
|
4
|
+
success [ TrueClass, FalseClass ]
|
5
|
+
id String
|
6
|
+
invalid_urls String, array: true
|
12
7
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
@attributes[ :status ]&.to_sym || ( @success ? :scraping : :failed )
|
17
|
-
end
|
8
|
+
total Integer
|
9
|
+
completed Integer
|
10
|
+
credits_used Integer, as: :creditsUsed
|
18
11
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
end
|
12
|
+
url String
|
13
|
+
_value :next, type: String
|
14
|
+
|
15
|
+
expires_at Date, as: :expiresAt
|
16
|
+
data ScrapeResultData, array: true, default: []
|
17
|
+
end
|
26
18
|
|
27
|
-
|
28
|
-
|
29
|
-
end
|
19
|
+
class BatchScrapeResult < BatchScrapeResultSchema
|
20
|
+
extend Forwardable
|
30
21
|
|
31
|
-
|
32
|
-
|
33
|
-
|
22
|
+
def_delegators :data,
|
23
|
+
:[], :[]=, :<<, :push, :pop, :shift, :unshift,
|
24
|
+
:length, :size, :empty?, :each
|
34
25
|
|
35
|
-
def
|
36
|
-
|
37
|
-
|
26
|
+
def scraping?() = !!self.next_url
|
27
|
+
|
28
|
+
def next_url() = self.next || self.url
|
38
29
|
|
39
|
-
def
|
40
|
-
|
30
|
+
def merge_attributes( attributes )
|
31
|
+
new_attributes = self.to_h.merge( attributes )
|
32
|
+
data = attributes[ :data ]
|
33
|
+
new_attributes[ :data ] = self.data.concat( data ) if data
|
34
|
+
self.class.new( attributes )
|
41
35
|
end
|
42
|
-
|
43
|
-
def url
|
44
|
-
@attributes[ :url ]
|
45
|
-
end
|
46
|
-
|
47
|
-
def next_url
|
48
|
-
@attributes[ :next ] || @attributes[ :url ]
|
49
|
-
end
|
50
|
-
|
51
|
-
def scrape_results
|
52
|
-
success = @attributes[ :success ]
|
53
|
-
# note the &.compact is here because I've noted null entries in the data
|
54
|
-
( @attributes[ :data ]&.compact || [] ).map do | attr |
|
55
|
-
ScrapeResult.new( success, attr )
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
def merge( attributes )
|
60
|
-
self.class.new( attributes[ :success ], @attributes.merge( attributes ) )
|
61
|
-
end
|
62
36
|
end
|
37
|
+
|
63
38
|
end
|
@@ -6,12 +6,23 @@ module Firecrawl
|
|
6
6
|
schema do
|
7
7
|
exclude_paths String, as: :excludePaths, array: true
|
8
8
|
include_paths String, as: :includePaths, array: true
|
9
|
-
maximum_depth Integer, as: :
|
10
|
-
|
9
|
+
maximum_depth Integer, as: :maxDiscoveryDepth
|
10
|
+
sitemap Symbol, in: [ :skip, :include ]
|
11
11
|
limit Integer, in: (0..)
|
12
|
-
|
12
|
+
ignore_query_parameters [ TrueClass, FalseClass ], as: :ignoreQueryParameters
|
13
|
+
crawl_entire_domain [ TrueClass, FalseClass ], as: :crawlEntireDomain
|
13
14
|
allow_external_links [ TrueClass, FalseClass ], as: :allowExternalLinks
|
14
|
-
|
15
|
+
allow_subdomains String, as: :allowSubdomains
|
16
|
+
delay Integer, in: (0..)
|
17
|
+
max_concurency Integer, in: (0..), as: :maxConcurrency
|
18
|
+
|
19
|
+
webhook do
|
20
|
+
url URI, required: true
|
21
|
+
headers Hash
|
22
|
+
metadata Hash
|
23
|
+
events Symbol, array: true, in: [ :completed, :page, :failed, :started ]
|
24
|
+
end
|
25
|
+
|
15
26
|
scrape_options as: :scrapeOptions, &ScrapeOptions.schema
|
16
27
|
end
|
17
28
|
|
@@ -26,11 +37,6 @@ module Firecrawl
|
|
26
37
|
def initialize( options = nil, api_options: nil )
|
27
38
|
@options = self.class.builder.build( options || {} )
|
28
39
|
@options = api_options.merge( @options ) if api_options
|
29
|
-
|
30
|
-
scrape_options = @options[ :scrapeOptions ]
|
31
|
-
if scrape_options
|
32
|
-
scrape_options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
|
33
|
-
end
|
34
40
|
end
|
35
41
|
|
36
42
|
def to_h
|
@@ -57,6 +57,11 @@ module Firecrawl
|
|
57
57
|
if options
|
58
58
|
options = options.is_a?( CrawlOptions ) ? options : CrawlOptions.build( options.to_h )
|
59
59
|
options = options.to_h
|
60
|
+
|
61
|
+
scrape_options = options[ :scrapeOptions ]
|
62
|
+
if scrape_options
|
63
|
+
options[ :scrapeOptions ] = ScrapeOptions.normalize_options( scrape_options )
|
64
|
+
end
|
60
65
|
else
|
61
66
|
options = {}
|
62
67
|
end
|
@@ -65,8 +70,7 @@ module Firecrawl
|
|
65
70
|
result = nil
|
66
71
|
attributes = JSON.parse( response.body, symbolize_names: true ) rescue nil
|
67
72
|
if response.success?
|
68
|
-
|
69
|
-
result = CrawlResult.new( attributes[ :success ], attributes )
|
73
|
+
result = CrawlResult.new( attributes )
|
70
74
|
else
|
71
75
|
result = ErrorResult.new( response.status, attributes )
|
72
76
|
end
|
@@ -96,7 +100,7 @@ module Firecrawl
|
|
96
100
|
result = nil
|
97
101
|
attributes = JSON.parse( response.body, symbolize_names: true ) rescue nil
|
98
102
|
if response.success?
|
99
|
-
result = crawl_result.
|
103
|
+
result = crawl_result.merge_attributes( attributes || { success: false, status: :failed } )
|
100
104
|
else
|
101
105
|
result = ErrorResult.new( response.status, attributes || {} )
|
102
106
|
end
|
@@ -123,7 +127,7 @@ module Firecrawl
|
|
123
127
|
result = nil
|
124
128
|
attributes = JSON.parse( response.body, symbolize_names: true ) rescue nil
|
125
129
|
if response.success?
|
126
|
-
result = crawl_result.
|
130
|
+
result = crawl_result.merge_attributes( attributes || { success: false, status: :failed } )
|
127
131
|
else
|
128
132
|
result = ErrorResult.new( response.status, attributes || {} )
|
129
133
|
end
|
@@ -1,63 +1,33 @@
|
|
1
1
|
module Firecrawl
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
def status
|
14
|
-
# the initial Firecrawl response does not have a status so we synthesize a 'crawling'
|
15
|
-
# status if the operation was otherwise successful
|
16
|
-
@attributes[ :status ]&.to_sym || ( @success ? :scraping : :failed )
|
17
|
-
end
|
18
|
-
|
19
|
-
def status?( status )
|
20
|
-
self.status == status
|
21
|
-
end
|
2
|
+
CrawlResultSchema = DynamicSchema::Struct.define do
|
3
|
+
success [ TrueClass, FalseClass ]
|
4
|
+
id String
|
5
|
+
total Integer
|
6
|
+
completed Integer
|
7
|
+
credits_used Integer, as: :creditsUsed
|
8
|
+
url String
|
9
|
+
_value :next, type: String
|
10
|
+
expires_at Date, as: :expiresAt
|
11
|
+
data ScrapeResultData, array: true, default: []
|
12
|
+
end
|
22
13
|
|
23
|
-
|
24
|
-
|
25
|
-
end
|
14
|
+
class CrawlResult < CrawlResultSchema
|
15
|
+
extend Forwardable
|
26
16
|
|
27
|
-
|
28
|
-
|
29
|
-
|
17
|
+
def_delegators :data,
|
18
|
+
:[], :[]=, :<<, :push, :pop, :shift, :unshift,
|
19
|
+
:length, :size, :empty?, :each
|
30
20
|
|
31
|
-
def
|
32
|
-
|
33
|
-
end
|
21
|
+
def success?() = self.success
|
22
|
+
def crawling?() = !!self.next_url
|
34
23
|
|
35
|
-
def
|
36
|
-
@attributes[ :creditsUsed ] || 0
|
37
|
-
end
|
24
|
+
def next_url() = self.next || self.url
|
38
25
|
|
39
|
-
def
|
40
|
-
|
26
|
+
def merge_attributes( attributes )
|
27
|
+
new_attributes = self.to_h.merge( attributes )
|
28
|
+
data = attributes[ :data ]
|
29
|
+
new_attributes[ :data ] = self.data.concat( data ) if data
|
30
|
+
self.class.new( attributes )
|
41
31
|
end
|
42
|
-
|
43
|
-
def url
|
44
|
-
@attributes[ :url ]
|
45
|
-
end
|
46
|
-
|
47
|
-
def next_url
|
48
|
-
@attributes[ :next ] || @attributes[ :url ]
|
49
|
-
end
|
50
|
-
|
51
|
-
def scrape_results
|
52
|
-
success = @attributes[ :success ]
|
53
|
-
# note the &.compact is here because I've noted null entries in the data
|
54
|
-
( @attributes[ :data ]&.compact || [] ).map do | attr |
|
55
|
-
ScrapeResult.new( success, attr )
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
def merge( attributes )
|
60
|
-
self.class.new( attributes[ :success ], @attributes.merge( attributes ) )
|
61
|
-
end
|
62
32
|
end
|
63
33
|
end
|
@@ -1,30 +1,27 @@
|
|
1
1
|
module Firecrawl
|
2
2
|
class MapOptions
|
3
3
|
include DynamicSchema::Definable
|
4
|
+
include DynamicSchema::Buildable
|
4
5
|
|
5
6
|
schema do
|
6
|
-
search
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
search String
|
8
|
+
sitemap Symbol, in: [ :skip, :include ]
|
9
|
+
include_subdomains [ TrueClass, FalseClass ], as: :includeSubdomains
|
10
|
+
ignore_query_parameters [ TrueClass, FalseClass ], as: :ignoreQueryParameters
|
11
|
+
limit Integer, in: (0..)
|
12
|
+
timeout Integer, in: (0..)
|
13
|
+
location arguments: :country do
|
14
|
+
country String, required: true # two digit country code
|
15
|
+
languages String, array: true #en-US jp etc
|
16
|
+
end
|
10
17
|
end
|
11
18
|
|
12
|
-
def self.build( options = nil, &block )
|
13
|
-
new( api_options: builder.build( options, &block ) )
|
14
|
-
end
|
15
|
-
|
16
|
-
def self.build!( options = nil, &block )
|
17
|
-
new( api_options: builder.build!( options, &block ) )
|
18
|
-
end
|
19
|
-
|
20
19
|
def initialize( options = {}, api_options: nil )
|
21
20
|
@options = self.class.builder.build( options || {} )
|
22
21
|
@options = api_options.merge( @options ) if api_options
|
23
22
|
end
|
24
23
|
|
25
|
-
def to_h
|
26
|
-
@options.to_h
|
27
|
-
end
|
24
|
+
def to_h() = @options.to_h
|
28
25
|
|
29
26
|
end
|
30
27
|
end
|
@@ -45,8 +45,7 @@ module Firecrawl
|
|
45
45
|
result = nil
|
46
46
|
if response.success?
|
47
47
|
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
48
|
-
|
49
|
-
result = MapResult.new( attributes[ :success ], attributes )
|
48
|
+
result = MapResult.new( attributes )
|
50
49
|
else
|
51
50
|
result = ErrorResult.new( response.status, attributes )
|
52
51
|
end
|
data/lib/firecrawl/map_result.rb
CHANGED
@@ -1,29 +1,21 @@
|
|
1
1
|
module Firecrawl
|
2
|
-
class MapResult
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
# Note that the response +success?+ tells you if the call to the Firecrawl api was successful
|
13
|
-
# while this +success?+ method tells you if the actual scraping operation was successful.
|
14
|
-
#
|
15
|
-
def success?
|
16
|
-
@success || false
|
17
|
-
end
|
18
|
-
|
19
|
-
##
|
20
|
-
# The +links+ method returns an array of the links that were scraped from the the page.
|
21
|
-
# The +links+ are empty unless the request options +formats+ included +links+.
|
22
|
-
#
|
23
|
-
def links
|
24
|
-
@attributes[ :links ] || []
|
25
|
-
end
|
3
|
+
MapResultSchema = DynamicSchema::Struct.define do
|
4
|
+
success [ TrueClass, FalseClass ]
|
5
|
+
links array: true do
|
6
|
+
url String
|
7
|
+
title String
|
8
|
+
description String
|
9
|
+
end
|
10
|
+
end
|
26
11
|
|
12
|
+
class MapResult < MapResultSchema
|
13
|
+
extend Forwardable
|
14
|
+
def_delegators :links,
|
15
|
+
:[], :[]=, :<<, :push, :pop, :shift, :unshift,
|
16
|
+
:length, :size, :empty?, :each
|
17
|
+
def success?() = success
|
27
18
|
end
|
19
|
+
|
28
20
|
end
|
29
21
|
|
data/lib/firecrawl/request.rb
CHANGED
@@ -3,24 +3,42 @@ module Firecrawl
|
|
3
3
|
include DynamicSchema::Definable
|
4
4
|
include Helpers
|
5
5
|
|
6
|
-
FORMATS = [ :
|
6
|
+
FORMATS = [ :summary, :markdown, :html, :raw_html, :links, :screenshot ]
|
7
7
|
|
8
8
|
ACTIONS = [ :wait, :click, :write, :press, :screenshot, :scrape ]
|
9
9
|
|
10
10
|
schema do
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
|
12
|
+
proxy Symbol, in: [ :basic, :stealth, :auto ]
|
13
|
+
skip_tls_verification [ TrueClass, FalseClass ], as: :skipTlsVerification
|
14
|
+
mobile [ TrueClass, FalseClass ]
|
15
|
+
location arguments: :country do
|
16
|
+
country String, required: true # two digit country code
|
17
|
+
languages String, array: true #en-US jp etc
|
18
|
+
end
|
19
|
+
max_age Integer, as: :maxAge
|
20
|
+
|
21
|
+
headers Hash
|
22
|
+
|
14
23
|
only_main_content [ TrueClass, FalseClass ], as: :onlyMainContent
|
15
24
|
include_tags String, as: :includeTags, array: true
|
16
25
|
exclude_tags String, as: :excludeTags, array: true
|
17
|
-
wait_for Integer
|
26
|
+
wait_for Integer, as: :waitFor
|
18
27
|
timeout Integer
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
28
|
+
parsers Symbol, array: true, in: [ :pdf ]
|
29
|
+
|
30
|
+
formats Symbol, in: FORMATS, array: true
|
31
|
+
screenshot do
|
32
|
+
type String, default: 'screenshot'
|
33
|
+
full_page [ TrueClass, FalseClass ], as: :fullPage
|
34
|
+
quality Integer, in: 0..100
|
35
|
+
viewport do
|
36
|
+
height Integer, required: true
|
37
|
+
width Integer, required: true
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
24
42
|
action as: :actions, arguments: :type, array: true do
|
25
43
|
type Symbol, required: true, in: ACTIONS
|
26
44
|
# wait
|
@@ -32,6 +50,9 @@ module Firecrawl
|
|
32
50
|
# press
|
33
51
|
key String
|
34
52
|
end
|
53
|
+
|
54
|
+
cache [ TrueClass, FalseClass ], as: :storeInCache
|
55
|
+
zero_retention [ TrueClass, FalseClass ], as: :zeroDataRetention
|
35
56
|
end
|
36
57
|
|
37
58
|
def self.build( options = nil, &block )
|
@@ -42,10 +63,21 @@ module Firecrawl
|
|
42
63
|
new( api_options: builder.build!( options, &block ) )
|
43
64
|
end
|
44
65
|
|
66
|
+
def self.normalize_options( options )
|
67
|
+
options = options&.dup || {}
|
68
|
+
screenshot = options.delete( :screenshot )
|
69
|
+
if screenshot
|
70
|
+
formats = options[ :formats ] || []
|
71
|
+
formats.delete( :screenshot )
|
72
|
+
formats << screenshot
|
73
|
+
options[ :formats ] = formats
|
74
|
+
end
|
75
|
+
options
|
76
|
+
end
|
77
|
+
|
45
78
|
def initialize( options = {}, api_options: nil )
|
46
79
|
@options = self.class.builder.build( options || {} )
|
47
80
|
@options = api_options.merge( @options ) if api_options
|
48
|
-
@options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
|
49
81
|
end
|
50
82
|
|
51
83
|
def to_h
|
@@ -36,8 +36,8 @@ module Firecrawl
|
|
36
36
|
#
|
37
37
|
def submit( url, options = nil, &block )
|
38
38
|
if options
|
39
|
-
options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build( options.to_h )
|
40
|
-
options = options.to_h
|
39
|
+
options = options.is_a?( ScrapeOptions ) ? options : ScrapeOptions.build!( options.to_h )
|
40
|
+
options = ScrapeOptions.normalize_options( options.to_h )
|
41
41
|
else
|
42
42
|
options = {}
|
43
43
|
end
|
@@ -47,8 +47,7 @@ module Firecrawl
|
|
47
47
|
result = nil
|
48
48
|
if response.success?
|
49
49
|
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
50
|
-
|
51
|
-
result = ScrapeResult.new( attributes[ :success ], attributes[ :data ] )
|
50
|
+
result = ScrapeResult.new( attributes )
|
52
51
|
else
|
53
52
|
result = ErrorResult.new( response.status, attributes )
|
54
53
|
end
|
@@ -56,5 +55,7 @@ module Firecrawl
|
|
56
55
|
ResponseMethods.install( response, result )
|
57
56
|
end
|
58
57
|
|
58
|
+
private
|
59
|
+
|
59
60
|
end
|
60
61
|
end
|
@@ -1,92 +1,47 @@
|
|
1
1
|
module Firecrawl
|
2
|
-
class ScrapeResult
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
@attributes = attributes || {}
|
7
|
-
end
|
8
|
-
|
9
|
-
##
|
10
|
-
# The +success?+ method returns +true+ if the scraping was successful.
|
11
|
-
#
|
12
|
-
# Note that the response +success?+ tells you if the call to the Firecrawl api was successful
|
13
|
-
# while this +success?+ method tells you if the actual scraping operation was successful.
|
14
|
-
#
|
15
|
-
def success?
|
16
|
-
@success || false
|
17
|
-
end
|
18
|
-
|
19
|
-
def metadata
|
20
|
-
unless @metadata
|
21
|
-
metadata = @attributes[ :metadata ] || {}
|
22
|
-
@metadata = metadata.transform_keys do | key |
|
23
|
-
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
24
|
-
end
|
25
|
-
# remove the camelCase forms injected by Firecrawl
|
26
|
-
@metadata.delete_if do | key, _ |
|
27
|
-
key.start_with?( 'og_' ) && @metadata.key?( key.sub( 'og_', 'og:' ) )
|
28
|
-
end
|
29
|
-
end
|
30
|
-
@metadata
|
31
|
-
end
|
32
|
-
|
33
|
-
##
|
34
|
-
# The +markdown+ method returns scraped content that has been converted to markdown. The
|
35
|
-
# markdown content is present only if the request options +formats+ included +markdown+.
|
36
|
-
#
|
37
|
-
def markdown
|
38
|
-
@attributes[ :markdown ]
|
39
|
-
end
|
3
|
+
ScrapeResultDataSchema = DynamicSchema::Struct.define do
|
4
|
+
metadata Hash
|
40
5
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
#
|
45
|
-
def html
|
46
|
-
@attributes[ :html ]
|
47
|
-
end
|
6
|
+
markdown String
|
7
|
+
html String
|
8
|
+
raw_html String, as: :rawHtml
|
48
9
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
#
|
53
|
-
def raw_html
|
54
|
-
@attributes[ :rawHtml ]
|
55
|
-
end
|
10
|
+
screenshot_url String, as: :screenshot
|
11
|
+
links String, array: true
|
12
|
+
actions Hash, default: {}
|
56
13
|
|
57
|
-
|
58
|
-
|
59
|
-
# screenshot url is present only if the request options +formats+ included +screenshot+ or
|
60
|
-
# +screenshot@full_page+.
|
61
|
-
#
|
62
|
-
def screenshot_url
|
63
|
-
@attributes[ :screenshot ]
|
64
|
-
end
|
65
|
-
|
66
|
-
##
|
67
|
-
# The +links+ method returns an array of the links that were scraped from the the page.
|
68
|
-
# The +links+ are empty unless the request options +formats+ included +links+.
|
69
|
-
#
|
70
|
-
def links
|
71
|
-
@attributes[ :links ] || []
|
72
|
-
end
|
14
|
+
warning String
|
15
|
+
end
|
73
16
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
17
|
+
class ScrapeResultData < ScrapeResultDataSchema
|
18
|
+
def metadata() = @metadata ||= normalize_metadata( super )
|
19
|
+
private
|
20
|
+
def normalize_metadata( metadata )
|
21
|
+
return nil unless metadata
|
22
|
+
metadata = metadata.transform_keys do | key |
|
23
|
+
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
24
|
+
end
|
25
|
+
# remove the camelCase forms injected by Firecrawl
|
26
|
+
metadata.delete_if do | key, _ |
|
27
|
+
key.start_with?( 'og_' ) && metadata.key?( key.sub( 'og_', 'og:' ) )
|
28
|
+
end
|
29
|
+
metadata
|
81
30
|
end
|
31
|
+
end
|
82
32
|
|
83
|
-
|
84
|
-
|
85
|
-
|
33
|
+
ScrapeResultSchema = DynamicSchema::Struct.define do
|
34
|
+
success [ TrueClass, FalseClass ]
|
35
|
+
data ScrapeResultData
|
36
|
+
end
|
86
37
|
|
87
|
-
|
88
|
-
|
89
|
-
|
38
|
+
class ScrapeResult < ScrapeResultSchema
|
39
|
+
extend Forwardable
|
40
|
+
def success?() = self.success
|
41
|
+
def_delegators :data,
|
42
|
+
:metadata, :warning,
|
43
|
+
:markdown, :html, :raw_html,
|
44
|
+
:screenshot_url, :links, :actions
|
90
45
|
|
91
46
|
end
|
92
47
|
end
|
data/lib/firecrawl.rb
CHANGED
@@ -5,6 +5,8 @@ require 'uri'
|
|
5
5
|
require 'faraday'
|
6
6
|
require 'dynamic_schema'
|
7
7
|
|
8
|
+
require_relative 'firecrawl/version'
|
9
|
+
|
8
10
|
require_relative 'firecrawl/helpers'
|
9
11
|
require_relative 'firecrawl/error_result'
|
10
12
|
require_relative 'firecrawl/request'
|
@@ -14,6 +16,7 @@ require_relative 'firecrawl/scrape_options'
|
|
14
16
|
require_relative 'firecrawl/scrape_result'
|
15
17
|
require_relative 'firecrawl/scrape_request'
|
16
18
|
require_relative 'firecrawl/batch_scrape_result'
|
19
|
+
require_relative 'firecrawl/batch_scrape_options'
|
17
20
|
require_relative 'firecrawl/batch_scrape_request'
|
18
21
|
require_relative 'firecrawl/map_options'
|
19
22
|
require_relative 'firecrawl/map_result'
|
@@ -28,4 +31,3 @@ module Firecrawl
|
|
28
31
|
extend ModuleMethods
|
29
32
|
end
|
30
33
|
|
31
|
-
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: firecrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kristoph Cichocki-Romanov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: faraday
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '2
|
19
|
+
version: '2'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '2
|
26
|
+
version: '2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: dynamicschema
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '2'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -94,6 +94,7 @@ files:
|
|
94
94
|
- README.md
|
95
95
|
- firecrawl.gemspec
|
96
96
|
- lib/firecrawl.rb
|
97
|
+
- lib/firecrawl/batch_scrape_options.rb
|
97
98
|
- lib/firecrawl/batch_scrape_request.rb
|
98
99
|
- lib/firecrawl/batch_scrape_result.rb
|
99
100
|
- lib/firecrawl/crawl_options.rb
|
@@ -110,6 +111,7 @@ files:
|
|
110
111
|
- lib/firecrawl/scrape_options.rb
|
111
112
|
- lib/firecrawl/scrape_request.rb
|
112
113
|
- lib/firecrawl/scrape_result.rb
|
114
|
+
- lib/firecrawl/version.rb
|
113
115
|
homepage: https://github.com/EndlessInternational/firecrawl
|
114
116
|
licenses:
|
115
117
|
- MIT
|
@@ -131,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
131
133
|
- !ruby/object:Gem::Version
|
132
134
|
version: '0'
|
133
135
|
requirements: []
|
134
|
-
rubygems_version: 3.5.
|
136
|
+
rubygems_version: 3.5.22
|
135
137
|
signing_key:
|
136
138
|
specification_version: 4
|
137
139
|
summary: The Firecrawl gem implements a lightweight interface to the Firecrawl.dev
|