spidercloud 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +233 -0
- data/lib/spider_cloud/costs.rb +15 -0
- data/lib/spider_cloud/crawl_options.rb +154 -0
- data/lib/spider_cloud/crawl_request.rb +28 -0
- data/lib/spider_cloud/crawl_result.rb +62 -0
- data/lib/spider_cloud/error_result.rb +52 -0
- data/lib/spider_cloud/helpers.rb +33 -0
- data/lib/spider_cloud/links_options.rb +52 -0
- data/lib/spider_cloud/links_request.rb +29 -0
- data/lib/spider_cloud/links_result.rb +55 -0
- data/lib/spider_cloud/module_methods.rb +31 -0
- data/lib/spider_cloud/request.rb +41 -0
- data/lib/spider_cloud/response_methods.rb +15 -0
- data/lib/spider_cloud/scrape_options.rb +164 -0
- data/lib/spider_cloud/scrape_request.rb +29 -0
- data/lib/spider_cloud/scrape_result.rb +62 -0
- data/lib/spider_cloud/screenshot_options.rb +84 -0
- data/lib/spider_cloud/screenshot_request.rb +29 -0
- data/lib/spider_cloud/screenshot_result.rb +69 -0
- data/lib/spider_cloud/shared_schemas.rb +80 -0
- data/lib/spider_cloud/version.rb +3 -0
- data/lib/spider_cloud.rb +37 -0
- data/lib/spidercloud.rb +1 -0
- data/readme/crawl.md +218 -0
- data/readme/links.md +198 -0
- data/readme/scrape.md +248 -0
- data/readme/screenshot.md +240 -0
- data/spidercloud.gemspec +40 -0
- metadata +159 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
|
|
3
|
+
LinksResultItemSchema = DynamicSchema::Struct.define do
|
|
4
|
+
url String
|
|
5
|
+
status Integer
|
|
6
|
+
duration_elapsed_ms Integer, as: :duration_elapsed_ms
|
|
7
|
+
error String
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
class LinksResultItem < LinksResultItemSchema
|
|
11
|
+
def success?
|
|
12
|
+
error.nil? && ( status.nil? || ( status >= 200 && status < 300 ) )
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
LinksResultSchema = DynamicSchema::Struct.define do
|
|
17
|
+
items LinksResultItem, array: true
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
class LinksResult < LinksResultSchema
|
|
21
|
+
extend Forwardable
|
|
22
|
+
include Enumerable
|
|
23
|
+
|
|
24
|
+
def_delegators :items, :each, :[], :count, :size, :length, :first, :last, :empty?
|
|
25
|
+
|
|
26
|
+
def self.from_array( array )
|
|
27
|
+
new( items: array )
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def success?
|
|
31
|
+
items&.all?( &:success? ) || false
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# get all discovered URLs
|
|
35
|
+
def urls
|
|
36
|
+
items&.map( &:url ) || []
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# get failed URLs
|
|
40
|
+
def failed
|
|
41
|
+
items&.reject( &:success? ) || []
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# get successful URLs
|
|
45
|
+
def succeeded
|
|
46
|
+
items&.select( &:success? ) || []
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# get URLs by status code
|
|
50
|
+
def with_status( code )
|
|
51
|
+
items&.select { | item | item.status == code } || []
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
module ModuleMethods
|
|
3
|
+
|
|
4
|
+
def connection( connection = nil )
|
|
5
|
+
@connection = connection if connection
|
|
6
|
+
@connection ||= Faraday.new { | builder | builder.adapter Faraday.default_adapter }
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def api_key( api_key = nil )
|
|
10
|
+
@api_key = api_key || @api_key
|
|
11
|
+
@api_key
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def scrape( url, options = nil, &block )
|
|
15
|
+
SpiderCloud::ScrapeRequest.new.submit( url, options, &block )
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def crawl( url, options = nil, &block )
|
|
19
|
+
SpiderCloud::CrawlRequest.new.submit( url, options, &block )
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def screenshot( url, options = nil, &block )
|
|
23
|
+
SpiderCloud::ScreenshotRequest.new.submit( url, options, &block )
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def links( url, options = nil, &block )
|
|
27
|
+
SpiderCloud::LinksRequest.new.submit( url, options, &block )
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class Request
|
|
3
|
+
|
|
4
|
+
BASE_URI = 'https://api.spider.cloud'
|
|
5
|
+
|
|
6
|
+
def initialize( connection: nil, api_key: nil )
|
|
7
|
+
@connection = connection || SpiderCloud.connection
|
|
8
|
+
@api_key = api_key || SpiderCloud.api_key
|
|
9
|
+
raise ArgumentError, "An 'api_key' is required unless configured using 'SpiderCloud.api_key'." \
|
|
10
|
+
unless @api_key
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
protected
|
|
14
|
+
|
|
15
|
+
def post( uri, body, &block )
|
|
16
|
+
headers = {
|
|
17
|
+
'Authorization' => "Bearer #{ @api_key }",
|
|
18
|
+
'Content-Type' => 'application/json'
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
@connection.post( uri ) do | request |
|
|
22
|
+
headers.each { | key, value | request.headers[ key ] = value }
|
|
23
|
+
request.body = body.is_a?( String ) ? body : JSON.generate( body )
|
|
24
|
+
block.call( request ) if block
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def get( uri, &block )
|
|
29
|
+
headers = {
|
|
30
|
+
'Authorization' => "Bearer #{ @api_key }",
|
|
31
|
+
'Content-Type' => 'application/json'
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
@connection.get( uri ) do | request |
|
|
35
|
+
headers.each { | key, value | request.headers[ key ] = value }
|
|
36
|
+
block.call( request ) if block
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
#
|
|
3
|
+
# The ResponseMethods module extends a Faraday response, adding the +result+ method.
|
|
4
|
+
#
|
|
5
|
+
module ResponseMethods
|
|
6
|
+
def self.install( response, result )
|
|
7
|
+
response.instance_variable_set( "@_spider_cloud_result", result )
|
|
8
|
+
response.extend( ResponseMethods )
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def result
|
|
12
|
+
@_spider_cloud_result
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class ScrapeOptions
|
|
3
|
+
include DynamicSchema::Definable
|
|
4
|
+
include Helpers
|
|
5
|
+
|
|
6
|
+
schema do
|
|
7
|
+
request Symbol, in: REQUEST_TYPES
|
|
8
|
+
return_format Symbol, in: RETURN_FORMATS
|
|
9
|
+
request_timeout Integer, in: 5..255
|
|
10
|
+
lite_mode [ TrueClass, FalseClass ]
|
|
11
|
+
|
|
12
|
+
network_blacklist String, array: true
|
|
13
|
+
network_whitelist String, array: true
|
|
14
|
+
disable_hints [ TrueClass, FalseClass ]
|
|
15
|
+
disable_intercept [ TrueClass, FalseClass ]
|
|
16
|
+
preserve_host [ TrueClass, FalseClass ]
|
|
17
|
+
anti_bot [ TrueClass, FalseClass ]
|
|
18
|
+
|
|
19
|
+
session [ TrueClass, FalseClass ]
|
|
20
|
+
cookies String
|
|
21
|
+
headers Hash
|
|
22
|
+
user_agent String
|
|
23
|
+
|
|
24
|
+
proxy Symbol, in: PROXY_TYPES
|
|
25
|
+
proxy_enabled [ TrueClass, FalseClass ]
|
|
26
|
+
remote_proxy String
|
|
27
|
+
country_code String
|
|
28
|
+
locale String
|
|
29
|
+
|
|
30
|
+
readability [ TrueClass, FalseClass ]
|
|
31
|
+
root_selector String
|
|
32
|
+
exclude_selector String
|
|
33
|
+
css_extraction_map Hash
|
|
34
|
+
clean_html [ TrueClass, FalseClass ]
|
|
35
|
+
filter_svg [ TrueClass, FalseClass ]
|
|
36
|
+
filter_images [ TrueClass, FalseClass ]
|
|
37
|
+
filter_main_only [ TrueClass, FalseClass ]
|
|
38
|
+
filter_output_svg [ TrueClass, FalseClass ]
|
|
39
|
+
filter_output_images [ TrueClass, FalseClass ]
|
|
40
|
+
filter_output_main_only [ TrueClass, FalseClass ]
|
|
41
|
+
|
|
42
|
+
return_json_data [ TrueClass, FalseClass ]
|
|
43
|
+
return_headers [ TrueClass, FalseClass ]
|
|
44
|
+
return_cookies [ TrueClass, FalseClass ]
|
|
45
|
+
return_page_links [ TrueClass, FalseClass ]
|
|
46
|
+
return_embeddings [ TrueClass, FalseClass ]
|
|
47
|
+
metadata [ TrueClass, FalseClass ]
|
|
48
|
+
encoding String
|
|
49
|
+
|
|
50
|
+
gpt_config do
|
|
51
|
+
prompt String
|
|
52
|
+
model String
|
|
53
|
+
max_tokens Integer
|
|
54
|
+
temperature Float
|
|
55
|
+
top_p Float
|
|
56
|
+
api_key String
|
|
57
|
+
extra_ai_data [ TrueClass, FalseClass ]
|
|
58
|
+
screenshot [ TrueClass, FalseClass ]
|
|
59
|
+
end
|
|
60
|
+
custom_prompt String
|
|
61
|
+
custom_function String
|
|
62
|
+
model String
|
|
63
|
+
|
|
64
|
+
chunking_algorithm as: :chunking_alg do
|
|
65
|
+
type Symbol, in: CHUNKING_TYPES
|
|
66
|
+
value Integer
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
wait_for do
|
|
70
|
+
idle_network do
|
|
71
|
+
timeout do
|
|
72
|
+
seconds Integer, as: :secs
|
|
73
|
+
nanoseconds Integer, as: :nanos
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
selector String
|
|
77
|
+
dom do
|
|
78
|
+
timeout do
|
|
79
|
+
seconds Integer, as: :secs
|
|
80
|
+
nanoseconds Integer, as: :nanos
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
delay do
|
|
84
|
+
timeout do
|
|
85
|
+
seconds Integer, as: :secs
|
|
86
|
+
nanoseconds Integer, as: :nanos
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
page_navigations do
|
|
90
|
+
timeout do
|
|
91
|
+
seconds Integer, as: :secs
|
|
92
|
+
nanoseconds Integer, as: :nanos
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
fingerprint [ TrueClass, FalseClass ]
|
|
98
|
+
stealth [ TrueClass, FalseClass ]
|
|
99
|
+
viewport do
|
|
100
|
+
width Integer
|
|
101
|
+
height Integer
|
|
102
|
+
end
|
|
103
|
+
device Symbol, in: DEVICE_TYPES
|
|
104
|
+
scroll Integer
|
|
105
|
+
block_ads [ TrueClass, FalseClass ]
|
|
106
|
+
block_analytics [ TrueClass, FalseClass ]
|
|
107
|
+
block_stylesheets [ TrueClass, FalseClass ]
|
|
108
|
+
block_images [ TrueClass, FalseClass ]
|
|
109
|
+
omit_background [ TrueClass, FalseClass ]
|
|
110
|
+
service_worker_enabled [ TrueClass, FalseClass ]
|
|
111
|
+
virtual_display [ TrueClass, FalseClass ]
|
|
112
|
+
|
|
113
|
+
automation_scripts Hash
|
|
114
|
+
execution_scripts Hash
|
|
115
|
+
evaluate_on_new_document String
|
|
116
|
+
|
|
117
|
+
blacklist String, array: true
|
|
118
|
+
whitelist String, array: true
|
|
119
|
+
external_domains String, array: true
|
|
120
|
+
redirect_policy Symbol, in: REDIRECT_POLICIES
|
|
121
|
+
link_rewrite Hash
|
|
122
|
+
|
|
123
|
+
sitemap [ TrueClass, FalseClass ]
|
|
124
|
+
sitemap_only [ TrueClass, FalseClass ]
|
|
125
|
+
sitemap_path String
|
|
126
|
+
subdomains [ TrueClass, FalseClass ]
|
|
127
|
+
tld [ TrueClass, FalseClass ]
|
|
128
|
+
|
|
129
|
+
cache [ TrueClass, FalseClass ]
|
|
130
|
+
skip_config_checks [ TrueClass, FalseClass ]
|
|
131
|
+
storageless [ TrueClass, FalseClass ]
|
|
132
|
+
store_data [ TrueClass, FalseClass ]
|
|
133
|
+
respect_robots [ TrueClass, FalseClass ]
|
|
134
|
+
concurrency_limit Integer
|
|
135
|
+
delay Integer
|
|
136
|
+
|
|
137
|
+
max_credits_per_page Integer
|
|
138
|
+
max_credits_allowed Integer
|
|
139
|
+
budget Hash
|
|
140
|
+
|
|
141
|
+
webhooks Hash
|
|
142
|
+
run_in_background [ TrueClass, FalseClass ]
|
|
143
|
+
event_tracker Hash
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def self.build( options = nil, &block )
|
|
147
|
+
new( api_options: builder.build( options, &block ) )
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def self.build!( options = nil, &block )
|
|
151
|
+
new( api_options: builder.build!( options, &block ) )
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def initialize( options = {}, api_options: nil )
|
|
155
|
+
@options = self.class.builder.build( options || {} )
|
|
156
|
+
@options = api_options.merge( @options ) if api_options
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def to_h
|
|
160
|
+
@options.to_h
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
end
|
|
164
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class ScrapeRequest < Request
|
|
3
|
+
|
|
4
|
+
def submit( url, options = nil, &block )
|
|
5
|
+
if options
|
|
6
|
+
options = options.is_a?( ScrapeOptions ) ? options : \
|
|
7
|
+
ScrapeOptions.build!( options.to_h )
|
|
8
|
+
options = options.to_h
|
|
9
|
+
else
|
|
10
|
+
options = {}
|
|
11
|
+
end
|
|
12
|
+
options[ :url ] = Helpers.normalize_url( url )
|
|
13
|
+
|
|
14
|
+
response = post( "#{ BASE_URI }/scrape", options, &block )
|
|
15
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
|
16
|
+
|
|
17
|
+
result = if response.success? && attributes.is_a?( Array )
|
|
18
|
+
ScrapeResult.from_array( attributes )
|
|
19
|
+
elsif response.success?
|
|
20
|
+
ErrorResult.new( response.status, attributes )
|
|
21
|
+
else
|
|
22
|
+
ErrorResult.new( response.status, attributes )
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
ResponseMethods.install( response, result )
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
|
|
3
|
+
ScrapeResultItemSchema = DynamicSchema::Struct.define do
|
|
4
|
+
content String
|
|
5
|
+
error String
|
|
6
|
+
status Integer
|
|
7
|
+
duration_elapsed_ms Integer, as: :duration_elapsed_ms
|
|
8
|
+
costs Costs
|
|
9
|
+
url String
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class ScrapeResultItem < ScrapeResultItemSchema
|
|
13
|
+
def success?
|
|
14
|
+
error.nil? && ( status.nil? || ( status >= 200 && status < 300 ) )
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
ScrapeResultSchema = DynamicSchema::Struct.define do
|
|
19
|
+
items ScrapeResultItem, array: true
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
class ScrapeResult < ScrapeResultSchema
|
|
23
|
+
extend Forwardable
|
|
24
|
+
include Enumerable
|
|
25
|
+
|
|
26
|
+
def_delegators :items, :each, :[], :count, :size, :length, :first, :last, :empty?
|
|
27
|
+
|
|
28
|
+
def self.from_array( array )
|
|
29
|
+
new( items: array )
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def success?
|
|
33
|
+
items&.all?( &:success? ) || false
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# convenience method for single URL scrapes
|
|
37
|
+
def content
|
|
38
|
+
first&.content
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def error
|
|
42
|
+
first&.error
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def status
|
|
46
|
+
first&.status
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def url
|
|
50
|
+
first&.url
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def costs
|
|
54
|
+
first&.costs
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def duration_elapsed_ms
|
|
58
|
+
first&.duration_elapsed_ms
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class ScreenshotOptions
|
|
3
|
+
include DynamicSchema::Definable
|
|
4
|
+
include Helpers
|
|
5
|
+
|
|
6
|
+
schema do
|
|
7
|
+
full_page [ TrueClass, FalseClass ]
|
|
8
|
+
binary [ TrueClass, FalseClass ]
|
|
9
|
+
omit_background [ TrueClass, FalseClass ]
|
|
10
|
+
block_images [ TrueClass, FalseClass ]
|
|
11
|
+
|
|
12
|
+
cdp_params do
|
|
13
|
+
format Symbol, in: IMAGE_FORMATS
|
|
14
|
+
quality Integer, in: 0..100
|
|
15
|
+
clip do
|
|
16
|
+
x Integer
|
|
17
|
+
y Integer
|
|
18
|
+
width Integer
|
|
19
|
+
height Integer
|
|
20
|
+
scale [ Integer, Float ]
|
|
21
|
+
end
|
|
22
|
+
from_surface [ TrueClass, FalseClass ]
|
|
23
|
+
capture_beyond_viewport [ TrueClass, FalseClass ]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
limit Integer
|
|
27
|
+
request Symbol, in: REQUEST_TYPES
|
|
28
|
+
|
|
29
|
+
viewport do
|
|
30
|
+
width Integer
|
|
31
|
+
height Integer
|
|
32
|
+
end
|
|
33
|
+
device Symbol, in: DEVICE_TYPES
|
|
34
|
+
|
|
35
|
+
wait_for do
|
|
36
|
+
idle_network do
|
|
37
|
+
timeout do
|
|
38
|
+
seconds Integer, as: :secs
|
|
39
|
+
nanoseconds Integer, as: :nanos
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
selector String
|
|
43
|
+
delay do
|
|
44
|
+
timeout do
|
|
45
|
+
seconds Integer, as: :secs
|
|
46
|
+
nanoseconds Integer, as: :nanos
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
proxy Symbol, in: PROXY_TYPES
|
|
52
|
+
proxy_enabled [ TrueClass, FalseClass ]
|
|
53
|
+
country_code String
|
|
54
|
+
|
|
55
|
+
stealth [ TrueClass, FalseClass ]
|
|
56
|
+
fingerprint [ TrueClass, FalseClass ]
|
|
57
|
+
scroll Integer
|
|
58
|
+
block_ads [ TrueClass, FalseClass ]
|
|
59
|
+
virtual_display [ TrueClass, FalseClass ]
|
|
60
|
+
|
|
61
|
+
cookies String
|
|
62
|
+
headers Hash
|
|
63
|
+
automation_scripts Hash
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def self.build( options = nil, &block )
|
|
67
|
+
new( api_options: builder.build( options, &block ) )
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def self.build!( options = nil, &block )
|
|
71
|
+
new( api_options: builder.build!( options, &block ) )
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def initialize( options = {}, api_options: nil )
|
|
75
|
+
@options = self.class.builder.build( options || {} )
|
|
76
|
+
@options = api_options.merge( @options ) if api_options
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def to_h
|
|
80
|
+
@options.to_h
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class ScreenshotRequest < Request
|
|
3
|
+
|
|
4
|
+
def submit( url, options = nil, &block )
|
|
5
|
+
if options
|
|
6
|
+
options = options.is_a?( ScreenshotOptions ) ? options : \
|
|
7
|
+
ScreenshotOptions.build!( options.to_h )
|
|
8
|
+
options = options.to_h
|
|
9
|
+
else
|
|
10
|
+
options = {}
|
|
11
|
+
end
|
|
12
|
+
options[ :url ] = Helpers.normalize_url( url )
|
|
13
|
+
|
|
14
|
+
response = post( "#{ BASE_URI }/screenshot", options, &block )
|
|
15
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
|
16
|
+
|
|
17
|
+
result = if response.success? && attributes.is_a?( Array )
|
|
18
|
+
ScreenshotResult.from_array( attributes )
|
|
19
|
+
elsif response.success?
|
|
20
|
+
ErrorResult.new( response.status, attributes )
|
|
21
|
+
else
|
|
22
|
+
ErrorResult.new( response.status, attributes )
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
ResponseMethods.install( response, result )
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
|
|
3
|
+
ScreenshotResultItemSchema = DynamicSchema::Struct.define do
|
|
4
|
+
content String
|
|
5
|
+
error String
|
|
6
|
+
status Integer
|
|
7
|
+
url String
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
class ScreenshotResultItem < ScreenshotResultItemSchema
|
|
11
|
+
def success?
|
|
12
|
+
error.nil? && ( status.nil? || ( status >= 200 && status < 300 ) )
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# decode base64 content to binary
|
|
16
|
+
def image_data
|
|
17
|
+
return nil unless content
|
|
18
|
+
Base64.decode64( content )
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
ScreenshotResultSchema = DynamicSchema::Struct.define do
|
|
23
|
+
items ScreenshotResultItem, array: true
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
class ScreenshotResult < ScreenshotResultSchema
|
|
27
|
+
extend Forwardable
|
|
28
|
+
include Enumerable
|
|
29
|
+
|
|
30
|
+
def_delegators :items, :each, :[], :count, :size, :length, :first, :last, :empty?
|
|
31
|
+
|
|
32
|
+
def self.from_array( array )
|
|
33
|
+
new( items: array )
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def success?
|
|
37
|
+
items&.all?( &:success? ) || false
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# convenience method for single URL screenshots
|
|
41
|
+
def content
|
|
42
|
+
first&.content
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def image_data
|
|
46
|
+
first&.image_data
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def error
|
|
50
|
+
first&.error
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def status
|
|
54
|
+
first&.status
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def url
|
|
58
|
+
first&.url
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# save screenshot to file
|
|
62
|
+
def save_to( path )
|
|
63
|
+
return false unless image_data
|
|
64
|
+
File.binwrite( path, image_data )
|
|
65
|
+
true
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
|
|
3
|
+
RETURN_FORMATS = [ :markdown, :commonmark, :raw, :text, :html2text,
|
|
4
|
+
:xml, :bytes, :empty ]
|
|
5
|
+
REQUEST_TYPES = [ :http, :chrome, :smart ]
|
|
6
|
+
PROXY_TYPES = [ :residential, :mobile, :isp ]
|
|
7
|
+
DEVICE_TYPES = [ :mobile, :tablet, :desktop ]
|
|
8
|
+
REDIRECT_POLICIES = [ :loose, :strict, :none ]
|
|
9
|
+
CHUNKING_TYPES = [ :no, :by_words, :by_lines, :by_character_length,
|
|
10
|
+
:by_sentence ]
|
|
11
|
+
IMAGE_FORMATS = [ :png, :jpeg ]
|
|
12
|
+
|
|
13
|
+
# reusable schema fragments for dynamic_schema definable classes
|
|
14
|
+
module SharedSchemas
|
|
15
|
+
|
|
16
|
+
# timeout schema fragment: seconds and nanoseconds
|
|
17
|
+
def timeout_schema
|
|
18
|
+
proc do
|
|
19
|
+
timeout do
|
|
20
|
+
seconds Integer, as: :secs
|
|
21
|
+
nanoseconds Integer, as: :nanos
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# viewport schema fragment: width and height
|
|
27
|
+
def viewport_schema
|
|
28
|
+
proc do
|
|
29
|
+
viewport do
|
|
30
|
+
width Integer
|
|
31
|
+
height Integer
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# wait_for schema fragment with all wait conditions
|
|
37
|
+
def wait_for_schema( include_dom: true, include_page_navigations: true )
|
|
38
|
+
timeout_proc = timeout_schema
|
|
39
|
+
proc do
|
|
40
|
+
wait_for do
|
|
41
|
+
idle_network do
|
|
42
|
+
instance_eval( &timeout_proc )
|
|
43
|
+
end
|
|
44
|
+
selector String
|
|
45
|
+
if include_dom
|
|
46
|
+
dom do
|
|
47
|
+
instance_eval( &timeout_proc )
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
delay do
|
|
51
|
+
instance_eval( &timeout_proc )
|
|
52
|
+
end
|
|
53
|
+
if include_page_navigations
|
|
54
|
+
page_navigations do
|
|
55
|
+
instance_eval( &timeout_proc )
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# gpt_config schema fragment for ai integration
|
|
63
|
+
def gpt_config_schema
|
|
64
|
+
proc do
|
|
65
|
+
gpt_config do
|
|
66
|
+
prompt String
|
|
67
|
+
model String
|
|
68
|
+
max_tokens Integer
|
|
69
|
+
temperature Float
|
|
70
|
+
top_p Float
|
|
71
|
+
api_key String
|
|
72
|
+
extra_ai_data [ TrueClass, FalseClass ]
|
|
73
|
+
screenshot [ TrueClass, FalseClass ]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
end
|