spidercloud 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +233 -0
- data/lib/spider_cloud/costs.rb +15 -0
- data/lib/spider_cloud/crawl_options.rb +154 -0
- data/lib/spider_cloud/crawl_request.rb +28 -0
- data/lib/spider_cloud/crawl_result.rb +62 -0
- data/lib/spider_cloud/error_result.rb +52 -0
- data/lib/spider_cloud/helpers.rb +33 -0
- data/lib/spider_cloud/links_options.rb +52 -0
- data/lib/spider_cloud/links_request.rb +29 -0
- data/lib/spider_cloud/links_result.rb +55 -0
- data/lib/spider_cloud/module_methods.rb +31 -0
- data/lib/spider_cloud/request.rb +41 -0
- data/lib/spider_cloud/response_methods.rb +15 -0
- data/lib/spider_cloud/scrape_options.rb +164 -0
- data/lib/spider_cloud/scrape_request.rb +29 -0
- data/lib/spider_cloud/scrape_result.rb +62 -0
- data/lib/spider_cloud/screenshot_options.rb +84 -0
- data/lib/spider_cloud/screenshot_request.rb +29 -0
- data/lib/spider_cloud/screenshot_result.rb +69 -0
- data/lib/spider_cloud/shared_schemas.rb +80 -0
- data/lib/spider_cloud/version.rb +3 -0
- data/lib/spider_cloud.rb +37 -0
- data/lib/spidercloud.rb +1 -0
- data/readme/crawl.md +218 -0
- data/readme/links.md +198 -0
- data/readme/scrape.md +248 -0
- data/readme/screenshot.md +240 -0
- data/spidercloud.gemspec +40 -0
- metadata +159 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# Screenshot Endpoint
|
|
2
|
+
|
|
3
|
+
The Screenshot endpoint captures screenshots of web pages with full JavaScript
|
|
4
|
+
rendering support.
|
|
5
|
+
|
|
6
|
+
**API Reference:** https://spider.cloud/docs/api#screenshot
|
|
7
|
+
|
|
8
|
+
## Basic Usage
|
|
9
|
+
|
|
10
|
+
```ruby
|
|
11
|
+
response = SpiderCloud.screenshot( 'https://example.com' )
|
|
12
|
+
|
|
13
|
+
# Save to file
|
|
14
|
+
response.result.save_to( 'screenshot.png' )
|
|
15
|
+
|
|
16
|
+
# Or access raw image data
|
|
17
|
+
image_data = response.result.image_data
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## With Options
|
|
21
|
+
|
|
22
|
+
```ruby
|
|
23
|
+
options = SpiderCloud::ScreenshotOptions.build do
|
|
24
|
+
full_page true
|
|
25
|
+
viewport do
|
|
26
|
+
width 1920
|
|
27
|
+
height 1080
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
response = SpiderCloud.screenshot( 'https://example.com', options )
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Options Reference
|
|
35
|
+
|
|
36
|
+
### Screenshot Options
|
|
37
|
+
|
|
38
|
+
| Option | Type | Default | Description |
|
|
39
|
+
|--------|------|---------|-------------|
|
|
40
|
+
| `full_page` | Boolean | true | Capture full scrollable page |
|
|
41
|
+
| `binary` | Boolean | false | Return binary instead of base64 |
|
|
42
|
+
| `omit_background` | Boolean | false | Transparent background (PNG only) |
|
|
43
|
+
| `block_images` | Boolean | false | Block images for faster capture |
|
|
44
|
+
|
|
45
|
+
### CDP Parameters
|
|
46
|
+
|
|
47
|
+
Chrome DevTools Protocol parameters for advanced control:
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
cdp_params do
|
|
51
|
+
format :png # :png or :jpeg
|
|
52
|
+
quality 80 # JPEG quality (0-100)
|
|
53
|
+
from_surface true
|
|
54
|
+
capture_beyond_viewport true
|
|
55
|
+
clip do
|
|
56
|
+
x 0
|
|
57
|
+
y 0
|
|
58
|
+
width 800
|
|
59
|
+
height 600
|
|
60
|
+
scale 1
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Viewport & Device
|
|
66
|
+
|
|
67
|
+
| Option | Type | Description |
|
|
68
|
+
|--------|------|-------------|
|
|
69
|
+
| `viewport` | Hash | Browser viewport `{width:, height:}` |
|
|
70
|
+
| `device` | Symbol | Device: `:mobile`, `:tablet`, `:desktop` |
|
|
71
|
+
|
|
72
|
+
### Wait Conditions
|
|
73
|
+
|
|
74
|
+
```ruby
|
|
75
|
+
wait_for do
|
|
76
|
+
# Wait for CSS selector
|
|
77
|
+
selector '#loaded'
|
|
78
|
+
|
|
79
|
+
# Wait for network idle
|
|
80
|
+
idle_network do
|
|
81
|
+
timeout { seconds 5; nanoseconds 0 }
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Wait for delay
|
|
85
|
+
delay do
|
|
86
|
+
timeout { seconds 2; nanoseconds 0 }
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Browser Configuration
|
|
92
|
+
|
|
93
|
+
| Option | Type | Description |
|
|
94
|
+
|--------|------|-------------|
|
|
95
|
+
| `stealth` | Boolean | Stealth mode |
|
|
96
|
+
| `fingerprint` | Boolean | Use fingerprint detection |
|
|
97
|
+
| `scroll` | Integer | Scroll duration before capture (ms) |
|
|
98
|
+
| `block_ads` | Boolean | Block ads |
|
|
99
|
+
| `virtual_display` | Boolean | Use virtual display |
|
|
100
|
+
|
|
101
|
+
### Proxy Configuration
|
|
102
|
+
|
|
103
|
+
| Option | Type | Description |
|
|
104
|
+
|--------|------|-------------|
|
|
105
|
+
| `proxy` | Symbol | Proxy pool: `:residential`, `:mobile`, `:isp` |
|
|
106
|
+
| `proxy_enabled` | Boolean | Enable proxy |
|
|
107
|
+
| `country_code` | String | ISO country code |
|
|
108
|
+
|
|
109
|
+
### Authentication
|
|
110
|
+
|
|
111
|
+
| Option | Type | Description |
|
|
112
|
+
|--------|------|-------------|
|
|
113
|
+
| `cookies` | String | HTTP cookies |
|
|
114
|
+
| `headers` | Hash | Custom HTTP headers |
|
|
115
|
+
| `automation_scripts` | Hash | Path-based automation |
|
|
116
|
+
|
|
117
|
+
## Response
|
|
118
|
+
|
|
119
|
+
```ruby
|
|
120
|
+
response = SpiderCloud.screenshot( 'https://example.com' )
|
|
121
|
+
|
|
122
|
+
response.result.success? # => true
|
|
123
|
+
response.result.content # => "iVBORw0KGgo..." (base64)
|
|
124
|
+
response.result.image_data # => binary PNG/JPEG data
|
|
125
|
+
response.result.url # => "https://example.com"
|
|
126
|
+
response.result.status # => 200
|
|
127
|
+
|
|
128
|
+
# Save directly to file
|
|
129
|
+
response.result.save_to( 'screenshot.png' )
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Examples
|
|
133
|
+
|
|
134
|
+
### Full Page Screenshot
|
|
135
|
+
|
|
136
|
+
```ruby
|
|
137
|
+
options = SpiderCloud::ScreenshotOptions.build do
|
|
138
|
+
full_page true
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
response = SpiderCloud.screenshot( 'https://example.com', options )
|
|
142
|
+
response.result.save_to( 'full-page.png' )
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Viewport Screenshot
|
|
146
|
+
|
|
147
|
+
```ruby
|
|
148
|
+
options = SpiderCloud::ScreenshotOptions.build do
|
|
149
|
+
full_page false
|
|
150
|
+
viewport do
|
|
151
|
+
width 1280
|
|
152
|
+
height 720
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
response = SpiderCloud.screenshot( 'https://example.com', options )
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Mobile Screenshot
|
|
160
|
+
|
|
161
|
+
```ruby
|
|
162
|
+
options = SpiderCloud::ScreenshotOptions.build do
|
|
163
|
+
device :mobile
|
|
164
|
+
viewport do
|
|
165
|
+
width 375
|
|
166
|
+
height 812
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
response = SpiderCloud.screenshot( 'https://example.com', options )
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### JPEG with Quality
|
|
174
|
+
|
|
175
|
+
```ruby
|
|
176
|
+
options = SpiderCloud::ScreenshotOptions.build do
|
|
177
|
+
cdp_params do
|
|
178
|
+
format :jpeg
|
|
179
|
+
quality 85
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
response = SpiderCloud.screenshot( 'https://example.com', options )
|
|
184
|
+
response.result.save_to( 'screenshot.jpg' )
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Capture Specific Region
|
|
188
|
+
|
|
189
|
+
```ruby
|
|
190
|
+
options = SpiderCloud::ScreenshotOptions.build do
|
|
191
|
+
cdp_params do
|
|
192
|
+
clip do
|
|
193
|
+
x 100
|
|
194
|
+
y 100
|
|
195
|
+
width 400
|
|
196
|
+
height 300
|
|
197
|
+
scale 1
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
response = SpiderCloud.screenshot( 'https://example.com', options )
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Wait for Content
|
|
206
|
+
|
|
207
|
+
```ruby
|
|
208
|
+
options = SpiderCloud::ScreenshotOptions.build do
|
|
209
|
+
wait_for do
|
|
210
|
+
selector '.chart-loaded'
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
response = SpiderCloud.screenshot( 'https://example.com/dashboard', options )
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### With Proxy
|
|
218
|
+
|
|
219
|
+
```ruby
|
|
220
|
+
options = SpiderCloud::ScreenshotOptions.build do
|
|
221
|
+
proxy :residential
|
|
222
|
+
proxy_enabled true
|
|
223
|
+
country_code 'UK'
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
response = SpiderCloud.screenshot( 'https://example.com', options )
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Transparent Background
|
|
230
|
+
|
|
231
|
+
```ruby
|
|
232
|
+
options = SpiderCloud::ScreenshotOptions.build do
|
|
233
|
+
omit_background true
|
|
234
|
+
cdp_params do
|
|
235
|
+
format :png
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
response = SpiderCloud.screenshot( 'https://example.com', options )
|
|
240
|
+
```
|
data/spidercloud.gemspec
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
require_relative 'lib/spider_cloud/version'
|
|
2
|
+
|
|
3
|
+
Gem::Specification.new do | spec |
|
|
4
|
+
|
|
5
|
+
spec.name = 'spidercloud'
|
|
6
|
+
spec.version = SpiderCloud::VERSION
|
|
7
|
+
spec.authors = [ 'Kristoph Cichocki-Romanov' ]
|
|
8
|
+
spec.email = [ 'rubygems.org@kristoph.net' ]
|
|
9
|
+
|
|
10
|
+
spec.summary =
|
|
11
|
+
"The SpiderCloud gem implements a lightweight interface to the Spider Cloud API for " \
|
|
12
|
+
"web scraping, crawling, screenshots, and link extraction."
|
|
13
|
+
spec.description =
|
|
14
|
+
"The SpiderCloud gem implements a lightweight interface to the Spider Cloud API. Spider " \
|
|
15
|
+
"Cloud provides powerful web scraping and crawling capabilities with support for " \
|
|
16
|
+
"JavaScript rendering, proxy rotation, and anti-bot measures.\n" \
|
|
17
|
+
"\n" \
|
|
18
|
+
"This gem supports scrape, crawl, screenshot, and links endpoints with comprehensive " \
|
|
19
|
+
"options for content extraction, filtering, and automation."
|
|
20
|
+
spec.license = 'MIT'
|
|
21
|
+
spec.homepage = 'https://github.com/EndlessInternational/spider-cloud'
|
|
22
|
+
spec.metadata = {
|
|
23
|
+
'source_code_uri' => 'https://github.com/EndlessInternational/spider-cloud',
|
|
24
|
+
'bug_tracker_uri' => 'https://github.com/EndlessInternational/spider-cloud/issues',
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
spec.required_ruby_version = '>= 3.0'
|
|
28
|
+
spec.files = Dir[ "lib/**/*.rb", "readme/**/*.md", "LICENSE", "README.md",
|
|
29
|
+
"spidercloud.gemspec" ]
|
|
30
|
+
spec.require_paths = [ "lib" ]
|
|
31
|
+
|
|
32
|
+
spec.add_runtime_dependency 'faraday', '~> 2'
|
|
33
|
+
spec.add_runtime_dependency 'dynamicschema', '~> 2'
|
|
34
|
+
spec.add_runtime_dependency 'base64', '~> 0.2'
|
|
35
|
+
|
|
36
|
+
spec.add_development_dependency 'minitest', '~> 6'
|
|
37
|
+
spec.add_development_dependency 'debug', '~> 1.11'
|
|
38
|
+
spec.add_development_dependency 'vcr', '~> 6.4'
|
|
39
|
+
|
|
40
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: spidercloud
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Kristoph Cichocki-Romanov
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: faraday
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '2'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '2'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: dynamicschema
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '2'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '2'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: base64
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '0.2'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '0.2'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: minitest
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '6'
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '6'
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: debug
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - "~>"
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '1.11'
|
|
75
|
+
type: :development
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - "~>"
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '1.11'
|
|
82
|
+
- !ruby/object:Gem::Dependency
|
|
83
|
+
name: vcr
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - "~>"
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '6.4'
|
|
89
|
+
type: :development
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - "~>"
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '6.4'
|
|
96
|
+
description: |-
|
|
97
|
+
The SpiderCloud gem implements a lightweight interface to the Spider Cloud API. Spider Cloud provides powerful web scraping and crawling capabilities with support for JavaScript rendering, proxy rotation, and anti-bot measures.
|
|
98
|
+
|
|
99
|
+
This gem supports scrape, crawl, screenshot, and links endpoints with comprehensive options for content extraction, filtering, and automation.
|
|
100
|
+
email:
|
|
101
|
+
- rubygems.org@kristoph.net
|
|
102
|
+
executables: []
|
|
103
|
+
extensions: []
|
|
104
|
+
extra_rdoc_files: []
|
|
105
|
+
files:
|
|
106
|
+
- LICENSE
|
|
107
|
+
- README.md
|
|
108
|
+
- lib/spider_cloud.rb
|
|
109
|
+
- lib/spider_cloud/costs.rb
|
|
110
|
+
- lib/spider_cloud/crawl_options.rb
|
|
111
|
+
- lib/spider_cloud/crawl_request.rb
|
|
112
|
+
- lib/spider_cloud/crawl_result.rb
|
|
113
|
+
- lib/spider_cloud/error_result.rb
|
|
114
|
+
- lib/spider_cloud/helpers.rb
|
|
115
|
+
- lib/spider_cloud/links_options.rb
|
|
116
|
+
- lib/spider_cloud/links_request.rb
|
|
117
|
+
- lib/spider_cloud/links_result.rb
|
|
118
|
+
- lib/spider_cloud/module_methods.rb
|
|
119
|
+
- lib/spider_cloud/request.rb
|
|
120
|
+
- lib/spider_cloud/response_methods.rb
|
|
121
|
+
- lib/spider_cloud/scrape_options.rb
|
|
122
|
+
- lib/spider_cloud/scrape_request.rb
|
|
123
|
+
- lib/spider_cloud/scrape_result.rb
|
|
124
|
+
- lib/spider_cloud/screenshot_options.rb
|
|
125
|
+
- lib/spider_cloud/screenshot_request.rb
|
|
126
|
+
- lib/spider_cloud/screenshot_result.rb
|
|
127
|
+
- lib/spider_cloud/shared_schemas.rb
|
|
128
|
+
- lib/spider_cloud/version.rb
|
|
129
|
+
- lib/spidercloud.rb
|
|
130
|
+
- readme/crawl.md
|
|
131
|
+
- readme/links.md
|
|
132
|
+
- readme/scrape.md
|
|
133
|
+
- readme/screenshot.md
|
|
134
|
+
- spidercloud.gemspec
|
|
135
|
+
homepage: https://github.com/EndlessInternational/spider-cloud
|
|
136
|
+
licenses:
|
|
137
|
+
- MIT
|
|
138
|
+
metadata:
|
|
139
|
+
source_code_uri: https://github.com/EndlessInternational/spider-cloud
|
|
140
|
+
bug_tracker_uri: https://github.com/EndlessInternational/spider-cloud/issues
|
|
141
|
+
rdoc_options: []
|
|
142
|
+
require_paths:
|
|
143
|
+
- lib
|
|
144
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
145
|
+
requirements:
|
|
146
|
+
- - ">="
|
|
147
|
+
- !ruby/object:Gem::Version
|
|
148
|
+
version: '3.0'
|
|
149
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
150
|
+
requirements:
|
|
151
|
+
- - ">="
|
|
152
|
+
- !ruby/object:Gem::Version
|
|
153
|
+
version: '0'
|
|
154
|
+
requirements: []
|
|
155
|
+
rubygems_version: 3.6.7
|
|
156
|
+
specification_version: 4
|
|
157
|
+
summary: The SpiderCloud gem implements a lightweight interface to the Spider Cloud
|
|
158
|
+
API for web scraping, crawling, screenshots, and link extraction.
|
|
159
|
+
test_files: []
|