spidercloud 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +233 -0
- data/lib/spider_cloud/costs.rb +15 -0
- data/lib/spider_cloud/crawl_options.rb +154 -0
- data/lib/spider_cloud/crawl_request.rb +28 -0
- data/lib/spider_cloud/crawl_result.rb +62 -0
- data/lib/spider_cloud/error_result.rb +52 -0
- data/lib/spider_cloud/helpers.rb +33 -0
- data/lib/spider_cloud/links_options.rb +52 -0
- data/lib/spider_cloud/links_request.rb +29 -0
- data/lib/spider_cloud/links_result.rb +55 -0
- data/lib/spider_cloud/module_methods.rb +31 -0
- data/lib/spider_cloud/request.rb +41 -0
- data/lib/spider_cloud/response_methods.rb +15 -0
- data/lib/spider_cloud/scrape_options.rb +164 -0
- data/lib/spider_cloud/scrape_request.rb +29 -0
- data/lib/spider_cloud/scrape_result.rb +62 -0
- data/lib/spider_cloud/screenshot_options.rb +84 -0
- data/lib/spider_cloud/screenshot_request.rb +29 -0
- data/lib/spider_cloud/screenshot_result.rb +69 -0
- data/lib/spider_cloud/shared_schemas.rb +80 -0
- data/lib/spider_cloud/version.rb +3 -0
- data/lib/spider_cloud.rb +37 -0
- data/lib/spidercloud.rb +1 -0
- data/readme/crawl.md +218 -0
- data/readme/links.md +198 -0
- data/readme/scrape.md +248 -0
- data/readme/screenshot.md +240 -0
- data/spidercloud.gemspec +40 -0
- metadata +159 -0
data/lib/spider_cloud.rb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
require 'base64'
|
|
3
|
+
require 'uri'
|
|
4
|
+
|
|
5
|
+
require 'faraday'
|
|
6
|
+
require 'dynamic_schema'
|
|
7
|
+
|
|
8
|
+
require_relative 'spider_cloud/version'
|
|
9
|
+
|
|
10
|
+
require_relative 'spider_cloud/helpers'
|
|
11
|
+
require_relative 'spider_cloud/shared_schemas'
|
|
12
|
+
require_relative 'spider_cloud/costs'
|
|
13
|
+
require_relative 'spider_cloud/error_result'
|
|
14
|
+
require_relative 'spider_cloud/request'
|
|
15
|
+
require_relative 'spider_cloud/response_methods'
|
|
16
|
+
|
|
17
|
+
require_relative 'spider_cloud/scrape_options'
|
|
18
|
+
require_relative 'spider_cloud/scrape_result'
|
|
19
|
+
require_relative 'spider_cloud/scrape_request'
|
|
20
|
+
|
|
21
|
+
require_relative 'spider_cloud/crawl_options'
|
|
22
|
+
require_relative 'spider_cloud/crawl_result'
|
|
23
|
+
require_relative 'spider_cloud/crawl_request'
|
|
24
|
+
|
|
25
|
+
require_relative 'spider_cloud/screenshot_options'
|
|
26
|
+
require_relative 'spider_cloud/screenshot_result'
|
|
27
|
+
require_relative 'spider_cloud/screenshot_request'
|
|
28
|
+
|
|
29
|
+
require_relative 'spider_cloud/links_options'
|
|
30
|
+
require_relative 'spider_cloud/links_result'
|
|
31
|
+
require_relative 'spider_cloud/links_request'
|
|
32
|
+
|
|
33
|
+
require_relative 'spider_cloud/module_methods'
|
|
34
|
+
|
|
35
|
+
module SpiderCloud
|
|
36
|
+
extend ModuleMethods
|
|
37
|
+
end
|
data/lib/spidercloud.rb
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require_relative 'spider_cloud'
|
data/readme/crawl.md
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# Crawl Endpoint
|
|
2
|
+
|
|
3
|
+
The Crawl endpoint discovers and extracts content from multiple pages starting
|
|
4
|
+
from a given URL.
|
|
5
|
+
|
|
6
|
+
**API Reference:** https://spider.cloud/docs/api#crawl
|
|
7
|
+
|
|
8
|
+
## Basic Usage
|
|
9
|
+
|
|
10
|
+
```ruby
|
|
11
|
+
# Always use a limit to control credit usage
|
|
12
|
+
response = SpiderCloud.crawl( 'https://example.com', limit: 5 )
|
|
13
|
+
|
|
14
|
+
response.result.each do | page |
|
|
15
|
+
puts "#{ page.url }: #{ page.content&.length } chars"
|
|
16
|
+
end
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## With Options
|
|
20
|
+
|
|
21
|
+
```ruby
|
|
22
|
+
options = SpiderCloud::CrawlOptions.build do
|
|
23
|
+
limit 5 # max pages to crawl
|
|
24
|
+
depth 2 # max link depth
|
|
25
|
+
return_format :markdown
|
|
26
|
+
readability true
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
response = SpiderCloud.crawl( 'https://example.com', options )
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Options Reference
|
|
33
|
+
|
|
34
|
+
### Core Options
|
|
35
|
+
|
|
36
|
+
| Option | Type | Default | Description |
|
|
37
|
+
|--------|------|---------|-------------|
|
|
38
|
+
| `limit` | Integer | 0 | Max pages to crawl (0 = unlimited) |
|
|
39
|
+
| `depth` | Integer | 25 | Max crawl depth |
|
|
40
|
+
| `return_format` | Symbol | `:raw` | Output format |
|
|
41
|
+
| `request` | Symbol | `:smart` | Request type |
|
|
42
|
+
|
|
43
|
+
### Crawl Scope
|
|
44
|
+
|
|
45
|
+
| Option | Type | Description |
|
|
46
|
+
|--------|------|-------------|
|
|
47
|
+
| `subdomains` | Boolean | Include subdomains |
|
|
48
|
+
| `tld` | Boolean | Include TLD variations |
|
|
49
|
+
| `external_domains` | Array | External domains to include (`["*"]` for all) |
|
|
50
|
+
| `redirect_policy` | Symbol | `:loose`, `:strict`, `:none` |
|
|
51
|
+
|
|
52
|
+
### URL Filtering
|
|
53
|
+
|
|
54
|
+
| Option | Type | Description |
|
|
55
|
+
|--------|------|-------------|
|
|
56
|
+
| `blacklist` | Array | Paths to exclude (regex supported) |
|
|
57
|
+
| `whitelist` | Array | Paths to include only |
|
|
58
|
+
| `budget` | Hash | Path-based page limits |
|
|
59
|
+
| `link_rewrite` | Hash | URL rewrite rules |
|
|
60
|
+
|
|
61
|
+
### Budget Example
|
|
62
|
+
|
|
63
|
+
Control how many pages to crawl per path:
|
|
64
|
+
|
|
65
|
+
```ruby
|
|
66
|
+
options = SpiderCloud::CrawlOptions.build do
|
|
67
|
+
limit 100
|
|
68
|
+
budget( {
|
|
69
|
+
'*' => 5, # default: 5 pages per path
|
|
70
|
+
'/docs/' => 50, # up to 50 pages under /docs/
|
|
71
|
+
'/blog/' => 20 # up to 20 pages under /blog/
|
|
72
|
+
} )
|
|
73
|
+
end
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Sitemap Options
|
|
77
|
+
|
|
78
|
+
| Option | Type | Description |
|
|
79
|
+
|--------|------|-------------|
|
|
80
|
+
| `sitemap` | Boolean | Use sitemap for discovery |
|
|
81
|
+
| `sitemap_only` | Boolean | Only crawl sitemap URLs |
|
|
82
|
+
| `sitemap_path` | String | Custom sitemap path |
|
|
83
|
+
|
|
84
|
+
### Content Extraction
|
|
85
|
+
|
|
86
|
+
| Option | Type | Description |
|
|
87
|
+
|--------|------|-------------|
|
|
88
|
+
| `readability` | Boolean | Safari Reader Mode |
|
|
89
|
+
| `root_selector` | String | CSS selector for content |
|
|
90
|
+
| `exclude_selector` | String | CSS selector to ignore |
|
|
91
|
+
| `css_extraction_map` | Hash | Structured data extraction |
|
|
92
|
+
| `filter_main_only` | Boolean | Main content only |
|
|
93
|
+
| `full_resources` | Boolean | Download images, videos |
|
|
94
|
+
|
|
95
|
+
### Output Options
|
|
96
|
+
|
|
97
|
+
| Option | Type | Description |
|
|
98
|
+
|--------|------|-------------|
|
|
99
|
+
| `return_json_data` | Boolean | Return SSR JSON data |
|
|
100
|
+
| `return_headers` | Boolean | Include HTTP headers |
|
|
101
|
+
| `return_cookies` | Boolean | Include cookies |
|
|
102
|
+
| `return_page_links` | Boolean | Include discovered links |
|
|
103
|
+
| `return_embeddings` | Boolean | Include embeddings |
|
|
104
|
+
| `metadata` | Boolean | Collect page metadata |
|
|
105
|
+
|
|
106
|
+
### Performance
|
|
107
|
+
|
|
108
|
+
| Option | Type | Description |
|
|
109
|
+
|--------|------|-------------|
|
|
110
|
+
| `request_timeout` | Integer | Timeout per page (5-255 seconds) |
|
|
111
|
+
| `cache` | Boolean | Enable caching |
|
|
112
|
+
| `concurrency_limit` | Integer | Concurrent requests |
|
|
113
|
+
| `delay` | Integer | Delay between requests (ms) |
|
|
114
|
+
|
|
115
|
+
### Cost Control
|
|
116
|
+
|
|
117
|
+
| Option | Type | Description |
|
|
118
|
+
|--------|------|-------------|
|
|
119
|
+
| `max_credits_per_page` | Integer | Max credits per page |
|
|
120
|
+
| `max_credits_allowed` | Integer | Total credit limit |
|
|
121
|
+
| `crawl_timeout` | Hash | Max crawl duration `{seconds:, nanoseconds:}` |
|
|
122
|
+
|
|
123
|
+
### Webhooks
|
|
124
|
+
|
|
125
|
+
```ruby
|
|
126
|
+
options = SpiderCloud::CrawlOptions.build do
|
|
127
|
+
limit 100
|
|
128
|
+
webhooks do
|
|
129
|
+
destination 'https://your-server.com/webhook'
|
|
130
|
+
on_credits_depleted true
|
|
131
|
+
on_find true
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Response
|
|
137
|
+
|
|
138
|
+
```ruby
|
|
139
|
+
response = SpiderCloud.crawl( 'https://example.com', limit: 5 )
|
|
140
|
+
|
|
141
|
+
response.result.success? # => true
|
|
142
|
+
response.result.count # => 5
|
|
143
|
+
response.result.urls # => ["https://...", ...]
|
|
144
|
+
response.result.contents # => ["...", ...]
|
|
145
|
+
response.result.total_cost # => 0.0002
|
|
146
|
+
|
|
147
|
+
# Iterate over pages
|
|
148
|
+
response.result.each do | page |
|
|
149
|
+
page.url # => "https://..."
|
|
150
|
+
page.content # => "..."
|
|
151
|
+
page.status # => 200
|
|
152
|
+
page.costs.total_cost # => 0.00004
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Filter by success
|
|
156
|
+
response.result.succeeded # => [successful pages]
|
|
157
|
+
response.result.failed # => [failed pages]
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Examples
|
|
161
|
+
|
|
162
|
+
### Crawl Documentation
|
|
163
|
+
|
|
164
|
+
```ruby
|
|
165
|
+
options = SpiderCloud::CrawlOptions.build do
|
|
166
|
+
limit 50
|
|
167
|
+
whitelist [ '/docs/' ]
|
|
168
|
+
return_format :markdown
|
|
169
|
+
readability true
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
response = SpiderCloud.crawl( 'https://example.com', options )
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Crawl with Depth Limit
|
|
176
|
+
|
|
177
|
+
```ruby
|
|
178
|
+
options = SpiderCloud::CrawlOptions.build do
|
|
179
|
+
limit 20
|
|
180
|
+
depth 2
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
response = SpiderCloud.crawl( 'https://example.com', options )
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Exclude Paths
|
|
187
|
+
|
|
188
|
+
```ruby
|
|
189
|
+
options = SpiderCloud::CrawlOptions.build do
|
|
190
|
+
limit 50
|
|
191
|
+
blacklist [ '/admin/', '/private/', '/api/' ]
|
|
192
|
+
end
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Use Sitemap
|
|
196
|
+
|
|
197
|
+
```ruby
|
|
198
|
+
options = SpiderCloud::CrawlOptions.build do
|
|
199
|
+
limit 100
|
|
200
|
+
sitemap true
|
|
201
|
+
sitemap_only true
|
|
202
|
+
end
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### With Automation
|
|
206
|
+
|
|
207
|
+
```ruby
|
|
208
|
+
options = SpiderCloud::CrawlOptions.build do
|
|
209
|
+
limit 10
|
|
210
|
+
automation_scripts( {
|
|
211
|
+
'/login' => [
|
|
212
|
+
{ 'Fill' => { 'selector' => '#email', 'value' => 'user@example.com' } },
|
|
213
|
+
{ 'Click' => 'button[type=submit]' },
|
|
214
|
+
{ 'WaitForNavigation' => true }
|
|
215
|
+
]
|
|
216
|
+
} )
|
|
217
|
+
end
|
|
218
|
+
```
|
data/readme/links.md
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# Links Endpoint
|
|
2
|
+
|
|
3
|
+
The Links endpoint discovers and extracts all links from a website without
|
|
4
|
+
returning page content. This is more efficient than crawling when you only
|
|
5
|
+
need URLs.
|
|
6
|
+
|
|
7
|
+
**API Reference:** https://spider.cloud/docs/api#links
|
|
8
|
+
|
|
9
|
+
## Basic Usage
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
# Always use a limit to control credit usage
|
|
13
|
+
response = SpiderCloud.links( 'https://example.com', limit: 5 )
|
|
14
|
+
|
|
15
|
+
response.result.urls.each do | url |
|
|
16
|
+
puts url
|
|
17
|
+
end
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## With Options
|
|
21
|
+
|
|
22
|
+
```ruby
|
|
23
|
+
options = SpiderCloud::LinksOptions.build do
|
|
24
|
+
limit 10
|
|
25
|
+
depth 2
|
|
26
|
+
subdomains false
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
response = SpiderCloud.links( 'https://example.com', options )
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Options Reference
|
|
33
|
+
|
|
34
|
+
### Core Options
|
|
35
|
+
|
|
36
|
+
| Option | Type | Default | Description |
|
|
37
|
+
|--------|------|---------|-------------|
|
|
38
|
+
| `limit` | Integer | 0 | Max pages to check (0 = unlimited) |
|
|
39
|
+
| `depth` | Integer | 25 | Max link depth |
|
|
40
|
+
| `request` | Symbol | `:smart` | Request type |
|
|
41
|
+
| `request_timeout` | Integer | 60 | Timeout per request (5-255 seconds) |
|
|
42
|
+
|
|
43
|
+
### Scope
|
|
44
|
+
|
|
45
|
+
| Option | Type | Description |
|
|
46
|
+
|--------|------|-------------|
|
|
47
|
+
| `subdomains` | Boolean | Include subdomains |
|
|
48
|
+
| `tld` | Boolean | Include TLD variations |
|
|
49
|
+
| `external_domains` | Array | External domains to include |
|
|
50
|
+
| `redirect_policy` | Symbol | `:loose`, `:strict`, `:none` |
|
|
51
|
+
|
|
52
|
+
### URL Filtering
|
|
53
|
+
|
|
54
|
+
| Option | Type | Description |
|
|
55
|
+
|--------|------|-------------|
|
|
56
|
+
| `blacklist` | Array | Paths to exclude (regex supported) |
|
|
57
|
+
| `whitelist` | Array | Paths to include only |
|
|
58
|
+
| `budget` | Hash | Path-based page limits |
|
|
59
|
+
|
|
60
|
+
### Sitemap
|
|
61
|
+
|
|
62
|
+
| Option | Type | Description |
|
|
63
|
+
|--------|------|-------------|
|
|
64
|
+
| `sitemap` | Boolean | Use sitemap for discovery |
|
|
65
|
+
| `sitemap_only` | Boolean | Only check sitemap URLs |
|
|
66
|
+
| `sitemap_path` | String | Custom sitemap path |
|
|
67
|
+
|
|
68
|
+
### Performance
|
|
69
|
+
|
|
70
|
+
| Option | Type | Description |
|
|
71
|
+
|--------|------|-------------|
|
|
72
|
+
| `cache` | Boolean | Enable caching |
|
|
73
|
+
| `respect_robots` | Boolean | Respect robots.txt |
|
|
74
|
+
|
|
75
|
+
### Proxy
|
|
76
|
+
|
|
77
|
+
| Option | Type | Description |
|
|
78
|
+
|--------|------|-------------|
|
|
79
|
+
| `proxy` | Symbol | Proxy pool: `:residential`, `:mobile`, `:isp` |
|
|
80
|
+
| `proxy_enabled` | Boolean | Enable proxy |
|
|
81
|
+
| `country_code` | String | ISO country code |
|
|
82
|
+
|
|
83
|
+
## Response
|
|
84
|
+
|
|
85
|
+
```ruby
|
|
86
|
+
response = SpiderCloud.links( 'https://example.com', limit: 5 )
|
|
87
|
+
|
|
88
|
+
response.result.success? # => true
|
|
89
|
+
response.result.count # => 5
|
|
90
|
+
response.result.urls # => ["https://...", ...]
|
|
91
|
+
|
|
92
|
+
# Iterate over results
|
|
93
|
+
response.result.each do | item |
|
|
94
|
+
item.url # => "https://..."
|
|
95
|
+
item.status # => 200
|
|
96
|
+
item.duration_elapsed_ms # => 123
|
|
97
|
+
item.error # => nil
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Filter results
|
|
101
|
+
response.result.succeeded # => [successful items]
|
|
102
|
+
response.result.failed # => [failed items]
|
|
103
|
+
response.result.with_status( 200 ) # => [items with 200 status]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Examples
|
|
107
|
+
|
|
108
|
+
### Discover All Links
|
|
109
|
+
|
|
110
|
+
```ruby
|
|
111
|
+
options = SpiderCloud::LinksOptions.build do
|
|
112
|
+
limit 100
|
|
113
|
+
depth 3
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
response = SpiderCloud.links( 'https://example.com', options )
|
|
117
|
+
puts "Found #{ response.result.count } URLs"
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Filter by Path
|
|
121
|
+
|
|
122
|
+
```ruby
|
|
123
|
+
options = SpiderCloud::LinksOptions.build do
|
|
124
|
+
limit 50
|
|
125
|
+
whitelist [ '/docs/', '/api/' ]
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
response = SpiderCloud.links( 'https://example.com', options )
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Exclude Paths
|
|
132
|
+
|
|
133
|
+
```ruby
|
|
134
|
+
options = SpiderCloud::LinksOptions.build do
|
|
135
|
+
limit 50
|
|
136
|
+
blacklist [ '/admin/', '/private/', '/cdn/' ]
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
response = SpiderCloud.links( 'https://example.com', options )
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Use Sitemap
|
|
143
|
+
|
|
144
|
+
```ruby
|
|
145
|
+
options = SpiderCloud::LinksOptions.build do
|
|
146
|
+
limit 100
|
|
147
|
+
sitemap true
|
|
148
|
+
sitemap_only true
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
response = SpiderCloud.links( 'https://example.com', options )
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Include Subdomains
|
|
155
|
+
|
|
156
|
+
```ruby
|
|
157
|
+
options = SpiderCloud::LinksOptions.build do
|
|
158
|
+
limit 50
|
|
159
|
+
subdomains true
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
response = SpiderCloud.links( 'https://example.com', options )
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### With Budget
|
|
166
|
+
|
|
167
|
+
```ruby
|
|
168
|
+
options = SpiderCloud::LinksOptions.build do
|
|
169
|
+
limit 100
|
|
170
|
+
budget( {
|
|
171
|
+
'*' => 5,
|
|
172
|
+
'/docs/' => 50,
|
|
173
|
+
'/blog/' => 20
|
|
174
|
+
} )
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
response = SpiderCloud.links( 'https://example.com', options )
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Get Only Successful URLs
|
|
181
|
+
|
|
182
|
+
```ruby
|
|
183
|
+
response = SpiderCloud.links( 'https://example.com', limit: 20 )
|
|
184
|
+
|
|
185
|
+
successful_urls = response.result.succeeded.map( &:url )
|
|
186
|
+
puts successful_urls
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Check for Broken Links
|
|
190
|
+
|
|
191
|
+
```ruby
|
|
192
|
+
response = SpiderCloud.links( 'https://example.com', limit: 50 )
|
|
193
|
+
|
|
194
|
+
broken = response.result.failed
|
|
195
|
+
broken.each do | item |
|
|
196
|
+
puts "Broken: #{ item.url } - Status: #{ item.status }"
|
|
197
|
+
end
|
|
198
|
+
```
|
data/readme/scrape.md
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# Scrape Endpoint
|
|
2
|
+
|
|
3
|
+
The Scrape endpoint extracts content from a single URL with full JavaScript
|
|
4
|
+
rendering support.
|
|
5
|
+
|
|
6
|
+
**API Reference:** https://spider.cloud/docs/api#scrape
|
|
7
|
+
|
|
8
|
+
## Basic Usage
|
|
9
|
+
|
|
10
|
+
```ruby
|
|
11
|
+
response = SpiderCloud.scrape( 'https://example.com' )
|
|
12
|
+
puts response.result.content
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## With Options
|
|
16
|
+
|
|
17
|
+
```ruby
|
|
18
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
19
|
+
return_format :markdown
|
|
20
|
+
readability true
|
|
21
|
+
metadata true
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
response = SpiderCloud.scrape( 'https://example.com', options )
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Understanding the Response
|
|
28
|
+
|
|
29
|
+
The scrape endpoint takes a single URL and returns a `ScrapeResult` containing
|
|
30
|
+
an array of result items. For single URL scrapes, convenience methods delegate
|
|
31
|
+
to the first item:
|
|
32
|
+
|
|
33
|
+
```ruby
|
|
34
|
+
response = SpiderCloud.scrape( 'https://example.com' )
|
|
35
|
+
|
|
36
|
+
# the result is enumerable
|
|
37
|
+
response.result.count # => 1
|
|
38
|
+
|
|
39
|
+
# convenience methods for single URL scrapes
|
|
40
|
+
response.result.success? # => true
|
|
41
|
+
response.result.content # => "# Page Title..."
|
|
42
|
+
response.result.url # => "https://example.com"
|
|
43
|
+
response.result.status # => 200
|
|
44
|
+
response.result.error # => nil
|
|
45
|
+
response.result.costs # => Costs object
|
|
46
|
+
response.result.duration_elapsed_ms # => 1234
|
|
47
|
+
|
|
48
|
+
# access the underlying item directly
|
|
49
|
+
response.result.first.content # same as response.result.content
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Iterating Results
|
|
53
|
+
|
|
54
|
+
The result is enumerable, so you can iterate over items:
|
|
55
|
+
|
|
56
|
+
```ruby
|
|
57
|
+
response.result.each do | item |
|
|
58
|
+
puts item.url
|
|
59
|
+
puts item.content
|
|
60
|
+
puts item.status
|
|
61
|
+
end
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Checking Success
|
|
65
|
+
|
|
66
|
+
```ruby
|
|
67
|
+
# check if the HTTP request succeeded
|
|
68
|
+
response.success? # => true/false
|
|
69
|
+
|
|
70
|
+
# check if the scrape operation succeeded
|
|
71
|
+
response.result.success? # => true/false
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Options Reference
|
|
75
|
+
|
|
76
|
+
### Core Options
|
|
77
|
+
|
|
78
|
+
| Option | Type | Default | Description |
|
|
79
|
+
|--------|------|---------|-------------|
|
|
80
|
+
| `request` | Symbol | `:smart` | Request type: `:http`, `:chrome`, `:smart` |
|
|
81
|
+
| `return_format` | Symbol | `:raw` | Output format (see formats below) |
|
|
82
|
+
| `request_timeout` | Integer | 60 | Timeout in seconds (5-255) |
|
|
83
|
+
| `lite_mode` | Boolean | false | 50% cost reduction mode |
|
|
84
|
+
|
|
85
|
+
### Return Formats
|
|
86
|
+
|
|
87
|
+
- `:markdown` - Markdown format
|
|
88
|
+
- `:commonmark` - CommonMark format
|
|
89
|
+
- `:raw` - Raw HTML
|
|
90
|
+
- `:text` - Plain text
|
|
91
|
+
- `:html2text` - HTML to text
|
|
92
|
+
- `:xml` - XML format
|
|
93
|
+
- `:bytes` - Raw bytes
|
|
94
|
+
- `:empty` - No content
|
|
95
|
+
|
|
96
|
+
### Content Extraction
|
|
97
|
+
|
|
98
|
+
| Option | Type | Description |
|
|
99
|
+
|--------|------|-------------|
|
|
100
|
+
| `readability` | Boolean | Safari Reader Mode algorithm |
|
|
101
|
+
| `root_selector` | String | CSS selector for content extraction |
|
|
102
|
+
| `exclude_selector` | String | CSS selector to ignore |
|
|
103
|
+
| `css_extraction_map` | Hash | Structured data extraction |
|
|
104
|
+
| `clean_html` | Boolean | Clean HTML output |
|
|
105
|
+
| `filter_svg` | Boolean | Filter SVG elements |
|
|
106
|
+
| `filter_images` | Boolean | Filter image elements |
|
|
107
|
+
| `filter_main_only` | Boolean | Extract main content only |
|
|
108
|
+
|
|
109
|
+
### Output Options
|
|
110
|
+
|
|
111
|
+
| Option | Type | Description |
|
|
112
|
+
|--------|------|-------------|
|
|
113
|
+
| `return_json_data` | Boolean | Return SSR JSON data |
|
|
114
|
+
| `return_headers` | Boolean | Include HTTP headers |
|
|
115
|
+
| `return_cookies` | Boolean | Include cookies |
|
|
116
|
+
| `return_page_links` | Boolean | Include discovered links |
|
|
117
|
+
| `return_embeddings` | Boolean | Include OpenAI embeddings |
|
|
118
|
+
| `metadata` | Boolean | Collect page metadata |
|
|
119
|
+
|
|
120
|
+
### Network Configuration
|
|
121
|
+
|
|
122
|
+
| Option | Type | Description |
|
|
123
|
+
|--------|------|-------------|
|
|
124
|
+
| `network_blacklist` | Array | Block matching network requests |
|
|
125
|
+
| `network_whitelist` | Array | Allow only matching requests |
|
|
126
|
+
| `anti_bot` | Boolean | Enable anti-bot measures |
|
|
127
|
+
|
|
128
|
+
### Session & Authentication
|
|
129
|
+
|
|
130
|
+
| Option | Type | Description |
|
|
131
|
+
|--------|------|-------------|
|
|
132
|
+
| `session` | Boolean | Persist session |
|
|
133
|
+
| `cookies` | String | HTTP cookies |
|
|
134
|
+
| `headers` | Hash | Custom HTTP headers |
|
|
135
|
+
| `user_agent` | String | Custom user agent |
|
|
136
|
+
|
|
137
|
+
### Proxy Configuration
|
|
138
|
+
|
|
139
|
+
| Option | Type | Description |
|
|
140
|
+
|--------|------|-------------|
|
|
141
|
+
| `proxy` | Symbol | Proxy pool: `:residential`, `:mobile`, `:isp` |
|
|
142
|
+
| `proxy_enabled` | Boolean | Enable proxy |
|
|
143
|
+
| `remote_proxy` | String | External proxy URL |
|
|
144
|
+
| `country_code` | String | ISO country code |
|
|
145
|
+
| `locale` | String | Locale (e.g., `"en-US"`) |
|
|
146
|
+
|
|
147
|
+
### Browser Configuration
|
|
148
|
+
|
|
149
|
+
| Option | Type | Description |
|
|
150
|
+
|--------|------|-------------|
|
|
151
|
+
| `fingerprint` | Boolean | Use fingerprint detection |
|
|
152
|
+
| `stealth` | Boolean | Stealth mode |
|
|
153
|
+
| `viewport` | Hash | Browser viewport `{width:, height:}` |
|
|
154
|
+
| `device` | Symbol | Device: `:mobile`, `:tablet`, `:desktop` |
|
|
155
|
+
| `scroll` | Integer | Scroll duration (ms) |
|
|
156
|
+
| `block_ads` | Boolean | Block ads |
|
|
157
|
+
| `block_analytics` | Boolean | Block analytics |
|
|
158
|
+
| `block_stylesheets` | Boolean | Block stylesheets |
|
|
159
|
+
| `block_images` | Boolean | Block images |
|
|
160
|
+
|
|
161
|
+
### Wait Conditions
|
|
162
|
+
|
|
163
|
+
```ruby
|
|
164
|
+
wait_for do
|
|
165
|
+
# Wait for CSS selector
|
|
166
|
+
selector '#content'
|
|
167
|
+
|
|
168
|
+
# Wait for network idle
|
|
169
|
+
idle_network do
|
|
170
|
+
timeout { seconds 5; nanoseconds 0 }
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Wait for delay
|
|
174
|
+
delay do
|
|
175
|
+
timeout { seconds 2; nanoseconds 0 }
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### AI/LLM Integration
|
|
181
|
+
|
|
182
|
+
```ruby
|
|
183
|
+
gpt_config do
|
|
184
|
+
prompt 'Summarize this page'
|
|
185
|
+
model 'gpt-4'
|
|
186
|
+
max_tokens 1000
|
|
187
|
+
temperature 0.7
|
|
188
|
+
top_p 0.9
|
|
189
|
+
end
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Chunking
|
|
193
|
+
|
|
194
|
+
```ruby
|
|
195
|
+
chunking_algorithm do
|
|
196
|
+
type :by_words # :no, :by_words, :by_lines, :by_character_length, :by_sentence
|
|
197
|
+
value 500
|
|
198
|
+
end
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## Examples
|
|
202
|
+
|
|
203
|
+
### Extract Clean Content
|
|
204
|
+
|
|
205
|
+
```ruby
|
|
206
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
207
|
+
return_format :markdown
|
|
208
|
+
readability true
|
|
209
|
+
filter_main_only true
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
response = SpiderCloud.scrape( 'https://example.com/article', options )
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### With CSS Extraction
|
|
216
|
+
|
|
217
|
+
```ruby
|
|
218
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
219
|
+
css_extraction_map( {
|
|
220
|
+
'title' => 'h1',
|
|
221
|
+
'price' => '.price',
|
|
222
|
+
'description' => '.product-description'
|
|
223
|
+
} )
|
|
224
|
+
end
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### With Proxy and Stealth
|
|
228
|
+
|
|
229
|
+
```ruby
|
|
230
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
231
|
+
proxy :residential
|
|
232
|
+
proxy_enabled true
|
|
233
|
+
country_code 'US'
|
|
234
|
+
stealth true
|
|
235
|
+
fingerprint true
|
|
236
|
+
end
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Wait for Dynamic Content
|
|
240
|
+
|
|
241
|
+
```ruby
|
|
242
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
243
|
+
request :chrome
|
|
244
|
+
wait_for do
|
|
245
|
+
selector '.loaded-content'
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
```
|