spidercloud 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,37 @@
1
+ require 'json'
2
+ require 'base64'
3
+ require 'uri'
4
+
5
+ require 'faraday'
6
+ require 'dynamic_schema'
7
+
8
+ require_relative 'spider_cloud/version'
9
+
10
+ require_relative 'spider_cloud/helpers'
11
+ require_relative 'spider_cloud/shared_schemas'
12
+ require_relative 'spider_cloud/costs'
13
+ require_relative 'spider_cloud/error_result'
14
+ require_relative 'spider_cloud/request'
15
+ require_relative 'spider_cloud/response_methods'
16
+
17
+ require_relative 'spider_cloud/scrape_options'
18
+ require_relative 'spider_cloud/scrape_result'
19
+ require_relative 'spider_cloud/scrape_request'
20
+
21
+ require_relative 'spider_cloud/crawl_options'
22
+ require_relative 'spider_cloud/crawl_result'
23
+ require_relative 'spider_cloud/crawl_request'
24
+
25
+ require_relative 'spider_cloud/screenshot_options'
26
+ require_relative 'spider_cloud/screenshot_result'
27
+ require_relative 'spider_cloud/screenshot_request'
28
+
29
+ require_relative 'spider_cloud/links_options'
30
+ require_relative 'spider_cloud/links_result'
31
+ require_relative 'spider_cloud/links_request'
32
+
33
+ require_relative 'spider_cloud/module_methods'
34
+
35
+ module SpiderCloud
36
+ extend ModuleMethods
37
+ end
@@ -0,0 +1 @@
1
+ require_relative 'spider_cloud'
data/readme/crawl.md ADDED
@@ -0,0 +1,218 @@
1
+ # Crawl Endpoint
2
+
3
+ The Crawl endpoint discovers and extracts content from multiple pages starting
4
+ from a given URL.
5
+
6
+ **API Reference:** https://spider.cloud/docs/api#crawl
7
+
8
+ ## Basic Usage
9
+
10
+ ```ruby
11
+ # Always use a limit to control credit usage
12
+ response = SpiderCloud.crawl( 'https://example.com', limit: 5 )
13
+
14
+ response.result.each do | page |
15
+ puts "#{ page.url }: #{ page.content&.length } chars"
16
+ end
17
+ ```
18
+
19
+ ## With Options
20
+
21
+ ```ruby
22
+ options = SpiderCloud::CrawlOptions.build do
23
+ limit 5 # max pages to crawl
24
+ depth 2 # max link depth
25
+ return_format :markdown
26
+ readability true
27
+ end
28
+
29
+ response = SpiderCloud.crawl( 'https://example.com', options )
30
+ ```
31
+
32
+ ## Options Reference
33
+
34
+ ### Core Options
35
+
36
+ | Option | Type | Default | Description |
37
+ |--------|------|---------|-------------|
38
+ | `limit` | Integer | 0 | Max pages to crawl (0 = unlimited) |
39
+ | `depth` | Integer | 25 | Max crawl depth |
40
+ | `return_format` | Symbol | `:raw` | Output format |
41
+ | `request` | Symbol | `:smart` | Request type |
42
+
43
+ ### Crawl Scope
44
+
45
+ | Option | Type | Description |
46
+ |--------|------|-------------|
47
+ | `subdomains` | Boolean | Include subdomains |
48
+ | `tld` | Boolean | Include TLD variations |
49
+ | `external_domains` | Array | External domains to include (`["*"]` for all) |
50
+ | `redirect_policy` | Symbol | `:loose`, `:strict`, `:none` |
51
+
52
+ ### URL Filtering
53
+
54
+ | Option | Type | Description |
55
+ |--------|------|-------------|
56
+ | `blacklist` | Array | Paths to exclude (regex supported) |
57
+ | `whitelist` | Array | Paths to include only |
58
+ | `budget` | Hash | Path-based page limits |
59
+ | `link_rewrite` | Hash | URL rewrite rules |
60
+
61
+ ### Budget Example
62
+
63
+ Control how many pages to crawl per path:
64
+
65
+ ```ruby
66
+ options = SpiderCloud::CrawlOptions.build do
67
+ limit 100
68
+ budget( {
69
+ '*' => 5, # default: 5 pages per path
70
+ '/docs/' => 50, # up to 50 pages under /docs/
71
+ '/blog/' => 20 # up to 20 pages under /blog/
72
+ } )
73
+ end
74
+ ```
75
+
76
+ ### Sitemap Options
77
+
78
+ | Option | Type | Description |
79
+ |--------|------|-------------|
80
+ | `sitemap` | Boolean | Use sitemap for discovery |
81
+ | `sitemap_only` | Boolean | Only crawl sitemap URLs |
82
+ | `sitemap_path` | String | Custom sitemap path |
83
+
84
+ ### Content Extraction
85
+
86
+ | Option | Type | Description |
87
+ |--------|------|-------------|
88
+ | `readability` | Boolean | Safari Reader Mode |
89
+ | `root_selector` | String | CSS selector for content |
90
+ | `exclude_selector` | String | CSS selector to ignore |
91
+ | `css_extraction_map` | Hash | Structured data extraction |
92
+ | `filter_main_only` | Boolean | Main content only |
93
+ | `full_resources` | Boolean | Download images, videos |
94
+
95
+ ### Output Options
96
+
97
+ | Option | Type | Description |
98
+ |--------|------|-------------|
99
+ | `return_json_data` | Boolean | Return SSR JSON data |
100
+ | `return_headers` | Boolean | Include HTTP headers |
101
+ | `return_cookies` | Boolean | Include cookies |
102
+ | `return_page_links` | Boolean | Include discovered links |
103
+ | `return_embeddings` | Boolean | Include embeddings |
104
+ | `metadata` | Boolean | Collect page metadata |
105
+
106
+ ### Performance
107
+
108
+ | Option | Type | Description |
109
+ |--------|------|-------------|
110
+ | `request_timeout` | Integer | Timeout per page (5-255 seconds) |
111
+ | `cache` | Boolean | Enable caching |
112
+ | `concurrency_limit` | Integer | Concurrent requests |
113
+ | `delay` | Integer | Delay between requests (ms) |
114
+
115
+ ### Cost Control
116
+
117
+ | Option | Type | Description |
118
+ |--------|------|-------------|
119
+ | `max_credits_per_page` | Integer | Max credits per page |
120
+ | `max_credits_allowed` | Integer | Total credit limit |
121
+ | `crawl_timeout` | Hash | Max crawl duration `{seconds:, nanoseconds:}` |
122
+
123
+ ### Webhooks
124
+
125
+ ```ruby
126
+ options = SpiderCloud::CrawlOptions.build do
127
+ limit 100
128
+ webhooks do
129
+ destination 'https://your-server.com/webhook'
130
+ on_credits_depleted true
131
+ on_find true
132
+ end
133
+ end
134
+ ```
135
+
136
+ ## Response
137
+
138
+ ```ruby
139
+ response = SpiderCloud.crawl( 'https://example.com', limit: 5 )
140
+
141
+ response.result.success? # => true
142
+ response.result.count # => 5
143
+ response.result.urls # => ["https://...", ...]
144
+ response.result.contents # => ["...", ...]
145
+ response.result.total_cost # => 0.0002
146
+
147
+ # Iterate over pages
148
+ response.result.each do | page |
149
+ page.url # => "https://..."
150
+ page.content # => "..."
151
+ page.status # => 200
152
+ page.costs.total_cost # => 0.00004
153
+ end
154
+
155
+ # Filter by success
156
+ response.result.succeeded # => [successful pages]
157
+ response.result.failed # => [failed pages]
158
+ ```
159
+
160
+ ## Examples
161
+
162
+ ### Crawl Documentation
163
+
164
+ ```ruby
165
+ options = SpiderCloud::CrawlOptions.build do
166
+ limit 50
167
+ whitelist [ '/docs/' ]
168
+ return_format :markdown
169
+ readability true
170
+ end
171
+
172
+ response = SpiderCloud.crawl( 'https://example.com', options )
173
+ ```
174
+
175
+ ### Crawl with Depth Limit
176
+
177
+ ```ruby
178
+ options = SpiderCloud::CrawlOptions.build do
179
+ limit 20
180
+ depth 2
181
+ end
182
+
183
+ response = SpiderCloud.crawl( 'https://example.com', options )
184
+ ```
185
+
186
+ ### Exclude Paths
187
+
188
+ ```ruby
189
+ options = SpiderCloud::CrawlOptions.build do
190
+ limit 50
191
+ blacklist [ '/admin/', '/private/', '/api/' ]
192
+ end
193
+ ```
194
+
195
+ ### Use Sitemap
196
+
197
+ ```ruby
198
+ options = SpiderCloud::CrawlOptions.build do
199
+ limit 100
200
+ sitemap true
201
+ sitemap_only true
202
+ end
203
+ ```
204
+
205
+ ### With Automation
206
+
207
+ ```ruby
208
+ options = SpiderCloud::CrawlOptions.build do
209
+ limit 10
210
+ automation_scripts( {
211
+ '/login' => [
212
+ { 'Fill' => { 'selector' => '#email', 'value' => 'user@example.com' } },
213
+ { 'Click' => 'button[type=submit]' },
214
+ { 'WaitForNavigation' => true }
215
+ ]
216
+ } )
217
+ end
218
+ ```
data/readme/links.md ADDED
@@ -0,0 +1,198 @@
1
+ # Links Endpoint
2
+
3
+ The Links endpoint discovers and extracts all links from a website without
4
+ returning page content. This is more efficient than crawling when you only
5
+ need URLs.
6
+
7
+ **API Reference:** https://spider.cloud/docs/api#links
8
+
9
+ ## Basic Usage
10
+
11
+ ```ruby
12
+ # Always use a limit to control credit usage
13
+ response = SpiderCloud.links( 'https://example.com', limit: 5 )
14
+
15
+ response.result.urls.each do | url |
16
+ puts url
17
+ end
18
+ ```
19
+
20
+ ## With Options
21
+
22
+ ```ruby
23
+ options = SpiderCloud::LinksOptions.build do
24
+ limit 10
25
+ depth 2
26
+ subdomains false
27
+ end
28
+
29
+ response = SpiderCloud.links( 'https://example.com', options )
30
+ ```
31
+
32
+ ## Options Reference
33
+
34
+ ### Core Options
35
+
36
+ | Option | Type | Default | Description |
37
+ |--------|------|---------|-------------|
38
+ | `limit` | Integer | 0 | Max pages to check (0 = unlimited) |
39
+ | `depth` | Integer | 25 | Max link depth |
40
+ | `request` | Symbol | `:smart` | Request type |
41
+ | `request_timeout` | Integer | 60 | Timeout per request (5-255 seconds) |
42
+
43
+ ### Scope
44
+
45
+ | Option | Type | Description |
46
+ |--------|------|-------------|
47
+ | `subdomains` | Boolean | Include subdomains |
48
+ | `tld` | Boolean | Include TLD variations |
49
+ | `external_domains` | Array | External domains to include |
50
+ | `redirect_policy` | Symbol | `:loose`, `:strict`, `:none` |
51
+
52
+ ### URL Filtering
53
+
54
+ | Option | Type | Description |
55
+ |--------|------|-------------|
56
+ | `blacklist` | Array | Paths to exclude (regex supported) |
57
+ | `whitelist` | Array | Paths to include only |
58
+ | `budget` | Hash | Path-based page limits |
59
+
60
+ ### Sitemap
61
+
62
+ | Option | Type | Description |
63
+ |--------|------|-------------|
64
+ | `sitemap` | Boolean | Use sitemap for discovery |
65
+ | `sitemap_only` | Boolean | Only check sitemap URLs |
66
+ | `sitemap_path` | String | Custom sitemap path |
67
+
68
+ ### Performance
69
+
70
+ | Option | Type | Description |
71
+ |--------|------|-------------|
72
+ | `cache` | Boolean | Enable caching |
73
+ | `respect_robots` | Boolean | Respect robots.txt |
74
+
75
+ ### Proxy
76
+
77
+ | Option | Type | Description |
78
+ |--------|------|-------------|
79
+ | `proxy` | Symbol | Proxy pool: `:residential`, `:mobile`, `:isp` |
80
+ | `proxy_enabled` | Boolean | Enable proxy |
81
+ | `country_code` | String | ISO country code |
82
+
83
+ ## Response
84
+
85
+ ```ruby
86
+ response = SpiderCloud.links( 'https://example.com', limit: 5 )
87
+
88
+ response.result.success? # => true
89
+ response.result.count # => 5
90
+ response.result.urls # => ["https://...", ...]
91
+
92
+ # Iterate over results
93
+ response.result.each do | item |
94
+ item.url # => "https://..."
95
+ item.status # => 200
96
+ item.duration_elapsed_ms # => 123
97
+ item.error # => nil
98
+ end
99
+
100
+ # Filter results
101
+ response.result.succeeded # => [successful items]
102
+ response.result.failed # => [failed items]
103
+ response.result.with_status( 200 ) # => [items with 200 status]
104
+ ```
105
+
106
+ ## Examples
107
+
108
+ ### Discover All Links
109
+
110
+ ```ruby
111
+ options = SpiderCloud::LinksOptions.build do
112
+ limit 100
113
+ depth 3
114
+ end
115
+
116
+ response = SpiderCloud.links( 'https://example.com', options )
117
+ puts "Found #{ response.result.count } URLs"
118
+ ```
119
+
120
+ ### Filter by Path
121
+
122
+ ```ruby
123
+ options = SpiderCloud::LinksOptions.build do
124
+ limit 50
125
+ whitelist [ '/docs/', '/api/' ]
126
+ end
127
+
128
+ response = SpiderCloud.links( 'https://example.com', options )
129
+ ```
130
+
131
+ ### Exclude Paths
132
+
133
+ ```ruby
134
+ options = SpiderCloud::LinksOptions.build do
135
+ limit 50
136
+ blacklist [ '/admin/', '/private/', '/cdn/' ]
137
+ end
138
+
139
+ response = SpiderCloud.links( 'https://example.com', options )
140
+ ```
141
+
142
+ ### Use Sitemap
143
+
144
+ ```ruby
145
+ options = SpiderCloud::LinksOptions.build do
146
+ limit 100
147
+ sitemap true
148
+ sitemap_only true
149
+ end
150
+
151
+ response = SpiderCloud.links( 'https://example.com', options )
152
+ ```
153
+
154
+ ### Include Subdomains
155
+
156
+ ```ruby
157
+ options = SpiderCloud::LinksOptions.build do
158
+ limit 50
159
+ subdomains true
160
+ end
161
+
162
+ response = SpiderCloud.links( 'https://example.com', options )
163
+ ```
164
+
165
+ ### With Budget
166
+
167
+ ```ruby
168
+ options = SpiderCloud::LinksOptions.build do
169
+ limit 100
170
+ budget( {
171
+ '*' => 5,
172
+ '/docs/' => 50,
173
+ '/blog/' => 20
174
+ } )
175
+ end
176
+
177
+ response = SpiderCloud.links( 'https://example.com', options )
178
+ ```
179
+
180
+ ### Get Only Successful URLs
181
+
182
+ ```ruby
183
+ response = SpiderCloud.links( 'https://example.com', limit: 20 )
184
+
185
+ successful_urls = response.result.succeeded.map( &:url )
186
+ puts successful_urls
187
+ ```
188
+
189
+ ### Check for Broken Links
190
+
191
+ ```ruby
192
+ response = SpiderCloud.links( 'https://example.com', limit: 50 )
193
+
194
+ broken = response.result.failed
195
+ broken.each do | item |
196
+ puts "Broken: #{ item.url } - Status: #{ item.status }"
197
+ end
198
+ ```
data/readme/scrape.md ADDED
@@ -0,0 +1,248 @@
1
+ # Scrape Endpoint
2
+
3
+ The Scrape endpoint extracts content from a single URL with full JavaScript
4
+ rendering support.
5
+
6
+ **API Reference:** https://spider.cloud/docs/api#scrape
7
+
8
+ ## Basic Usage
9
+
10
+ ```ruby
11
+ response = SpiderCloud.scrape( 'https://example.com' )
12
+ puts response.result.content
13
+ ```
14
+
15
+ ## With Options
16
+
17
+ ```ruby
18
+ options = SpiderCloud::ScrapeOptions.build do
19
+ return_format :markdown
20
+ readability true
21
+ metadata true
22
+ end
23
+
24
+ response = SpiderCloud.scrape( 'https://example.com', options )
25
+ ```
26
+
27
+ ## Understanding the Response
28
+
29
+ The scrape endpoint takes a single URL and returns a `ScrapeResult` containing
30
+ an array of result items. For single URL scrapes, convenience methods delegate
31
+ to the first item:
32
+
33
+ ```ruby
34
+ response = SpiderCloud.scrape( 'https://example.com' )
35
+
36
+ # the result is enumerable
37
+ response.result.count # => 1
38
+
39
+ # convenience methods for single URL scrapes
40
+ response.result.success? # => true
41
+ response.result.content # => "# Page Title..."
42
+ response.result.url # => "https://example.com"
43
+ response.result.status # => 200
44
+ response.result.error # => nil
45
+ response.result.costs # => Costs object
46
+ response.result.duration_elapsed_ms # => 1234
47
+
48
+ # access the underlying item directly
49
+ response.result.first.content # same as response.result.content
50
+ ```
51
+
52
+ ### Iterating Results
53
+
54
+ The result is enumerable, so you can iterate over items:
55
+
56
+ ```ruby
57
+ response.result.each do | item |
58
+ puts item.url
59
+ puts item.content
60
+ puts item.status
61
+ end
62
+ ```
63
+
64
+ ### Checking Success
65
+
66
+ ```ruby
67
+ # check if the HTTP request succeeded
68
+ response.success? # => true/false
69
+
70
+ # check if the scrape operation succeeded
71
+ response.result.success? # => true/false
72
+ ```
73
+
74
+ ## Options Reference
75
+
76
+ ### Core Options
77
+
78
+ | Option | Type | Default | Description |
79
+ |--------|------|---------|-------------|
80
+ | `request` | Symbol | `:smart` | Request type: `:http`, `:chrome`, `:smart` |
81
+ | `return_format` | Symbol | `:raw` | Output format (see formats below) |
82
+ | `request_timeout` | Integer | 60 | Timeout in seconds (5-255) |
83
+ | `lite_mode` | Boolean | false | 50% cost reduction mode |
84
+
85
+ ### Return Formats
86
+
87
+ - `:markdown` - Markdown format
88
+ - `:commonmark` - CommonMark format
89
+ - `:raw` - Raw HTML
90
+ - `:text` - Plain text
91
+ - `:html2text` - HTML to text
92
+ - `:xml` - XML format
93
+ - `:bytes` - Raw bytes
94
+ - `:empty` - No content
95
+
96
+ ### Content Extraction
97
+
98
+ | Option | Type | Description |
99
+ |--------|------|-------------|
100
+ | `readability` | Boolean | Safari Reader Mode algorithm |
101
+ | `root_selector` | String | CSS selector for content extraction |
102
+ | `exclude_selector` | String | CSS selector to ignore |
103
+ | `css_extraction_map` | Hash | Structured data extraction |
104
+ | `clean_html` | Boolean | Clean HTML output |
105
+ | `filter_svg` | Boolean | Filter SVG elements |
106
+ | `filter_images` | Boolean | Filter image elements |
107
+ | `filter_main_only` | Boolean | Extract main content only |
108
+
109
+ ### Output Options
110
+
111
+ | Option | Type | Description |
112
+ |--------|------|-------------|
113
+ | `return_json_data` | Boolean | Return SSR JSON data |
114
+ | `return_headers` | Boolean | Include HTTP headers |
115
+ | `return_cookies` | Boolean | Include cookies |
116
+ | `return_page_links` | Boolean | Include discovered links |
117
+ | `return_embeddings` | Boolean | Include OpenAI embeddings |
118
+ | `metadata` | Boolean | Collect page metadata |
119
+
120
+ ### Network Configuration
121
+
122
+ | Option | Type | Description |
123
+ |--------|------|-------------|
124
+ | `network_blacklist` | Array | Block matching network requests |
125
+ | `network_whitelist` | Array | Allow only matching requests |
126
+ | `anti_bot` | Boolean | Enable anti-bot measures |
127
+
128
+ ### Session & Authentication
129
+
130
+ | Option | Type | Description |
131
+ |--------|------|-------------|
132
+ | `session` | Boolean | Persist session |
133
+ | `cookies` | String | HTTP cookies |
134
+ | `headers` | Hash | Custom HTTP headers |
135
+ | `user_agent` | String | Custom user agent |
136
+
137
+ ### Proxy Configuration
138
+
139
+ | Option | Type | Description |
140
+ |--------|------|-------------|
141
+ | `proxy` | Symbol | Proxy pool: `:residential`, `:mobile`, `:isp` |
142
+ | `proxy_enabled` | Boolean | Enable proxy |
143
+ | `remote_proxy` | String | External proxy URL |
144
+ | `country_code` | String | ISO country code |
145
+ | `locale` | String | Locale (e.g., `"en-US"`) |
146
+
147
+ ### Browser Configuration
148
+
149
+ | Option | Type | Description |
150
+ |--------|------|-------------|
151
+ | `fingerprint` | Boolean | Use fingerprint detection |
152
+ | `stealth` | Boolean | Stealth mode |
153
+ | `viewport` | Hash | Browser viewport `{width:, height:}` |
154
+ | `device` | Symbol | Device: `:mobile`, `:tablet`, `:desktop` |
155
+ | `scroll` | Integer | Scroll duration (ms) |
156
+ | `block_ads` | Boolean | Block ads |
157
+ | `block_analytics` | Boolean | Block analytics |
158
+ | `block_stylesheets` | Boolean | Block stylesheets |
159
+ | `block_images` | Boolean | Block images |
160
+
161
+ ### Wait Conditions
162
+
163
+ ```ruby
164
+ wait_for do
165
+ # Wait for CSS selector
166
+ selector '#content'
167
+
168
+ # Wait for network idle
169
+ idle_network do
170
+ timeout { seconds 5; nanoseconds 0 }
171
+ end
172
+
173
+ # Wait for delay
174
+ delay do
175
+ timeout { seconds 2; nanoseconds 0 }
176
+ end
177
+ end
178
+ ```
179
+
180
+ ### AI/LLM Integration
181
+
182
+ ```ruby
183
+ gpt_config do
184
+ prompt 'Summarize this page'
185
+ model 'gpt-4'
186
+ max_tokens 1000
187
+ temperature 0.7
188
+ top_p 0.9
189
+ end
190
+ ```
191
+
192
+ ### Chunking
193
+
194
+ ```ruby
195
+ chunking_algorithm do
196
+ type :by_words # :no, :by_words, :by_lines, :by_character_length, :by_sentence
197
+ value 500
198
+ end
199
+ ```
200
+
201
+ ## Examples
202
+
203
+ ### Extract Clean Content
204
+
205
+ ```ruby
206
+ options = SpiderCloud::ScrapeOptions.build do
207
+ return_format :markdown
208
+ readability true
209
+ filter_main_only true
210
+ end
211
+
212
+ response = SpiderCloud.scrape( 'https://example.com/article', options )
213
+ ```
214
+
215
+ ### With CSS Extraction
216
+
217
+ ```ruby
218
+ options = SpiderCloud::ScrapeOptions.build do
219
+ css_extraction_map( {
220
+ 'title' => 'h1',
221
+ 'price' => '.price',
222
+ 'description' => '.product-description'
223
+ } )
224
+ end
225
+ ```
226
+
227
+ ### With Proxy and Stealth
228
+
229
+ ```ruby
230
+ options = SpiderCloud::ScrapeOptions.build do
231
+ proxy :residential
232
+ proxy_enabled true
233
+ country_code 'US'
234
+ stealth true
235
+ fingerprint true
236
+ end
237
+ ```
238
+
239
+ ### Wait for Dynamic Content
240
+
241
+ ```ruby
242
+ options = SpiderCloud::ScrapeOptions.build do
243
+ request :chrome
244
+ wait_for do
245
+ selector '.loaded-content'
246
+ end
247
+ end
248
+ ```