spidercloud 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +233 -0
- data/lib/spider_cloud/costs.rb +15 -0
- data/lib/spider_cloud/crawl_options.rb +154 -0
- data/lib/spider_cloud/crawl_request.rb +28 -0
- data/lib/spider_cloud/crawl_result.rb +62 -0
- data/lib/spider_cloud/error_result.rb +52 -0
- data/lib/spider_cloud/helpers.rb +33 -0
- data/lib/spider_cloud/links_options.rb +52 -0
- data/lib/spider_cloud/links_request.rb +29 -0
- data/lib/spider_cloud/links_result.rb +55 -0
- data/lib/spider_cloud/module_methods.rb +31 -0
- data/lib/spider_cloud/request.rb +41 -0
- data/lib/spider_cloud/response_methods.rb +15 -0
- data/lib/spider_cloud/scrape_options.rb +164 -0
- data/lib/spider_cloud/scrape_request.rb +29 -0
- data/lib/spider_cloud/scrape_result.rb +62 -0
- data/lib/spider_cloud/screenshot_options.rb +84 -0
- data/lib/spider_cloud/screenshot_request.rb +29 -0
- data/lib/spider_cloud/screenshot_result.rb +69 -0
- data/lib/spider_cloud/shared_schemas.rb +80 -0
- data/lib/spider_cloud/version.rb +3 -0
- data/lib/spider_cloud.rb +37 -0
- data/lib/spidercloud.rb +1 -0
- data/readme/crawl.md +218 -0
- data/readme/links.md +198 -0
- data/readme/scrape.md +248 -0
- data/readme/screenshot.md +240 -0
- data/spidercloud.gemspec +40 -0
- metadata +159 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 8ebacc4a7294c02a50c8e49d4e1dda6d75747330af8d17a427ea1e9871e4d025
|
|
4
|
+
data.tar.gz: 1dacbdcbbb2191d4c81741c1527912ea461538f11bce5e078d7d38b6c6bbb16e
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: a873ad78cbc6d96e5aa014d3b61f822e8f94696e2979747f8bc16734419161f8011a3e8f16194ad72690a40a74d7efc85344d0e4969b69b054e6993ecd5fdc02
|
|
7
|
+
data.tar.gz: bb317f6e76f0c6d6d1fb2f079b24f74c4bc864062f38f0a0871039e42ae521946f8f7cb087834ca23607576975d3085e1d63cc89691be10af60409360183678c
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Kristoph Cichocki-Romanov
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# SpiderCloud
|
|
2
|
+
|
|
3
|
+
The SpiderCloud gem provides a lightweight Ruby interface to the
|
|
4
|
+
[Spider Cloud API](https://spider.cloud) for web scraping, crawling, screenshots,
|
|
5
|
+
and link extraction.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
Add this line to your application's Gemfile:
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
gem 'spidercloud'
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Or install directly:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
gem install spidercloud
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```ruby
|
|
24
|
+
require 'spider_cloud'
|
|
25
|
+
|
|
26
|
+
# Configure your API key
|
|
27
|
+
SpiderCloud.api_key 'your-api-key'
|
|
28
|
+
|
|
29
|
+
# Scrape a single page
|
|
30
|
+
response = SpiderCloud.scrape( 'https://example.com' )
|
|
31
|
+
puts response.result.content
|
|
32
|
+
|
|
33
|
+
# Crawl a website (limited to 5 pages)
|
|
34
|
+
response = SpiderCloud.crawl( 'https://example.com', limit: 5 )
|
|
35
|
+
response.result.each { | page | puts page.url }
|
|
36
|
+
|
|
37
|
+
# Take a screenshot
|
|
38
|
+
response = SpiderCloud.screenshot( 'https://example.com' )
|
|
39
|
+
response.result.save_to( 'screenshot.png' )
|
|
40
|
+
|
|
41
|
+
# Extract links
|
|
42
|
+
response = SpiderCloud.links( 'https://example.com', limit: 5 )
|
|
43
|
+
puts response.result.urls
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Configuration
|
|
47
|
+
|
|
48
|
+
Set your API key globally:
|
|
49
|
+
|
|
50
|
+
```ruby
|
|
51
|
+
SpiderCloud.api_key 'your-api-key'
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Or pass it per-request:
|
|
55
|
+
|
|
56
|
+
```ruby
|
|
57
|
+
request = SpiderCloud::ScrapeRequest.new( api_key: 'your-api-key' )
|
|
58
|
+
response = request.submit( 'https://example.com' )
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Endpoints
|
|
62
|
+
|
|
63
|
+
SpiderCloud supports four main endpoints:
|
|
64
|
+
|
|
65
|
+
- **[Scrape](readme/scrape.md)** - Extract content from a single URL
|
|
66
|
+
- **[Crawl](readme/crawl.md)** - Crawl multiple pages from a starting URL
|
|
67
|
+
- **[Screenshot](readme/screenshot.md)** - Capture screenshots of web pages
|
|
68
|
+
- **[Links](readme/links.md)** - Discover and extract links from a website
|
|
69
|
+
|
|
70
|
+
## Using Options
|
|
71
|
+
|
|
72
|
+
Each endpoint accepts options that can be built using the options builder:
|
|
73
|
+
|
|
74
|
+
```ruby
|
|
75
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
76
|
+
return_format :markdown
|
|
77
|
+
readability true
|
|
78
|
+
stealth true
|
|
79
|
+
wait_for do
|
|
80
|
+
selector '#content'
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
response = SpiderCloud.scrape( 'https://example.com', options )
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Or pass options as a hash:
|
|
88
|
+
|
|
89
|
+
```ruby
|
|
90
|
+
response = SpiderCloud.scrape( 'https://example.com', {
|
|
91
|
+
return_format: :markdown,
|
|
92
|
+
readability: true
|
|
93
|
+
} )
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Response Handling
|
|
97
|
+
|
|
98
|
+
All endpoints return a Faraday response with an attached `result` object:
|
|
99
|
+
|
|
100
|
+
```ruby
|
|
101
|
+
response = SpiderCloud.scrape( 'https://example.com' )
|
|
102
|
+
|
|
103
|
+
# Check if the HTTP request succeeded
|
|
104
|
+
response.success? # => true/false
|
|
105
|
+
|
|
106
|
+
# Access the parsed result
|
|
107
|
+
response.result.success? # => true/false
|
|
108
|
+
response.result.content # => "# Page Title\n\nContent..."
|
|
109
|
+
response.result.url # => "https://example.com"
|
|
110
|
+
response.result.status # => 200
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Error Handling
|
|
114
|
+
|
|
115
|
+
When a request fails, the result will be an `ErrorResult`:
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
response = SpiderCloud.scrape( 'https://example.com' )
|
|
119
|
+
|
|
120
|
+
unless response.result.success?
|
|
121
|
+
puts response.result.error_type # => :authentication_error
|
|
122
|
+
puts response.result.error_description # => "The API key is invalid."
|
|
123
|
+
end
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Content Formats
|
|
127
|
+
|
|
128
|
+
The `return_format` option controls the output format:
|
|
129
|
+
|
|
130
|
+
- `:markdown` - Markdown format
|
|
131
|
+
- `:commonmark` - CommonMark format
|
|
132
|
+
- `:raw` - Raw HTML (default)
|
|
133
|
+
- `:text` - Plain text
|
|
134
|
+
- `:html2text` - HTML converted to text
|
|
135
|
+
- `:xml` - XML format
|
|
136
|
+
- `:bytes` - Raw bytes
|
|
137
|
+
- `:empty` - No content (useful for links-only)
|
|
138
|
+
|
|
139
|
+
## Proxy Support
|
|
140
|
+
|
|
141
|
+
Spider Cloud supports multiple proxy types:
|
|
142
|
+
|
|
143
|
+
```ruby
|
|
144
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
145
|
+
proxy :residential
|
|
146
|
+
proxy_enabled true
|
|
147
|
+
country_code 'US'
|
|
148
|
+
end
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Proxy types: `:residential`, `:mobile`, `:isp`
|
|
152
|
+
|
|
153
|
+
## Wait Conditions
|
|
154
|
+
|
|
155
|
+
Wait for specific conditions before extracting content:
|
|
156
|
+
|
|
157
|
+
```ruby
|
|
158
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
159
|
+
wait_for do
|
|
160
|
+
# Wait for a CSS selector
|
|
161
|
+
selector '#loaded'
|
|
162
|
+
|
|
163
|
+
# Or wait for network idle
|
|
164
|
+
idle_network do
|
|
165
|
+
timeout do
|
|
166
|
+
seconds 5
|
|
167
|
+
nanoseconds 0
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Or wait for a delay
|
|
172
|
+
delay do
|
|
173
|
+
timeout do
|
|
174
|
+
seconds 2
|
|
175
|
+
nanoseconds 0
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## AI/LLM Integration
|
|
183
|
+
|
|
184
|
+
Configure GPT to process scraped content:
|
|
185
|
+
|
|
186
|
+
```ruby
|
|
187
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
188
|
+
gpt_config do
|
|
189
|
+
prompt 'Summarize this page in 3 sentences'
|
|
190
|
+
model 'gpt-4'
|
|
191
|
+
max_tokens 500
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## Browser Configuration
|
|
197
|
+
|
|
198
|
+
Control browser behavior:
|
|
199
|
+
|
|
200
|
+
```ruby
|
|
201
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
202
|
+
stealth true
|
|
203
|
+
fingerprint true
|
|
204
|
+
block_ads true
|
|
205
|
+
block_analytics true
|
|
206
|
+
viewport do
|
|
207
|
+
width 1920
|
|
208
|
+
height 1080
|
|
209
|
+
end
|
|
210
|
+
device :desktop # :mobile, :tablet, :desktop
|
|
211
|
+
end
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## Automation Scripts
|
|
215
|
+
|
|
216
|
+
Execute actions before scraping:
|
|
217
|
+
|
|
218
|
+
```ruby
|
|
219
|
+
options = SpiderCloud::ScrapeOptions.build do
|
|
220
|
+
automation_scripts( {
|
|
221
|
+
'/login' => [
|
|
222
|
+
{ 'Fill' => { 'selector' => '#email', 'value' => 'user@example.com' } },
|
|
223
|
+
{ 'Fill' => { 'selector' => '#password', 'value' => 'secret' } },
|
|
224
|
+
{ 'Click' => 'button[type=submit]' },
|
|
225
|
+
{ 'WaitForNavigation' => true }
|
|
226
|
+
]
|
|
227
|
+
} )
|
|
228
|
+
end
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## License
|
|
232
|
+
|
|
233
|
+
The gem is available under the MIT License. See LICENSE for details.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
|
|
3
|
+
CostsSchema = DynamicSchema::Struct.define do
|
|
4
|
+
ai_cost Float, as: :ai_cost
|
|
5
|
+
compute_cost Float, as: :compute_cost
|
|
6
|
+
file_cost Float, as: :file_cost
|
|
7
|
+
bytes_transferred_cost Float, as: :bytes_transferred_cost
|
|
8
|
+
total_cost Float, as: :total_cost
|
|
9
|
+
transform_cost Float, as: :transform_cost
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class Costs < CostsSchema
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
end
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class CrawlOptions
|
|
3
|
+
include DynamicSchema::Definable
|
|
4
|
+
include Helpers
|
|
5
|
+
|
|
6
|
+
schema do
|
|
7
|
+
limit Integer
|
|
8
|
+
return_format Symbol, in: RETURN_FORMATS
|
|
9
|
+
request Symbol, in: REQUEST_TYPES
|
|
10
|
+
|
|
11
|
+
depth Integer
|
|
12
|
+
subdomains [ TrueClass, FalseClass ]
|
|
13
|
+
tld [ TrueClass, FalseClass ]
|
|
14
|
+
external_domains String, array: true
|
|
15
|
+
redirect_policy Symbol, in: REDIRECT_POLICIES
|
|
16
|
+
|
|
17
|
+
blacklist String, array: true
|
|
18
|
+
whitelist String, array: true
|
|
19
|
+
budget Hash
|
|
20
|
+
link_rewrite Hash
|
|
21
|
+
|
|
22
|
+
sitemap [ TrueClass, FalseClass ]
|
|
23
|
+
sitemap_only [ TrueClass, FalseClass ]
|
|
24
|
+
sitemap_path String
|
|
25
|
+
|
|
26
|
+
readability [ TrueClass, FalseClass ]
|
|
27
|
+
root_selector String
|
|
28
|
+
exclude_selector String
|
|
29
|
+
css_extraction_map Hash
|
|
30
|
+
filter_main_only [ TrueClass, FalseClass ]
|
|
31
|
+
full_resources [ TrueClass, FalseClass ]
|
|
32
|
+
|
|
33
|
+
return_json_data [ TrueClass, FalseClass ]
|
|
34
|
+
return_headers [ TrueClass, FalseClass ]
|
|
35
|
+
return_cookies [ TrueClass, FalseClass ]
|
|
36
|
+
return_page_links [ TrueClass, FalseClass ]
|
|
37
|
+
return_embeddings [ TrueClass, FalseClass ]
|
|
38
|
+
metadata [ TrueClass, FalseClass ]
|
|
39
|
+
|
|
40
|
+
gpt_config do
|
|
41
|
+
prompt String
|
|
42
|
+
model String
|
|
43
|
+
max_tokens Integer
|
|
44
|
+
temperature Float
|
|
45
|
+
top_p Float
|
|
46
|
+
api_key String
|
|
47
|
+
extra_ai_data [ TrueClass, FalseClass ]
|
|
48
|
+
screenshot [ TrueClass, FalseClass ]
|
|
49
|
+
end
|
|
50
|
+
custom_prompt String
|
|
51
|
+
model String
|
|
52
|
+
|
|
53
|
+
chunking_algorithm as: :chunking_alg do
|
|
54
|
+
type Symbol, in: CHUNKING_TYPES
|
|
55
|
+
value Integer
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
request_timeout Integer, in: 5..255
|
|
59
|
+
lite_mode [ TrueClass, FalseClass ]
|
|
60
|
+
network_blacklist String, array: true
|
|
61
|
+
network_whitelist String, array: true
|
|
62
|
+
anti_bot [ TrueClass, FalseClass ]
|
|
63
|
+
respect_robots [ TrueClass, FalseClass ]
|
|
64
|
+
|
|
65
|
+
wait_for do
|
|
66
|
+
idle_network do
|
|
67
|
+
timeout do
|
|
68
|
+
seconds Integer, as: :secs
|
|
69
|
+
nanoseconds Integer, as: :nanos
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
selector String
|
|
73
|
+
dom do
|
|
74
|
+
timeout do
|
|
75
|
+
seconds Integer, as: :secs
|
|
76
|
+
nanoseconds Integer, as: :nanos
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
delay do
|
|
80
|
+
timeout do
|
|
81
|
+
seconds Integer, as: :secs
|
|
82
|
+
nanoseconds Integer, as: :nanos
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
page_navigations do
|
|
86
|
+
timeout do
|
|
87
|
+
seconds Integer, as: :secs
|
|
88
|
+
nanoseconds Integer, as: :nanos
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
session [ TrueClass, FalseClass ]
|
|
94
|
+
cookies String
|
|
95
|
+
headers Hash
|
|
96
|
+
user_agent String
|
|
97
|
+
|
|
98
|
+
proxy Symbol, in: PROXY_TYPES
|
|
99
|
+
proxy_enabled [ TrueClass, FalseClass ]
|
|
100
|
+
remote_proxy String
|
|
101
|
+
country_code String
|
|
102
|
+
|
|
103
|
+
stealth [ TrueClass, FalseClass ]
|
|
104
|
+
fingerprint [ TrueClass, FalseClass ]
|
|
105
|
+
viewport do
|
|
106
|
+
width Integer
|
|
107
|
+
height Integer
|
|
108
|
+
end
|
|
109
|
+
device Symbol, in: DEVICE_TYPES
|
|
110
|
+
scroll Integer
|
|
111
|
+
block_ads [ TrueClass, FalseClass ]
|
|
112
|
+
virtual_display [ TrueClass, FalseClass ]
|
|
113
|
+
|
|
114
|
+
automation_scripts Hash
|
|
115
|
+
|
|
116
|
+
max_credits_per_page Integer
|
|
117
|
+
max_credits_allowed Integer
|
|
118
|
+
crawl_timeout do
|
|
119
|
+
seconds Integer, as: :secs
|
|
120
|
+
nanoseconds Integer, as: :nanos
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
cache [ TrueClass, FalseClass ]
|
|
124
|
+
storageless [ TrueClass, FalseClass ]
|
|
125
|
+
store_data [ TrueClass, FalseClass ]
|
|
126
|
+
concurrency_limit Integer
|
|
127
|
+
delay Integer
|
|
128
|
+
|
|
129
|
+
webhooks do
|
|
130
|
+
destination String
|
|
131
|
+
on_credits_depleted [ TrueClass, FalseClass ]
|
|
132
|
+
on_find [ TrueClass, FalseClass ]
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def self.build( options = nil, &block )
|
|
137
|
+
new( api_options: builder.build( options, &block ) )
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def self.build!( options = nil, &block )
|
|
141
|
+
new( api_options: builder.build!( options, &block ) )
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def initialize( options = {}, api_options: nil )
|
|
145
|
+
@options = self.class.builder.build( options || {} )
|
|
146
|
+
@options = api_options.merge( @options ) if api_options
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def to_h
|
|
150
|
+
@options.to_h
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
end
|
|
154
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class CrawlRequest < Request
|
|
3
|
+
|
|
4
|
+
def submit( url, options = nil, &block )
|
|
5
|
+
if options
|
|
6
|
+
options = options.is_a?( CrawlOptions ) ? options : CrawlOptions.build!( options.to_h )
|
|
7
|
+
options = options.to_h
|
|
8
|
+
else
|
|
9
|
+
options = {}
|
|
10
|
+
end
|
|
11
|
+
options[ :url ] = Helpers.normalize_url( url )
|
|
12
|
+
|
|
13
|
+
response = post( "#{ BASE_URI }/crawl", options, &block )
|
|
14
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
|
15
|
+
|
|
16
|
+
result = if response.success? && attributes.is_a?( Array )
|
|
17
|
+
CrawlResult.from_array( attributes )
|
|
18
|
+
elsif response.success?
|
|
19
|
+
ErrorResult.new( response.status, attributes )
|
|
20
|
+
else
|
|
21
|
+
ErrorResult.new( response.status, attributes )
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
ResponseMethods.install( response, result )
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
|
|
3
|
+
CrawlResultItemSchema = DynamicSchema::Struct.define do
|
|
4
|
+
content String
|
|
5
|
+
error String
|
|
6
|
+
status Integer
|
|
7
|
+
duration_elapsed_ms Integer, as: :duration_elapsed_ms
|
|
8
|
+
costs Costs
|
|
9
|
+
url String
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class CrawlResultItem < CrawlResultItemSchema
|
|
13
|
+
def success?
|
|
14
|
+
error.nil? && ( status.nil? || ( status >= 200 && status < 300 ) )
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
CrawlResultSchema = DynamicSchema::Struct.define do
|
|
19
|
+
items CrawlResultItem, array: true
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
class CrawlResult < CrawlResultSchema
|
|
23
|
+
extend Forwardable
|
|
24
|
+
include Enumerable
|
|
25
|
+
|
|
26
|
+
def_delegators :items, :each, :[], :count, :size, :length, :first, :last, :empty?
|
|
27
|
+
|
|
28
|
+
def self.from_array( array )
|
|
29
|
+
new( items: array )
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def success?
|
|
33
|
+
items&.all?( &:success? ) || false
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# convenience method for accessing all URLs
|
|
37
|
+
def urls
|
|
38
|
+
items&.map( &:url ) || []
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# convenience method for accessing all content
|
|
42
|
+
def contents
|
|
43
|
+
items&.map( &:content ) || []
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# convenience method for failed items
|
|
47
|
+
def failed
|
|
48
|
+
items&.reject( &:success? ) || []
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# convenience method for successful items
|
|
52
|
+
def succeeded
|
|
53
|
+
items&.select( &:success? ) || []
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# total cost of the crawl
|
|
57
|
+
def total_cost
|
|
58
|
+
items&.sum { | item | item.costs&.total_cost || 0 } || 0
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class ErrorResult
|
|
3
|
+
|
|
4
|
+
attr_reader :error_type, :error_description
|
|
5
|
+
|
|
6
|
+
def initialize( status_code, attributes = nil )
|
|
7
|
+
@error_type, @error_description = status_code_to_error( status_code )
|
|
8
|
+
@error_description = attributes[ :error ] if attributes&.respond_to?( :[] ) && attributes[ :error ]
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def success?
|
|
12
|
+
false
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
private
|
|
16
|
+
|
|
17
|
+
def _status_code_to_error( status_code )
|
|
18
|
+
case status_code
|
|
19
|
+
when 200
|
|
20
|
+
[ :unexpected_error,
|
|
21
|
+
"The response was successful but it did not include a valid payload." ]
|
|
22
|
+
when 400
|
|
23
|
+
[ :invalid_request_error,
|
|
24
|
+
"There was an issue with the format or content of your request." ]
|
|
25
|
+
when 401
|
|
26
|
+
[ :authentication_error,
|
|
27
|
+
"There's an issue with your API key." ]
|
|
28
|
+
when 402
|
|
29
|
+
[ :payment_required,
|
|
30
|
+
"The request requires a paid account or you have insufficient credits." ]
|
|
31
|
+
when 404
|
|
32
|
+
[ :not_found_error,
|
|
33
|
+
"The requested resource was not found." ]
|
|
34
|
+
when 429
|
|
35
|
+
[ :rate_limit_error,
|
|
36
|
+
"Your account has hit a rate limit." ]
|
|
37
|
+
when 500..595
|
|
38
|
+
[ :server_error,
|
|
39
|
+
"The Spider Cloud service encountered an unexpected server error." ]
|
|
40
|
+
when 529
|
|
41
|
+
[ :overloaded_error,
|
|
42
|
+
"The Spider Cloud service is overloaded." ]
|
|
43
|
+
else
|
|
44
|
+
[ :unknown_error,
|
|
45
|
+
"The Spider Cloud service returned an unexpected status code: '#{ status_code }'." ]
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
alias_method :status_code_to_error, :_status_code_to_error
|
|
50
|
+
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
module Helpers
|
|
3
|
+
def string_camelize( string )
|
|
4
|
+
words = string.split( /[\s_\-]/ )
|
|
5
|
+
words.map.with_index do | word, index |
|
|
6
|
+
index.zero? ? word.downcase : word.capitalize
|
|
7
|
+
end.join
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def string_underscore( string )
|
|
11
|
+
string.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# normalize URL by ensuring it has a trailing slash for root URLs
|
|
15
|
+
# and no trailing slash for paths
|
|
16
|
+
def normalize_url( url )
|
|
17
|
+
url = url.to_s.strip
|
|
18
|
+
uri = URI.parse( url )
|
|
19
|
+
if uri.path.empty? || uri.path == '/'
|
|
20
|
+
# root URL - ensure trailing slash
|
|
21
|
+
uri.path = '/'
|
|
22
|
+
else
|
|
23
|
+
# path URL - remove trailing slash
|
|
24
|
+
uri.path = uri.path.chomp( '/' )
|
|
25
|
+
end
|
|
26
|
+
uri.to_s
|
|
27
|
+
rescue URI::InvalidURIError
|
|
28
|
+
url
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
module_function :string_camelize, :string_underscore, :normalize_url
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class LinksOptions
|
|
3
|
+
include DynamicSchema::Definable
|
|
4
|
+
include Helpers
|
|
5
|
+
|
|
6
|
+
schema do
|
|
7
|
+
limit Integer
|
|
8
|
+
return_format Symbol
|
|
9
|
+
|
|
10
|
+
depth Integer
|
|
11
|
+
subdomains [ TrueClass, FalseClass ]
|
|
12
|
+
tld [ TrueClass, FalseClass ]
|
|
13
|
+
external_domains String, array: true
|
|
14
|
+
|
|
15
|
+
blacklist String, array: true
|
|
16
|
+
whitelist String, array: true
|
|
17
|
+
budget Hash
|
|
18
|
+
redirect_policy Symbol, in: REDIRECT_POLICIES
|
|
19
|
+
|
|
20
|
+
sitemap [ TrueClass, FalseClass ]
|
|
21
|
+
sitemap_only [ TrueClass, FalseClass ]
|
|
22
|
+
sitemap_path String
|
|
23
|
+
|
|
24
|
+
request Symbol, in: REQUEST_TYPES
|
|
25
|
+
request_timeout Integer, in: 5..255
|
|
26
|
+
cache [ TrueClass, FalseClass ]
|
|
27
|
+
respect_robots [ TrueClass, FalseClass ]
|
|
28
|
+
|
|
29
|
+
proxy Symbol, in: PROXY_TYPES
|
|
30
|
+
proxy_enabled [ TrueClass, FalseClass ]
|
|
31
|
+
country_code String
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.build( options = nil, &block )
|
|
35
|
+
new( api_options: builder.build( options, &block ) )
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def self.build!( options = nil, &block )
|
|
39
|
+
new( api_options: builder.build!( options, &block ) )
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def initialize( options = {}, api_options: nil )
|
|
43
|
+
@options = self.class.builder.build( options || {} )
|
|
44
|
+
@options = api_options.merge( @options ) if api_options
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def to_h
|
|
48
|
+
@options.to_h
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module SpiderCloud
|
|
2
|
+
class LinksRequest < Request
|
|
3
|
+
|
|
4
|
+
def submit( url, options = nil, &block )
|
|
5
|
+
if options
|
|
6
|
+
options = options.is_a?( LinksOptions ) ? options : \
|
|
7
|
+
LinksOptions.build!( options.to_h )
|
|
8
|
+
options = options.to_h
|
|
9
|
+
else
|
|
10
|
+
options = {}
|
|
11
|
+
end
|
|
12
|
+
options[ :url ] = Helpers.normalize_url( url )
|
|
13
|
+
|
|
14
|
+
response = post( "#{ BASE_URI }/links", options, &block )
|
|
15
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
|
16
|
+
|
|
17
|
+
result = if response.success? && attributes.is_a?( Array )
|
|
18
|
+
LinksResult.from_array( attributes )
|
|
19
|
+
elsif response.success?
|
|
20
|
+
ErrorResult.new( response.status, attributes )
|
|
21
|
+
else
|
|
22
|
+
ErrorResult.new( response.status, attributes )
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
ResponseMethods.install( response, result )
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
end
|