proxycrawl 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +116 -0
- data/lib/proxycrawl.rb +1 -0
- data/lib/proxycrawl/api.rb +7 -13
- data/lib/proxycrawl/leads_api.rb +10 -3
- data/lib/proxycrawl/storage_api.rb +116 -0
- data/lib/proxycrawl/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 464bcfcfd4be75be12ec870b5cb2ac5d9f38ed01e1fa2ba44ee1746cf9031795
|
4
|
+
data.tar.gz: 579e140f00efd51ec451b1e33d926d079b3cd5f4353e3a1c001bf37ce04e4e51
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00d314fe67826d76c4ec0470cc5419495d047ed0aa5a408d9d625c45919bd5f1f3c207e0fca30feb66b1b8360e9e7c80f3e9b99967ea6661d5bafb8b981930e1
|
7
|
+
data.tar.gz: 67e1685970d48281b357bcda337fa7bbab40794e9488c5afd3af2810b2cb66a7494e3c8135ca6ff0c085b1d3aa329297b527788d614d2ae578f3b2adb9e5894e
|
data/README.md
CHANGED
@@ -149,6 +149,7 @@ Example:
|
|
149
149
|
```ruby
|
150
150
|
begin
|
151
151
|
response = scraper_api.get('https://www.amazon.com/Halo-SleepSack-Swaddle-Triangle-Neutral/dp/B01LAG1TOS')
|
152
|
+
puts response.remaining_requests
|
152
153
|
puts response.status_code
|
153
154
|
puts response.body
|
154
155
|
rescue => exception
|
@@ -160,11 +161,15 @@ end
|
|
160
161
|
|
161
162
|
Initialize with your Leads API token and call the `get` method.
|
162
163
|
|
164
|
+
For more details on the implementation, please visit the [Leads API documentation](https://proxycrawl.com/docs/leads-api).
|
165
|
+
|
163
166
|
```ruby
|
164
167
|
leads_api = ProxyCrawl::LeadsAPI.new(token: 'YOUR_TOKEN')
|
165
168
|
|
166
169
|
begin
|
167
170
|
response = leads_api.get('stripe.com')
|
171
|
+
puts response.success
|
172
|
+
puts response.remaining_requests
|
168
173
|
puts response.status_code
|
169
174
|
puts response.body
|
170
175
|
rescue => exception
|
@@ -184,6 +189,8 @@ screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
|
|
184
189
|
|
185
190
|
begin
|
186
191
|
response = screenshots_api.get('https://www.apple.com')
|
192
|
+
puts response.success
|
193
|
+
puts response.remaining_requests
|
187
194
|
puts response.status_code
|
188
195
|
puts response.screenshot_path # do something with screenshot_path here
|
189
196
|
rescue => exception
|
@@ -200,6 +207,8 @@ begin
|
|
200
207
|
response = screenshots_api.get('https://www.apple.com') do |file|
|
201
208
|
# do something (reading/writing) with the image file here
|
202
209
|
end
|
210
|
+
puts response.success
|
211
|
+
puts response.remaining_requests
|
203
212
|
puts response.status_code
|
204
213
|
rescue => exception
|
205
214
|
puts exception.backtrace
|
@@ -215,6 +224,8 @@ begin
|
|
215
224
|
response = screenshots_api.get('https://www.apple.com', save_to_path: '~/screenshot.jpg') do |file|
|
216
225
|
# do something (reading/writing) with the image file here
|
217
226
|
end
|
227
|
+
puts response.success
|
228
|
+
puts response.remaining_requests
|
218
229
|
puts response.status_code
|
219
230
|
rescue => exception
|
220
231
|
puts exception.backtrace
|
@@ -223,6 +234,111 @@ end
|
|
223
234
|
|
224
235
|
Note that `screenshots_api.get(url, options)` method accepts an [options](https://proxycrawl.com/docs/screenshots-api/parameters)
|
225
236
|
|
237
|
+
## Storage API usage
|
238
|
+
|
239
|
+
Initialize the Storage API using your private token.
|
240
|
+
|
241
|
+
```ruby
|
242
|
+
storage_api = ProxyCrawl::StorageAPI.new(token: 'YOUR_TOKEN')
|
243
|
+
```
|
244
|
+
|
245
|
+
Pass the [url](https://proxycrawl.com/docs/storage-api/parameters/#url) that you want to get from [Proxycrawl Storage](https://proxycrawl.com/dashboard/storage).
|
246
|
+
|
247
|
+
```ruby
|
248
|
+
begin
|
249
|
+
response = storage_api.get('https://www.apple.com')
|
250
|
+
puts response.original_status
|
251
|
+
puts response.pc_status
|
252
|
+
puts response.url
|
253
|
+
puts response.status_code
|
254
|
+
puts response.rid
|
255
|
+
puts response.body
|
256
|
+
puts response.stored_at
|
257
|
+
rescue => exception
|
258
|
+
puts exception.backtrace
|
259
|
+
end
|
260
|
+
```
|
261
|
+
|
262
|
+
or you can use the [RID](https://proxycrawl.com/docs/storage-api/parameters/#rid)
|
263
|
+
|
264
|
+
```ruby
|
265
|
+
begin
|
266
|
+
response = storage_api.get(RID)
|
267
|
+
puts response.original_status
|
268
|
+
puts response.pc_status
|
269
|
+
puts response.url
|
270
|
+
puts response.status_code
|
271
|
+
puts response.rid
|
272
|
+
puts response.body
|
273
|
+
puts response.stored_at
|
274
|
+
rescue => exception
|
275
|
+
puts exception.backtrace
|
276
|
+
end
|
277
|
+
```
|
278
|
+
|
279
|
+
Note: One of the two RID or URL must be sent. So both are optional but it's mandatory to send one of the two.
|
280
|
+
|
281
|
+
### [Delete](https://proxycrawl.com/docs/storage-api/delete/) request
|
282
|
+
|
283
|
+
To delete a storage item from your storage area, use the correct RID
|
284
|
+
|
285
|
+
```ruby
|
286
|
+
if storage_api.delete(RID)
|
287
|
+
puts 'delete success'
|
288
|
+
else
|
289
|
+
puts "Unable to delete: #{storage_api.body['error']}"
|
290
|
+
end
|
291
|
+
```
|
292
|
+
|
293
|
+
### [Bulk](https://proxycrawl.com/docs/storage-api/bulk/) request
|
294
|
+
|
295
|
+
To do a bulk request with a list of RIDs, please send the list of rids as an array
|
296
|
+
|
297
|
+
```ruby
|
298
|
+
begin
|
299
|
+
response = storage_api.bulk([RID1, RID2, RID3, ...])
|
300
|
+
puts response.original_status
|
301
|
+
puts response.pc_status
|
302
|
+
puts response.url
|
303
|
+
puts response.status_code
|
304
|
+
puts response.rid
|
305
|
+
puts response.body
|
306
|
+
puts response.stored_at
|
307
|
+
rescue => exception
|
308
|
+
puts exception.backtrace
|
309
|
+
end
|
310
|
+
```
|
311
|
+
|
312
|
+
### [RIDs](https://proxycrawl.com/docs/storage-api/rids) request
|
313
|
+
|
314
|
+
To request a bulk list of RIDs from your storage area
|
315
|
+
|
316
|
+
```ruby
|
317
|
+
begin
|
318
|
+
response = storage_api.rids
|
319
|
+
puts response.status_code
|
320
|
+
puts response.rid
|
321
|
+
puts response.body
|
322
|
+
rescue => exception
|
323
|
+
puts exception.backtrace
|
324
|
+
end
|
325
|
+
```
|
326
|
+
|
327
|
+
You can also specify a limit as a parameter
|
328
|
+
|
329
|
+
```ruby
|
330
|
+
storage_api.rids(100)
|
331
|
+
```
|
332
|
+
|
333
|
+
### [Total Count](https://proxycrawl.com/docs/storage-api/total_count)
|
334
|
+
|
335
|
+
To get the total number of documents in your storage area
|
336
|
+
|
337
|
+
```ruby
|
338
|
+
total_count = storage_api.total_count
|
339
|
+
puts "total_count: #{total_count}"
|
340
|
+
```
|
341
|
+
|
226
342
|
If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact).
|
227
343
|
|
228
344
|
## Development
|
data/lib/proxycrawl.rb
CHANGED
data/lib/proxycrawl/api.rb
CHANGED
@@ -6,7 +6,7 @@ require 'uri'
|
|
6
6
|
|
7
7
|
module ProxyCrawl
|
8
8
|
class API
|
9
|
-
attr_reader :token, :body, :status_code, :original_status, :pc_status, :url
|
9
|
+
attr_reader :token, :body, :status_code, :original_status, :pc_status, :url, :storage_url
|
10
10
|
|
11
11
|
INVALID_TOKEN = 'Token is required'
|
12
12
|
INVALID_URL = 'URL is required'
|
@@ -69,19 +69,13 @@ module ProxyCrawl
|
|
69
69
|
end
|
70
70
|
|
71
71
|
def prepare_response(response, format)
|
72
|
-
|
73
|
-
json_body = JSON.parse(response.body)
|
74
|
-
@original_status = json_body['original_status'].to_i
|
75
|
-
@pc_status = json_body['pc_status'].to_i
|
76
|
-
@url = json_body['url']
|
77
|
-
@status_code = response.code.to_i
|
78
|
-
else
|
79
|
-
@original_status = response['original_status'].to_i
|
80
|
-
@status_code = response.code.to_i
|
81
|
-
@pc_status = response['pc_status'].to_i
|
82
|
-
@url = response['url']
|
83
|
-
end
|
72
|
+
res = format == 'json' || base_url.include?('/scraper') ? JSON.parse(response.body) : response
|
84
73
|
|
74
|
+
@original_status = res['original_status'].to_i
|
75
|
+
@pc_status = res['pc_status'].to_i
|
76
|
+
@url = res['url']
|
77
|
+
@storage_url = res['storage_url']
|
78
|
+
@status_code = response.code.to_i
|
85
79
|
@body = response.body
|
86
80
|
end
|
87
81
|
end
|
data/lib/proxycrawl/leads_api.rb
CHANGED
@@ -6,13 +6,13 @@ require 'uri'
|
|
6
6
|
|
7
7
|
module ProxyCrawl
|
8
8
|
class LeadsAPI
|
9
|
-
attr_reader :token, :body, :status_code
|
9
|
+
attr_reader :token, :body, :status_code, :success, :remaining_requests
|
10
10
|
|
11
11
|
INVALID_TOKEN = 'Token is required'
|
12
12
|
INVALID_DOMAIN = 'Domain is required'
|
13
13
|
|
14
14
|
def initialize(options = {})
|
15
|
-
raise INVALID_TOKEN if options[:token].nil?
|
15
|
+
raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
|
16
16
|
|
17
17
|
@token = options[:token]
|
18
18
|
end
|
@@ -24,11 +24,18 @@ module ProxyCrawl
|
|
24
24
|
uri.query = URI.encode_www_form({ token: token, domain: domain })
|
25
25
|
|
26
26
|
response = Net::HTTP.get_response(uri)
|
27
|
-
|
28
27
|
@status_code = response.code.to_i
|
29
28
|
@body = response.body
|
30
29
|
|
30
|
+
json_body = JSON.parse(response.body)
|
31
|
+
@success = json_body['success']
|
32
|
+
@remaining_requests = json_body['remaining_requests'].to_i
|
33
|
+
|
31
34
|
self
|
32
35
|
end
|
36
|
+
|
37
|
+
def post
|
38
|
+
raise 'Only GET is allowed for the LeadsAPI'
|
39
|
+
end
|
33
40
|
end
|
34
41
|
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
require 'json'
|
5
|
+
require 'uri'
|
6
|
+
|
7
|
+
module ProxyCrawl
|
8
|
+
class StorageAPI
|
9
|
+
attr_reader :token, :original_status, :pc_status, :url, :status_code, :rid, :body, :stored_at
|
10
|
+
|
11
|
+
INVALID_TOKEN = 'Token is required'
|
12
|
+
INVALID_RID = 'RID is required'
|
13
|
+
INVALID_RID_ARRAY = 'One or more RIDs are required'
|
14
|
+
INVALID_URL_OR_RID = 'Either URL or RID is required'
|
15
|
+
BASE_URL = 'https://api.proxycrawl.com/storage'
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
|
19
|
+
|
20
|
+
@token = options[:token]
|
21
|
+
end
|
22
|
+
|
23
|
+
def get(url_or_rid, format = 'html')
|
24
|
+
raise INVALID_URL_OR_RID if url_or_rid.nil? || url_or_rid.empty?
|
25
|
+
|
26
|
+
uri = URI(BASE_URL)
|
27
|
+
uri.query = URI.encode_www_form({ token: token, format: format }.merge(decide_url_or_rid(url_or_rid)))
|
28
|
+
response = Net::HTTP.get_response(uri)
|
29
|
+
|
30
|
+
res = format == 'json' ? JSON.parse(response.body) : response
|
31
|
+
|
32
|
+
@original_status = res['original_status'].to_i
|
33
|
+
@pc_status = res['pc_status'].to_i
|
34
|
+
@url = res['url']
|
35
|
+
@rid = res['rid']
|
36
|
+
@stored_at = res['stored_at']
|
37
|
+
|
38
|
+
@status_code = response.code.to_i
|
39
|
+
@body = response.body
|
40
|
+
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def delete(rid)
|
45
|
+
raise INVALID_RID if rid.nil? || rid.empty?
|
46
|
+
|
47
|
+
uri = URI(BASE_URL)
|
48
|
+
uri.query = URI.encode_www_form(token: token, rid: rid)
|
49
|
+
http = Net::HTTP.new(uri.host)
|
50
|
+
request = Net::HTTP::Delete.new(uri.request_uri)
|
51
|
+
response = http.request(request)
|
52
|
+
|
53
|
+
@url, @original_status, @pc_status, @stored_at = nil
|
54
|
+
@status_code = response.code.to_i
|
55
|
+
@rid = rid
|
56
|
+
@body = JSON.parse(response.body)
|
57
|
+
|
58
|
+
@body.key?('success')
|
59
|
+
end
|
60
|
+
|
61
|
+
def bulk(rids_array = [])
|
62
|
+
raise INVALID_RID_ARRAY if rids_array.empty?
|
63
|
+
|
64
|
+
uri = URI("#{BASE_URL}/bulk")
|
65
|
+
uri.query = URI.encode_www_form(token: token)
|
66
|
+
http = Net::HTTP.new(uri.host)
|
67
|
+
request = Net::HTTP::Post.new(uri.request_uri, { 'Content-Type': 'application/json' })
|
68
|
+
request.body = { rids: rids_array }.to_json
|
69
|
+
response = http.request(request)
|
70
|
+
|
71
|
+
@body = JSON.parse(response.body)
|
72
|
+
@original_status = @body.map { |item| item['original_status'].to_i }
|
73
|
+
@status_code = response.code.to_i
|
74
|
+
@pc_status = @body.map { |item| item['pc_status'].to_i }
|
75
|
+
@url = @body.map { |item| item['url'] }
|
76
|
+
@rid = @body.map { |item| item['rid'] }
|
77
|
+
@stored_at = @body.map { |item| item['stored_at'] }
|
78
|
+
|
79
|
+
self
|
80
|
+
end
|
81
|
+
|
82
|
+
def rids(limit = -1)
|
83
|
+
uri = URI("#{BASE_URL}/rids")
|
84
|
+
query_hash = { token: token }
|
85
|
+
query_hash.merge!({ limit: limit }) if limit >= 0
|
86
|
+
uri.query = URI.encode_www_form(query_hash)
|
87
|
+
|
88
|
+
response = Net::HTTP.get_response(uri)
|
89
|
+
@url, @original_status, @pc_status, @stored_at = nil
|
90
|
+
@status_code = response.code.to_i
|
91
|
+
@body = JSON.parse(response.body)
|
92
|
+
@rid = @body
|
93
|
+
|
94
|
+
@body
|
95
|
+
end
|
96
|
+
|
97
|
+
def total_count
|
98
|
+
uri = URI("#{BASE_URL}/total_count")
|
99
|
+
uri.query = URI.encode_www_form(token: token)
|
100
|
+
|
101
|
+
response = Net::HTTP.get_response(uri)
|
102
|
+
@url, @original_status, @pc_status, @stored_at = nil
|
103
|
+
@status_code = response.code.to_i
|
104
|
+
@rid = rid
|
105
|
+
@body = JSON.parse(response.body)
|
106
|
+
|
107
|
+
body['totalCount']
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def decide_url_or_rid(url_or_rid)
|
113
|
+
%r{^https?://} =~ url_or_rid ? { url: url_or_rid } : { rid: url_or_rid }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
data/lib/proxycrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proxycrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- proxycrawl
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-07-
|
11
|
+
date: 2021-07-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -87,6 +87,7 @@ files:
|
|
87
87
|
- lib/proxycrawl/leads_api.rb
|
88
88
|
- lib/proxycrawl/scraper_api.rb
|
89
89
|
- lib/proxycrawl/screenshots_api.rb
|
90
|
+
- lib/proxycrawl/storage_api.rb
|
90
91
|
- lib/proxycrawl/version.rb
|
91
92
|
- proxycrawl.gemspec
|
92
93
|
homepage: https://github.com/proxycrawl/proxycrawl-ruby
|