proxycrawl 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +116 -0
- data/lib/proxycrawl.rb +1 -0
- data/lib/proxycrawl/api.rb +7 -13
- data/lib/proxycrawl/leads_api.rb +10 -3
- data/lib/proxycrawl/storage_api.rb +116 -0
- data/lib/proxycrawl/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 464bcfcfd4be75be12ec870b5cb2ac5d9f38ed01e1fa2ba44ee1746cf9031795
|
4
|
+
data.tar.gz: 579e140f00efd51ec451b1e33d926d079b3cd5f4353e3a1c001bf37ce04e4e51
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00d314fe67826d76c4ec0470cc5419495d047ed0aa5a408d9d625c45919bd5f1f3c207e0fca30feb66b1b8360e9e7c80f3e9b99967ea6661d5bafb8b981930e1
|
7
|
+
data.tar.gz: 67e1685970d48281b357bcda337fa7bbab40794e9488c5afd3af2810b2cb66a7494e3c8135ca6ff0c085b1d3aa329297b527788d614d2ae578f3b2adb9e5894e
|
data/README.md
CHANGED
@@ -149,6 +149,7 @@ Example:
|
|
149
149
|
```ruby
|
150
150
|
begin
|
151
151
|
response = scraper_api.get('https://www.amazon.com/Halo-SleepSack-Swaddle-Triangle-Neutral/dp/B01LAG1TOS')
|
152
|
+
puts response.remaining_requests
|
152
153
|
puts response.status_code
|
153
154
|
puts response.body
|
154
155
|
rescue => exception
|
@@ -160,11 +161,15 @@ end
|
|
160
161
|
|
161
162
|
Initialize with your Leads API token and call the `get` method.
|
162
163
|
|
164
|
+
For more details on the implementation, please visit the [Leads API documentation](https://proxycrawl.com/docs/leads-api).
|
165
|
+
|
163
166
|
```ruby
|
164
167
|
leads_api = ProxyCrawl::LeadsAPI.new(token: 'YOUR_TOKEN')
|
165
168
|
|
166
169
|
begin
|
167
170
|
response = leads_api.get('stripe.com')
|
171
|
+
puts response.success
|
172
|
+
puts response.remaining_requests
|
168
173
|
puts response.status_code
|
169
174
|
puts response.body
|
170
175
|
rescue => exception
|
@@ -184,6 +189,8 @@ screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
|
|
184
189
|
|
185
190
|
begin
|
186
191
|
response = screenshots_api.get('https://www.apple.com')
|
192
|
+
puts response.success
|
193
|
+
puts response.remaining_requests
|
187
194
|
puts response.status_code
|
188
195
|
puts response.screenshot_path # do something with screenshot_path here
|
189
196
|
rescue => exception
|
@@ -200,6 +207,8 @@ begin
|
|
200
207
|
response = screenshots_api.get('https://www.apple.com') do |file|
|
201
208
|
# do something (reading/writing) with the image file here
|
202
209
|
end
|
210
|
+
puts response.success
|
211
|
+
puts response.remaining_requests
|
203
212
|
puts response.status_code
|
204
213
|
rescue => exception
|
205
214
|
puts exception.backtrace
|
@@ -215,6 +224,8 @@ begin
|
|
215
224
|
response = screenshots_api.get('https://www.apple.com', save_to_path: '~/screenshot.jpg') do |file|
|
216
225
|
# do something (reading/writing) with the image file here
|
217
226
|
end
|
227
|
+
puts response.success
|
228
|
+
puts response.remaining_requests
|
218
229
|
puts response.status_code
|
219
230
|
rescue => exception
|
220
231
|
puts exception.backtrace
|
@@ -223,6 +234,111 @@ end
|
|
223
234
|
|
224
235
|
Note that `screenshots_api.get(url, options)` method accepts an [options](https://proxycrawl.com/docs/screenshots-api/parameters)
|
225
236
|
|
237
|
+
## Storage API usage
|
238
|
+
|
239
|
+
Initialize the Storage API using your private token.
|
240
|
+
|
241
|
+
```ruby
|
242
|
+
storage_api = ProxyCrawl::StorageAPI.new(token: 'YOUR_TOKEN')
|
243
|
+
```
|
244
|
+
|
245
|
+
Pass the [url](https://proxycrawl.com/docs/storage-api/parameters/#url) that you want to get from [Proxycrawl Storage](https://proxycrawl.com/dashboard/storage).
|
246
|
+
|
247
|
+
```ruby
|
248
|
+
begin
|
249
|
+
response = storage_api.get('https://www.apple.com')
|
250
|
+
puts response.original_status
|
251
|
+
puts response.pc_status
|
252
|
+
puts response.url
|
253
|
+
puts response.status_code
|
254
|
+
puts response.rid
|
255
|
+
puts response.body
|
256
|
+
puts response.stored_at
|
257
|
+
rescue => exception
|
258
|
+
puts exception.backtrace
|
259
|
+
end
|
260
|
+
```
|
261
|
+
|
262
|
+
or you can use the [RID](https://proxycrawl.com/docs/storage-api/parameters/#rid)
|
263
|
+
|
264
|
+
```ruby
|
265
|
+
begin
|
266
|
+
response = storage_api.get(RID)
|
267
|
+
puts response.original_status
|
268
|
+
puts response.pc_status
|
269
|
+
puts response.url
|
270
|
+
puts response.status_code
|
271
|
+
puts response.rid
|
272
|
+
puts response.body
|
273
|
+
puts response.stored_at
|
274
|
+
rescue => exception
|
275
|
+
puts exception.backtrace
|
276
|
+
end
|
277
|
+
```
|
278
|
+
|
279
|
+
Note: One of the two RID or URL must be sent. So both are optional but it's mandatory to send one of the two.
|
280
|
+
|
281
|
+
### [Delete](https://proxycrawl.com/docs/storage-api/delete/) request
|
282
|
+
|
283
|
+
To delete a storage item from your storage area, use the correct RID
|
284
|
+
|
285
|
+
```ruby
|
286
|
+
if storage_api.delete(RID)
|
287
|
+
puts 'delete success'
|
288
|
+
else
|
289
|
+
puts "Unable to delete: #{storage_api.body['error']}"
|
290
|
+
end
|
291
|
+
```
|
292
|
+
|
293
|
+
### [Bulk](https://proxycrawl.com/docs/storage-api/bulk/) request
|
294
|
+
|
295
|
+
To do a bulk request with a list of RIDs, please send the list of rids as an array
|
296
|
+
|
297
|
+
```ruby
|
298
|
+
begin
|
299
|
+
response = storage_api.bulk([RID1, RID2, RID3, ...])
|
300
|
+
puts response.original_status
|
301
|
+
puts response.pc_status
|
302
|
+
puts response.url
|
303
|
+
puts response.status_code
|
304
|
+
puts response.rid
|
305
|
+
puts response.body
|
306
|
+
puts response.stored_at
|
307
|
+
rescue => exception
|
308
|
+
puts exception.backtrace
|
309
|
+
end
|
310
|
+
```
|
311
|
+
|
312
|
+
### [RIDs](https://proxycrawl.com/docs/storage-api/rids) request
|
313
|
+
|
314
|
+
To request a bulk list of RIDs from your storage area
|
315
|
+
|
316
|
+
```ruby
|
317
|
+
begin
|
318
|
+
response = storage_api.rids
|
319
|
+
puts response.status_code
|
320
|
+
puts response.rid
|
321
|
+
puts response.body
|
322
|
+
rescue => exception
|
323
|
+
puts exception.backtrace
|
324
|
+
end
|
325
|
+
```
|
326
|
+
|
327
|
+
You can also specify a limit as a parameter
|
328
|
+
|
329
|
+
```ruby
|
330
|
+
storage_api.rids(100)
|
331
|
+
```
|
332
|
+
|
333
|
+
### [Total Count](https://proxycrawl.com/docs/storage-api/total_count)
|
334
|
+
|
335
|
+
To get the total number of documents in your storage area
|
336
|
+
|
337
|
+
```ruby
|
338
|
+
total_count = storage_api.total_count
|
339
|
+
puts "total_count: #{total_count}"
|
340
|
+
```
|
341
|
+
|
226
342
|
If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact).
|
227
343
|
|
228
344
|
## Development
|
data/lib/proxycrawl.rb
CHANGED
data/lib/proxycrawl/api.rb
CHANGED
@@ -6,7 +6,7 @@ require 'uri'
|
|
6
6
|
|
7
7
|
module ProxyCrawl
|
8
8
|
class API
|
9
|
-
attr_reader :token, :body, :status_code, :original_status, :pc_status, :url
|
9
|
+
attr_reader :token, :body, :status_code, :original_status, :pc_status, :url, :storage_url
|
10
10
|
|
11
11
|
INVALID_TOKEN = 'Token is required'
|
12
12
|
INVALID_URL = 'URL is required'
|
@@ -69,19 +69,13 @@ module ProxyCrawl
|
|
69
69
|
end
|
70
70
|
|
71
71
|
def prepare_response(response, format)
|
72
|
-
|
73
|
-
json_body = JSON.parse(response.body)
|
74
|
-
@original_status = json_body['original_status'].to_i
|
75
|
-
@pc_status = json_body['pc_status'].to_i
|
76
|
-
@url = json_body['url']
|
77
|
-
@status_code = response.code.to_i
|
78
|
-
else
|
79
|
-
@original_status = response['original_status'].to_i
|
80
|
-
@status_code = response.code.to_i
|
81
|
-
@pc_status = response['pc_status'].to_i
|
82
|
-
@url = response['url']
|
83
|
-
end
|
72
|
+
res = format == 'json' || base_url.include?('/scraper') ? JSON.parse(response.body) : response
|
84
73
|
|
74
|
+
@original_status = res['original_status'].to_i
|
75
|
+
@pc_status = res['pc_status'].to_i
|
76
|
+
@url = res['url']
|
77
|
+
@storage_url = res['storage_url']
|
78
|
+
@status_code = response.code.to_i
|
85
79
|
@body = response.body
|
86
80
|
end
|
87
81
|
end
|
data/lib/proxycrawl/leads_api.rb
CHANGED
@@ -6,13 +6,13 @@ require 'uri'
|
|
6
6
|
|
7
7
|
module ProxyCrawl
|
8
8
|
class LeadsAPI
|
9
|
-
attr_reader :token, :body, :status_code
|
9
|
+
attr_reader :token, :body, :status_code, :success, :remaining_requests
|
10
10
|
|
11
11
|
INVALID_TOKEN = 'Token is required'
|
12
12
|
INVALID_DOMAIN = 'Domain is required'
|
13
13
|
|
14
14
|
def initialize(options = {})
|
15
|
-
raise INVALID_TOKEN if options[:token].nil?
|
15
|
+
raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
|
16
16
|
|
17
17
|
@token = options[:token]
|
18
18
|
end
|
@@ -24,11 +24,18 @@ module ProxyCrawl
|
|
24
24
|
uri.query = URI.encode_www_form({ token: token, domain: domain })
|
25
25
|
|
26
26
|
response = Net::HTTP.get_response(uri)
|
27
|
-
|
28
27
|
@status_code = response.code.to_i
|
29
28
|
@body = response.body
|
30
29
|
|
30
|
+
json_body = JSON.parse(response.body)
|
31
|
+
@success = json_body['success']
|
32
|
+
@remaining_requests = json_body['remaining_requests'].to_i
|
33
|
+
|
31
34
|
self
|
32
35
|
end
|
36
|
+
|
37
|
+
def post
|
38
|
+
raise 'Only GET is allowed for the LeadsAPI'
|
39
|
+
end
|
33
40
|
end
|
34
41
|
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
require 'json'
|
5
|
+
require 'uri'
|
6
|
+
|
7
|
+
module ProxyCrawl
|
8
|
+
class StorageAPI
|
9
|
+
attr_reader :token, :original_status, :pc_status, :url, :status_code, :rid, :body, :stored_at
|
10
|
+
|
11
|
+
INVALID_TOKEN = 'Token is required'
|
12
|
+
INVALID_RID = 'RID is required'
|
13
|
+
INVALID_RID_ARRAY = 'One or more RIDs are required'
|
14
|
+
INVALID_URL_OR_RID = 'Either URL or RID is required'
|
15
|
+
BASE_URL = 'https://api.proxycrawl.com/storage'
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
|
19
|
+
|
20
|
+
@token = options[:token]
|
21
|
+
end
|
22
|
+
|
23
|
+
def get(url_or_rid, format = 'html')
|
24
|
+
raise INVALID_URL_OR_RID if url_or_rid.nil? || url_or_rid.empty?
|
25
|
+
|
26
|
+
uri = URI(BASE_URL)
|
27
|
+
uri.query = URI.encode_www_form({ token: token, format: format }.merge(decide_url_or_rid(url_or_rid)))
|
28
|
+
response = Net::HTTP.get_response(uri)
|
29
|
+
|
30
|
+
res = format == 'json' ? JSON.parse(response.body) : response
|
31
|
+
|
32
|
+
@original_status = res['original_status'].to_i
|
33
|
+
@pc_status = res['pc_status'].to_i
|
34
|
+
@url = res['url']
|
35
|
+
@rid = res['rid']
|
36
|
+
@stored_at = res['stored_at']
|
37
|
+
|
38
|
+
@status_code = response.code.to_i
|
39
|
+
@body = response.body
|
40
|
+
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def delete(rid)
|
45
|
+
raise INVALID_RID if rid.nil? || rid.empty?
|
46
|
+
|
47
|
+
uri = URI(BASE_URL)
|
48
|
+
uri.query = URI.encode_www_form(token: token, rid: rid)
|
49
|
+
http = Net::HTTP.new(uri.host)
|
50
|
+
request = Net::HTTP::Delete.new(uri.request_uri)
|
51
|
+
response = http.request(request)
|
52
|
+
|
53
|
+
@url, @original_status, @pc_status, @stored_at = nil
|
54
|
+
@status_code = response.code.to_i
|
55
|
+
@rid = rid
|
56
|
+
@body = JSON.parse(response.body)
|
57
|
+
|
58
|
+
@body.key?('success')
|
59
|
+
end
|
60
|
+
|
61
|
+
def bulk(rids_array = [])
|
62
|
+
raise INVALID_RID_ARRAY if rids_array.empty?
|
63
|
+
|
64
|
+
uri = URI("#{BASE_URL}/bulk")
|
65
|
+
uri.query = URI.encode_www_form(token: token)
|
66
|
+
http = Net::HTTP.new(uri.host)
|
67
|
+
request = Net::HTTP::Post.new(uri.request_uri, { 'Content-Type': 'application/json' })
|
68
|
+
request.body = { rids: rids_array }.to_json
|
69
|
+
response = http.request(request)
|
70
|
+
|
71
|
+
@body = JSON.parse(response.body)
|
72
|
+
@original_status = @body.map { |item| item['original_status'].to_i }
|
73
|
+
@status_code = response.code.to_i
|
74
|
+
@pc_status = @body.map { |item| item['pc_status'].to_i }
|
75
|
+
@url = @body.map { |item| item['url'] }
|
76
|
+
@rid = @body.map { |item| item['rid'] }
|
77
|
+
@stored_at = @body.map { |item| item['stored_at'] }
|
78
|
+
|
79
|
+
self
|
80
|
+
end
|
81
|
+
|
82
|
+
def rids(limit = -1)
|
83
|
+
uri = URI("#{BASE_URL}/rids")
|
84
|
+
query_hash = { token: token }
|
85
|
+
query_hash.merge!({ limit: limit }) if limit >= 0
|
86
|
+
uri.query = URI.encode_www_form(query_hash)
|
87
|
+
|
88
|
+
response = Net::HTTP.get_response(uri)
|
89
|
+
@url, @original_status, @pc_status, @stored_at = nil
|
90
|
+
@status_code = response.code.to_i
|
91
|
+
@body = JSON.parse(response.body)
|
92
|
+
@rid = @body
|
93
|
+
|
94
|
+
@body
|
95
|
+
end
|
96
|
+
|
97
|
+
def total_count
|
98
|
+
uri = URI("#{BASE_URL}/total_count")
|
99
|
+
uri.query = URI.encode_www_form(token: token)
|
100
|
+
|
101
|
+
response = Net::HTTP.get_response(uri)
|
102
|
+
@url, @original_status, @pc_status, @stored_at = nil
|
103
|
+
@status_code = response.code.to_i
|
104
|
+
@rid = rid
|
105
|
+
@body = JSON.parse(response.body)
|
106
|
+
|
107
|
+
body['totalCount']
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def decide_url_or_rid(url_or_rid)
|
113
|
+
%r{^https?://} =~ url_or_rid ? { url: url_or_rid } : { rid: url_or_rid }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
data/lib/proxycrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proxycrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- proxycrawl
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-07-
|
11
|
+
date: 2021-07-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -87,6 +87,7 @@ files:
|
|
87
87
|
- lib/proxycrawl/leads_api.rb
|
88
88
|
- lib/proxycrawl/scraper_api.rb
|
89
89
|
- lib/proxycrawl/screenshots_api.rb
|
90
|
+
- lib/proxycrawl/storage_api.rb
|
90
91
|
- lib/proxycrawl/version.rb
|
91
92
|
- proxycrawl.gemspec
|
92
93
|
homepage: https://github.com/proxycrawl/proxycrawl-ruby
|