proxycrawl 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8217ab7a72ae67d28e565375f1903aa2c485c250ebf17bd4afa24e03f1d124b1
4
- data.tar.gz: 8626caed930b16ef6287d9075b45d430f6e01de11565abc5d15352908e0b187c
3
+ metadata.gz: 464bcfcfd4be75be12ec870b5cb2ac5d9f38ed01e1fa2ba44ee1746cf9031795
4
+ data.tar.gz: 579e140f00efd51ec451b1e33d926d079b3cd5f4353e3a1c001bf37ce04e4e51
5
5
  SHA512:
6
- metadata.gz: c60c5baea5a6f7638a5d0ee773f1225a05b40d31db781d5e44db6b3673c0c02d4d32a868663ddaf8476b756dae68ed6b49d4e682b50bd74dc874e7759a604fc2
7
- data.tar.gz: 336172ab96bd5b80f3a44ea1fe8efd7717eaadd3ad6b06f2a534ca9df2177b6e42fc8e02ee7595705879bfce95b8788d22789ed8a9ae82d1805087a33f636623
6
+ metadata.gz: 00d314fe67826d76c4ec0470cc5419495d047ed0aa5a408d9d625c45919bd5f1f3c207e0fca30feb66b1b8360e9e7c80f3e9b99967ea6661d5bafb8b981930e1
7
+ data.tar.gz: 67e1685970d48281b357bcda337fa7bbab40794e9488c5afd3af2810b2cb66a7494e3c8135ca6ff0c085b1d3aa329297b527788d614d2ae578f3b2adb9e5894e
data/README.md CHANGED
@@ -149,6 +149,7 @@ Example:
149
149
  ```ruby
150
150
  begin
151
151
  response = scraper_api.get('https://www.amazon.com/Halo-SleepSack-Swaddle-Triangle-Neutral/dp/B01LAG1TOS')
152
+ puts response.remaining_requests
152
153
  puts response.status_code
153
154
  puts response.body
154
155
  rescue => exception
@@ -160,11 +161,15 @@ end
160
161
 
161
162
  Initialize with your Leads API token and call the `get` method.
162
163
 
164
+ For more details on the implementation, please visit the [Leads API documentation](https://proxycrawl.com/docs/leads-api).
165
+
163
166
  ```ruby
164
167
  leads_api = ProxyCrawl::LeadsAPI.new(token: 'YOUR_TOKEN')
165
168
 
166
169
  begin
167
170
  response = leads_api.get('stripe.com')
171
+ puts response.success
172
+ puts response.remaining_requests
168
173
  puts response.status_code
169
174
  puts response.body
170
175
  rescue => exception
@@ -184,6 +189,8 @@ screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
184
189
 
185
190
  begin
186
191
  response = screenshots_api.get('https://www.apple.com')
192
+ puts response.success
193
+ puts response.remaining_requests
187
194
  puts response.status_code
188
195
  puts response.screenshot_path # do something with screenshot_path here
189
196
  rescue => exception
@@ -200,6 +207,8 @@ begin
200
207
  response = screenshots_api.get('https://www.apple.com') do |file|
201
208
  # do something (reading/writing) with the image file here
202
209
  end
210
+ puts response.success
211
+ puts response.remaining_requests
203
212
  puts response.status_code
204
213
  rescue => exception
205
214
  puts exception.backtrace
@@ -215,6 +224,8 @@ begin
215
224
  response = screenshots_api.get('https://www.apple.com', save_to_path: '~/screenshot.jpg') do |file|
216
225
  # do something (reading/writing) with the image file here
217
226
  end
227
+ puts response.success
228
+ puts response.remaining_requests
218
229
  puts response.status_code
219
230
  rescue => exception
220
231
  puts exception.backtrace
@@ -223,6 +234,111 @@ end
223
234
 
224
235
  Note that `screenshots_api.get(url, options)` method accepts an [options](https://proxycrawl.com/docs/screenshots-api/parameters)
225
236
 
237
+ ## Storage API usage
238
+
239
+ Initialize the Storage API using your private token.
240
+
241
+ ```ruby
242
+ storage_api = ProxyCrawl::StorageAPI.new(token: 'YOUR_TOKEN')
243
+ ```
244
+
245
+ Pass the [url](https://proxycrawl.com/docs/storage-api/parameters/#url) that you want to get from [Proxycrawl Storage](https://proxycrawl.com/dashboard/storage).
246
+
247
+ ```ruby
248
+ begin
249
+ response = storage_api.get('https://www.apple.com')
250
+ puts response.original_status
251
+ puts response.pc_status
252
+ puts response.url
253
+ puts response.status_code
254
+ puts response.rid
255
+ puts response.body
256
+ puts response.stored_at
257
+ rescue => exception
258
+ puts exception.backtrace
259
+ end
260
+ ```
261
+
262
+ or you can use the [RID](https://proxycrawl.com/docs/storage-api/parameters/#rid)
263
+
264
+ ```ruby
265
+ begin
266
+ response = storage_api.get(RID)
267
+ puts response.original_status
268
+ puts response.pc_status
269
+ puts response.url
270
+ puts response.status_code
271
+ puts response.rid
272
+ puts response.body
273
+ puts response.stored_at
274
+ rescue => exception
275
+ puts exception.backtrace
276
+ end
277
+ ```
278
+
279
+ Note: One of the two RID or URL must be sent. So both are optional but it's mandatory to send one of the two.
280
+
281
+ ### [Delete](https://proxycrawl.com/docs/storage-api/delete/) request
282
+
283
+ To delete a storage item from your storage area, use the correct RID
284
+
285
+ ```ruby
286
+ if storage_api.delete(RID)
287
+ puts 'delete success'
288
+ else
289
+ puts "Unable to delete: #{storage_api.body['error']}"
290
+ end
291
+ ```
292
+
293
+ ### [Bulk](https://proxycrawl.com/docs/storage-api/bulk/) request
294
+
295
+ To do a bulk request with a list of RIDs, please send the list of rids as an array
296
+
297
+ ```ruby
298
+ begin
299
+ response = storage_api.bulk([RID1, RID2, RID3, ...])
300
+ puts response.original_status
301
+ puts response.pc_status
302
+ puts response.url
303
+ puts response.status_code
304
+ puts response.rid
305
+ puts response.body
306
+ puts response.stored_at
307
+ rescue => exception
308
+ puts exception.backtrace
309
+ end
310
+ ```
311
+
312
+ ### [RIDs](https://proxycrawl.com/docs/storage-api/rids) request
313
+
314
+ To request a bulk list of RIDs from your storage area
315
+
316
+ ```ruby
317
+ begin
318
+ response = storage_api.rids
319
+ puts response.status_code
320
+ puts response.rid
321
+ puts response.body
322
+ rescue => exception
323
+ puts exception.backtrace
324
+ end
325
+ ```
326
+
327
+ You can also specify a limit as a parameter
328
+
329
+ ```ruby
330
+ storage_api.rids(100)
331
+ ```
332
+
333
+ ### [Total Count](https://proxycrawl.com/docs/storage-api/total_count)
334
+
335
+ To get the total number of documents in your storage area
336
+
337
+ ```ruby
338
+ total_count = storage_api.total_count
339
+ puts "total_count: #{total_count}"
340
+ ```
341
+
226
342
  If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact).
227
343
 
228
344
  ## Development
data/lib/proxycrawl.rb CHANGED
@@ -5,6 +5,7 @@ require 'proxycrawl/api'
5
5
  require 'proxycrawl/scraper_api'
6
6
  require 'proxycrawl/leads_api'
7
7
  require 'proxycrawl/screenshots_api'
8
+ require 'proxycrawl/storage_api'
8
9
 
9
10
  module ProxyCrawl
10
11
  end
@@ -6,7 +6,7 @@ require 'uri'
6
6
 
7
7
  module ProxyCrawl
8
8
  class API
9
- attr_reader :token, :body, :status_code, :original_status, :pc_status, :url
9
+ attr_reader :token, :body, :status_code, :original_status, :pc_status, :url, :storage_url
10
10
 
11
11
  INVALID_TOKEN = 'Token is required'
12
12
  INVALID_URL = 'URL is required'
@@ -69,19 +69,13 @@ module ProxyCrawl
69
69
  end
70
70
 
71
71
  def prepare_response(response, format)
72
- if format == 'json' || base_url.include?('/scraper')
73
- json_body = JSON.parse(response.body)
74
- @original_status = json_body['original_status'].to_i
75
- @pc_status = json_body['pc_status'].to_i
76
- @url = json_body['url']
77
- @status_code = response.code.to_i
78
- else
79
- @original_status = response['original_status'].to_i
80
- @status_code = response.code.to_i
81
- @pc_status = response['pc_status'].to_i
82
- @url = response['url']
83
- end
72
+ res = format == 'json' || base_url.include?('/scraper') ? JSON.parse(response.body) : response
84
73
 
74
+ @original_status = res['original_status'].to_i
75
+ @pc_status = res['pc_status'].to_i
76
+ @url = res['url']
77
+ @storage_url = res['storage_url']
78
+ @status_code = response.code.to_i
85
79
  @body = response.body
86
80
  end
87
81
  end
@@ -6,13 +6,13 @@ require 'uri'
6
6
 
7
7
  module ProxyCrawl
8
8
  class LeadsAPI
9
- attr_reader :token, :body, :status_code
9
+ attr_reader :token, :body, :status_code, :success, :remaining_requests
10
10
 
11
11
  INVALID_TOKEN = 'Token is required'
12
12
  INVALID_DOMAIN = 'Domain is required'
13
13
 
14
14
  def initialize(options = {})
15
- raise INVALID_TOKEN if options[:token].nil?
15
+ raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
16
16
 
17
17
  @token = options[:token]
18
18
  end
@@ -24,11 +24,18 @@ module ProxyCrawl
24
24
  uri.query = URI.encode_www_form({ token: token, domain: domain })
25
25
 
26
26
  response = Net::HTTP.get_response(uri)
27
-
28
27
  @status_code = response.code.to_i
29
28
  @body = response.body
30
29
 
30
+ json_body = JSON.parse(response.body)
31
+ @success = json_body['success']
32
+ @remaining_requests = json_body['remaining_requests'].to_i
33
+
31
34
  self
32
35
  end
36
+
37
+ def post
38
+ raise 'Only GET is allowed for the LeadsAPI'
39
+ end
33
40
  end
34
41
  end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'json'
5
+ require 'uri'
6
+
7
+ module ProxyCrawl
8
+ class StorageAPI
9
+ attr_reader :token, :original_status, :pc_status, :url, :status_code, :rid, :body, :stored_at
10
+
11
+ INVALID_TOKEN = 'Token is required'
12
+ INVALID_RID = 'RID is required'
13
+ INVALID_RID_ARRAY = 'One or more RIDs are required'
14
+ INVALID_URL_OR_RID = 'Either URL or RID is required'
15
+ BASE_URL = 'https://api.proxycrawl.com/storage'
16
+
17
+ def initialize(options = {})
18
+ raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
19
+
20
+ @token = options[:token]
21
+ end
22
+
23
+ def get(url_or_rid, format = 'html')
24
+ raise INVALID_URL_OR_RID if url_or_rid.nil? || url_or_rid.empty?
25
+
26
+ uri = URI(BASE_URL)
27
+ uri.query = URI.encode_www_form({ token: token, format: format }.merge(decide_url_or_rid(url_or_rid)))
28
+ response = Net::HTTP.get_response(uri)
29
+
30
+ res = format == 'json' ? JSON.parse(response.body) : response
31
+
32
+ @original_status = res['original_status'].to_i
33
+ @pc_status = res['pc_status'].to_i
34
+ @url = res['url']
35
+ @rid = res['rid']
36
+ @stored_at = res['stored_at']
37
+
38
+ @status_code = response.code.to_i
39
+ @body = response.body
40
+
41
+ self
42
+ end
43
+
44
+ def delete(rid)
45
+ raise INVALID_RID if rid.nil? || rid.empty?
46
+
47
+ uri = URI(BASE_URL)
48
+ uri.query = URI.encode_www_form(token: token, rid: rid)
49
+ http = Net::HTTP.new(uri.host)
50
+ request = Net::HTTP::Delete.new(uri.request_uri)
51
+ response = http.request(request)
52
+
53
+ @url, @original_status, @pc_status, @stored_at = nil
54
+ @status_code = response.code.to_i
55
+ @rid = rid
56
+ @body = JSON.parse(response.body)
57
+
58
+ @body.key?('success')
59
+ end
60
+
61
+ def bulk(rids_array = [])
62
+ raise INVALID_RID_ARRAY if rids_array.empty?
63
+
64
+ uri = URI("#{BASE_URL}/bulk")
65
+ uri.query = URI.encode_www_form(token: token)
66
+ http = Net::HTTP.new(uri.host)
67
+ request = Net::HTTP::Post.new(uri.request_uri, { 'Content-Type': 'application/json' })
68
+ request.body = { rids: rids_array }.to_json
69
+ response = http.request(request)
70
+
71
+ @body = JSON.parse(response.body)
72
+ @original_status = @body.map { |item| item['original_status'].to_i }
73
+ @status_code = response.code.to_i
74
+ @pc_status = @body.map { |item| item['pc_status'].to_i }
75
+ @url = @body.map { |item| item['url'] }
76
+ @rid = @body.map { |item| item['rid'] }
77
+ @stored_at = @body.map { |item| item['stored_at'] }
78
+
79
+ self
80
+ end
81
+
82
+ def rids(limit = -1)
83
+ uri = URI("#{BASE_URL}/rids")
84
+ query_hash = { token: token }
85
+ query_hash.merge!({ limit: limit }) if limit >= 0
86
+ uri.query = URI.encode_www_form(query_hash)
87
+
88
+ response = Net::HTTP.get_response(uri)
89
+ @url, @original_status, @pc_status, @stored_at = nil
90
+ @status_code = response.code.to_i
91
+ @body = JSON.parse(response.body)
92
+ @rid = @body
93
+
94
+ @body
95
+ end
96
+
97
+ def total_count
98
+ uri = URI("#{BASE_URL}/total_count")
99
+ uri.query = URI.encode_www_form(token: token)
100
+
101
+ response = Net::HTTP.get_response(uri)
102
+ @url, @original_status, @pc_status, @stored_at = nil
103
+ @status_code = response.code.to_i
104
+ @rid = rid
105
+ @body = JSON.parse(response.body)
106
+
107
+ body['totalCount']
108
+ end
109
+
110
+ private
111
+
112
+ def decide_url_or_rid(url_or_rid)
113
+ %r{^https?://} =~ url_or_rid ? { url: url_or_rid } : { rid: url_or_rid }
114
+ end
115
+ end
116
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ProxyCrawl
4
- VERSION = '0.3.0'
4
+ VERSION = '0.3.1'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proxycrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - proxycrawl
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-07-07 00:00:00.000000000 Z
11
+ date: 2021-07-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -87,6 +87,7 @@ files:
87
87
  - lib/proxycrawl/leads_api.rb
88
88
  - lib/proxycrawl/scraper_api.rb
89
89
  - lib/proxycrawl/screenshots_api.rb
90
+ - lib/proxycrawl/storage_api.rb
90
91
  - lib/proxycrawl/version.rb
91
92
  - proxycrawl.gemspec
92
93
  homepage: https://github.com/proxycrawl/proxycrawl-ruby