proxycrawl 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8217ab7a72ae67d28e565375f1903aa2c485c250ebf17bd4afa24e03f1d124b1
4
- data.tar.gz: 8626caed930b16ef6287d9075b45d430f6e01de11565abc5d15352908e0b187c
3
+ metadata.gz: 464bcfcfd4be75be12ec870b5cb2ac5d9f38ed01e1fa2ba44ee1746cf9031795
4
+ data.tar.gz: 579e140f00efd51ec451b1e33d926d079b3cd5f4353e3a1c001bf37ce04e4e51
5
5
  SHA512:
6
- metadata.gz: c60c5baea5a6f7638a5d0ee773f1225a05b40d31db781d5e44db6b3673c0c02d4d32a868663ddaf8476b756dae68ed6b49d4e682b50bd74dc874e7759a604fc2
7
- data.tar.gz: 336172ab96bd5b80f3a44ea1fe8efd7717eaadd3ad6b06f2a534ca9df2177b6e42fc8e02ee7595705879bfce95b8788d22789ed8a9ae82d1805087a33f636623
6
+ metadata.gz: 00d314fe67826d76c4ec0470cc5419495d047ed0aa5a408d9d625c45919bd5f1f3c207e0fca30feb66b1b8360e9e7c80f3e9b99967ea6661d5bafb8b981930e1
7
+ data.tar.gz: 67e1685970d48281b357bcda337fa7bbab40794e9488c5afd3af2810b2cb66a7494e3c8135ca6ff0c085b1d3aa329297b527788d614d2ae578f3b2adb9e5894e
data/README.md CHANGED
@@ -149,6 +149,7 @@ Example:
149
149
  ```ruby
150
150
  begin
151
151
  response = scraper_api.get('https://www.amazon.com/Halo-SleepSack-Swaddle-Triangle-Neutral/dp/B01LAG1TOS')
152
+ puts response.remaining_requests
152
153
  puts response.status_code
153
154
  puts response.body
154
155
  rescue => exception
@@ -160,11 +161,15 @@ end
160
161
 
161
162
  Initialize with your Leads API token and call the `get` method.
162
163
 
164
+ For more details on the implementation, please visit the [Leads API documentation](https://proxycrawl.com/docs/leads-api).
165
+
163
166
  ```ruby
164
167
  leads_api = ProxyCrawl::LeadsAPI.new(token: 'YOUR_TOKEN')
165
168
 
166
169
  begin
167
170
  response = leads_api.get('stripe.com')
171
+ puts response.success
172
+ puts response.remaining_requests
168
173
  puts response.status_code
169
174
  puts response.body
170
175
  rescue => exception
@@ -184,6 +189,8 @@ screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
184
189
 
185
190
  begin
186
191
  response = screenshots_api.get('https://www.apple.com')
192
+ puts response.success
193
+ puts response.remaining_requests
187
194
  puts response.status_code
188
195
  puts response.screenshot_path # do something with screenshot_path here
189
196
  rescue => exception
@@ -200,6 +207,8 @@ begin
200
207
  response = screenshots_api.get('https://www.apple.com') do |file|
201
208
  # do something (reading/writing) with the image file here
202
209
  end
210
+ puts response.success
211
+ puts response.remaining_requests
203
212
  puts response.status_code
204
213
  rescue => exception
205
214
  puts exception.backtrace
@@ -215,6 +224,8 @@ begin
215
224
  response = screenshots_api.get('https://www.apple.com', save_to_path: '~/screenshot.jpg') do |file|
216
225
  # do something (reading/writing) with the image file here
217
226
  end
227
+ puts response.success
228
+ puts response.remaining_requests
218
229
  puts response.status_code
219
230
  rescue => exception
220
231
  puts exception.backtrace
@@ -223,6 +234,111 @@ end
223
234
 
224
235
  Note that `screenshots_api.get(url, options)` method accepts an [options](https://proxycrawl.com/docs/screenshots-api/parameters)
225
236
 
237
+ ## Storage API usage
238
+
239
+ Initialize the Storage API using your private token.
240
+
241
+ ```ruby
242
+ storage_api = ProxyCrawl::StorageAPI.new(token: 'YOUR_TOKEN')
243
+ ```
244
+
245
+ Pass the [url](https://proxycrawl.com/docs/storage-api/parameters/#url) that you want to get from [Proxycrawl Storage](https://proxycrawl.com/dashboard/storage).
246
+
247
+ ```ruby
248
+ begin
249
+ response = storage_api.get('https://www.apple.com')
250
+ puts response.original_status
251
+ puts response.pc_status
252
+ puts response.url
253
+ puts response.status_code
254
+ puts response.rid
255
+ puts response.body
256
+ puts response.stored_at
257
+ rescue => exception
258
+ puts exception.backtrace
259
+ end
260
+ ```
261
+
262
+ or you can use the [RID](https://proxycrawl.com/docs/storage-api/parameters/#rid)
263
+
264
+ ```ruby
265
+ begin
266
+ response = storage_api.get(RID)
267
+ puts response.original_status
268
+ puts response.pc_status
269
+ puts response.url
270
+ puts response.status_code
271
+ puts response.rid
272
+ puts response.body
273
+ puts response.stored_at
274
+ rescue => exception
275
+ puts exception.backtrace
276
+ end
277
+ ```
278
+
279
+ Note: One of the two RID or URL must be sent. So both are optional but it's mandatory to send one of the two.
280
+
281
+ ### [Delete](https://proxycrawl.com/docs/storage-api/delete/) request
282
+
283
+ To delete a storage item from your storage area, use the correct RID
284
+
285
+ ```ruby
286
+ if storage_api.delete(RID)
287
+ puts 'delete success'
288
+ else
289
+ puts "Unable to delete: #{storage_api.body['error']}"
290
+ end
291
+ ```
292
+
293
+ ### [Bulk](https://proxycrawl.com/docs/storage-api/bulk/) request
294
+
295
+ To do a bulk request with a list of RIDs, please send the list of rids as an array
296
+
297
+ ```ruby
298
+ begin
299
+ response = storage_api.bulk([RID1, RID2, RID3, ...])
300
+ puts response.original_status
301
+ puts response.pc_status
302
+ puts response.url
303
+ puts response.status_code
304
+ puts response.rid
305
+ puts response.body
306
+ puts response.stored_at
307
+ rescue => exception
308
+ puts exception.backtrace
309
+ end
310
+ ```
311
+
312
+ ### [RIDs](https://proxycrawl.com/docs/storage-api/rids) request
313
+
314
+ To request a bulk list of RIDs from your storage area
315
+
316
+ ```ruby
317
+ begin
318
+ response = storage_api.rids
319
+ puts response.status_code
320
+ puts response.rid
321
+ puts response.body
322
+ rescue => exception
323
+ puts exception.backtrace
324
+ end
325
+ ```
326
+
327
+ You can also specify a limit as a parameter
328
+
329
+ ```ruby
330
+ storage_api.rids(100)
331
+ ```
332
+
333
+ ### [Total Count](https://proxycrawl.com/docs/storage-api/total_count)
334
+
335
+ To get the total number of documents in your storage area
336
+
337
+ ```ruby
338
+ total_count = storage_api.total_count
339
+ puts "total_count: #{total_count}"
340
+ ```
341
+
226
342
  If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact).
227
343
 
228
344
  ## Development
data/lib/proxycrawl.rb CHANGED
@@ -5,6 +5,7 @@ require 'proxycrawl/api'
5
5
  require 'proxycrawl/scraper_api'
6
6
  require 'proxycrawl/leads_api'
7
7
  require 'proxycrawl/screenshots_api'
8
+ require 'proxycrawl/storage_api'
8
9
 
9
10
  module ProxyCrawl
10
11
  end
@@ -6,7 +6,7 @@ require 'uri'
6
6
 
7
7
  module ProxyCrawl
8
8
  class API
9
- attr_reader :token, :body, :status_code, :original_status, :pc_status, :url
9
+ attr_reader :token, :body, :status_code, :original_status, :pc_status, :url, :storage_url
10
10
 
11
11
  INVALID_TOKEN = 'Token is required'
12
12
  INVALID_URL = 'URL is required'
@@ -69,19 +69,13 @@ module ProxyCrawl
69
69
  end
70
70
 
71
71
  def prepare_response(response, format)
72
- if format == 'json' || base_url.include?('/scraper')
73
- json_body = JSON.parse(response.body)
74
- @original_status = json_body['original_status'].to_i
75
- @pc_status = json_body['pc_status'].to_i
76
- @url = json_body['url']
77
- @status_code = response.code.to_i
78
- else
79
- @original_status = response['original_status'].to_i
80
- @status_code = response.code.to_i
81
- @pc_status = response['pc_status'].to_i
82
- @url = response['url']
83
- end
72
+ res = format == 'json' || base_url.include?('/scraper') ? JSON.parse(response.body) : response
84
73
 
74
+ @original_status = res['original_status'].to_i
75
+ @pc_status = res['pc_status'].to_i
76
+ @url = res['url']
77
+ @storage_url = res['storage_url']
78
+ @status_code = response.code.to_i
85
79
  @body = response.body
86
80
  end
87
81
  end
@@ -6,13 +6,13 @@ require 'uri'
6
6
 
7
7
  module ProxyCrawl
8
8
  class LeadsAPI
9
- attr_reader :token, :body, :status_code
9
+ attr_reader :token, :body, :status_code, :success, :remaining_requests
10
10
 
11
11
  INVALID_TOKEN = 'Token is required'
12
12
  INVALID_DOMAIN = 'Domain is required'
13
13
 
14
14
  def initialize(options = {})
15
- raise INVALID_TOKEN if options[:token].nil?
15
+ raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
16
16
 
17
17
  @token = options[:token]
18
18
  end
@@ -24,11 +24,18 @@ module ProxyCrawl
24
24
  uri.query = URI.encode_www_form({ token: token, domain: domain })
25
25
 
26
26
  response = Net::HTTP.get_response(uri)
27
-
28
27
  @status_code = response.code.to_i
29
28
  @body = response.body
30
29
 
30
+ json_body = JSON.parse(response.body)
31
+ @success = json_body['success']
32
+ @remaining_requests = json_body['remaining_requests'].to_i
33
+
31
34
  self
32
35
  end
36
+
37
+ def post
38
+ raise 'Only GET is allowed for the LeadsAPI'
39
+ end
33
40
  end
34
41
  end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'json'
5
+ require 'uri'
6
+
7
+ module ProxyCrawl
8
+ class StorageAPI
9
+ attr_reader :token, :original_status, :pc_status, :url, :status_code, :rid, :body, :stored_at
10
+
11
+ INVALID_TOKEN = 'Token is required'
12
+ INVALID_RID = 'RID is required'
13
+ INVALID_RID_ARRAY = 'One or more RIDs are required'
14
+ INVALID_URL_OR_RID = 'Either URL or RID is required'
15
+ BASE_URL = 'https://api.proxycrawl.com/storage'
16
+
17
+ def initialize(options = {})
18
+ raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
19
+
20
+ @token = options[:token]
21
+ end
22
+
23
+ def get(url_or_rid, format = 'html')
24
+ raise INVALID_URL_OR_RID if url_or_rid.nil? || url_or_rid.empty?
25
+
26
+ uri = URI(BASE_URL)
27
+ uri.query = URI.encode_www_form({ token: token, format: format }.merge(decide_url_or_rid(url_or_rid)))
28
+ response = Net::HTTP.get_response(uri)
29
+
30
+ res = format == 'json' ? JSON.parse(response.body) : response
31
+
32
+ @original_status = res['original_status'].to_i
33
+ @pc_status = res['pc_status'].to_i
34
+ @url = res['url']
35
+ @rid = res['rid']
36
+ @stored_at = res['stored_at']
37
+
38
+ @status_code = response.code.to_i
39
+ @body = response.body
40
+
41
+ self
42
+ end
43
+
44
+ def delete(rid)
45
+ raise INVALID_RID if rid.nil? || rid.empty?
46
+
47
+ uri = URI(BASE_URL)
48
+ uri.query = URI.encode_www_form(token: token, rid: rid)
49
+ http = Net::HTTP.new(uri.host)
50
+ request = Net::HTTP::Delete.new(uri.request_uri)
51
+ response = http.request(request)
52
+
53
+ @url, @original_status, @pc_status, @stored_at = nil
54
+ @status_code = response.code.to_i
55
+ @rid = rid
56
+ @body = JSON.parse(response.body)
57
+
58
+ @body.key?('success')
59
+ end
60
+
61
+ def bulk(rids_array = [])
62
+ raise INVALID_RID_ARRAY if rids_array.empty?
63
+
64
+ uri = URI("#{BASE_URL}/bulk")
65
+ uri.query = URI.encode_www_form(token: token)
66
+ http = Net::HTTP.new(uri.host)
67
+ request = Net::HTTP::Post.new(uri.request_uri, { 'Content-Type': 'application/json' })
68
+ request.body = { rids: rids_array }.to_json
69
+ response = http.request(request)
70
+
71
+ @body = JSON.parse(response.body)
72
+ @original_status = @body.map { |item| item['original_status'].to_i }
73
+ @status_code = response.code.to_i
74
+ @pc_status = @body.map { |item| item['pc_status'].to_i }
75
+ @url = @body.map { |item| item['url'] }
76
+ @rid = @body.map { |item| item['rid'] }
77
+ @stored_at = @body.map { |item| item['stored_at'] }
78
+
79
+ self
80
+ end
81
+
82
+ def rids(limit = -1)
83
+ uri = URI("#{BASE_URL}/rids")
84
+ query_hash = { token: token }
85
+ query_hash.merge!({ limit: limit }) if limit >= 0
86
+ uri.query = URI.encode_www_form(query_hash)
87
+
88
+ response = Net::HTTP.get_response(uri)
89
+ @url, @original_status, @pc_status, @stored_at = nil
90
+ @status_code = response.code.to_i
91
+ @body = JSON.parse(response.body)
92
+ @rid = @body
93
+
94
+ @body
95
+ end
96
+
97
+ def total_count
98
+ uri = URI("#{BASE_URL}/total_count")
99
+ uri.query = URI.encode_www_form(token: token)
100
+
101
+ response = Net::HTTP.get_response(uri)
102
+ @url, @original_status, @pc_status, @stored_at = nil
103
+ @status_code = response.code.to_i
104
+ @rid = rid
105
+ @body = JSON.parse(response.body)
106
+
107
+ body['totalCount']
108
+ end
109
+
110
+ private
111
+
112
+ def decide_url_or_rid(url_or_rid)
113
+ %r{^https?://} =~ url_or_rid ? { url: url_or_rid } : { rid: url_or_rid }
114
+ end
115
+ end
116
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ProxyCrawl
4
- VERSION = '0.3.0'
4
+ VERSION = '0.3.1'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proxycrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - proxycrawl
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-07-07 00:00:00.000000000 Z
11
+ date: 2021-07-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -87,6 +87,7 @@ files:
87
87
  - lib/proxycrawl/leads_api.rb
88
88
  - lib/proxycrawl/scraper_api.rb
89
89
  - lib/proxycrawl/screenshots_api.rb
90
+ - lib/proxycrawl/storage_api.rb
90
91
  - lib/proxycrawl/version.rb
91
92
  - proxycrawl.gemspec
92
93
  homepage: https://github.com/proxycrawl/proxycrawl-ruby