datahen 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea12b1c12b5a5db4a650b35869de91b9b2ccc8c0c5b4e35da904fc77bfee5ebc
4
- data.tar.gz: bd96345cc669816cc281d76065cf64d150268aa8f14659e6395796d2aebd52ec
3
+ metadata.gz: d90c6eca445a5ffc51a59c784e7a297801938864dfd4b9f22984ebb1917028de
4
+ data.tar.gz: e2b68ac1b025f8c24efbed1ba01d8b0e87edbfe3630de2f01d01bffe258f0bf1
5
5
  SHA512:
6
- metadata.gz: 763c11bb6d96fdd92c8d2eb8c7965729b3812dbc0dfa9abb47151a61175f695870369d98cac0663ebdf2c644eda028833be313fb8e7924a353f82049c6430c22
7
- data.tar.gz: 43e074b6acde5a0367fc11f74c0a3dab0c7e1aecfc781c1e927c8e55bb6e367701ec0dcec2aa90d63c988eca16af90577a63e5f8191a5c7c055e9d0fb9e5bbea
6
+ metadata.gz: 60c0c0013d454e3c805f67ae3450ee9229c0da00d0ce4e4fca4bb716ffd2b6a45da234024d12e04f5ce0952b553063e21f9d144e24c7b99875e623c0f5f924e7
7
+ data.tar.gz: 9a30a97aaa2a6e5d07e45cc6616fdca80a9f99ab04c308d82d06c346654c28cfdf3e111a0d9ebcb74d7dacc0b3d8e1e2e403512887ec04bbfd123c906dbda868
@@ -7,6 +7,12 @@ module Datahen
7
7
 
8
8
  default_timeout 60
9
9
 
10
+ DEFAULT_RETRY_LIMIT = {
11
+ seeder: nil,
12
+ parser: 2,
13
+ finisher: nil
14
+ }
15
+
10
16
  def self.env_auth_token
11
17
  ENV['DATAHEN_TOKEN']
12
18
  end
@@ -33,6 +39,42 @@ module Datahen
33
39
  @auth_token = value
34
40
  end
35
41
 
42
+ def default_retry_limit
43
+ @default_retry_limit ||= DEFAULT_RETRY_LIMIT.dup
44
+ end
45
+
46
+ def left_merge target, source
47
+ # validate source and target
48
+ return {} if target.nil? || !target.is_a?(Hash)
49
+ return target if source.nil? || !source.is_a?(Hash)
50
+
51
+ # left merge source into target
52
+ target.merge(source.select{|k,v|target.has_key?(k)})
53
+ end
54
+
55
+ def retry times, delay = nil, err_msg = nil
56
+ limit = times.nil? ? nil : times.to_i
57
+ delay = delay.nil? ? 5 : delay.to_i
58
+ count = 0
59
+ begin
60
+ yield
61
+ rescue StandardError => e
62
+ STDERR.puts(e.inspect)
63
+
64
+ # wait before retry (default 5 sec)
65
+ sleep(delay) if delay > 0
66
+
67
+ # raise error when retry limit is reached
68
+ raise e unless limit.nil? || count < limit
69
+
70
+ # retry with a 100+ failsafe to prevent overflow error due integer limit
71
+ should_aprox = limit.nil? && count > 99
72
+ count += 1 unless should_aprox
73
+ puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
74
+ retry
75
+ end
76
+ end
77
+
36
78
  def initialize(opts={})
37
79
  @ignore_ssl = opts[:ignore_ssl]
38
80
  self.class.base_uri(env_api_url)
@@ -45,6 +87,9 @@ module Datahen
45
87
  verify: !ignore_ssl
46
88
  }
47
89
 
90
+ # extract and merge retry limits
91
+ @default_retry_limit = self.left_merge(DEFAULT_RETRY_LIMIT, opts[:retry_limit])
92
+
48
93
  query = {}
49
94
  query[:p] = opts[:page] if opts[:page]
50
95
  query[:pp] = opts[:per_page] if opts[:per_page]
@@ -55,7 +55,10 @@ module Datahen
55
55
 
56
56
  params = @options.merge({body: body.to_json})
57
57
 
58
- self.class.put("/jobs/#{job_id}/seeding_update", params)
58
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:seeder]
59
+ self.retry(limit, 5, "Error while updating the seeder.") do
60
+ self.class.put("/jobs/#{job_id}/seeding_update", params)
61
+ end
59
62
  end
60
63
 
61
64
  def finisher_update(job_id, opts={})
@@ -66,7 +69,10 @@ module Datahen
66
69
 
67
70
  params = @options.merge({body: body.to_json})
68
71
 
69
- self.class.put("/jobs/#{job_id}/finisher_update", params)
72
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:finisher]
73
+ self.retry(limit, 5, "Error while updating the finisher.") do
74
+ self.class.put("/jobs/#{job_id}/finisher_update", params)
75
+ end
70
76
  end
71
77
 
72
78
  def profile(job_id, opts={})
@@ -5,9 +5,11 @@ module Datahen
5
5
  self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
6
  end
7
7
 
8
- def all(job_id, collection = 'default')
9
-
10
- self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
8
+ def all(job_id, collection = 'default', opts = {})
9
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
10
+ self.retry(limit, 10, "Error while updating the seeder.") do
11
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
12
+ end
11
13
  end
12
14
 
13
15
  def collections(job_id)
@@ -16,4 +18,3 @@ module Datahen
16
18
  end
17
19
  end
18
20
  end
19
-
@@ -68,7 +68,10 @@ module Datahen
68
68
 
69
69
  params = @options.merge({body: body.to_json})
70
70
 
71
- self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
71
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:parser]
72
+ self.retry(limit, 5, "Error while updating the parser.") do
73
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
74
+ end
72
75
  end
73
76
 
74
77
  def find_content(job_id, gid)
@@ -152,7 +152,7 @@ module Datahen
152
152
  @page_types = []
153
153
  @parsers = Concurrent::Hash.new
154
154
  @config = YAML.load_file(config_file)
155
- self.config['parsers'].each do |v|
155
+ (self.config['parsers'] || []).each do |v|
156
156
  next if !v['disabled'].nil? && !!v['disabled']
157
157
  @page_types << v['page_type']
158
158
  self.parsers[v['page_type']] = v['file']
@@ -5,6 +5,7 @@ module Datahen
5
5
  class Executor
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
+ FIND_OUTPUTS_RETRY_LIMIT = 0
8
9
 
9
10
  attr_accessor :filename, :page, :gid, :job_id
10
11
 
@@ -159,13 +160,18 @@ module Datahen
159
160
  options = {
160
161
  query: query,
161
162
  page: page,
162
- per_page: per_page}
163
+ per_page: per_page
164
+ }
163
165
 
164
166
  # Get job_id
165
167
  query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
166
168
 
169
+ # find outputs
170
+ retry_limit = opts.has_key?(:retry_limit) ? opts[:retry_limit] : self.class::FIND_OUTPUTS_RETRY_LIMIT
167
171
  client = Client::JobOutput.new(options)
168
- response = client.all(query_job_id, collection)
172
+ response = client.all(query_job_id, collection, {
173
+ retry_limit: retry_limit
174
+ })
169
175
 
170
176
  if response.code != 200
171
177
  raise "response_code: #{response.code}|#{response.parsed_response}"
@@ -304,6 +310,7 @@ module Datahen
304
310
  end
305
311
 
306
312
  # saving to server
313
+
307
314
  response = update_to_server(
308
315
  job_id: job_id,
309
316
  gid: gid,
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubyFinisherExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -14,6 +14,8 @@ module Datahen
14
14
  # @return [Boollean]
15
15
  attr_accessor :limbo_self
16
16
 
17
+ FIND_OUTPUTS_RETRY_LIMIT = 2
18
+
17
19
  def initialize(options={})
18
20
  @filename = options.fetch(:filename) { raise "Filename is required"}
19
21
  @page = options.fetch(:page) { nil }
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubySeederExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "1.0.0"
2
+ VERSION = "1.0.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-06-03 00:00:00.000000000 Z
11
+ date: 2022-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -277,7 +277,7 @@ metadata:
277
277
  allowed_push_host: https://rubygems.org
278
278
  homepage_uri: https://datahen.com
279
279
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
280
- post_install_message:
280
+ post_install_message:
281
281
  rdoc_options: []
282
282
  require_paths:
283
283
  - lib
@@ -293,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
293
293
  version: '0'
294
294
  requirements: []
295
295
  rubygems_version: 3.0.3
296
- signing_key:
296
+ signing_key:
297
297
  specification_version: 4
298
298
  summary: DataHen toolbelt for developers
299
299
  test_files: []