datahen 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea12b1c12b5a5db4a650b35869de91b9b2ccc8c0c5b4e35da904fc77bfee5ebc
4
- data.tar.gz: bd96345cc669816cc281d76065cf64d150268aa8f14659e6395796d2aebd52ec
3
+ metadata.gz: d90c6eca445a5ffc51a59c784e7a297801938864dfd4b9f22984ebb1917028de
4
+ data.tar.gz: e2b68ac1b025f8c24efbed1ba01d8b0e87edbfe3630de2f01d01bffe258f0bf1
5
5
  SHA512:
6
- metadata.gz: 763c11bb6d96fdd92c8d2eb8c7965729b3812dbc0dfa9abb47151a61175f695870369d98cac0663ebdf2c644eda028833be313fb8e7924a353f82049c6430c22
7
- data.tar.gz: 43e074b6acde5a0367fc11f74c0a3dab0c7e1aecfc781c1e927c8e55bb6e367701ec0dcec2aa90d63c988eca16af90577a63e5f8191a5c7c055e9d0fb9e5bbea
6
+ metadata.gz: 60c0c0013d454e3c805f67ae3450ee9229c0da00d0ce4e4fca4bb716ffd2b6a45da234024d12e04f5ce0952b553063e21f9d144e24c7b99875e623c0f5f924e7
7
+ data.tar.gz: 9a30a97aaa2a6e5d07e45cc6616fdca80a9f99ab04c308d82d06c346654c28cfdf3e111a0d9ebcb74d7dacc0b3d8e1e2e403512887ec04bbfd123c906dbda868
@@ -7,6 +7,12 @@ module Datahen
7
7
 
8
8
  default_timeout 60
9
9
 
10
+ DEFAULT_RETRY_LIMIT = {
11
+ seeder: nil,
12
+ parser: 2,
13
+ finisher: nil
14
+ }
15
+
10
16
  def self.env_auth_token
11
17
  ENV['DATAHEN_TOKEN']
12
18
  end
@@ -33,6 +39,42 @@ module Datahen
33
39
  @auth_token = value
34
40
  end
35
41
 
42
+ def default_retry_limit
43
+ @default_retry_limit ||= DEFAULT_RETRY_LIMIT.dup
44
+ end
45
+
46
+ def left_merge target, source
47
+ # validate source and target
48
+ return {} if target.nil? || !target.is_a?(Hash)
49
+ return target if source.nil? || !source.is_a?(Hash)
50
+
51
+ # left merge source into target
52
+ target.merge(source.select{|k,v|target.has_key?(k)})
53
+ end
54
+
55
+ def retry times, delay = nil, err_msg = nil
56
+ limit = times.nil? ? nil : times.to_i
57
+ delay = delay.nil? ? 5 : delay.to_i
58
+ count = 0
59
+ begin
60
+ yield
61
+ rescue StandardError => e
62
+ STDERR.puts(e.inspect)
63
+
64
+ # wait before retry (default 5 sec)
65
+ sleep(delay) if delay > 0
66
+
67
+ # raise error when retry limit is reached
68
+ raise e unless limit.nil? || count < limit
69
+
70
+ # retry with a 100+ failsafe to prevent overflow error due integer limit
71
+ should_aprox = limit.nil? && count > 99
72
+ count += 1 unless should_aprox
73
+ puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
74
+ retry
75
+ end
76
+ end
77
+
36
78
  def initialize(opts={})
37
79
  @ignore_ssl = opts[:ignore_ssl]
38
80
  self.class.base_uri(env_api_url)
@@ -45,6 +87,9 @@ module Datahen
45
87
  verify: !ignore_ssl
46
88
  }
47
89
 
90
+ # extract and merge retry limits
91
+ @default_retry_limit = self.left_merge(DEFAULT_RETRY_LIMIT, opts[:retry_limit])
92
+
48
93
  query = {}
49
94
  query[:p] = opts[:page] if opts[:page]
50
95
  query[:pp] = opts[:per_page] if opts[:per_page]
@@ -55,7 +55,10 @@ module Datahen
55
55
 
56
56
  params = @options.merge({body: body.to_json})
57
57
 
58
- self.class.put("/jobs/#{job_id}/seeding_update", params)
58
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:seeder]
59
+ self.retry(limit, 5, "Error while updating the seeder.") do
60
+ self.class.put("/jobs/#{job_id}/seeding_update", params)
61
+ end
59
62
  end
60
63
 
61
64
  def finisher_update(job_id, opts={})
@@ -66,7 +69,10 @@ module Datahen
66
69
 
67
70
  params = @options.merge({body: body.to_json})
68
71
 
69
- self.class.put("/jobs/#{job_id}/finisher_update", params)
72
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:finisher]
73
+ self.retry(limit, 5, "Error while updating the finisher.") do
74
+ self.class.put("/jobs/#{job_id}/finisher_update", params)
75
+ end
70
76
  end
71
77
 
72
78
  def profile(job_id, opts={})
@@ -5,9 +5,11 @@ module Datahen
5
5
  self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
6
  end
7
7
 
8
- def all(job_id, collection = 'default')
9
-
10
- self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
8
+ def all(job_id, collection = 'default', opts = {})
9
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
10
+ self.retry(limit, 10, "Error while updating the seeder.") do
11
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
12
+ end
11
13
  end
12
14
 
13
15
  def collections(job_id)
@@ -16,4 +18,3 @@ module Datahen
16
18
  end
17
19
  end
18
20
  end
19
-
@@ -68,7 +68,10 @@ module Datahen
68
68
 
69
69
  params = @options.merge({body: body.to_json})
70
70
 
71
- self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
71
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:parser]
72
+ self.retry(limit, 5, "Error while updating the parser.") do
73
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
74
+ end
72
75
  end
73
76
 
74
77
  def find_content(job_id, gid)
@@ -152,7 +152,7 @@ module Datahen
152
152
  @page_types = []
153
153
  @parsers = Concurrent::Hash.new
154
154
  @config = YAML.load_file(config_file)
155
- self.config['parsers'].each do |v|
155
+ (self.config['parsers'] || []).each do |v|
156
156
  next if !v['disabled'].nil? && !!v['disabled']
157
157
  @page_types << v['page_type']
158
158
  self.parsers[v['page_type']] = v['file']
@@ -5,6 +5,7 @@ module Datahen
5
5
  class Executor
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
+ FIND_OUTPUTS_RETRY_LIMIT = 0
8
9
 
9
10
  attr_accessor :filename, :page, :gid, :job_id
10
11
 
@@ -159,13 +160,18 @@ module Datahen
159
160
  options = {
160
161
  query: query,
161
162
  page: page,
162
- per_page: per_page}
163
+ per_page: per_page
164
+ }
163
165
 
164
166
  # Get job_id
165
167
  query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
166
168
 
169
+ # find outputs
170
+ retry_limit = opts.has_key?(:retry_limit) ? opts[:retry_limit] : self.class::FIND_OUTPUTS_RETRY_LIMIT
167
171
  client = Client::JobOutput.new(options)
168
- response = client.all(query_job_id, collection)
172
+ response = client.all(query_job_id, collection, {
173
+ retry_limit: retry_limit
174
+ })
169
175
 
170
176
  if response.code != 200
171
177
  raise "response_code: #{response.code}|#{response.parsed_response}"
@@ -304,6 +310,7 @@ module Datahen
304
310
  end
305
311
 
306
312
  # saving to server
313
+
307
314
  response = update_to_server(
308
315
  job_id: job_id,
309
316
  gid: gid,
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubyFinisherExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -14,6 +14,8 @@ module Datahen
14
14
  # @return [Boollean]
15
15
  attr_accessor :limbo_self
16
16
 
17
+ FIND_OUTPUTS_RETRY_LIMIT = 2
18
+
17
19
  def initialize(options={})
18
20
  @filename = options.fetch(:filename) { raise "Filename is required"}
19
21
  @page = options.fetch(:page) { nil }
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubySeederExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "1.0.0"
2
+ VERSION = "1.0.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-06-03 00:00:00.000000000 Z
11
+ date: 2022-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -277,7 +277,7 @@ metadata:
277
277
  allowed_push_host: https://rubygems.org
278
278
  homepage_uri: https://datahen.com
279
279
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
280
- post_install_message:
280
+ post_install_message:
281
281
  rdoc_options: []
282
282
  require_paths:
283
283
  - lib
@@ -293,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
293
293
  version: '0'
294
294
  requirements: []
295
295
  rubygems_version: 3.0.3
296
- signing_key:
296
+ signing_key:
297
297
  specification_version: 4
298
298
  summary: DataHen toolbelt for developers
299
299
  test_files: []