datahen 0.20.0 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 11c43658f61faff627da448abff393b5cc658c15e1dfd6765c6cfdee96958d01
4
- data.tar.gz: 5dc4481c9755d33dcee2539a24fe1ba2b0336fc4320beaa54204d1636067ee75
3
+ metadata.gz: 4aa3927b9865f2815f64463f1d48b2dadddeaa73d2bd446a98ea9eb9ecb3ff5f
4
+ data.tar.gz: 91b7a370e740721202e9f14c043cc5c28cb9e6102dcd701c5121506042ba753b
5
5
  SHA512:
6
- metadata.gz: e25db9321dfb26cb88d38aa9adf353c9955c3932553fb87affe235bd58f0182e7043775d102ce2bb5e7d6d0764cd76bd71f3be024ced38eadd17c05cfa0efd69
7
- data.tar.gz: b07e2655df12424db7f859da29f0921fa6417dfbfa6b72537615089d6fabac7451e3e61e23ba4f4e8cfff0c0555ef50bf751dd8b91b8be8ae51b23d855145586
6
+ metadata.gz: c643100e60ea20686d882377b7e982829f93b4d4d8750342d47370f9d649688e94517462041b08ec1387901fcc8f33a0b0105e5f4ab43cd378dca5768cc190c4
7
+ data.tar.gz: ba3bae8b462aa6894520dc115452a27fcb3d64571820970ad7b69592670549824ec53daa1c8194392750f746a14ced4baf2e552eafc2684735dc185c57af8e8e
@@ -24,8 +24,9 @@ module Datahen
24
24
  option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
25
25
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
26
26
  option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
27
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
28
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
27
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
28
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
29
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
29
30
  option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
30
31
  option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
31
32
  option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
@@ -52,8 +53,9 @@ module Datahen
52
53
  option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
53
54
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
54
55
  option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
55
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
56
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
56
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
57
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
58
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
57
59
  option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
58
60
  option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
59
61
  option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
@@ -97,8 +99,9 @@ module Datahen
97
99
  long_desc <<-LONGDESC
98
100
  Starts a scraper by creating an active scrape job\x5
99
101
  LONGDESC
100
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
101
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
102
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
103
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
104
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
102
105
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
103
106
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
104
107
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
@@ -99,8 +99,9 @@ module Datahen
99
99
  long_desc <<-LONGDESC
100
100
  Updates a scraper's current job.
101
101
  LONGDESC
102
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
103
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
102
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 1. '
103
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 1. '
104
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 0. '
104
105
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
105
106
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
107
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
@@ -7,6 +7,12 @@ module Datahen
7
7
 
8
8
  default_timeout 60
9
9
 
10
+ DEFAULT_RETRY_LIMIT = {
11
+ seeder: nil,
12
+ parser: nil,
13
+ finisher: nil
14
+ }
15
+
10
16
  def self.env_auth_token
11
17
  ENV['DATAHEN_TOKEN']
12
18
  end
@@ -33,6 +39,42 @@ module Datahen
33
39
  @auth_token = value
34
40
  end
35
41
 
42
+ def default_retry_limit
43
+ @default_retry_limit ||= DEFAULT_RETRY_LIMIT.dup
44
+ end
45
+
46
+ def left_merge target, source
47
+ # validate source and target
48
+ return {} if target.nil? || !target.is_a?(Hash)
49
+ return target if source.nil? || !source.is_a?(Hash)
50
+
51
+ # left merge source into target
52
+ target.merge(source.select{|k,v|target.has_key?(k)})
53
+ end
54
+
55
+ def retry times, delay = nil, err_msg = nil
56
+ limit = times.nil? ? nil : times.to_i
57
+ delay = delay.nil? ? 5 : delay.to_i
58
+ count = 0
59
+ begin
60
+ yield
61
+ rescue StandardError => e
62
+ STDERR.puts(e.inspect)
63
+
64
+ # wait before retry (default 5 sec)
65
+ sleep(delay) if delay > 0
66
+
67
+ # raise error when retry limit is reached
68
+ raise e unless limit.nil? || count < limit
69
+
70
+ # retry with a 100+ failsafe to prevent overflow error due integer limit
71
+ should_aprox = limit.nil? && count > 99
72
+ count += 1 unless should_aprox
73
+ puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
74
+ retry
75
+ end
76
+ end
77
+
36
78
  def initialize(opts={})
37
79
  @ignore_ssl = opts[:ignore_ssl]
38
80
  self.class.base_uri(env_api_url)
@@ -45,6 +87,9 @@ module Datahen
45
87
  verify: !ignore_ssl
46
88
  }
47
89
 
90
+ # extract and merge retry limits
91
+ @default_retry_limit = self.left_merge(DEFAULT_RETRY_LIMIT, opts[:retry_limit])
92
+
48
93
  query = {}
49
94
  query[:p] = opts[:page] if opts[:page]
50
95
  query[:pp] = opts[:per_page] if opts[:per_page]
@@ -17,7 +17,8 @@ module Datahen
17
17
  def update(job_id, opts={})
18
18
  body = {}
19
19
  body[:status] = opts[:status] if opts[:status]
20
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
20
+ body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
21
+ body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
21
22
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
23
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
24
  body[:profile] = opts[:profile] if opts[:profile]
@@ -54,7 +55,10 @@ module Datahen
54
55
 
55
56
  params = @options.merge({body: body.to_json})
56
57
 
57
- self.class.put("/jobs/#{job_id}/seeding_update", params)
58
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:seeder]
59
+ self.retry(limit, 5, "Error while updating the seeder.") do
60
+ self.class.put("/jobs/#{job_id}/seeding_update", params)
61
+ end
58
62
  end
59
63
 
60
64
  def finisher_update(job_id, opts={})
@@ -65,7 +69,10 @@ module Datahen
65
69
 
66
70
  params = @options.merge({body: body.to_json})
67
71
 
68
- self.class.put("/jobs/#{job_id}/finisher_update", params)
72
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:finisher]
73
+ self.retry(limit, 5, "Error while updating the finisher.") do
74
+ self.class.put("/jobs/#{job_id}/finisher_update", params)
75
+ end
69
76
  end
70
77
 
71
78
  def profile(job_id, opts={})
@@ -5,9 +5,11 @@ module Datahen
5
5
  self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
6
  end
7
7
 
8
- def all(job_id, collection = 'default')
9
-
10
- self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
8
+ def all(job_id, collection = 'default', opts = {})
9
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
10
+ self.retry(limit, 10, "Error while updating the seeder.") do
11
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
12
+ end
11
13
  end
12
14
 
13
15
  def collections(job_id)
@@ -16,4 +18,3 @@ module Datahen
16
18
  end
17
19
  end
18
20
  end
19
-
@@ -68,7 +68,10 @@ module Datahen
68
68
 
69
69
  params = @options.merge({body: body.to_json})
70
70
 
71
- self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
71
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:parser]
72
+ self.retry(limit, 5, "Error while updating the parser.") do
73
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
74
+ end
72
75
  end
73
76
 
74
77
  def find_content(job_id, gid)
@@ -18,7 +18,8 @@ module Datahen
18
18
  body[:git_branch] = opts[:branch] || opts[:git_branch] || "master" if opts[:branch] || opts[:git_branch]
19
19
  body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
20
20
  body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
21
- body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
21
+ body[:parser_worker_count] = opts[:parsers] || opts[:parser_worker_count] if opts[:parsers] || opts[:parser_worker_count]
22
+ body[:fetcher_worker_count] = opts[:fetchers] || opts[:fetcher_worker_count] if opts[:fetchers] || opts[:fetcher_worker_count]
22
23
  body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
23
24
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
24
25
  body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
@@ -42,7 +43,8 @@ module Datahen
42
43
  body[:git_branch] = opts[:branch] || opts[:git_branch] if opts[:branch] || opts[:git_branch]
43
44
  body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
44
45
  body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch") || opts.has_key?(:force_fetch)
45
- body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
46
+ body[:parser_worker_count] = opts[:parsers] || opts[:parser_worker_count] if opts[:parsers] || opts[:parser_worker_count]
47
+ body[:fetcher_worker_count] = opts[:fetchers] || opts[:fetcher_worker_count] if opts[:fetchers] || opts[:fetcher_worker_count]
46
48
  body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
47
49
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
48
50
  body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler") || opts.has_key?(:disable_scheduler)
@@ -8,7 +8,8 @@ module Datahen
8
8
 
9
9
  def create(scraper_name, opts={})
10
10
  body = {}
11
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
11
+ body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
12
+ body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
12
13
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
14
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
15
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
@@ -36,7 +37,8 @@ module Datahen
36
37
  def update(scraper_name, opts={})
37
38
  body = {}
38
39
  body[:status] = opts[:status] if opts[:status]
39
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
40
+ body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
41
+ body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
40
42
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
41
43
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
42
44
  body[:profile] = opts[:profile] if opts[:profile]
@@ -152,7 +152,7 @@ module Datahen
152
152
  @page_types = []
153
153
  @parsers = Concurrent::Hash.new
154
154
  @config = YAML.load_file(config_file)
155
- self.config['parsers'].each do |v|
155
+ (self.config['parsers'] || []).each do |v|
156
156
  next if !v['disabled'].nil? && !!v['disabled']
157
157
  @page_types << v['page_type']
158
158
  self.parsers[v['page_type']] = v['file']
@@ -5,6 +5,7 @@ module Datahen
5
5
  class Executor
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
+ FIND_OUTPUTS_RETRY_LIMIT = 0
8
9
 
9
10
  attr_accessor :filename, :page, :gid, :job_id
10
11
 
@@ -159,13 +160,18 @@ module Datahen
159
160
  options = {
160
161
  query: query,
161
162
  page: page,
162
- per_page: per_page}
163
+ per_page: per_page
164
+ }
163
165
 
164
166
  # Get job_id
165
167
  query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
166
168
 
169
+ # find outputs
170
+ retry_limit = opts.has_key?(:retry_limit) ? opts[:retry_limit] : self.class::FIND_OUTPUTS_RETRY_LIMIT
167
171
  client = Client::JobOutput.new(options)
168
- response = client.all(query_job_id, collection)
172
+ response = client.all(query_job_id, collection, {
173
+ retry_limit: retry_limit
174
+ })
169
175
 
170
176
  if response.code != 200
171
177
  raise "response_code: #{response.code}|#{response.parsed_response}"
@@ -304,6 +310,7 @@ module Datahen
304
310
  end
305
311
 
306
312
  # saving to server
313
+
307
314
  response = update_to_server(
308
315
  job_id: job_id,
309
316
  gid: gid,
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubyFinisherExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -14,6 +14,8 @@ module Datahen
14
14
  # @return [Boollean]
15
15
  attr_accessor :limbo_self
16
16
 
17
+ FIND_OUTPUTS_RETRY_LIMIT = nil
18
+
17
19
  def initialize(options={})
18
20
  @filename = options.fetch(:filename) { raise "Filename is required"}
19
21
  @page = options.fetch(:page) { nil }
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubySeederExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.20.0"
2
+ VERSION = "1.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.20.0
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-11-29 00:00:00.000000000 Z
11
+ date: 2022-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -277,7 +277,7 @@ metadata:
277
277
  allowed_push_host: https://rubygems.org
278
278
  homepage_uri: https://datahen.com
279
279
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
280
- post_install_message:
280
+ post_install_message:
281
281
  rdoc_options: []
282
282
  require_paths:
283
283
  - lib
@@ -293,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
293
293
  version: '0'
294
294
  requirements: []
295
295
  rubygems_version: 3.0.3
296
- signing_key:
296
+ signing_key:
297
297
  specification_version: 4
298
298
  summary: DataHen toolbelt for developers
299
299
  test_files: []