datahen 0.19.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 382b0aa0e8191e8ba977e9e17c8f4db7b49d71e3e5791681d0c22c6ab3234851
4
- data.tar.gz: 50ded716d3f79419735e84e2d8edd55ddb845ca9b09b7a24820e09d912b0f52b
3
+ metadata.gz: d90c6eca445a5ffc51a59c784e7a297801938864dfd4b9f22984ebb1917028de
4
+ data.tar.gz: e2b68ac1b025f8c24efbed1ba01d8b0e87edbfe3630de2f01d01bffe258f0bf1
5
5
  SHA512:
6
- metadata.gz: e9f280b1857ac04e918a9179ffcc2afae27e8f36e54613a6fc3b2a446b52be9cb81f8ea74d23634fcd7c0cd87292fefe1c851c32c5b19b77227dfd7ef2e6ba99
7
- data.tar.gz: 58eaa407c775945303f33f6fa46185b0486c2871664cf29e6eb513fbbe76d7386b926675e8bef6e43f34e28995ed13f973e4d0ca2a5843cf21b6d2ca96877c98
6
+ metadata.gz: 60c0c0013d454e3c805f67ae3450ee9229c0da00d0ce4e4fca4bb716ffd2b6a45da234024d12e04f5ce0952b553063e21f9d144e24c7b99875e623c0f5f924e7
7
+ data.tar.gz: 9a30a97aaa2a6e5d07e45cc6616fdca80a9f99ab04c308d82d06c346654c28cfdf3e111a0d9ebcb74d7dacc0b3d8e1e2e403512887ec04bbfd123c906dbda868
@@ -11,6 +11,9 @@ module Datahen
11
11
  puts "#{client.profile()}"
12
12
  end
13
13
 
14
+ desc "deploy_key SUBCOMMAND ...ARGS", "manage deploy key"
15
+ subcommand "deploy_key", AccountDeployKey
16
+
14
17
  end
15
18
  end
16
19
 
@@ -0,0 +1,26 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class AccountDeployKey < Thor
4
+ package_name "account deploy_key"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+ desc "show", "Show public deploy key"
10
+ def show()
11
+ client = Client::DeployKey.new()
12
+ puts "#{client.find()}"
13
+ end
14
+
15
+ desc "recreate", "Recreate public deploy key"
16
+ long_desc <<-LONGDESC
17
+ Recreate public deploy key.
18
+ LONGDESC
19
+ def recreate()
20
+ client = Client::DeployKey.new()
21
+ puts "#{client.create()}"
22
+ end
23
+ end
24
+ end
25
+
26
+ end
@@ -24,8 +24,9 @@ module Datahen
24
24
  option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
25
25
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
26
26
  option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
27
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
28
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
27
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
28
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
29
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
29
30
  option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
30
31
  option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
31
32
  option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
@@ -52,8 +53,9 @@ module Datahen
52
53
  option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
53
54
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
54
55
  option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
55
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
56
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
56
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
57
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
58
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
57
59
  option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
58
60
  option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
59
61
  option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
@@ -97,8 +99,9 @@ module Datahen
97
99
  long_desc <<-LONGDESC
98
100
  Starts a scraper by creating an active scrape job\x5
99
101
  LONGDESC
100
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
101
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
102
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
103
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
104
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
102
105
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
103
106
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
104
107
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
@@ -99,8 +99,9 @@ module Datahen
99
99
  long_desc <<-LONGDESC
100
100
  Updates a scraper's current job.
101
101
  LONGDESC
102
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
103
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
102
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 1. '
103
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 1. '
104
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 0. '
104
105
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
105
106
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
107
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
data/lib/datahen/cli.rb CHANGED
@@ -16,10 +16,9 @@ require 'datahen/cli/parser'
16
16
  require 'datahen/cli/seeder'
17
17
  require 'datahen/cli/finisher'
18
18
  require 'datahen/cli/env_var'
19
+ require 'datahen/cli/account_deploy_key'
19
20
  require 'datahen/cli/account'
20
21
 
21
-
22
-
23
22
  module Datahen
24
23
  class CLI < Thor
25
24
  desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
@@ -7,6 +7,12 @@ module Datahen
7
7
 
8
8
  default_timeout 60
9
9
 
10
+ DEFAULT_RETRY_LIMIT = {
11
+ seeder: nil,
12
+ parser: 2,
13
+ finisher: nil
14
+ }
15
+
10
16
  def self.env_auth_token
11
17
  ENV['DATAHEN_TOKEN']
12
18
  end
@@ -33,6 +39,42 @@ module Datahen
33
39
  @auth_token = value
34
40
  end
35
41
 
42
+ def default_retry_limit
43
+ @default_retry_limit ||= DEFAULT_RETRY_LIMIT.dup
44
+ end
45
+
46
+ def left_merge target, source
47
+ # validate source and target
48
+ return {} if target.nil? || !target.is_a?(Hash)
49
+ return target if source.nil? || !source.is_a?(Hash)
50
+
51
+ # left merge source into target
52
+ target.merge(source.select{|k,v|target.has_key?(k)})
53
+ end
54
+
55
+ def retry times, delay = nil, err_msg = nil
56
+ limit = times.nil? ? nil : times.to_i
57
+ delay = delay.nil? ? 5 : delay.to_i
58
+ count = 0
59
+ begin
60
+ yield
61
+ rescue StandardError => e
62
+ STDERR.puts(e.inspect)
63
+
64
+ # wait before retry (default 5 sec)
65
+ sleep(delay) if delay > 0
66
+
67
+ # raise error when retry limit is reached
68
+ raise e unless limit.nil? || count < limit
69
+
70
+ # retry with a 100+ failsafe to prevent overflow error due integer limit
71
+ should_aprox = limit.nil? && count > 99
72
+ count += 1 unless should_aprox
73
+ puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
74
+ retry
75
+ end
76
+ end
77
+
36
78
  def initialize(opts={})
37
79
  @ignore_ssl = opts[:ignore_ssl]
38
80
  self.class.base_uri(env_api_url)
@@ -45,6 +87,9 @@ module Datahen
45
87
  verify: !ignore_ssl
46
88
  }
47
89
 
90
+ # extract and merge retry limits
91
+ @default_retry_limit = self.left_merge(DEFAULT_RETRY_LIMIT, opts[:retry_limit])
92
+
48
93
  query = {}
49
94
  query[:p] = opts[:page] if opts[:page]
50
95
  query[:pp] = opts[:per_page] if opts[:per_page]
@@ -17,7 +17,8 @@ module Datahen
17
17
  def update(job_id, opts={})
18
18
  body = {}
19
19
  body[:status] = opts[:status] if opts[:status]
20
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
20
+ body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
21
+ body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
21
22
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
23
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
24
  body[:profile] = opts[:profile] if opts[:profile]
@@ -54,7 +55,10 @@ module Datahen
54
55
 
55
56
  params = @options.merge({body: body.to_json})
56
57
 
57
- self.class.put("/jobs/#{job_id}/seeding_update", params)
58
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:seeder]
59
+ self.retry(limit, 5, "Error while updating the seeder.") do
60
+ self.class.put("/jobs/#{job_id}/seeding_update", params)
61
+ end
58
62
  end
59
63
 
60
64
  def finisher_update(job_id, opts={})
@@ -65,7 +69,10 @@ module Datahen
65
69
 
66
70
  params = @options.merge({body: body.to_json})
67
71
 
68
- self.class.put("/jobs/#{job_id}/finisher_update", params)
72
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:finisher]
73
+ self.retry(limit, 5, "Error while updating the finisher.") do
74
+ self.class.put("/jobs/#{job_id}/finisher_update", params)
75
+ end
69
76
  end
70
77
 
71
78
  def profile(job_id, opts={})
@@ -5,9 +5,11 @@ module Datahen
5
5
  self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
6
  end
7
7
 
8
- def all(job_id, collection = 'default')
9
-
10
- self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
8
+ def all(job_id, collection = 'default', opts = {})
9
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
10
+ self.retry(limit, 10, "Error while updating the seeder.") do
11
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
12
+ end
11
13
  end
12
14
 
13
15
  def collections(job_id)
@@ -16,4 +18,3 @@ module Datahen
16
18
  end
17
19
  end
18
20
  end
19
-
@@ -68,7 +68,10 @@ module Datahen
68
68
 
69
69
  params = @options.merge({body: body.to_json})
70
70
 
71
- self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
71
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:parser]
72
+ self.retry(limit, 5, "Error while updating the parser.") do
73
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
74
+ end
72
75
  end
73
76
 
74
77
  def find_content(job_id, gid)
@@ -18,7 +18,8 @@ module Datahen
18
18
  body[:git_branch] = opts[:branch] || opts[:git_branch] || "master" if opts[:branch] || opts[:git_branch]
19
19
  body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
20
20
  body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
21
- body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
21
+ body[:parser_worker_count] = opts[:parsers] || opts[:parser_worker_count] if opts[:parsers] || opts[:parser_worker_count]
22
+ body[:fetcher_worker_count] = opts[:fetchers] || opts[:fetcher_worker_count] if opts[:fetchers] || opts[:fetcher_worker_count]
22
23
  body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
23
24
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
24
25
  body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
@@ -42,7 +43,8 @@ module Datahen
42
43
  body[:git_branch] = opts[:branch] || opts[:git_branch] if opts[:branch] || opts[:git_branch]
43
44
  body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
44
45
  body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch") || opts.has_key?(:force_fetch)
45
- body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
46
+ body[:parser_worker_count] = opts[:parsers] || opts[:parser_worker_count] if opts[:parsers] || opts[:parser_worker_count]
47
+ body[:fetcher_worker_count] = opts[:fetchers] || opts[:fetcher_worker_count] if opts[:fetchers] || opts[:fetcher_worker_count]
46
48
  body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
47
49
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
48
50
  body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler") || opts.has_key?(:disable_scheduler)
@@ -8,7 +8,8 @@ module Datahen
8
8
 
9
9
  def create(scraper_name, opts={})
10
10
  body = {}
11
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
11
+ body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
12
+ body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
12
13
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
14
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
15
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
@@ -36,7 +37,8 @@ module Datahen
36
37
  def update(scraper_name, opts={})
37
38
  body = {}
38
39
  body[:status] = opts[:status] if opts[:status]
39
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
40
+ body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
41
+ body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
40
42
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
41
43
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
42
44
  body[:profile] = opts[:profile] if opts[:profile]
@@ -152,7 +152,7 @@ module Datahen
152
152
  @page_types = []
153
153
  @parsers = Concurrent::Hash.new
154
154
  @config = YAML.load_file(config_file)
155
- self.config['parsers'].each do |v|
155
+ (self.config['parsers'] || []).each do |v|
156
156
  next if !v['disabled'].nil? && !!v['disabled']
157
157
  @page_types << v['page_type']
158
158
  self.parsers[v['page_type']] = v['file']
@@ -5,6 +5,7 @@ module Datahen
5
5
  class Executor
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
+ FIND_OUTPUTS_RETRY_LIMIT = 0
8
9
 
9
10
  attr_accessor :filename, :page, :gid, :job_id
10
11
 
@@ -159,13 +160,18 @@ module Datahen
159
160
  options = {
160
161
  query: query,
161
162
  page: page,
162
- per_page: per_page}
163
+ per_page: per_page
164
+ }
163
165
 
164
166
  # Get job_id
165
167
  query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
166
168
 
169
+ # find outputs
170
+ retry_limit = opts.has_key?(:retry_limit) ? opts[:retry_limit] : self.class::FIND_OUTPUTS_RETRY_LIMIT
167
171
  client = Client::JobOutput.new(options)
168
- response = client.all(query_job_id, collection)
172
+ response = client.all(query_job_id, collection, {
173
+ retry_limit: retry_limit
174
+ })
169
175
 
170
176
  if response.code != 200
171
177
  raise "response_code: #{response.code}|#{response.parsed_response}"
@@ -304,6 +310,7 @@ module Datahen
304
310
  end
305
311
 
306
312
  # saving to server
313
+
307
314
  response = update_to_server(
308
315
  job_id: job_id,
309
316
  gid: gid,
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubyFinisherExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -14,6 +14,8 @@ module Datahen
14
14
  # @return [Boollean]
15
15
  attr_accessor :limbo_self
16
16
 
17
+ FIND_OUTPUTS_RETRY_LIMIT = 2
18
+
17
19
  def initialize(options={})
18
20
  @filename = options.fetch(:filename) { raise "Filename is required"}
19
21
  @page = options.fetch(:page) { nil }
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubySeederExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.19.0"
2
+ VERSION = "1.0.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.19.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-09-17 00:00:00.000000000 Z
11
+ date: 2022-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -212,6 +212,7 @@ files:
212
212
  - lib/datahen.rb
213
213
  - lib/datahen/cli.rb
214
214
  - lib/datahen/cli/account.rb
215
+ - lib/datahen/cli/account_deploy_key.rb
215
216
  - lib/datahen/cli/env_var.rb
216
217
  - lib/datahen/cli/finisher.rb
217
218
  - lib/datahen/cli/global_page.rb
@@ -276,7 +277,7 @@ metadata:
276
277
  allowed_push_host: https://rubygems.org
277
278
  homepage_uri: https://datahen.com
278
279
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
279
- post_install_message:
280
+ post_install_message:
280
281
  rdoc_options: []
281
282
  require_paths:
282
283
  - lib
@@ -292,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
292
293
  version: '0'
293
294
  requirements: []
294
295
  rubygems_version: 3.0.3
295
- signing_key:
296
+ signing_key:
296
297
  specification_version: 4
297
298
  summary: DataHen toolbelt for developers
298
299
  test_files: []