datahen 0.19.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 382b0aa0e8191e8ba977e9e17c8f4db7b49d71e3e5791681d0c22c6ab3234851
4
- data.tar.gz: 50ded716d3f79419735e84e2d8edd55ddb845ca9b09b7a24820e09d912b0f52b
3
+ metadata.gz: d90c6eca445a5ffc51a59c784e7a297801938864dfd4b9f22984ebb1917028de
4
+ data.tar.gz: e2b68ac1b025f8c24efbed1ba01d8b0e87edbfe3630de2f01d01bffe258f0bf1
5
5
  SHA512:
6
- metadata.gz: e9f280b1857ac04e918a9179ffcc2afae27e8f36e54613a6fc3b2a446b52be9cb81f8ea74d23634fcd7c0cd87292fefe1c851c32c5b19b77227dfd7ef2e6ba99
7
- data.tar.gz: 58eaa407c775945303f33f6fa46185b0486c2871664cf29e6eb513fbbe76d7386b926675e8bef6e43f34e28995ed13f973e4d0ca2a5843cf21b6d2ca96877c98
6
+ metadata.gz: 60c0c0013d454e3c805f67ae3450ee9229c0da00d0ce4e4fca4bb716ffd2b6a45da234024d12e04f5ce0952b553063e21f9d144e24c7b99875e623c0f5f924e7
7
+ data.tar.gz: 9a30a97aaa2a6e5d07e45cc6616fdca80a9f99ab04c308d82d06c346654c28cfdf3e111a0d9ebcb74d7dacc0b3d8e1e2e403512887ec04bbfd123c906dbda868
@@ -11,6 +11,9 @@ module Datahen
11
11
  puts "#{client.profile()}"
12
12
  end
13
13
 
14
+ desc "deploy_key SUBCOMMAND ...ARGS", "manage deploy key"
15
+ subcommand "deploy_key", AccountDeployKey
16
+
14
17
  end
15
18
  end
16
19
 
@@ -0,0 +1,26 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class AccountDeployKey < Thor
4
+ package_name "account deploy_key"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+ desc "show", "Show public deploy key"
10
+ def show()
11
+ client = Client::DeployKey.new()
12
+ puts "#{client.find()}"
13
+ end
14
+
15
+ desc "recreate", "Recreate public deploy key"
16
+ long_desc <<-LONGDESC
17
+ Recreate public deploy key.
18
+ LONGDESC
19
+ def recreate()
20
+ client = Client::DeployKey.new()
21
+ puts "#{client.create()}"
22
+ end
23
+ end
24
+ end
25
+
26
+ end
@@ -24,8 +24,9 @@ module Datahen
24
24
  option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
25
25
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
26
26
  option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
27
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
28
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
27
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
28
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
29
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
29
30
  option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
30
31
  option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
31
32
  option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
@@ -52,8 +53,9 @@ module Datahen
52
53
  option :freshness_type, :aliases => :t, desc: 'Set how fresh the page cache is. Possible values: day, week, month, year. Default: any'
53
54
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
54
55
  option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
55
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
56
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
56
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
57
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
58
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
57
59
  option :disable_scheduler, type: :boolean, desc: 'Set true to disable scheduler. Default: false'
58
60
  option :cancel_current_job, type: :boolean, desc: 'Set true to cancel currently active job if scheduler starts. Default: false'
59
61
  option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
@@ -97,8 +99,9 @@ module Datahen
97
99
  long_desc <<-LONGDESC
98
100
  Starts a scraper by creating an active scrape job\x5
99
101
  LONGDESC
100
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
101
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
102
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Default: 1'
103
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Default: 1'
104
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
102
105
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
103
106
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
104
107
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
@@ -99,8 +99,9 @@ module Datahen
99
99
  long_desc <<-LONGDESC
100
100
  Updates a scraper's current job.
101
101
  LONGDESC
102
- option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
103
- option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
102
+ option :parsers, :aliases => :pw, type: :numeric, desc: 'Set how many parser workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 1. '
103
+ option :fetchers, :aliases => :fw, type: :numeric, desc: 'Set how many fetcher workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 1. '
104
+ option :browsers, :aliases => :bw, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted (paused then resumed) for it to take effect. Default: 0. '
104
105
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
105
106
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
107
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
data/lib/datahen/cli.rb CHANGED
@@ -16,10 +16,9 @@ require 'datahen/cli/parser'
16
16
  require 'datahen/cli/seeder'
17
17
  require 'datahen/cli/finisher'
18
18
  require 'datahen/cli/env_var'
19
+ require 'datahen/cli/account_deploy_key'
19
20
  require 'datahen/cli/account'
20
21
 
21
-
22
-
23
22
  module Datahen
24
23
  class CLI < Thor
25
24
  desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
@@ -7,6 +7,12 @@ module Datahen
7
7
 
8
8
  default_timeout 60
9
9
 
10
+ DEFAULT_RETRY_LIMIT = {
11
+ seeder: nil,
12
+ parser: 2,
13
+ finisher: nil
14
+ }
15
+
10
16
  def self.env_auth_token
11
17
  ENV['DATAHEN_TOKEN']
12
18
  end
@@ -33,6 +39,42 @@ module Datahen
33
39
  @auth_token = value
34
40
  end
35
41
 
42
+ def default_retry_limit
43
+ @default_retry_limit ||= DEFAULT_RETRY_LIMIT.dup
44
+ end
45
+
46
+ def left_merge target, source
47
+ # validate source and target
48
+ return {} if target.nil? || !target.is_a?(Hash)
49
+ return target if source.nil? || !source.is_a?(Hash)
50
+
51
+ # left merge source into target
52
+ target.merge(source.select{|k,v|target.has_key?(k)})
53
+ end
54
+
55
+ def retry times, delay = nil, err_msg = nil
56
+ limit = times.nil? ? nil : times.to_i
57
+ delay = delay.nil? ? 5 : delay.to_i
58
+ count = 0
59
+ begin
60
+ yield
61
+ rescue StandardError => e
62
+ STDERR.puts(e.inspect)
63
+
64
+ # wait before retry (default 5 sec)
65
+ sleep(delay) if delay > 0
66
+
67
+ # raise error when retry limit is reached
68
+ raise e unless limit.nil? || count < limit
69
+
70
+ # retry with a 100+ failsafe to prevent overflow error due integer limit
71
+ should_aprox = limit.nil? && count > 99
72
+ count += 1 unless should_aprox
73
+ puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
74
+ retry
75
+ end
76
+ end
77
+
36
78
  def initialize(opts={})
37
79
  @ignore_ssl = opts[:ignore_ssl]
38
80
  self.class.base_uri(env_api_url)
@@ -45,6 +87,9 @@ module Datahen
45
87
  verify: !ignore_ssl
46
88
  }
47
89
 
90
+ # extract and merge retry limits
91
+ @default_retry_limit = self.left_merge(DEFAULT_RETRY_LIMIT, opts[:retry_limit])
92
+
48
93
  query = {}
49
94
  query[:p] = opts[:page] if opts[:page]
50
95
  query[:pp] = opts[:per_page] if opts[:per_page]
@@ -17,7 +17,8 @@ module Datahen
17
17
  def update(job_id, opts={})
18
18
  body = {}
19
19
  body[:status] = opts[:status] if opts[:status]
20
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
20
+ body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
21
+ body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
21
22
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
23
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
24
  body[:profile] = opts[:profile] if opts[:profile]
@@ -54,7 +55,10 @@ module Datahen
54
55
 
55
56
  params = @options.merge({body: body.to_json})
56
57
 
57
- self.class.put("/jobs/#{job_id}/seeding_update", params)
58
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:seeder]
59
+ self.retry(limit, 5, "Error while updating the seeder.") do
60
+ self.class.put("/jobs/#{job_id}/seeding_update", params)
61
+ end
58
62
  end
59
63
 
60
64
  def finisher_update(job_id, opts={})
@@ -65,7 +69,10 @@ module Datahen
65
69
 
66
70
  params = @options.merge({body: body.to_json})
67
71
 
68
- self.class.put("/jobs/#{job_id}/finisher_update", params)
72
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:finisher]
73
+ self.retry(limit, 5, "Error while updating the finisher.") do
74
+ self.class.put("/jobs/#{job_id}/finisher_update", params)
75
+ end
69
76
  end
70
77
 
71
78
  def profile(job_id, opts={})
@@ -5,9 +5,11 @@ module Datahen
5
5
  self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
6
  end
7
7
 
8
- def all(job_id, collection = 'default')
9
-
10
- self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
8
+ def all(job_id, collection = 'default', opts = {})
9
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
10
+ self.retry(limit, 10, "Error while updating the seeder.") do
11
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
12
+ end
11
13
  end
12
14
 
13
15
  def collections(job_id)
@@ -16,4 +18,3 @@ module Datahen
16
18
  end
17
19
  end
18
20
  end
19
-
@@ -68,7 +68,10 @@ module Datahen
68
68
 
69
69
  params = @options.merge({body: body.to_json})
70
70
 
71
- self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
71
+ limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : self.default_retry_limit[:parser]
72
+ self.retry(limit, 5, "Error while updating the parser.") do
73
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
74
+ end
72
75
  end
73
76
 
74
77
  def find_content(job_id, gid)
@@ -18,7 +18,8 @@ module Datahen
18
18
  body[:git_branch] = opts[:branch] || opts[:git_branch] || "master" if opts[:branch] || opts[:git_branch]
19
19
  body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
20
20
  body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
21
- body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
21
+ body[:parser_worker_count] = opts[:parsers] || opts[:parser_worker_count] if opts[:parsers] || opts[:parser_worker_count]
22
+ body[:fetcher_worker_count] = opts[:fetchers] || opts[:fetcher_worker_count] if opts[:fetchers] || opts[:fetcher_worker_count]
22
23
  body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
23
24
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
24
25
  body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
@@ -42,7 +43,8 @@ module Datahen
42
43
  body[:git_branch] = opts[:branch] || opts[:git_branch] if opts[:branch] || opts[:git_branch]
43
44
  body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
44
45
  body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch") || opts.has_key?(:force_fetch)
45
- body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
46
+ body[:parser_worker_count] = opts[:parsers] || opts[:parser_worker_count] if opts[:parsers] || opts[:parser_worker_count]
47
+ body[:fetcher_worker_count] = opts[:fetchers] || opts[:fetcher_worker_count] if opts[:fetchers] || opts[:fetcher_worker_count]
46
48
  body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
47
49
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
48
50
  body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler") || opts.has_key?(:disable_scheduler)
@@ -8,7 +8,8 @@ module Datahen
8
8
 
9
9
  def create(scraper_name, opts={})
10
10
  body = {}
11
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
11
+ body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
12
+ body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
12
13
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
14
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
15
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
@@ -36,7 +37,8 @@ module Datahen
36
37
  def update(scraper_name, opts={})
37
38
  body = {}
38
39
  body[:status] = opts[:status] if opts[:status]
39
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
40
+ body[:parser_worker_count] = opts[:parsers] if opts[:parsers]
41
+ body[:fetcher_worker_count] = opts[:fetchers] if opts[:fetchers]
40
42
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
41
43
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
42
44
  body[:profile] = opts[:profile] if opts[:profile]
@@ -152,7 +152,7 @@ module Datahen
152
152
  @page_types = []
153
153
  @parsers = Concurrent::Hash.new
154
154
  @config = YAML.load_file(config_file)
155
- self.config['parsers'].each do |v|
155
+ (self.config['parsers'] || []).each do |v|
156
156
  next if !v['disabled'].nil? && !!v['disabled']
157
157
  @page_types << v['page_type']
158
158
  self.parsers[v['page_type']] = v['file']
@@ -5,6 +5,7 @@ module Datahen
5
5
  class Executor
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
+ FIND_OUTPUTS_RETRY_LIMIT = 0
8
9
 
9
10
  attr_accessor :filename, :page, :gid, :job_id
10
11
 
@@ -159,13 +160,18 @@ module Datahen
159
160
  options = {
160
161
  query: query,
161
162
  page: page,
162
- per_page: per_page}
163
+ per_page: per_page
164
+ }
163
165
 
164
166
  # Get job_id
165
167
  query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
166
168
 
169
+ # find outputs
170
+ retry_limit = opts.has_key?(:retry_limit) ? opts[:retry_limit] : self.class::FIND_OUTPUTS_RETRY_LIMIT
167
171
  client = Client::JobOutput.new(options)
168
- response = client.all(query_job_id, collection)
172
+ response = client.all(query_job_id, collection, {
173
+ retry_limit: retry_limit
174
+ })
169
175
 
170
176
  if response.code != 200
171
177
  raise "response_code: #{response.code}|#{response.parsed_response}"
@@ -304,6 +310,7 @@ module Datahen
304
310
  end
305
311
 
306
312
  # saving to server
313
+
307
314
  response = update_to_server(
308
315
  job_id: job_id,
309
316
  gid: gid,
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubyFinisherExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -14,6 +14,8 @@ module Datahen
14
14
  # @return [Boollean]
15
15
  attr_accessor :limbo_self
16
16
 
17
+ FIND_OUTPUTS_RETRY_LIMIT = 2
18
+
17
19
  def initialize(options={})
18
20
  @filename = options.fetch(:filename) { raise "Filename is required"}
19
21
  @page = options.fetch(:page) { nil }
@@ -3,6 +3,8 @@ module Datahen
3
3
  class RubySeederExecutor < Executor
4
4
  attr_accessor :save
5
5
 
6
+ FIND_OUTPUTS_RETRY_LIMIT = nil
7
+
6
8
  def initialize(options={})
7
9
  @filename = options.fetch(:filename) { raise "Filename is required"}
8
10
  @job_id = options[:job_id]
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.19.0"
2
+ VERSION = "1.0.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.19.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-09-17 00:00:00.000000000 Z
11
+ date: 2022-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -212,6 +212,7 @@ files:
212
212
  - lib/datahen.rb
213
213
  - lib/datahen/cli.rb
214
214
  - lib/datahen/cli/account.rb
215
+ - lib/datahen/cli/account_deploy_key.rb
215
216
  - lib/datahen/cli/env_var.rb
216
217
  - lib/datahen/cli/finisher.rb
217
218
  - lib/datahen/cli/global_page.rb
@@ -276,7 +277,7 @@ metadata:
276
277
  allowed_push_host: https://rubygems.org
277
278
  homepage_uri: https://datahen.com
278
279
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
279
- post_install_message:
280
+ post_install_message:
280
281
  rdoc_options: []
281
282
  require_paths:
282
283
  - lib
@@ -292,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
292
293
  version: '0'
293
294
  requirements: []
294
295
  rubygems_version: 3.0.3
295
- signing_key:
296
+ signing_key:
296
297
  specification_version: 4
297
298
  summary: DataHen toolbelt for developers
298
299
  test_files: []