datahen 0.16.2 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8945724e5d11f40eba22a9ffca7ca7d024b8565dae4ba1d3da9e486eac262575
4
- data.tar.gz: 867319a0c6358c593951e6241b24d9c3fd9dcb759ce1287eeb640ade0c23e69a
3
+ metadata.gz: 11c43658f61faff627da448abff393b5cc658c15e1dfd6765c6cfdee96958d01
4
+ data.tar.gz: 5dc4481c9755d33dcee2539a24fe1ba2b0336fc4320beaa54204d1636067ee75
5
5
  SHA512:
6
- metadata.gz: bad8dea41951df061c84934fa63a0594bc8caf22b8665c3d6746bb29ca2821103ecd3814a9a9eba020009e8165390900b1c45a1fd0a7175fb7ea52f7a077fdab
7
- data.tar.gz: 74342e01eaa21a590ef998219282fb40342cde4bf8db58617d24583bfa00b967df3f8c38e41c9b92868e90113b7cc5f6346fb2062b446edd3bb4612e053c5da7
6
+ metadata.gz: e25db9321dfb26cb88d38aa9adf353c9955c3932553fb87affe235bd58f0182e7043775d102ce2bb5e7d6d0764cd76bd71f3be024ced38eadd17c05cfa0efd69
7
+ data.tar.gz: b07e2655df12424db7f859da29f0921fa6417dfbfa6b72537615089d6fabac7451e3e61e23ba4f4e8cfff0c0555ef50bf751dd8b91b8be8ae51b23d855145586
@@ -11,6 +11,9 @@ module Datahen
11
11
  puts "#{client.profile()}"
12
12
  end
13
13
 
14
+ desc "deploy_key SUBCOMMAND ...ARGS", "manage deploy key"
15
+ subcommand "deploy_key", AccountDeployKey
16
+
14
17
  end
15
18
  end
16
19
 
@@ -0,0 +1,26 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class AccountDeployKey < Thor
4
+ package_name "account deploy_key"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+ desc "show", "Show public deploy key"
10
+ def show()
11
+ client = Client::DeployKey.new()
12
+ puts "#{client.find()}"
13
+ end
14
+
15
+ desc "recreate", "Recreate public deploy key"
16
+ long_desc <<-LONGDESC
17
+ Recreate public deploy key.
18
+ LONGDESC
19
+ def recreate()
20
+ client = Client::DeployKey.new()
21
+ puts "#{client.create()}"
22
+ end
23
+ end
24
+ end
25
+
26
+ end
@@ -8,6 +8,7 @@ module Datahen
8
8
  LONGDESC
9
9
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
10
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
+ option :status, :aliases => :s, type: :string, desc: 'Scraper status. Status can be: done, cancelled, paused, finishing.'
11
12
  def list
12
13
  client = Client::Scraper.new(options)
13
14
  puts "#{client.all}"
@@ -33,6 +34,8 @@ module Datahen
33
34
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
35
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
35
36
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
37
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
38
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
36
39
  def create(scraper_name, git_repository)
37
40
  # puts "options #{options}"
38
41
  client = Client::Scraper.new(options)
@@ -59,6 +62,8 @@ module Datahen
59
62
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
60
63
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
61
64
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
65
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
66
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
62
67
  def update(scraper_name)
63
68
  client = Client::Scraper.new(options)
64
69
  puts "#{client.update(scraper_name, options)}"
@@ -97,6 +102,7 @@ module Datahen
97
102
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
98
103
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
99
104
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
105
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
100
106
  def start(scraper_name)
101
107
  client = Client::ScraperJob.new(options)
102
108
  puts "Starting a scrape job..."
@@ -105,6 +105,8 @@ module Datahen
105
105
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
106
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
107
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
108
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
109
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
108
110
  def update(scraper_name)
109
111
  if options[:job]
110
112
  client = Client::Job.new(options)
@@ -46,6 +46,7 @@ module Datahen
46
46
  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
47
  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
48
  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
49
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
49
50
  def add(scraper_name, url)
50
51
  begin
51
52
  options[:headers] = JSON.parse(options[:headers]) if options[:headers]
@@ -80,6 +81,7 @@ module Datahen
80
81
  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
81
82
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
82
83
  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
84
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
83
85
  def update(scraper_name, gid)
84
86
  begin
85
87
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
data/lib/datahen/cli.rb CHANGED
@@ -16,10 +16,9 @@ require 'datahen/cli/parser'
16
16
  require 'datahen/cli/seeder'
17
17
  require 'datahen/cli/finisher'
18
18
  require 'datahen/cli/env_var'
19
+ require 'datahen/cli/account_deploy_key'
19
20
  require 'datahen/cli/account'
20
21
 
21
-
22
-
23
22
  module Datahen
24
23
  class CLI < Thor
25
24
  desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
@@ -22,6 +22,8 @@ module Datahen
22
22
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
23
  body[:profile] = opts[:profile] if opts[:profile]
24
24
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
25
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
26
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
25
27
  params = @options.merge({body: body.to_json})
26
28
 
27
29
  self.class.put("/jobs/#{job_id}", params)
@@ -16,6 +16,8 @@ module Datahen
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
18
  body[:max_size] = opts[:max_size] if opts[:max_size]
19
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
20
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
19
21
 
20
22
  params = @options.merge({body: body.to_json})
21
23
 
@@ -38,6 +40,8 @@ module Datahen
38
40
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
39
41
  body[:cookie] = opts[:cookie] if opts[:cookie]
40
42
  body[:max_size] = opts[:max_size] if opts[:max_size]
43
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
44
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
41
45
 
42
46
  params = @options.merge({body: body.to_json})
43
47
 
@@ -29,6 +29,8 @@ module Datahen
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
30
  body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
31
31
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
32
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
33
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
32
34
  params = @options.merge({body: body.to_json})
33
35
  self.class.post("/scrapers", params)
34
36
  end
@@ -51,6 +53,8 @@ module Datahen
51
53
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
52
54
  body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
53
55
  body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
56
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
57
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
54
58
  params = @options.merge({body: body.to_json})
55
59
 
56
60
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -12,6 +12,8 @@ module Datahen
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
14
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
15
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
16
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
15
17
  if opts[:vars]
16
18
  if opts[:vars].is_a?(Array)
17
19
  body[:vars] = opts[:vars]
@@ -39,6 +41,8 @@ module Datahen
39
41
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
40
42
  body[:profile] = opts[:profile] if opts[:profile]
41
43
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
44
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
45
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
42
46
  params = @options.merge({body: body.to_json})
43
47
 
44
48
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -16,6 +16,8 @@ module Datahen
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
18
  body[:max_size] = opts[:max_size] if opts[:max_size]
19
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
20
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
19
21
 
20
22
  params = @options.merge({body: body.to_json})
21
23
 
@@ -61,6 +63,8 @@ module Datahen
61
63
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
62
64
  body[:cookie] = opts[:cookie] if opts[:cookie]
63
65
  body[:max_size] = opts[:max_size] if opts[:max_size]
66
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
67
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
64
68
 
65
69
  params = @options.merge({body: body.to_json})
66
70
 
@@ -8,7 +8,11 @@ module Datahen
8
8
  attr_accessor :refetch_self
9
9
  # Reparse self page flag.
10
10
  # @return [Boollean]
11
+ # @note It is stronger than #limbo_self flag.
11
12
  attr_accessor :reparse_self
13
+ # Limbo self page flag.
14
+ # @return [Boollean]
15
+ attr_accessor :limbo_self
12
16
 
13
17
  def initialize(options={})
14
18
  @filename = options.fetch(:filename) { raise "Filename is required"}
@@ -31,7 +35,8 @@ module Datahen
31
35
  :find_output,
32
36
  :find_outputs,
33
37
  :refetch,
34
- :reparse
38
+ :reparse,
39
+ :limbo
35
40
  ].freeze
36
41
  end
37
42
 
@@ -104,7 +109,7 @@ module Datahen
104
109
 
105
110
  def refetch_page gid
106
111
  if save
107
- Client::ScraperJobPage.new({gid: gid}).refetch_by_job(self.job_id)
112
+ Client::JobPage.new({gid: gid}).refetch(self.job_id)
108
113
  puts "Refetch page #{gid}"
109
114
  else
110
115
  puts "Would have refetch page #{gid}"
@@ -122,7 +127,7 @@ module Datahen
122
127
 
123
128
  def reparse_page gid
124
129
  if save
125
- Client::ScraperJobPage.new({gid: gid}).reparse_by_job(self.job_id)
130
+ Client::JobPage.new({gid: gid}).reparse(self.job_id)
126
131
  puts "Reparse page #{gid}"
127
132
  else
128
133
  puts "Would have reparse page #{gid}"
@@ -138,6 +143,24 @@ module Datahen
138
143
  reparse_page page_gid
139
144
  end
140
145
 
146
+ def limbo_page gid
147
+ if save
148
+ Client::JobPage.new({gid: gid}).limbo(self.job_id)
149
+ puts "Limbo page #{gid}"
150
+ else
151
+ puts "Would have limbo page #{gid}"
152
+ end
153
+ end
154
+
155
+ def limbo page_gid
156
+ raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
157
+ if page_gid == gid
158
+ self.limbo_self = true
159
+ raise Error::SafeTerminateError
160
+ end
161
+ limbo_page page_gid
162
+ end
163
+
141
164
  def eval_parser_script(save=false)
142
165
  update_parsing_starting_status
143
166
 
@@ -148,6 +171,7 @@ module Datahen
148
171
  page = init_page_vars(page)
149
172
  self.refetch_self = false
150
173
  self.reparse_self = false
174
+ self.limbo_self = false
151
175
 
152
176
  begin
153
177
  context = isolated_binding({
@@ -178,6 +202,8 @@ module Datahen
178
202
  refetch_page gid
179
203
  elsif reparse_self
180
204
  reparse_page gid
205
+ elsif limbo_self
206
+ limbo_page gid
181
207
  else
182
208
  update_parsing_done_status
183
209
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.16.2"
2
+ VERSION = "0.20.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.2
4
+ version: 0.20.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-11 00:00:00.000000000 Z
11
+ date: 2021-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -212,6 +212,7 @@ files:
212
212
  - lib/datahen.rb
213
213
  - lib/datahen/cli.rb
214
214
  - lib/datahen/cli/account.rb
215
+ - lib/datahen/cli/account_deploy_key.rb
215
216
  - lib/datahen/cli/env_var.rb
216
217
  - lib/datahen/cli/finisher.rb
217
218
  - lib/datahen/cli/global_page.rb
@@ -276,7 +277,7 @@ metadata:
276
277
  allowed_push_host: https://rubygems.org
277
278
  homepage_uri: https://datahen.com
278
279
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
279
- post_install_message:
280
+ post_install_message:
280
281
  rdoc_options: []
281
282
  require_paths:
282
283
  - lib
@@ -292,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
292
293
  version: '0'
293
294
  requirements: []
294
295
  rubygems_version: 3.0.3
295
- signing_key:
296
+ signing_key:
296
297
  specification_version: 4
297
298
  summary: DataHen toolbelt for developers
298
299
  test_files: []