datahen 0.16.2 → 0.20.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8945724e5d11f40eba22a9ffca7ca7d024b8565dae4ba1d3da9e486eac262575
4
- data.tar.gz: 867319a0c6358c593951e6241b24d9c3fd9dcb759ce1287eeb640ade0c23e69a
3
+ metadata.gz: 11c43658f61faff627da448abff393b5cc658c15e1dfd6765c6cfdee96958d01
4
+ data.tar.gz: 5dc4481c9755d33dcee2539a24fe1ba2b0336fc4320beaa54204d1636067ee75
5
5
  SHA512:
6
- metadata.gz: bad8dea41951df061c84934fa63a0594bc8caf22b8665c3d6746bb29ca2821103ecd3814a9a9eba020009e8165390900b1c45a1fd0a7175fb7ea52f7a077fdab
7
- data.tar.gz: 74342e01eaa21a590ef998219282fb40342cde4bf8db58617d24583bfa00b967df3f8c38e41c9b92868e90113b7cc5f6346fb2062b446edd3bb4612e053c5da7
6
+ metadata.gz: e25db9321dfb26cb88d38aa9adf353c9955c3932553fb87affe235bd58f0182e7043775d102ce2bb5e7d6d0764cd76bd71f3be024ced38eadd17c05cfa0efd69
7
+ data.tar.gz: b07e2655df12424db7f859da29f0921fa6417dfbfa6b72537615089d6fabac7451e3e61e23ba4f4e8cfff0c0555ef50bf751dd8b91b8be8ae51b23d855145586
@@ -11,6 +11,9 @@ module Datahen
11
11
  puts "#{client.profile()}"
12
12
  end
13
13
 
14
+ desc "deploy_key SUBCOMMAND ...ARGS", "manage deploy key"
15
+ subcommand "deploy_key", AccountDeployKey
16
+
14
17
  end
15
18
  end
16
19
 
@@ -0,0 +1,26 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class AccountDeployKey < Thor
4
+ package_name "account deploy_key"
5
+ def self.banner(command, namespace = nil, subcommand = false)
6
+ "#{basename} #{@package_name} #{command.usage}"
7
+ end
8
+
9
+ desc "show", "Show public deploy key"
10
+ def show()
11
+ client = Client::DeployKey.new()
12
+ puts "#{client.find()}"
13
+ end
14
+
15
+ desc "recreate", "Recreate public deploy key"
16
+ long_desc <<-LONGDESC
17
+ Recreate public deploy key.
18
+ LONGDESC
19
+ def recreate()
20
+ client = Client::DeployKey.new()
21
+ puts "#{client.create()}"
22
+ end
23
+ end
24
+ end
25
+
26
+ end
@@ -8,6 +8,7 @@ module Datahen
8
8
  LONGDESC
9
9
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
10
10
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
11
+ option :status, :aliases => :s, type: :string, desc: 'Scraper status. Status can be: done, cancelled, paused, finishing.'
11
12
  def list
12
13
  client = Client::Scraper.new(options)
13
14
  puts "#{client.all}"
@@ -33,6 +34,8 @@ module Datahen
33
34
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
35
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
35
36
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
37
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
38
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
36
39
  def create(scraper_name, git_repository)
37
40
  # puts "options #{options}"
38
41
  client = Client::Scraper.new(options)
@@ -59,6 +62,8 @@ module Datahen
59
62
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
60
63
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
61
64
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
65
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
66
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
62
67
  def update(scraper_name)
63
68
  client = Client::Scraper.new(options)
64
69
  puts "#{client.update(scraper_name, options)}"
@@ -97,6 +102,7 @@ module Datahen
97
102
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
98
103
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
99
104
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
105
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
100
106
  def start(scraper_name)
101
107
  client = Client::ScraperJob.new(options)
102
108
  puts "Starting a scrape job..."
@@ -105,6 +105,8 @@ module Datahen
105
105
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
106
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
107
  option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
108
+ option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
109
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
108
110
  def update(scraper_name)
109
111
  if options[:job]
110
112
  client = Client::Job.new(options)
@@ -46,6 +46,7 @@ module Datahen
46
46
  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
47
  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
48
  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
49
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
49
50
  def add(scraper_name, url)
50
51
  begin
51
52
  options[:headers] = JSON.parse(options[:headers]) if options[:headers]
@@ -80,6 +81,7 @@ module Datahen
80
81
  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
81
82
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
82
83
  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
84
+ option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
83
85
  def update(scraper_name, gid)
84
86
  begin
85
87
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
data/lib/datahen/cli.rb CHANGED
@@ -16,10 +16,9 @@ require 'datahen/cli/parser'
16
16
  require 'datahen/cli/seeder'
17
17
  require 'datahen/cli/finisher'
18
18
  require 'datahen/cli/env_var'
19
+ require 'datahen/cli/account_deploy_key'
19
20
  require 'datahen/cli/account'
20
21
 
21
-
22
-
23
22
  module Datahen
24
23
  class CLI < Thor
25
24
  desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
@@ -22,6 +22,8 @@ module Datahen
22
22
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
23
  body[:profile] = opts[:profile] if opts[:profile]
24
24
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
25
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
26
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
25
27
  params = @options.merge({body: body.to_json})
26
28
 
27
29
  self.class.put("/jobs/#{job_id}", params)
@@ -16,6 +16,8 @@ module Datahen
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
18
  body[:max_size] = opts[:max_size] if opts[:max_size]
19
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
20
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
19
21
 
20
22
  params = @options.merge({body: body.to_json})
21
23
 
@@ -38,6 +40,8 @@ module Datahen
38
40
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
39
41
  body[:cookie] = opts[:cookie] if opts[:cookie]
40
42
  body[:max_size] = opts[:max_size] if opts[:max_size]
43
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
44
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
41
45
 
42
46
  params = @options.merge({body: body.to_json})
43
47
 
@@ -29,6 +29,8 @@ module Datahen
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
30
  body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
31
31
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
32
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
33
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
32
34
  params = @options.merge({body: body.to_json})
33
35
  self.class.post("/scrapers", params)
34
36
  end
@@ -51,6 +53,8 @@ module Datahen
51
53
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
52
54
  body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
53
55
  body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
56
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
57
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
54
58
  params = @options.merge({body: body.to_json})
55
59
 
56
60
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -12,6 +12,8 @@ module Datahen
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
14
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
15
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
16
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
15
17
  if opts[:vars]
16
18
  if opts[:vars].is_a?(Array)
17
19
  body[:vars] = opts[:vars]
@@ -39,6 +41,8 @@ module Datahen
39
41
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
40
42
  body[:profile] = opts[:profile] if opts[:profile]
41
43
  body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
44
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
45
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
42
46
  params = @options.merge({body: body.to_json})
43
47
 
44
48
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -16,6 +16,8 @@ module Datahen
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
18
  body[:max_size] = opts[:max_size] if opts[:max_size]
19
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
20
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
19
21
 
20
22
  params = @options.merge({body: body.to_json})
21
23
 
@@ -61,6 +63,8 @@ module Datahen
61
63
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
62
64
  body[:cookie] = opts[:cookie] if opts[:cookie]
63
65
  body[:max_size] = opts[:max_size] if opts[:max_size]
66
+ body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
67
+ body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
64
68
 
65
69
  params = @options.merge({body: body.to_json})
66
70
 
@@ -8,7 +8,11 @@ module Datahen
8
8
  attr_accessor :refetch_self
9
9
  # Reparse self page flag.
10
10
  # @return [Boollean]
11
+ # @note It is stronger than #limbo_self flag.
11
12
  attr_accessor :reparse_self
13
+ # Limbo self page flag.
14
+ # @return [Boollean]
15
+ attr_accessor :limbo_self
12
16
 
13
17
  def initialize(options={})
14
18
  @filename = options.fetch(:filename) { raise "Filename is required"}
@@ -31,7 +35,8 @@ module Datahen
31
35
  :find_output,
32
36
  :find_outputs,
33
37
  :refetch,
34
- :reparse
38
+ :reparse,
39
+ :limbo
35
40
  ].freeze
36
41
  end
37
42
 
@@ -104,7 +109,7 @@ module Datahen
104
109
 
105
110
  def refetch_page gid
106
111
  if save
107
- Client::ScraperJobPage.new({gid: gid}).refetch_by_job(self.job_id)
112
+ Client::JobPage.new({gid: gid}).refetch(self.job_id)
108
113
  puts "Refetch page #{gid}"
109
114
  else
110
115
  puts "Would have refetch page #{gid}"
@@ -122,7 +127,7 @@ module Datahen
122
127
 
123
128
  def reparse_page gid
124
129
  if save
125
- Client::ScraperJobPage.new({gid: gid}).reparse_by_job(self.job_id)
130
+ Client::JobPage.new({gid: gid}).reparse(self.job_id)
126
131
  puts "Reparse page #{gid}"
127
132
  else
128
133
  puts "Would have reparse page #{gid}"
@@ -138,6 +143,24 @@ module Datahen
138
143
  reparse_page page_gid
139
144
  end
140
145
 
146
+ def limbo_page gid
147
+ if save
148
+ Client::JobPage.new({gid: gid}).limbo(self.job_id)
149
+ puts "Limbo page #{gid}"
150
+ else
151
+ puts "Would have limbo page #{gid}"
152
+ end
153
+ end
154
+
155
+ def limbo page_gid
156
+ raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
157
+ if page_gid == gid
158
+ self.limbo_self = true
159
+ raise Error::SafeTerminateError
160
+ end
161
+ limbo_page page_gid
162
+ end
163
+
141
164
  def eval_parser_script(save=false)
142
165
  update_parsing_starting_status
143
166
 
@@ -148,6 +171,7 @@ module Datahen
148
171
  page = init_page_vars(page)
149
172
  self.refetch_self = false
150
173
  self.reparse_self = false
174
+ self.limbo_self = false
151
175
 
152
176
  begin
153
177
  context = isolated_binding({
@@ -178,6 +202,8 @@ module Datahen
178
202
  refetch_page gid
179
203
  elsif reparse_self
180
204
  reparse_page gid
205
+ elsif limbo_self
206
+ limbo_page gid
181
207
  else
182
208
  update_parsing_done_status
183
209
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.16.2"
2
+ VERSION = "0.20.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.2
4
+ version: 0.20.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-11 00:00:00.000000000 Z
11
+ date: 2021-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -212,6 +212,7 @@ files:
212
212
  - lib/datahen.rb
213
213
  - lib/datahen/cli.rb
214
214
  - lib/datahen/cli/account.rb
215
+ - lib/datahen/cli/account_deploy_key.rb
215
216
  - lib/datahen/cli/env_var.rb
216
217
  - lib/datahen/cli/finisher.rb
217
218
  - lib/datahen/cli/global_page.rb
@@ -276,7 +277,7 @@ metadata:
276
277
  allowed_push_host: https://rubygems.org
277
278
  homepage_uri: https://datahen.com
278
279
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
279
- post_install_message:
280
+ post_install_message:
280
281
  rdoc_options: []
281
282
  require_paths:
282
283
  - lib
@@ -292,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
292
293
  version: '0'
293
294
  requirements: []
294
295
  rubygems_version: 3.0.3
295
- signing_key:
296
+ signing_key:
296
297
  specification_version: 4
297
298
  summary: DataHen toolbelt for developers
298
299
  test_files: []