datahen 0.16.2 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/account.rb +3 -0
- data/lib/datahen/cli/account_deploy_key.rb +26 -0
- data/lib/datahen/cli/scraper.rb +6 -0
- data/lib/datahen/cli/scraper_job.rb +2 -0
- data/lib/datahen/cli/scraper_page.rb +2 -0
- data/lib/datahen/cli.rb +1 -2
- data/lib/datahen/client/job.rb +2 -0
- data/lib/datahen/client/job_page.rb +4 -0
- data/lib/datahen/client/scraper.rb +4 -0
- data/lib/datahen/client/scraper_job.rb +4 -0
- data/lib/datahen/client/scraper_job_page.rb +4 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +29 -3
- data/lib/datahen/version.rb +1 -1
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 11c43658f61faff627da448abff393b5cc658c15e1dfd6765c6cfdee96958d01
|
4
|
+
data.tar.gz: 5dc4481c9755d33dcee2539a24fe1ba2b0336fc4320beaa54204d1636067ee75
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e25db9321dfb26cb88d38aa9adf353c9955c3932553fb87affe235bd58f0182e7043775d102ce2bb5e7d6d0764cd76bd71f3be024ced38eadd17c05cfa0efd69
|
7
|
+
data.tar.gz: b07e2655df12424db7f859da29f0921fa6417dfbfa6b72537615089d6fabac7451e3e61e23ba4f4e8cfff0c0555ef50bf751dd8b91b8be8ae51b23d855145586
|
data/lib/datahen/cli/account.rb
CHANGED
@@ -0,0 +1,26 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class AccountDeployKey < Thor
|
4
|
+
package_name "account deploy_key"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "show", "Show public deploy key"
|
10
|
+
def show()
|
11
|
+
client = Client::DeployKey.new()
|
12
|
+
puts "#{client.find()}"
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "recreate", "Recreate public deploy key"
|
16
|
+
long_desc <<-LONGDESC
|
17
|
+
Recreate public deploy key.
|
18
|
+
LONGDESC
|
19
|
+
def recreate()
|
20
|
+
client = Client::DeployKey.new()
|
21
|
+
puts "#{client.create()}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -8,6 +8,7 @@ module Datahen
|
|
8
8
|
LONGDESC
|
9
9
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
10
10
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
11
|
+
option :status, :aliases => :s, type: :string, desc: 'Scraper status. Status can be: done, cancelled, paused, finishing.'
|
11
12
|
def list
|
12
13
|
client = Client::Scraper.new(options)
|
13
14
|
puts "#{client.all}"
|
@@ -33,6 +34,8 @@ module Datahen
|
|
33
34
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
35
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
35
36
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
37
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
38
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
36
39
|
def create(scraper_name, git_repository)
|
37
40
|
# puts "options #{options}"
|
38
41
|
client = Client::Scraper.new(options)
|
@@ -59,6 +62,8 @@ module Datahen
|
|
59
62
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
60
63
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
61
64
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
65
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
66
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
62
67
|
def update(scraper_name)
|
63
68
|
client = Client::Scraper.new(options)
|
64
69
|
puts "#{client.update(scraper_name, options)}"
|
@@ -97,6 +102,7 @@ module Datahen
|
|
97
102
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
98
103
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
99
104
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
105
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
100
106
|
def start(scraper_name)
|
101
107
|
client = Client::ScraperJob.new(options)
|
102
108
|
puts "Starting a scrape job..."
|
@@ -105,6 +105,8 @@ module Datahen
|
|
105
105
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
106
106
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
107
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
108
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
109
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
108
110
|
def update(scraper_name)
|
109
111
|
if options[:job]
|
110
112
|
client = Client::Job.new(options)
|
@@ -46,6 +46,7 @@ module Datahen
|
|
46
46
|
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
47
|
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
48
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
49
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
49
50
|
def add(scraper_name, url)
|
50
51
|
begin
|
51
52
|
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
@@ -80,6 +81,7 @@ module Datahen
|
|
80
81
|
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
81
82
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
82
83
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
84
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
83
85
|
def update(scraper_name, gid)
|
84
86
|
begin
|
85
87
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
data/lib/datahen/cli.rb
CHANGED
@@ -16,10 +16,9 @@ require 'datahen/cli/parser'
|
|
16
16
|
require 'datahen/cli/seeder'
|
17
17
|
require 'datahen/cli/finisher'
|
18
18
|
require 'datahen/cli/env_var'
|
19
|
+
require 'datahen/cli/account_deploy_key'
|
19
20
|
require 'datahen/cli/account'
|
20
21
|
|
21
|
-
|
22
|
-
|
23
22
|
module Datahen
|
24
23
|
class CLI < Thor
|
25
24
|
desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
|
data/lib/datahen/client/job.rb
CHANGED
@@ -22,6 +22,8 @@ module Datahen
|
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
23
|
body[:profile] = opts[:profile] if opts[:profile]
|
24
24
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
25
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
26
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
25
27
|
params = @options.merge({body: body.to_json})
|
26
28
|
|
27
29
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -16,6 +16,8 @@ module Datahen
|
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
19
21
|
|
20
22
|
params = @options.merge({body: body.to_json})
|
21
23
|
|
@@ -38,6 +40,8 @@ module Datahen
|
|
38
40
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
39
41
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
40
42
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
43
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
44
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
41
45
|
|
42
46
|
params = @options.merge({body: body.to_json})
|
43
47
|
|
@@ -29,6 +29,8 @@ module Datahen
|
|
29
29
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
30
|
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
31
31
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
32
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
33
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
32
34
|
params = @options.merge({body: body.to_json})
|
33
35
|
self.class.post("/scrapers", params)
|
34
36
|
end
|
@@ -51,6 +53,8 @@ module Datahen
|
|
51
53
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
52
54
|
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
53
55
|
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
56
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
57
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
54
58
|
params = @options.merge({body: body.to_json})
|
55
59
|
|
56
60
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -12,6 +12,8 @@ module Datahen
|
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
14
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
15
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
16
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
15
17
|
if opts[:vars]
|
16
18
|
if opts[:vars].is_a?(Array)
|
17
19
|
body[:vars] = opts[:vars]
|
@@ -39,6 +41,8 @@ module Datahen
|
|
39
41
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
40
42
|
body[:profile] = opts[:profile] if opts[:profile]
|
41
43
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
44
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
45
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
42
46
|
params = @options.merge({body: body.to_json})
|
43
47
|
|
44
48
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -16,6 +16,8 @@ module Datahen
|
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
19
21
|
|
20
22
|
params = @options.merge({body: body.to_json})
|
21
23
|
|
@@ -61,6 +63,8 @@ module Datahen
|
|
61
63
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
62
64
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
63
65
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
66
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
67
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
64
68
|
|
65
69
|
params = @options.merge({body: body.to_json})
|
66
70
|
|
@@ -8,7 +8,11 @@ module Datahen
|
|
8
8
|
attr_accessor :refetch_self
|
9
9
|
# Reparse self page flag.
|
10
10
|
# @return [Boollean]
|
11
|
+
# @note It is stronger than #limbo_self flag.
|
11
12
|
attr_accessor :reparse_self
|
13
|
+
# Limbo self page flag.
|
14
|
+
# @return [Boollean]
|
15
|
+
attr_accessor :limbo_self
|
12
16
|
|
13
17
|
def initialize(options={})
|
14
18
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
@@ -31,7 +35,8 @@ module Datahen
|
|
31
35
|
:find_output,
|
32
36
|
:find_outputs,
|
33
37
|
:refetch,
|
34
|
-
:reparse
|
38
|
+
:reparse,
|
39
|
+
:limbo
|
35
40
|
].freeze
|
36
41
|
end
|
37
42
|
|
@@ -104,7 +109,7 @@ module Datahen
|
|
104
109
|
|
105
110
|
def refetch_page gid
|
106
111
|
if save
|
107
|
-
Client::
|
112
|
+
Client::JobPage.new({gid: gid}).refetch(self.job_id)
|
108
113
|
puts "Refetch page #{gid}"
|
109
114
|
else
|
110
115
|
puts "Would have refetch page #{gid}"
|
@@ -122,7 +127,7 @@ module Datahen
|
|
122
127
|
|
123
128
|
def reparse_page gid
|
124
129
|
if save
|
125
|
-
Client::
|
130
|
+
Client::JobPage.new({gid: gid}).reparse(self.job_id)
|
126
131
|
puts "Reparse page #{gid}"
|
127
132
|
else
|
128
133
|
puts "Would have reparse page #{gid}"
|
@@ -138,6 +143,24 @@ module Datahen
|
|
138
143
|
reparse_page page_gid
|
139
144
|
end
|
140
145
|
|
146
|
+
def limbo_page gid
|
147
|
+
if save
|
148
|
+
Client::JobPage.new({gid: gid}).limbo(self.job_id)
|
149
|
+
puts "Limbo page #{gid}"
|
150
|
+
else
|
151
|
+
puts "Would have limbo page #{gid}"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def limbo page_gid
|
156
|
+
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
157
|
+
if page_gid == gid
|
158
|
+
self.limbo_self = true
|
159
|
+
raise Error::SafeTerminateError
|
160
|
+
end
|
161
|
+
limbo_page page_gid
|
162
|
+
end
|
163
|
+
|
141
164
|
def eval_parser_script(save=false)
|
142
165
|
update_parsing_starting_status
|
143
166
|
|
@@ -148,6 +171,7 @@ module Datahen
|
|
148
171
|
page = init_page_vars(page)
|
149
172
|
self.refetch_self = false
|
150
173
|
self.reparse_self = false
|
174
|
+
self.limbo_self = false
|
151
175
|
|
152
176
|
begin
|
153
177
|
context = isolated_binding({
|
@@ -178,6 +202,8 @@ module Datahen
|
|
178
202
|
refetch_page gid
|
179
203
|
elsif reparse_self
|
180
204
|
reparse_page gid
|
205
|
+
elsif limbo_self
|
206
|
+
limbo_page gid
|
181
207
|
else
|
182
208
|
update_parsing_done_status
|
183
209
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.20.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -212,6 +212,7 @@ files:
|
|
212
212
|
- lib/datahen.rb
|
213
213
|
- lib/datahen/cli.rb
|
214
214
|
- lib/datahen/cli/account.rb
|
215
|
+
- lib/datahen/cli/account_deploy_key.rb
|
215
216
|
- lib/datahen/cli/env_var.rb
|
216
217
|
- lib/datahen/cli/finisher.rb
|
217
218
|
- lib/datahen/cli/global_page.rb
|
@@ -276,7 +277,7 @@ metadata:
|
|
276
277
|
allowed_push_host: https://rubygems.org
|
277
278
|
homepage_uri: https://datahen.com
|
278
279
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
279
|
-
post_install_message:
|
280
|
+
post_install_message:
|
280
281
|
rdoc_options: []
|
281
282
|
require_paths:
|
282
283
|
- lib
|
@@ -292,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
292
293
|
version: '0'
|
293
294
|
requirements: []
|
294
295
|
rubygems_version: 3.0.3
|
295
|
-
signing_key:
|
296
|
+
signing_key:
|
296
297
|
specification_version: 4
|
297
298
|
summary: DataHen toolbelt for developers
|
298
299
|
test_files: []
|