datahen 0.16.2 → 0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/account.rb +3 -0
- data/lib/datahen/cli/account_deploy_key.rb +26 -0
- data/lib/datahen/cli/scraper.rb +6 -0
- data/lib/datahen/cli/scraper_job.rb +2 -0
- data/lib/datahen/cli/scraper_page.rb +2 -0
- data/lib/datahen/cli.rb +1 -2
- data/lib/datahen/client/job.rb +2 -0
- data/lib/datahen/client/job_page.rb +4 -0
- data/lib/datahen/client/scraper.rb +4 -0
- data/lib/datahen/client/scraper_job.rb +4 -0
- data/lib/datahen/client/scraper_job_page.rb +4 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +29 -3
- data/lib/datahen/version.rb +1 -1
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 11c43658f61faff627da448abff393b5cc658c15e1dfd6765c6cfdee96958d01
|
4
|
+
data.tar.gz: 5dc4481c9755d33dcee2539a24fe1ba2b0336fc4320beaa54204d1636067ee75
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e25db9321dfb26cb88d38aa9adf353c9955c3932553fb87affe235bd58f0182e7043775d102ce2bb5e7d6d0764cd76bd71f3be024ced38eadd17c05cfa0efd69
|
7
|
+
data.tar.gz: b07e2655df12424db7f859da29f0921fa6417dfbfa6b72537615089d6fabac7451e3e61e23ba4f4e8cfff0c0555ef50bf751dd8b91b8be8ae51b23d855145586
|
data/lib/datahen/cli/account.rb
CHANGED
@@ -0,0 +1,26 @@
|
|
1
|
+
module Datahen
|
2
|
+
class CLI < Thor
|
3
|
+
class AccountDeployKey < Thor
|
4
|
+
package_name "account deploy_key"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "show", "Show public deploy key"
|
10
|
+
def show()
|
11
|
+
client = Client::DeployKey.new()
|
12
|
+
puts "#{client.find()}"
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "recreate", "Recreate public deploy key"
|
16
|
+
long_desc <<-LONGDESC
|
17
|
+
Recreate public deploy key.
|
18
|
+
LONGDESC
|
19
|
+
def recreate()
|
20
|
+
client = Client::DeployKey.new()
|
21
|
+
puts "#{client.create()}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -8,6 +8,7 @@ module Datahen
|
|
8
8
|
LONGDESC
|
9
9
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
10
10
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
11
|
+
option :status, :aliases => :s, type: :string, desc: 'Scraper status. Status can be: done, cancelled, paused, finishing.'
|
11
12
|
def list
|
12
13
|
client = Client::Scraper.new(options)
|
13
14
|
puts "#{client.all}"
|
@@ -33,6 +34,8 @@ module Datahen
|
|
33
34
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
35
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
35
36
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
37
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
38
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
36
39
|
def create(scraper_name, git_repository)
|
37
40
|
# puts "options #{options}"
|
38
41
|
client = Client::Scraper.new(options)
|
@@ -59,6 +62,8 @@ module Datahen
|
|
59
62
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
60
63
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
61
64
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
65
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
66
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
62
67
|
def update(scraper_name)
|
63
68
|
client = Client::Scraper.new(options)
|
64
69
|
puts "#{client.update(scraper_name, options)}"
|
@@ -97,6 +102,7 @@ module Datahen
|
|
97
102
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
98
103
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
99
104
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
105
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
100
106
|
def start(scraper_name)
|
101
107
|
client = Client::ScraperJob.new(options)
|
102
108
|
puts "Starting a scrape job..."
|
@@ -105,6 +105,8 @@ module Datahen
|
|
105
105
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
106
106
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
107
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
108
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
109
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
108
110
|
def update(scraper_name)
|
109
111
|
if options[:job]
|
110
112
|
client = Client::Job.new(options)
|
@@ -46,6 +46,7 @@ module Datahen
|
|
46
46
|
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
47
|
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
48
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
49
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
49
50
|
def add(scraper_name, url)
|
50
51
|
begin
|
51
52
|
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
@@ -80,6 +81,7 @@ module Datahen
|
|
80
81
|
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
81
82
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
82
83
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
84
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
83
85
|
def update(scraper_name, gid)
|
84
86
|
begin
|
85
87
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
data/lib/datahen/cli.rb
CHANGED
@@ -16,10 +16,9 @@ require 'datahen/cli/parser'
|
|
16
16
|
require 'datahen/cli/seeder'
|
17
17
|
require 'datahen/cli/finisher'
|
18
18
|
require 'datahen/cli/env_var'
|
19
|
+
require 'datahen/cli/account_deploy_key'
|
19
20
|
require 'datahen/cli/account'
|
20
21
|
|
21
|
-
|
22
|
-
|
23
22
|
module Datahen
|
24
23
|
class CLI < Thor
|
25
24
|
desc "scraper SUBCOMMAND ...ARGS", "manage scrapers"
|
data/lib/datahen/client/job.rb
CHANGED
@@ -22,6 +22,8 @@ module Datahen
|
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
23
|
body[:profile] = opts[:profile] if opts[:profile]
|
24
24
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
25
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
26
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
25
27
|
params = @options.merge({body: body.to_json})
|
26
28
|
|
27
29
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -16,6 +16,8 @@ module Datahen
|
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
19
21
|
|
20
22
|
params = @options.merge({body: body.to_json})
|
21
23
|
|
@@ -38,6 +40,8 @@ module Datahen
|
|
38
40
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
39
41
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
40
42
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
43
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
44
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
41
45
|
|
42
46
|
params = @options.merge({body: body.to_json})
|
43
47
|
|
@@ -29,6 +29,8 @@ module Datahen
|
|
29
29
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
30
|
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
31
31
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
32
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
33
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
32
34
|
params = @options.merge({body: body.to_json})
|
33
35
|
self.class.post("/scrapers", params)
|
34
36
|
end
|
@@ -51,6 +53,8 @@ module Datahen
|
|
51
53
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
52
54
|
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
53
55
|
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
56
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
57
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
54
58
|
params = @options.merge({body: body.to_json})
|
55
59
|
|
56
60
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -12,6 +12,8 @@ module Datahen
|
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
14
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
15
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
16
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
15
17
|
if opts[:vars]
|
16
18
|
if opts[:vars].is_a?(Array)
|
17
19
|
body[:vars] = opts[:vars]
|
@@ -39,6 +41,8 @@ module Datahen
|
|
39
41
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
40
42
|
body[:profile] = opts[:profile] if opts[:profile]
|
41
43
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
44
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
45
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
42
46
|
params = @options.merge({body: body.to_json})
|
43
47
|
|
44
48
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -16,6 +16,8 @@ module Datahen
|
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
19
21
|
|
20
22
|
params = @options.merge({body: body.to_json})
|
21
23
|
|
@@ -61,6 +63,8 @@ module Datahen
|
|
61
63
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
62
64
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
63
65
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
66
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
67
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
64
68
|
|
65
69
|
params = @options.merge({body: body.to_json})
|
66
70
|
|
@@ -8,7 +8,11 @@ module Datahen
|
|
8
8
|
attr_accessor :refetch_self
|
9
9
|
# Reparse self page flag.
|
10
10
|
# @return [Boollean]
|
11
|
+
# @note It is stronger than #limbo_self flag.
|
11
12
|
attr_accessor :reparse_self
|
13
|
+
# Limbo self page flag.
|
14
|
+
# @return [Boollean]
|
15
|
+
attr_accessor :limbo_self
|
12
16
|
|
13
17
|
def initialize(options={})
|
14
18
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
@@ -31,7 +35,8 @@ module Datahen
|
|
31
35
|
:find_output,
|
32
36
|
:find_outputs,
|
33
37
|
:refetch,
|
34
|
-
:reparse
|
38
|
+
:reparse,
|
39
|
+
:limbo
|
35
40
|
].freeze
|
36
41
|
end
|
37
42
|
|
@@ -104,7 +109,7 @@ module Datahen
|
|
104
109
|
|
105
110
|
def refetch_page gid
|
106
111
|
if save
|
107
|
-
Client::
|
112
|
+
Client::JobPage.new({gid: gid}).refetch(self.job_id)
|
108
113
|
puts "Refetch page #{gid}"
|
109
114
|
else
|
110
115
|
puts "Would have refetch page #{gid}"
|
@@ -122,7 +127,7 @@ module Datahen
|
|
122
127
|
|
123
128
|
def reparse_page gid
|
124
129
|
if save
|
125
|
-
Client::
|
130
|
+
Client::JobPage.new({gid: gid}).reparse(self.job_id)
|
126
131
|
puts "Reparse page #{gid}"
|
127
132
|
else
|
128
133
|
puts "Would have reparse page #{gid}"
|
@@ -138,6 +143,24 @@ module Datahen
|
|
138
143
|
reparse_page page_gid
|
139
144
|
end
|
140
145
|
|
146
|
+
def limbo_page gid
|
147
|
+
if save
|
148
|
+
Client::JobPage.new({gid: gid}).limbo(self.job_id)
|
149
|
+
puts "Limbo page #{gid}"
|
150
|
+
else
|
151
|
+
puts "Would have limbo page #{gid}"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def limbo page_gid
|
156
|
+
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
157
|
+
if page_gid == gid
|
158
|
+
self.limbo_self = true
|
159
|
+
raise Error::SafeTerminateError
|
160
|
+
end
|
161
|
+
limbo_page page_gid
|
162
|
+
end
|
163
|
+
|
141
164
|
def eval_parser_script(save=false)
|
142
165
|
update_parsing_starting_status
|
143
166
|
|
@@ -148,6 +171,7 @@ module Datahen
|
|
148
171
|
page = init_page_vars(page)
|
149
172
|
self.refetch_self = false
|
150
173
|
self.reparse_self = false
|
174
|
+
self.limbo_self = false
|
151
175
|
|
152
176
|
begin
|
153
177
|
context = isolated_binding({
|
@@ -178,6 +202,8 @@ module Datahen
|
|
178
202
|
refetch_page gid
|
179
203
|
elsif reparse_self
|
180
204
|
reparse_page gid
|
205
|
+
elsif limbo_self
|
206
|
+
limbo_page gid
|
181
207
|
else
|
182
208
|
update_parsing_done_status
|
183
209
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.20.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -212,6 +212,7 @@ files:
|
|
212
212
|
- lib/datahen.rb
|
213
213
|
- lib/datahen/cli.rb
|
214
214
|
- lib/datahen/cli/account.rb
|
215
|
+
- lib/datahen/cli/account_deploy_key.rb
|
215
216
|
- lib/datahen/cli/env_var.rb
|
216
217
|
- lib/datahen/cli/finisher.rb
|
217
218
|
- lib/datahen/cli/global_page.rb
|
@@ -276,7 +277,7 @@ metadata:
|
|
276
277
|
allowed_push_host: https://rubygems.org
|
277
278
|
homepage_uri: https://datahen.com
|
278
279
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
279
|
-
post_install_message:
|
280
|
+
post_install_message:
|
280
281
|
rdoc_options: []
|
281
282
|
require_paths:
|
282
283
|
- lib
|
@@ -292,7 +293,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
292
293
|
version: '0'
|
293
294
|
requirements: []
|
294
295
|
rubygems_version: 3.0.3
|
295
|
-
signing_key:
|
296
|
+
signing_key:
|
296
297
|
specification_version: 4
|
297
298
|
summary: DataHen toolbelt for developers
|
298
299
|
test_files: []
|