datahen 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/scraper.rb +5 -0
- data/lib/datahen/cli/scraper_job.rb +2 -0
- data/lib/datahen/cli/scraper_page.rb +2 -0
- data/lib/datahen/client/job.rb +2 -0
- data/lib/datahen/client/job_page.rb +4 -0
- data/lib/datahen/client/scraper.rb +4 -0
- data/lib/datahen/client/scraper_job.rb +4 -0
- data/lib/datahen/client/scraper_job_page.rb +6 -0
- data/lib/datahen/scraper/batch_parser.rb +2 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +29 -3
- data/lib/datahen/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e1fcf7422236924fd818a1527337a6089cd444b1f35510b72fe140facbed7b0
|
4
|
+
data.tar.gz: 05be57d3e058ee9969d210ded0b1d043b388390d5f2ac834ece490691683f39d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4e076509fa8a0fa7fa78406916530bfe2c1b6075ac1007baab43a447911d1c7d8e90bddd8a1438c339a4fadef4e05e629e4907f1f1bae3f4c1f283dba63c25c9
|
7
|
+
data.tar.gz: e7ceb1208c87cd75fa7202f55549c6b2f2ce24980f7642827aab6f721107ca8ddb59829b93742126e465d6930c6c5574de2d045d3a968a7f3a826bf099ee3c4b
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -33,6 +33,8 @@ module Datahen
|
|
33
33
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
34
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
35
35
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
36
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
37
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
36
38
|
def create(scraper_name, git_repository)
|
37
39
|
# puts "options #{options}"
|
38
40
|
client = Client::Scraper.new(options)
|
@@ -59,6 +61,8 @@ module Datahen
|
|
59
61
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
60
62
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
61
63
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
64
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
65
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
62
66
|
def update(scraper_name)
|
63
67
|
client = Client::Scraper.new(options)
|
64
68
|
puts "#{client.update(scraper_name, options)}"
|
@@ -97,6 +101,7 @@ module Datahen
|
|
97
101
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
98
102
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
99
103
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
104
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
100
105
|
def start(scraper_name)
|
101
106
|
client = Client::ScraperJob.new(options)
|
102
107
|
puts "Starting a scrape job..."
|
@@ -105,6 +105,8 @@ module Datahen
|
|
105
105
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
106
106
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
107
|
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
108
|
+
option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
|
109
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
108
110
|
def update(scraper_name)
|
109
111
|
if options[:job]
|
110
112
|
client = Client::Job.new(options)
|
@@ -46,6 +46,7 @@ module Datahen
|
|
46
46
|
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
47
|
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
48
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
49
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
49
50
|
def add(scraper_name, url)
|
50
51
|
begin
|
51
52
|
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
@@ -80,6 +81,7 @@ module Datahen
|
|
80
81
|
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
81
82
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
82
83
|
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
84
|
+
option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
|
83
85
|
def update(scraper_name, gid)
|
84
86
|
begin
|
85
87
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
data/lib/datahen/client/job.rb
CHANGED
@@ -22,6 +22,8 @@ module Datahen
|
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
23
|
body[:profile] = opts[:profile] if opts[:profile]
|
24
24
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
25
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
26
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
25
27
|
params = @options.merge({body: body.to_json})
|
26
28
|
|
27
29
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -16,6 +16,8 @@ module Datahen
|
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
18
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
19
21
|
|
20
22
|
params = @options.merge({body: body.to_json})
|
21
23
|
|
@@ -38,6 +40,8 @@ module Datahen
|
|
38
40
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
39
41
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
40
42
|
body[:max_size] = opts[:max_size] if opts[:max_size]
|
43
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
44
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
41
45
|
|
42
46
|
params = @options.merge({body: body.to_json})
|
43
47
|
|
@@ -29,6 +29,8 @@ module Datahen
|
|
29
29
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
30
|
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
31
31
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
32
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
33
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
32
34
|
params = @options.merge({body: body.to_json})
|
33
35
|
self.class.post("/scrapers", params)
|
34
36
|
end
|
@@ -51,6 +53,8 @@ module Datahen
|
|
51
53
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
52
54
|
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
53
55
|
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
56
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
57
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
54
58
|
params = @options.merge({body: body.to_json})
|
55
59
|
|
56
60
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -12,6 +12,8 @@ module Datahen
|
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
14
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
15
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
16
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
15
17
|
if opts[:vars]
|
16
18
|
if opts[:vars].is_a?(Array)
|
17
19
|
body[:vars] = opts[:vars]
|
@@ -39,6 +41,8 @@ module Datahen
|
|
39
41
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
40
42
|
body[:profile] = opts[:profile] if opts[:profile]
|
41
43
|
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
44
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
45
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
42
46
|
params = @options.merge({body: body.to_json})
|
43
47
|
|
44
48
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -15,6 +15,9 @@ module Datahen
|
|
15
15
|
body[:page_type] = opts[:page_type] if opts[:page_type]
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
19
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
20
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
18
21
|
|
19
22
|
params = @options.merge({body: body.to_json})
|
20
23
|
|
@@ -59,6 +62,9 @@ module Datahen
|
|
59
62
|
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
60
63
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
61
64
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
65
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
66
|
+
body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
|
67
|
+
body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
|
62
68
|
|
63
69
|
params = @options.merge({body: body.to_json})
|
64
70
|
|
@@ -334,6 +334,7 @@ module Datahen
|
|
334
334
|
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
335
335
|
parser_file = self.parsers[page['page_type']]
|
336
336
|
begin
|
337
|
+
self.repeat_puts("Parsing page with GID #{page['gid']}")
|
337
338
|
puts Datahen::Scraper::Parser.exec_parser_by_page(
|
338
339
|
parser_file,
|
339
340
|
page,
|
@@ -342,6 +343,7 @@ module Datahen
|
|
342
343
|
nil,
|
343
344
|
keep_outputs
|
344
345
|
)
|
346
|
+
self.repeat_puts("Finish parsing page with GID #{page['gid']}")
|
345
347
|
rescue Parallel::Kill => e
|
346
348
|
puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
|
347
349
|
rescue Parallel::Break => e
|
@@ -8,7 +8,11 @@ module Datahen
|
|
8
8
|
attr_accessor :refetch_self
|
9
9
|
# Reparse self page flag.
|
10
10
|
# @return [Boollean]
|
11
|
+
# @note It is stronger than #limbo_self flag.
|
11
12
|
attr_accessor :reparse_self
|
13
|
+
# Limbo self page flag.
|
14
|
+
# @return [Boollean]
|
15
|
+
attr_accessor :limbo_self
|
12
16
|
|
13
17
|
def initialize(options={})
|
14
18
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
@@ -31,7 +35,8 @@ module Datahen
|
|
31
35
|
:find_output,
|
32
36
|
:find_outputs,
|
33
37
|
:refetch,
|
34
|
-
:reparse
|
38
|
+
:reparse,
|
39
|
+
:limbo
|
35
40
|
].freeze
|
36
41
|
end
|
37
42
|
|
@@ -104,7 +109,7 @@ module Datahen
|
|
104
109
|
|
105
110
|
def refetch_page gid
|
106
111
|
if save
|
107
|
-
Client::
|
112
|
+
Client::JobPage.new({gid: gid}).refetch(self.job_id)
|
108
113
|
puts "Refetch page #{gid}"
|
109
114
|
else
|
110
115
|
puts "Would have refetch page #{gid}"
|
@@ -122,7 +127,7 @@ module Datahen
|
|
122
127
|
|
123
128
|
def reparse_page gid
|
124
129
|
if save
|
125
|
-
Client::
|
130
|
+
Client::JobPage.new({gid: gid}).reparse(self.job_id)
|
126
131
|
puts "Reparse page #{gid}"
|
127
132
|
else
|
128
133
|
puts "Would have reparse page #{gid}"
|
@@ -138,6 +143,24 @@ module Datahen
|
|
138
143
|
reparse_page page_gid
|
139
144
|
end
|
140
145
|
|
146
|
+
def limbo_page gid
|
147
|
+
if save
|
148
|
+
Client::JobPage.new({gid: gid}).limbo(self.job_id)
|
149
|
+
puts "Limbo page #{gid}"
|
150
|
+
else
|
151
|
+
puts "Would have limbo page #{gid}"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def limbo page_gid
|
156
|
+
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
157
|
+
if page_gid == gid
|
158
|
+
self.limbo_self = true
|
159
|
+
raise Error::SafeTerminateError
|
160
|
+
end
|
161
|
+
limbo_page page_gid
|
162
|
+
end
|
163
|
+
|
141
164
|
def eval_parser_script(save=false)
|
142
165
|
update_parsing_starting_status
|
143
166
|
|
@@ -148,6 +171,7 @@ module Datahen
|
|
148
171
|
page = init_page_vars(page)
|
149
172
|
self.refetch_self = false
|
150
173
|
self.reparse_self = false
|
174
|
+
self.limbo_self = false
|
151
175
|
|
152
176
|
begin
|
153
177
|
context = isolated_binding({
|
@@ -178,6 +202,8 @@ module Datahen
|
|
178
202
|
refetch_page gid
|
179
203
|
elsif reparse_self
|
180
204
|
reparse_page gid
|
205
|
+
elsif limbo_self
|
206
|
+
limbo_page gid
|
181
207
|
else
|
182
208
|
update_parsing_done_status
|
183
209
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.18.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-09-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|