datahen 0.14.26 → 0.15.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/datahen.gemspec +3 -1
- data/lib/datahen/cli/parser.rb +47 -7
- data/lib/datahen/client/job_page.rb +10 -0
- data/lib/datahen/scraper/batch_parser.rb +177 -0
- data/lib/datahen/scraper/executor.rb +4 -1
- data/lib/datahen/scraper/parser.rb +16 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +4 -1
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/version.rb +1 -1
- metadata +32 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7ad617ad097cf773fe496c3ccf801c49926524017784e5a51764ec88992b11ab
|
4
|
+
data.tar.gz: 8292cf447a3fedcad565bc1aa98041d68a51f0d93ff1598b8c31980170d1f66c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e883e9a53339e342446ed543bab41bf0ab60de6e1f5f7aaaad54125ec67275803bf1e7676e3764c59f0ba7b75e8fdf4adfe98733ab376c350b3a3247a678895
|
7
|
+
data.tar.gz: eaa4d12e93f31c7e516bb94f1222ecad2d6581609f77372de9078ac91a716438f6e998ec9b00ba4d3da76d02ccab5639fe06393c67346ab896e69d62d3fcc1e5
|
data/datahen.gemspec
CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.bindir = "exe"
|
34
34
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
35
35
|
spec.require_paths = ["lib"]
|
36
|
-
spec.required_ruby_version = '>= 2.
|
36
|
+
spec.required_ruby_version = '>= 2.4.4'
|
37
37
|
spec.add_dependency "thor", "~> 0.20.3"
|
38
38
|
spec.add_dependency 'httparty', '~> 0.16.2'
|
39
39
|
spec.add_dependency 'nokogiri', '~> 1.6'
|
40
|
+
spec.add_dependency 'concurrent-ruby', '~> 1.1'
|
41
|
+
spec.add_dependency 'parallel', '~> 1.20'
|
40
42
|
spec.add_development_dependency 'bundler', '>= 1.16'
|
41
43
|
spec.add_development_dependency 'rake', '>= 10.0'
|
42
44
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -43,17 +43,17 @@ module Datahen
|
|
43
43
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
44
44
|
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
45
45
|
def exec_parse(scraper_name, parser_file, *gids)
|
46
|
+
if options[:job]
|
47
|
+
job_id = options[:job]
|
48
|
+
else
|
49
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
50
|
+
job_id = job['id']
|
51
|
+
end
|
52
|
+
|
46
53
|
gids.each do |gid|
|
47
54
|
begin
|
48
55
|
puts "Parsing #{gid}"
|
49
56
|
|
50
|
-
if options[:job]
|
51
|
-
job_id = options[:job]
|
52
|
-
else
|
53
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
54
|
-
job_id = job['id']
|
55
|
-
end
|
56
|
-
|
57
57
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
58
58
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
59
59
|
rescue => e
|
@@ -61,6 +61,46 @@ module Datahen
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
64
|
+
|
65
|
+
desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
|
66
|
+
long_desc <<-LONGDESC
|
67
|
+
Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
|
68
|
+
LONGDESC
|
69
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
70
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
71
|
+
option :"workers", type: :numeric, default: 1, desc: "Worker count"
|
72
|
+
option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
|
73
|
+
option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
|
74
|
+
option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
|
75
|
+
def batch_exec_parse(scraper_name, config_file)
|
76
|
+
if options[:job]
|
77
|
+
job_id = options[:job]
|
78
|
+
else
|
79
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
80
|
+
job_id = job['id']
|
81
|
+
end
|
82
|
+
|
83
|
+
# make the stdout and stderr sync to prevent buffering
|
84
|
+
old_stdout_sync = $stdout.sync
|
85
|
+
old_stderr_sync = $stderr.sync
|
86
|
+
$stdout.sync = true
|
87
|
+
$stderr.sync = true
|
88
|
+
|
89
|
+
begin
|
90
|
+
batch = Datahen::Scraper::BatchParser.new job_id, config_file,
|
91
|
+
worker_count: options[:"workers"],
|
92
|
+
max_garbage: options[:"max-garbage"],
|
93
|
+
dequeue_interval: options[:"dequeue-interval"],
|
94
|
+
dequeue_scale: options[:"dequeue-scale"]
|
95
|
+
batch.exec_parse true, options[:"keep-outputs"]
|
96
|
+
rescue => e
|
97
|
+
puts [e.message] + e.backtrace
|
98
|
+
end
|
99
|
+
|
100
|
+
# resume whatever state the stdout and stderr sync were
|
101
|
+
$stdout.sync = old_stdout_sync
|
102
|
+
$stderr.sync = old_stderr_sync
|
103
|
+
end
|
64
104
|
end
|
65
105
|
end
|
66
106
|
|
@@ -42,6 +42,16 @@ module Datahen
|
|
42
42
|
self.class.post("/jobs/#{job_id}/pages", params)
|
43
43
|
end
|
44
44
|
|
45
|
+
def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
|
46
|
+
body = {
|
47
|
+
limit: limit,
|
48
|
+
page_types: page_types,
|
49
|
+
parse_fetching_failed: parse_fetching_failed
|
50
|
+
}
|
51
|
+
params = @options.merge({body: body.to_json})
|
52
|
+
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
53
|
+
end
|
54
|
+
|
45
55
|
def parsing_update(job_id, gid, opts={})
|
46
56
|
body = {}
|
47
57
|
body[:outputs] = opts.fetch(:outputs) {[]}
|
@@ -0,0 +1,177 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
module Datahen
|
5
|
+
module Scraper
|
6
|
+
class BatchParser
|
7
|
+
NOT_FOUND_MSG = "No more pages to parse found"
|
8
|
+
NO_DEQUEUE_COUNT_MSG = "Warning: Max page to parse dequeue count is 0, check pages to parse scale"
|
9
|
+
NO_WORKERS_MSG = "Warning: There are no parser workers"
|
10
|
+
|
11
|
+
attr_accessor :config_file, :garbage_count, :last_message
|
12
|
+
attr_reader :job_id, :worker_count, :pages, :max_garbage
|
13
|
+
attr_reader :dequeue_interval, :dequeue_scale
|
14
|
+
attr_reader :page_types, :parsers
|
15
|
+
attr_reader :config, :client, :garbage_mutex
|
16
|
+
|
17
|
+
def self.wait time_in_seconds
|
18
|
+
Kernel.sleep time_in_seconds
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(job_id, config_file, opts = {})
|
22
|
+
opts = {
|
23
|
+
worker_count: 1,
|
24
|
+
max_garbage: 5,
|
25
|
+
dequeue_interval: 3,
|
26
|
+
dequeue_scale: 2,
|
27
|
+
client_options: {}
|
28
|
+
}.merge opts
|
29
|
+
|
30
|
+
@job_id = job_id
|
31
|
+
@worker_count = opts[:worker_count]
|
32
|
+
@dequeue_interval = opts[:dequeue_interval]
|
33
|
+
@dequeue_scale = opts[:dequeue_scale]
|
34
|
+
@max_garbage = opts[:max_garbage]
|
35
|
+
@pages = Concurrent::Hash.new
|
36
|
+
@garbage_mutex = Mutex.new
|
37
|
+
self.garbage_count = 0
|
38
|
+
self.config_file = config_file
|
39
|
+
self.load_config
|
40
|
+
|
41
|
+
@client = Datahen::Client::JobPage.new(opts[:client_options])
|
42
|
+
end
|
43
|
+
|
44
|
+
def recollect_garbage
|
45
|
+
self.garbage_mutex.synchronize do
|
46
|
+
puts "Recollect garbage"
|
47
|
+
GC.start
|
48
|
+
self.garbage_count = 0
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def load_config
|
53
|
+
# build page type to script file map
|
54
|
+
@page_types = []
|
55
|
+
@parsers = Concurrent::Hash.new
|
56
|
+
@config = YAML.load_file(config_file)
|
57
|
+
self.config['parsers'].each do |v|
|
58
|
+
next if !v['disabled'].nil? && !!v['disabled']
|
59
|
+
@page_types << v['page_type']
|
60
|
+
self.parsers[v['page_type']] = v['file']
|
61
|
+
end
|
62
|
+
self.recollect_garbage
|
63
|
+
end
|
64
|
+
|
65
|
+
def repeat_puts message
|
66
|
+
puts message
|
67
|
+
self.last_message = ''
|
68
|
+
end
|
69
|
+
|
70
|
+
def no_repeat_puts message
|
71
|
+
return if message == self.last_message
|
72
|
+
puts message
|
73
|
+
self.last_message = message
|
74
|
+
end
|
75
|
+
|
76
|
+
def load_pages
|
77
|
+
# calculate dequeue size
|
78
|
+
max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
|
79
|
+
current_size = self.pages.length
|
80
|
+
dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
|
81
|
+
if dequeue_size < 1
|
82
|
+
return 0
|
83
|
+
end
|
84
|
+
dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
|
85
|
+
|
86
|
+
# reserve and get to pages parse
|
87
|
+
response = client.dequeue self.job_id,
|
88
|
+
dequeue_size,
|
89
|
+
self.page_types,
|
90
|
+
config['parse_fetching_failed']
|
91
|
+
|
92
|
+
# ensure a valid response or try again
|
93
|
+
if response.nil? || response.response.code.to_i != 200
|
94
|
+
self.repeat_puts(response.nil? ? 'null' : response.body)
|
95
|
+
self.recollect_garbage
|
96
|
+
return 0
|
97
|
+
end
|
98
|
+
|
99
|
+
# add pages
|
100
|
+
count = 0
|
101
|
+
(JSON.parse(response.body) || []).each do |page|
|
102
|
+
count += 1
|
103
|
+
next if self.pages.has_key? page['gid']
|
104
|
+
self.pages[page['gid']] = page
|
105
|
+
end
|
106
|
+
response = nil
|
107
|
+
|
108
|
+
# recolect garbage to free some memory before parsing
|
109
|
+
if count > 0
|
110
|
+
self.recollect_garbage
|
111
|
+
self.repeat_puts "Found #{count} page(s) to parse"
|
112
|
+
else
|
113
|
+
self.no_repeat_puts NOT_FOUND_MSG
|
114
|
+
end
|
115
|
+
|
116
|
+
# return how many pages were loaded
|
117
|
+
count
|
118
|
+
end
|
119
|
+
|
120
|
+
def dequeue_pages
|
121
|
+
# collect garbage
|
122
|
+
self.garbage_count += 1
|
123
|
+
if self.garbage_count > self.max_garbage
|
124
|
+
self.recollect_garbage
|
125
|
+
end
|
126
|
+
|
127
|
+
# return page if there are loeaded pages
|
128
|
+
while true do
|
129
|
+
key_value = self.pages.shift
|
130
|
+
return key_value[1] unless key_value.nil?
|
131
|
+
self.class.wait 1
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def exec_parse save = false, keep_outputs = false
|
136
|
+
if self.worker_count < 1
|
137
|
+
self.no_repeat_puts NO_WORKERS_MSG
|
138
|
+
return
|
139
|
+
else
|
140
|
+
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
141
|
+
end
|
142
|
+
|
143
|
+
# dequeuing on parallel
|
144
|
+
keep_dequeue = Concurrent::Array.new
|
145
|
+
keep_dequeue[0] = true
|
146
|
+
Thread.new do
|
147
|
+
while keep_dequeue[0]
|
148
|
+
begin
|
149
|
+
self.load_pages
|
150
|
+
self.class.wait self.dequeue_interval
|
151
|
+
rescue => e
|
152
|
+
puts [e.message] + e.backtrace rescue 'error'
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
dequeue = lambda{ self.dequeue_pages }
|
158
|
+
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
159
|
+
parser_file = self.parsers[page['page_type']]
|
160
|
+
begin
|
161
|
+
puts Datahen::Scraper::Parser.exec_parser_by_page(
|
162
|
+
parser_file,
|
163
|
+
page,
|
164
|
+
job_id,
|
165
|
+
save,
|
166
|
+
nil,
|
167
|
+
keep_outputs
|
168
|
+
)
|
169
|
+
rescue => e
|
170
|
+
puts [e.message] + e.backtrace rescue 'error'
|
171
|
+
end
|
172
|
+
end
|
173
|
+
keep_dequeue[0] = false
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -6,7 +6,7 @@ module Datahen
|
|
6
6
|
# Max allowed page size when query outputs (see #find_outputs).
|
7
7
|
MAX_FIND_OUTPUTS_PER_PAGE = 500
|
8
8
|
|
9
|
-
attr_accessor :filename, :gid, :job_id
|
9
|
+
attr_accessor :filename, :page, :gid, :job_id
|
10
10
|
|
11
11
|
include Datahen::Plugin::ContextExposer
|
12
12
|
|
@@ -15,6 +15,9 @@ module Datahen
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def init_page()
|
18
|
+
# skip whenever a page is provided
|
19
|
+
return self.page unless self.page.nil?
|
20
|
+
|
18
21
|
if job_id
|
19
22
|
puts "getting Job Page"
|
20
23
|
init_job_page
|
@@ -18,6 +18,22 @@ module Datahen
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
+
def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
22
|
+
extname = File.extname(filename)
|
23
|
+
case extname
|
24
|
+
when '.rb'
|
25
|
+
executor = RubyParserExecutor.new(
|
26
|
+
filename: filename,
|
27
|
+
page: page,
|
28
|
+
job_id: job_id,
|
29
|
+
vars: vars,
|
30
|
+
keep_outputs: keep_outputs
|
31
|
+
)
|
32
|
+
executor.exec_parser(save)
|
33
|
+
else
|
34
|
+
puts "Unable to find a parser executor for file type \"#{extname}\""
|
35
|
+
end
|
36
|
+
end
|
21
37
|
|
22
38
|
end
|
23
39
|
end
|
@@ -12,7 +12,8 @@ module Datahen
|
|
12
12
|
|
13
13
|
def initialize(options={})
|
14
14
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
15
|
-
@
|
15
|
+
@page = options.fetch(:page) { nil }
|
16
|
+
@gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
|
16
17
|
@job_id = options.fetch(:job_id)
|
17
18
|
@page_vars = options.fetch(:vars) { {} }
|
18
19
|
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
@@ -46,6 +47,8 @@ module Datahen
|
|
46
47
|
end
|
47
48
|
|
48
49
|
def init_page_vars(page)
|
50
|
+
return self.page unless self.page.nil?
|
51
|
+
|
49
52
|
if !@page_vars.nil? && !@page_vars.empty?
|
50
53
|
page['vars'] = @page_vars
|
51
54
|
end
|
data/lib/datahen/scraper.rb
CHANGED
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-05-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -52,6 +52,34 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: concurrent-ruby
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.1'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.1'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: parallel
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.20'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.20'
|
55
83
|
- !ruby/object:Gem::Dependency
|
56
84
|
name: bundler
|
57
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -232,6 +260,7 @@ files:
|
|
232
260
|
- lib/datahen/plugin.rb
|
233
261
|
- lib/datahen/plugin/context_exposer.rb
|
234
262
|
- lib/datahen/scraper.rb
|
263
|
+
- lib/datahen/scraper/batch_parser.rb
|
235
264
|
- lib/datahen/scraper/executor.rb
|
236
265
|
- lib/datahen/scraper/finisher.rb
|
237
266
|
- lib/datahen/scraper/parser.rb
|
@@ -255,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
255
284
|
requirements:
|
256
285
|
- - ">="
|
257
286
|
- !ruby/object:Gem::Version
|
258
|
-
version: 2.
|
287
|
+
version: 2.4.4
|
259
288
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
260
289
|
requirements:
|
261
290
|
- - ">="
|