datahen 0.14.26 → 0.15.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/datahen.gemspec +3 -1
- data/lib/datahen/cli/parser.rb +47 -7
- data/lib/datahen/client/job_page.rb +10 -0
- data/lib/datahen/scraper/batch_parser.rb +177 -0
- data/lib/datahen/scraper/executor.rb +4 -1
- data/lib/datahen/scraper/parser.rb +16 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +4 -1
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/version.rb +1 -1
- metadata +32 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7ad617ad097cf773fe496c3ccf801c49926524017784e5a51764ec88992b11ab
|
4
|
+
data.tar.gz: 8292cf447a3fedcad565bc1aa98041d68a51f0d93ff1598b8c31980170d1f66c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e883e9a53339e342446ed543bab41bf0ab60de6e1f5f7aaaad54125ec67275803bf1e7676e3764c59f0ba7b75e8fdf4adfe98733ab376c350b3a3247a678895
|
7
|
+
data.tar.gz: eaa4d12e93f31c7e516bb94f1222ecad2d6581609f77372de9078ac91a716438f6e998ec9b00ba4d3da76d02ccab5639fe06393c67346ab896e69d62d3fcc1e5
|
data/datahen.gemspec
CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.bindir = "exe"
|
34
34
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
35
35
|
spec.require_paths = ["lib"]
|
36
|
-
spec.required_ruby_version = '>= 2.
|
36
|
+
spec.required_ruby_version = '>= 2.4.4'
|
37
37
|
spec.add_dependency "thor", "~> 0.20.3"
|
38
38
|
spec.add_dependency 'httparty', '~> 0.16.2'
|
39
39
|
spec.add_dependency 'nokogiri', '~> 1.6'
|
40
|
+
spec.add_dependency 'concurrent-ruby', '~> 1.1'
|
41
|
+
spec.add_dependency 'parallel', '~> 1.20'
|
40
42
|
spec.add_development_dependency 'bundler', '>= 1.16'
|
41
43
|
spec.add_development_dependency 'rake', '>= 10.0'
|
42
44
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -43,17 +43,17 @@ module Datahen
|
|
43
43
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
44
44
|
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
45
45
|
def exec_parse(scraper_name, parser_file, *gids)
|
46
|
+
if options[:job]
|
47
|
+
job_id = options[:job]
|
48
|
+
else
|
49
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
50
|
+
job_id = job['id']
|
51
|
+
end
|
52
|
+
|
46
53
|
gids.each do |gid|
|
47
54
|
begin
|
48
55
|
puts "Parsing #{gid}"
|
49
56
|
|
50
|
-
if options[:job]
|
51
|
-
job_id = options[:job]
|
52
|
-
else
|
53
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
54
|
-
job_id = job['id']
|
55
|
-
end
|
56
|
-
|
57
57
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
58
58
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
59
59
|
rescue => e
|
@@ -61,6 +61,46 @@ module Datahen
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
64
|
+
|
65
|
+
desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
|
66
|
+
long_desc <<-LONGDESC
|
67
|
+
Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
|
68
|
+
LONGDESC
|
69
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
70
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
71
|
+
option :"workers", type: :numeric, default: 1, desc: "Worker count"
|
72
|
+
option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
|
73
|
+
option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
|
74
|
+
option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
|
75
|
+
def batch_exec_parse(scraper_name, config_file)
|
76
|
+
if options[:job]
|
77
|
+
job_id = options[:job]
|
78
|
+
else
|
79
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
80
|
+
job_id = job['id']
|
81
|
+
end
|
82
|
+
|
83
|
+
# make the stdout and stderr sync to prevent buffering
|
84
|
+
old_stdout_sync = $stdout.sync
|
85
|
+
old_stderr_sync = $stderr.sync
|
86
|
+
$stdout.sync = true
|
87
|
+
$stderr.sync = true
|
88
|
+
|
89
|
+
begin
|
90
|
+
batch = Datahen::Scraper::BatchParser.new job_id, config_file,
|
91
|
+
worker_count: options[:"workers"],
|
92
|
+
max_garbage: options[:"max-garbage"],
|
93
|
+
dequeue_interval: options[:"dequeue-interval"],
|
94
|
+
dequeue_scale: options[:"dequeue-scale"]
|
95
|
+
batch.exec_parse true, options[:"keep-outputs"]
|
96
|
+
rescue => e
|
97
|
+
puts [e.message] + e.backtrace
|
98
|
+
end
|
99
|
+
|
100
|
+
# resume whatever state the stdout and stderr sync were
|
101
|
+
$stdout.sync = old_stdout_sync
|
102
|
+
$stderr.sync = old_stderr_sync
|
103
|
+
end
|
64
104
|
end
|
65
105
|
end
|
66
106
|
|
@@ -42,6 +42,16 @@ module Datahen
|
|
42
42
|
self.class.post("/jobs/#{job_id}/pages", params)
|
43
43
|
end
|
44
44
|
|
45
|
+
def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
|
46
|
+
body = {
|
47
|
+
limit: limit,
|
48
|
+
page_types: page_types,
|
49
|
+
parse_fetching_failed: parse_fetching_failed
|
50
|
+
}
|
51
|
+
params = @options.merge({body: body.to_json})
|
52
|
+
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
53
|
+
end
|
54
|
+
|
45
55
|
def parsing_update(job_id, gid, opts={})
|
46
56
|
body = {}
|
47
57
|
body[:outputs] = opts.fetch(:outputs) {[]}
|
@@ -0,0 +1,177 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
module Datahen
|
5
|
+
module Scraper
|
6
|
+
class BatchParser
|
7
|
+
NOT_FOUND_MSG = "No more pages to parse found"
|
8
|
+
NO_DEQUEUE_COUNT_MSG = "Warning: Max page to parse dequeue count is 0, check pages to parse scale"
|
9
|
+
NO_WORKERS_MSG = "Warning: There are no parser workers"
|
10
|
+
|
11
|
+
attr_accessor :config_file, :garbage_count, :last_message
|
12
|
+
attr_reader :job_id, :worker_count, :pages, :max_garbage
|
13
|
+
attr_reader :dequeue_interval, :dequeue_scale
|
14
|
+
attr_reader :page_types, :parsers
|
15
|
+
attr_reader :config, :client, :garbage_mutex
|
16
|
+
|
17
|
+
def self.wait time_in_seconds
|
18
|
+
Kernel.sleep time_in_seconds
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(job_id, config_file, opts = {})
|
22
|
+
opts = {
|
23
|
+
worker_count: 1,
|
24
|
+
max_garbage: 5,
|
25
|
+
dequeue_interval: 3,
|
26
|
+
dequeue_scale: 2,
|
27
|
+
client_options: {}
|
28
|
+
}.merge opts
|
29
|
+
|
30
|
+
@job_id = job_id
|
31
|
+
@worker_count = opts[:worker_count]
|
32
|
+
@dequeue_interval = opts[:dequeue_interval]
|
33
|
+
@dequeue_scale = opts[:dequeue_scale]
|
34
|
+
@max_garbage = opts[:max_garbage]
|
35
|
+
@pages = Concurrent::Hash.new
|
36
|
+
@garbage_mutex = Mutex.new
|
37
|
+
self.garbage_count = 0
|
38
|
+
self.config_file = config_file
|
39
|
+
self.load_config
|
40
|
+
|
41
|
+
@client = Datahen::Client::JobPage.new(opts[:client_options])
|
42
|
+
end
|
43
|
+
|
44
|
+
def recollect_garbage
|
45
|
+
self.garbage_mutex.synchronize do
|
46
|
+
puts "Recollect garbage"
|
47
|
+
GC.start
|
48
|
+
self.garbage_count = 0
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def load_config
|
53
|
+
# build page type to script file map
|
54
|
+
@page_types = []
|
55
|
+
@parsers = Concurrent::Hash.new
|
56
|
+
@config = YAML.load_file(config_file)
|
57
|
+
self.config['parsers'].each do |v|
|
58
|
+
next if !v['disabled'].nil? && !!v['disabled']
|
59
|
+
@page_types << v['page_type']
|
60
|
+
self.parsers[v['page_type']] = v['file']
|
61
|
+
end
|
62
|
+
self.recollect_garbage
|
63
|
+
end
|
64
|
+
|
65
|
+
def repeat_puts message
|
66
|
+
puts message
|
67
|
+
self.last_message = ''
|
68
|
+
end
|
69
|
+
|
70
|
+
def no_repeat_puts message
|
71
|
+
return if message == self.last_message
|
72
|
+
puts message
|
73
|
+
self.last_message = message
|
74
|
+
end
|
75
|
+
|
76
|
+
def load_pages
|
77
|
+
# calculate dequeue size
|
78
|
+
max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
|
79
|
+
current_size = self.pages.length
|
80
|
+
dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
|
81
|
+
if dequeue_size < 1
|
82
|
+
return 0
|
83
|
+
end
|
84
|
+
dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
|
85
|
+
|
86
|
+
# reserve and get to pages parse
|
87
|
+
response = client.dequeue self.job_id,
|
88
|
+
dequeue_size,
|
89
|
+
self.page_types,
|
90
|
+
config['parse_fetching_failed']
|
91
|
+
|
92
|
+
# ensure a valid response or try again
|
93
|
+
if response.nil? || response.response.code.to_i != 200
|
94
|
+
self.repeat_puts(response.nil? ? 'null' : response.body)
|
95
|
+
self.recollect_garbage
|
96
|
+
return 0
|
97
|
+
end
|
98
|
+
|
99
|
+
# add pages
|
100
|
+
count = 0
|
101
|
+
(JSON.parse(response.body) || []).each do |page|
|
102
|
+
count += 1
|
103
|
+
next if self.pages.has_key? page['gid']
|
104
|
+
self.pages[page['gid']] = page
|
105
|
+
end
|
106
|
+
response = nil
|
107
|
+
|
108
|
+
# recolect garbage to free some memory before parsing
|
109
|
+
if count > 0
|
110
|
+
self.recollect_garbage
|
111
|
+
self.repeat_puts "Found #{count} page(s) to parse"
|
112
|
+
else
|
113
|
+
self.no_repeat_puts NOT_FOUND_MSG
|
114
|
+
end
|
115
|
+
|
116
|
+
# return how many pages were loaded
|
117
|
+
count
|
118
|
+
end
|
119
|
+
|
120
|
+
def dequeue_pages
|
121
|
+
# collect garbage
|
122
|
+
self.garbage_count += 1
|
123
|
+
if self.garbage_count > self.max_garbage
|
124
|
+
self.recollect_garbage
|
125
|
+
end
|
126
|
+
|
127
|
+
# return page if there are loeaded pages
|
128
|
+
while true do
|
129
|
+
key_value = self.pages.shift
|
130
|
+
return key_value[1] unless key_value.nil?
|
131
|
+
self.class.wait 1
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def exec_parse save = false, keep_outputs = false
|
136
|
+
if self.worker_count < 1
|
137
|
+
self.no_repeat_puts NO_WORKERS_MSG
|
138
|
+
return
|
139
|
+
else
|
140
|
+
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
141
|
+
end
|
142
|
+
|
143
|
+
# dequeuing on parallel
|
144
|
+
keep_dequeue = Concurrent::Array.new
|
145
|
+
keep_dequeue[0] = true
|
146
|
+
Thread.new do
|
147
|
+
while keep_dequeue[0]
|
148
|
+
begin
|
149
|
+
self.load_pages
|
150
|
+
self.class.wait self.dequeue_interval
|
151
|
+
rescue => e
|
152
|
+
puts [e.message] + e.backtrace rescue 'error'
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
dequeue = lambda{ self.dequeue_pages }
|
158
|
+
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
159
|
+
parser_file = self.parsers[page['page_type']]
|
160
|
+
begin
|
161
|
+
puts Datahen::Scraper::Parser.exec_parser_by_page(
|
162
|
+
parser_file,
|
163
|
+
page,
|
164
|
+
job_id,
|
165
|
+
save,
|
166
|
+
nil,
|
167
|
+
keep_outputs
|
168
|
+
)
|
169
|
+
rescue => e
|
170
|
+
puts [e.message] + e.backtrace rescue 'error'
|
171
|
+
end
|
172
|
+
end
|
173
|
+
keep_dequeue[0] = false
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -6,7 +6,7 @@ module Datahen
|
|
6
6
|
# Max allowed page size when query outputs (see #find_outputs).
|
7
7
|
MAX_FIND_OUTPUTS_PER_PAGE = 500
|
8
8
|
|
9
|
-
attr_accessor :filename, :gid, :job_id
|
9
|
+
attr_accessor :filename, :page, :gid, :job_id
|
10
10
|
|
11
11
|
include Datahen::Plugin::ContextExposer
|
12
12
|
|
@@ -15,6 +15,9 @@ module Datahen
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def init_page()
|
18
|
+
# skip whenever a page is provided
|
19
|
+
return self.page unless self.page.nil?
|
20
|
+
|
18
21
|
if job_id
|
19
22
|
puts "getting Job Page"
|
20
23
|
init_job_page
|
@@ -18,6 +18,22 @@ module Datahen
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
+
def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
22
|
+
extname = File.extname(filename)
|
23
|
+
case extname
|
24
|
+
when '.rb'
|
25
|
+
executor = RubyParserExecutor.new(
|
26
|
+
filename: filename,
|
27
|
+
page: page,
|
28
|
+
job_id: job_id,
|
29
|
+
vars: vars,
|
30
|
+
keep_outputs: keep_outputs
|
31
|
+
)
|
32
|
+
executor.exec_parser(save)
|
33
|
+
else
|
34
|
+
puts "Unable to find a parser executor for file type \"#{extname}\""
|
35
|
+
end
|
36
|
+
end
|
21
37
|
|
22
38
|
end
|
23
39
|
end
|
@@ -12,7 +12,8 @@ module Datahen
|
|
12
12
|
|
13
13
|
def initialize(options={})
|
14
14
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
15
|
-
@
|
15
|
+
@page = options.fetch(:page) { nil }
|
16
|
+
@gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
|
16
17
|
@job_id = options.fetch(:job_id)
|
17
18
|
@page_vars = options.fetch(:vars) { {} }
|
18
19
|
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
@@ -46,6 +47,8 @@ module Datahen
|
|
46
47
|
end
|
47
48
|
|
48
49
|
def init_page_vars(page)
|
50
|
+
return self.page unless self.page.nil?
|
51
|
+
|
49
52
|
if !@page_vars.nil? && !@page_vars.empty?
|
50
53
|
page['vars'] = @page_vars
|
51
54
|
end
|
data/lib/datahen/scraper.rb
CHANGED
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-05-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -52,6 +52,34 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: concurrent-ruby
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.1'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.1'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: parallel
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.20'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.20'
|
55
83
|
- !ruby/object:Gem::Dependency
|
56
84
|
name: bundler
|
57
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -232,6 +260,7 @@ files:
|
|
232
260
|
- lib/datahen/plugin.rb
|
233
261
|
- lib/datahen/plugin/context_exposer.rb
|
234
262
|
- lib/datahen/scraper.rb
|
263
|
+
- lib/datahen/scraper/batch_parser.rb
|
235
264
|
- lib/datahen/scraper/executor.rb
|
236
265
|
- lib/datahen/scraper/finisher.rb
|
237
266
|
- lib/datahen/scraper/parser.rb
|
@@ -255,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
255
284
|
requirements:
|
256
285
|
- - ">="
|
257
286
|
- !ruby/object:Gem::Version
|
258
|
-
version: 2.
|
287
|
+
version: 2.4.4
|
259
288
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
260
289
|
requirements:
|
261
290
|
- - ">="
|