datahen 0.14.24 → 0.15.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/datahen.gemspec +4 -2
- data/lib/datahen/cli/parser.rb +48 -7
- data/lib/datahen/client/backblaze_content.rb +1 -1
- data/lib/datahen/client/job_page.rb +10 -0
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/scraper/batch_parser.rb +358 -0
- data/lib/datahen/scraper/executor.rb +4 -1
- data/lib/datahen/scraper/parser.rb +16 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +4 -1
- data/lib/datahen/version.rb +1 -1
- metadata +31 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f77e31da8e2a7ff08086c4aa9d174608a9c3f186679d456b22310b48384d3572
|
4
|
+
data.tar.gz: 0bf53ae0886b16bf6fe08b0db07b1a631b69f31d8e3a6868a4d483549049e4ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a491874347ed6ac97c0a0e4f0d2c5830140b9367c1e01b4c95e7a447b071df643d12793ed7d6e5a0224b8876905cf74bc13b987e6e3e03e937d1f821557b8ec3
|
7
|
+
data.tar.gz: c553a372790654726f2921b6d9d582ca90b314b3fe8d78625cf2442e01cf2ce96ae556e78812d5bd6a03f350beb32aa3158981554179410c12926d480c911887
|
data/datahen.gemspec
CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.bindir = "exe"
|
34
34
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
35
35
|
spec.require_paths = ["lib"]
|
36
|
-
spec.required_ruby_version = '>= 2.
|
36
|
+
spec.required_ruby_version = '>= 2.4.4'
|
37
37
|
spec.add_dependency "thor", "~> 0.20.3"
|
38
38
|
spec.add_dependency 'httparty', '~> 0.16.2'
|
39
|
-
spec.add_dependency 'nokogiri', '~> 1.6'
|
39
|
+
spec.add_dependency 'nokogiri', '~> 1.6'
|
40
|
+
spec.add_dependency 'concurrent-ruby', '~> 1.1'
|
41
|
+
spec.add_dependency 'parallel', '~> 1.20'
|
40
42
|
spec.add_development_dependency 'bundler', '>= 1.16'
|
41
43
|
spec.add_development_dependency 'rake', '>= 10.0'
|
42
44
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -43,17 +43,17 @@ module Datahen
|
|
43
43
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
44
44
|
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
45
45
|
def exec_parse(scraper_name, parser_file, *gids)
|
46
|
+
if options[:job]
|
47
|
+
job_id = options[:job]
|
48
|
+
else
|
49
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
50
|
+
job_id = job['id']
|
51
|
+
end
|
52
|
+
|
46
53
|
gids.each do |gid|
|
47
54
|
begin
|
48
55
|
puts "Parsing #{gid}"
|
49
56
|
|
50
|
-
if options[:job]
|
51
|
-
job_id = options[:job]
|
52
|
-
else
|
53
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
54
|
-
job_id = job['id']
|
55
|
-
end
|
56
|
-
|
57
57
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
58
58
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
59
59
|
rescue => e
|
@@ -61,6 +61,47 @@ module Datahen
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
64
|
+
|
65
|
+
desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
|
66
|
+
long_desc <<-LONGDESC
|
67
|
+
Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
|
68
|
+
LONGDESC
|
69
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
70
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
71
|
+
option :"workers", type: :numeric, default: 1, desc: "Worker count"
|
72
|
+
option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
|
73
|
+
option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
|
74
|
+
option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
|
75
|
+
option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
|
76
|
+
def batch_exec_parse(scraper_name, config_file)
|
77
|
+
if options[:job]
|
78
|
+
job_id = options[:job]
|
79
|
+
else
|
80
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
81
|
+
job_id = job['id']
|
82
|
+
end
|
83
|
+
|
84
|
+
# make the stdout and stderr sync to prevent buffering
|
85
|
+
old_stdout_sync = $stdout.sync
|
86
|
+
old_stderr_sync = $stderr.sync
|
87
|
+
$stdout.sync = true
|
88
|
+
$stderr.sync = true
|
89
|
+
|
90
|
+
begin
|
91
|
+
batch = Datahen::Scraper::BatchParser.new job_id, config_file,
|
92
|
+
worker_count: options[:"workers"],
|
93
|
+
max_garbage: options[:"max-garbage"],
|
94
|
+
dequeue_interval: options[:"dequeue-interval"],
|
95
|
+
dequeue_scale: options[:"dequeue-scale"]
|
96
|
+
batch.exec_parse true, options[:"keep-outputs"]
|
97
|
+
rescue => e
|
98
|
+
puts [e.message] + e.backtrace
|
99
|
+
end
|
100
|
+
|
101
|
+
# resume whatever state the stdout and stderr sync were
|
102
|
+
$stdout.sync = old_stdout_sync
|
103
|
+
$stderr.sync = old_stderr_sync
|
104
|
+
end
|
64
105
|
end
|
65
106
|
end
|
66
107
|
|
@@ -42,6 +42,16 @@ module Datahen
|
|
42
42
|
self.class.post("/jobs/#{job_id}/pages", params)
|
43
43
|
end
|
44
44
|
|
45
|
+
def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
|
46
|
+
body = {
|
47
|
+
limit: limit,
|
48
|
+
page_types: page_types,
|
49
|
+
parse_fetching_failed: parse_fetching_failed
|
50
|
+
}
|
51
|
+
params = @options.merge(opts).merge({body: body.to_json})
|
52
|
+
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
53
|
+
end
|
54
|
+
|
45
55
|
def parsing_update(job_id, gid, opts={})
|
46
56
|
body = {}
|
47
57
|
body[:outputs] = opts.fetch(:outputs) {[]}
|
data/lib/datahen/scraper.rb
CHANGED
@@ -0,0 +1,358 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
module Datahen
|
5
|
+
module Scraper
|
6
|
+
class BatchParser
|
7
|
+
NOT_FOUND_MSG = "No more pages to parse found"
|
8
|
+
NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
|
9
|
+
NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
|
10
|
+
|
11
|
+
# Configuration file path.
|
12
|
+
# @return [String] config file path
|
13
|
+
attr_accessor :config_file
|
14
|
+
# Garbage collector request counter.
|
15
|
+
# @return [Integer] garbage collector counter
|
16
|
+
attr_accessor :garbage_count
|
17
|
+
# Last printed message, useful to prevent duplicated log messages.
|
18
|
+
# @return [String] last printed message
|
19
|
+
attr_accessor :last_message
|
20
|
+
# Second dequeue counter used to prevent false negative warning messages.
|
21
|
+
# @return [Integer] second dequeue counter
|
22
|
+
attr_accessor :second_dequeue_count
|
23
|
+
# Dequeue API request timeout in seconds.
|
24
|
+
# @return [Integer] dequeue API request timeout in seconds
|
25
|
+
attr_accessor :dequeue_timeout
|
26
|
+
# Job id to be executed.
|
27
|
+
# @return [Integer] job id
|
28
|
+
attr_reader :job_id
|
29
|
+
# Parallel worker quantity.
|
30
|
+
# @return [Integer] parallel worker quantity
|
31
|
+
attr_reader :worker_count
|
32
|
+
# Loaded pages array.
|
33
|
+
# @return [Concurrent::Array<Hash>] loaded pages as an array
|
34
|
+
attr_reader :pages
|
35
|
+
# Loaded pages hash, useful to avoid duplicates on the loaded pages array.
|
36
|
+
# @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
|
37
|
+
attr_reader :loaded_pages
|
38
|
+
# Max garbage collector requests before actually executing the garbage
|
39
|
+
# collector.
|
40
|
+
# @return [Integer] max garbage request quantity before actually executing
|
41
|
+
# it
|
42
|
+
attr_reader :max_garbage
|
43
|
+
# Dequeue interval in seconds.
|
44
|
+
# @return [Integer] dequeue interval in seconds
|
45
|
+
attr_reader :dequeue_interval
|
46
|
+
# Dequeue scale used to calculate the ideal dequeue size.
|
47
|
+
# @return [Numeric] dequeue scale
|
48
|
+
attr_reader :dequeue_scale
|
49
|
+
# Known page types extracted from the config file.
|
50
|
+
# @return [Array<String>] known page types
|
51
|
+
attr_reader :page_types
|
52
|
+
# Known parsers extracted from the config file.
|
53
|
+
# @return [Concurrent::Hash<String, String>] known parsers
|
54
|
+
attr_reader :parsers
|
55
|
+
# Current config file loaded.
|
56
|
+
# @return [Hash] current loaded configuration
|
57
|
+
attr_reader :config
|
58
|
+
# Datahen job pages client used for API pages dequeuing.
|
59
|
+
# @return [Datahen::Client::JobPage] datahen job pages API client
|
60
|
+
attr_reader :client
|
61
|
+
# Garbage collector mutex used to synchronize garbage collector requests.
|
62
|
+
# @return [Mutex] garbage collector mutex
|
63
|
+
attr_reader :garbage_mutex
|
64
|
+
# Current dequeuer thread.
|
65
|
+
# @return [Thread] dequeuer thread
|
66
|
+
attr_reader :dequeuer_thread
|
67
|
+
# Dequeuer mutext used to synchronize page dequeuing.
|
68
|
+
# @return [Mutex] dequeuer mutex
|
69
|
+
attr_reader :dequeue_mutex
|
70
|
+
# Dequeuer last run unix timestamp.
|
71
|
+
# @return [Integer] dequeuer last run unix timestamp
|
72
|
+
attr_reader :dequeuer_still_alive
|
73
|
+
# Indicates whenever the wait time is because there are no more pages.
|
74
|
+
# @return [Boolean] `true` when wait time is due to no more pages,
|
75
|
+
# else `false`
|
76
|
+
attr_reader :not_found
|
77
|
+
|
78
|
+
# Wait a specific amount of seconds.
|
79
|
+
# @param [Integer] time_in_seconds Seconds to wait.
|
80
|
+
def self.wait time_in_seconds
|
81
|
+
Kernel.sleep time_in_seconds
|
82
|
+
end
|
83
|
+
|
84
|
+
# Get a unix timestamp.
|
85
|
+
# @return [Integer] unix timestamp
|
86
|
+
def self.timestamp
|
87
|
+
Time.new.utc.to_i
|
88
|
+
end
|
89
|
+
|
90
|
+
# Initialize a batch parser object.
|
91
|
+
# @param [Integer] job_id Job id.
|
92
|
+
# @param [String] config_file Config file path.
|
93
|
+
# @param [Hash] opts ({}) Configuration options
|
94
|
+
# @option opts [Integer] :worker_count (1) Parallel worker quantity.
|
95
|
+
# @option opts [Integer] :max_garbage (5) Max amount of times the garbage
|
96
|
+
# collector can be requested before actually executing.
|
97
|
+
# @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
|
98
|
+
# between page dequeuing.
|
99
|
+
# @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
|
100
|
+
# calculate page dequeue size.
|
101
|
+
# @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
|
102
|
+
# timeout in seconds.
|
103
|
+
# @option opts [Hash] :client_options ({}) Datahen client gem additional
|
104
|
+
# options (see Datahen::Client::Base#initialize method).
|
105
|
+
def initialize(job_id, config_file, opts = {})
|
106
|
+
opts = {
|
107
|
+
worker_count: 1,
|
108
|
+
max_garbage: 5,
|
109
|
+
dequeue_interval: 3,
|
110
|
+
dequeue_scale: 2,
|
111
|
+
dequeue_timeout: 30,
|
112
|
+
client_options: {}
|
113
|
+
}.merge opts
|
114
|
+
|
115
|
+
@job_id = job_id
|
116
|
+
@worker_count = opts[:worker_count]
|
117
|
+
@dequeue_interval = opts[:dequeue_interval]
|
118
|
+
@dequeue_scale = opts[:dequeue_scale]
|
119
|
+
@max_garbage = opts[:max_garbage]
|
120
|
+
@pages = Concurrent::Array.new
|
121
|
+
@loaded_pages = Concurrent::Hash.new
|
122
|
+
@garbage_mutex = Mutex.new
|
123
|
+
@dequeue_mutex = Mutex.new
|
124
|
+
@not_found = false
|
125
|
+
self.dequeue_timeout = opts[:dequeue_timeout]
|
126
|
+
self.second_dequeue_count = 0
|
127
|
+
self.garbage_count = 0
|
128
|
+
self.config_file = config_file
|
129
|
+
self.load_config
|
130
|
+
|
131
|
+
@client = Datahen::Client::JobPage.new(opts[:client_options])
|
132
|
+
nil
|
133
|
+
end
|
134
|
+
|
135
|
+
# Execute garbage collector after it is requested as many times as
|
136
|
+
# described by #max_garbage.
|
137
|
+
def recollect_garbage
|
138
|
+
self.garbage_mutex.synchronize do
|
139
|
+
self.garbage_count += 1
|
140
|
+
if self.garbage_count > self.max_garbage
|
141
|
+
puts "Recollect garbage"
|
142
|
+
GC.start
|
143
|
+
self.garbage_count = 0
|
144
|
+
end
|
145
|
+
end
|
146
|
+
nil
|
147
|
+
end
|
148
|
+
|
149
|
+
# Loads the config file into a Hash.
|
150
|
+
def load_config
|
151
|
+
# build page type to script file map
|
152
|
+
@page_types = []
|
153
|
+
@parsers = Concurrent::Hash.new
|
154
|
+
@config = YAML.load_file(config_file)
|
155
|
+
self.config['parsers'].each do |v|
|
156
|
+
next if !v['disabled'].nil? && !!v['disabled']
|
157
|
+
@page_types << v['page_type']
|
158
|
+
self.parsers[v['page_type']] = v['file']
|
159
|
+
end
|
160
|
+
self.recollect_garbage
|
161
|
+
nil
|
162
|
+
end
|
163
|
+
|
164
|
+
# Print the message regardless of it being the same as the last message.
|
165
|
+
# @param [String] message Message to display.
|
166
|
+
def repeat_puts message
|
167
|
+
puts message
|
168
|
+
self.last_message = message
|
169
|
+
nil
|
170
|
+
end
|
171
|
+
|
172
|
+
# Print the message only when it is different from the last recorded
|
173
|
+
# message.
|
174
|
+
# @param [String] message Message to display.
|
175
|
+
def no_repeat_puts message
|
176
|
+
return if message == self.last_message
|
177
|
+
puts message
|
178
|
+
self.last_message = message
|
179
|
+
nil
|
180
|
+
end
|
181
|
+
|
182
|
+
# Refresh dequeuer's still alive timestamp
|
183
|
+
def dequeuer_is_alive!
|
184
|
+
self.dequeue_mutex.synchronize do
|
185
|
+
@dequeuer_still_alive = self.class.timestamp
|
186
|
+
end
|
187
|
+
nil
|
188
|
+
end
|
189
|
+
|
190
|
+
# Load new pages by dequeuing from the API.
|
191
|
+
# @return [Integer] amount of pages loaded
|
192
|
+
def load_pages
|
193
|
+
self.dequeuer_is_alive!
|
194
|
+
|
195
|
+
# calculate dequeue size
|
196
|
+
max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
|
197
|
+
current_size = self.pages.length
|
198
|
+
dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
|
199
|
+
if dequeue_size < 1
|
200
|
+
return 0
|
201
|
+
end
|
202
|
+
dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
|
203
|
+
|
204
|
+
# reserve and get to pages parse
|
205
|
+
response = nil
|
206
|
+
begin
|
207
|
+
response = client.dequeue self.job_id,
|
208
|
+
dequeue_size,
|
209
|
+
self.page_types,
|
210
|
+
config['parse_fetching_failed'],
|
211
|
+
timeout: self.dequeue_timeout
|
212
|
+
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
213
|
+
self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
|
214
|
+
self.dequeuer_is_alive!
|
215
|
+
return 0
|
216
|
+
rescue => e
|
217
|
+
raise e
|
218
|
+
end
|
219
|
+
self.dequeuer_is_alive!
|
220
|
+
|
221
|
+
# ensure a valid response or try again
|
222
|
+
if response.nil? || response.response.code.to_i != 200
|
223
|
+
self.repeat_puts(response.nil? ? 'null' : response.body)
|
224
|
+
self.recollect_garbage
|
225
|
+
return 0
|
226
|
+
end
|
227
|
+
|
228
|
+
# add pages
|
229
|
+
count = 0
|
230
|
+
(JSON.parse(response.body) || []).each do |page|
|
231
|
+
count += 1
|
232
|
+
next if self.loaded_pages.has_key? page['gid']
|
233
|
+
self.pages << (self.loaded_pages[page['gid']] = page)
|
234
|
+
end
|
235
|
+
response = nil
|
236
|
+
self.dequeuer_is_alive!
|
237
|
+
|
238
|
+
# recolect garbage to free some memory before parsing
|
239
|
+
if count > 0
|
240
|
+
@not_found = false
|
241
|
+
self.recollect_garbage
|
242
|
+
self.repeat_puts "Found #{count} page(s) to parse"
|
243
|
+
self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
|
244
|
+
else
|
245
|
+
@not_found = true
|
246
|
+
self.no_repeat_puts NOT_FOUND_MSG
|
247
|
+
end
|
248
|
+
|
249
|
+
# return how many pages were loaded
|
250
|
+
count
|
251
|
+
end
|
252
|
+
|
253
|
+
# Ensures that the dequeuer thread exists and is running.
|
254
|
+
# @return [Boolean] `true` if thread was alive, or `false` if had to
|
255
|
+
# create a new thread
|
256
|
+
def ensure_dequeuer_thread
|
257
|
+
self.dequeue_mutex.synchronize do
|
258
|
+
# check if dequeuer thread is alive and healthy
|
259
|
+
if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
|
260
|
+
still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
|
261
|
+
return true if self.class.timestamp < still_alive_timeout
|
262
|
+
|
263
|
+
# kill dequeuer thread
|
264
|
+
self.repeat_puts "Dequeuer isn't healthy, will restart it..."
|
265
|
+
self.dequeuer_thread.kill
|
266
|
+
@dequeuer_thread = nil
|
267
|
+
self.recollect_garbage
|
268
|
+
self.no_repeat_puts "Dequeuer thread was killed!"
|
269
|
+
end
|
270
|
+
|
271
|
+
# dequeuing on parallel (the ride never ends :D)
|
272
|
+
@dequeuer_thread = Thread.new do
|
273
|
+
while true
|
274
|
+
begin
|
275
|
+
self.load_pages
|
276
|
+
self.class.wait self.dequeue_interval
|
277
|
+
rescue => e
|
278
|
+
puts [e.message] + e.backtrace rescue 'error'
|
279
|
+
end
|
280
|
+
end
|
281
|
+
puts "Error: dequeuer died! D:"
|
282
|
+
end
|
283
|
+
self.repeat_puts "Dequeuer thread was started!"
|
284
|
+
end
|
285
|
+
false
|
286
|
+
end
|
287
|
+
|
288
|
+
# Dequeue one page from the previously loaded pages, and waits until there
|
289
|
+
# are new pages whenever there are no loaded pages.
|
290
|
+
# @return [Hash] dequeued page
|
291
|
+
def dequeue_pages
|
292
|
+
# collect garbage
|
293
|
+
self.recollect_garbage
|
294
|
+
|
295
|
+
# return page if there are loeaded pages
|
296
|
+
is_waiting = false
|
297
|
+
while true do
|
298
|
+
page = self.pages.shift
|
299
|
+
unless page.nil?
|
300
|
+
puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
|
301
|
+
loaded_pages.delete(page['gid'])
|
302
|
+
return page
|
303
|
+
end
|
304
|
+
|
305
|
+
# be more verbose on worker waiting
|
306
|
+
unless is_waiting
|
307
|
+
is_waiting = true
|
308
|
+
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
309
|
+
if self.second_dequeue_count > 1 && !self.not_found
|
310
|
+
puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
|
311
|
+
end
|
312
|
+
end
|
313
|
+
self.class.wait 1
|
314
|
+
|
315
|
+
# ensure the dequeuer thread is alive and healthy
|
316
|
+
self.ensure_dequeuer_thread
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
# Dequeue pages and execute the parsers associated to them on parallel.
|
321
|
+
def exec_parse save = false, keep_outputs = false
|
322
|
+
if self.worker_count < 1
|
323
|
+
self.no_repeat_puts NO_WORKERS_MSG
|
324
|
+
return
|
325
|
+
else
|
326
|
+
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
327
|
+
end
|
328
|
+
|
329
|
+
# start dequeuer
|
330
|
+
self.ensure_dequeuer_thread
|
331
|
+
|
332
|
+
# process the pages
|
333
|
+
dequeue = lambda{ self.dequeue_pages }
|
334
|
+
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
335
|
+
parser_file = self.parsers[page['page_type']]
|
336
|
+
begin
|
337
|
+
puts Datahen::Scraper::Parser.exec_parser_by_page(
|
338
|
+
parser_file,
|
339
|
+
page,
|
340
|
+
job_id,
|
341
|
+
save,
|
342
|
+
nil,
|
343
|
+
keep_outputs
|
344
|
+
)
|
345
|
+
rescue Parallel::Kill => e
|
346
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
|
347
|
+
rescue Parallel::Break => e
|
348
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
|
349
|
+
rescue => e
|
350
|
+
puts [e.message] + e.backtrace rescue 'error'
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
nil
|
355
|
+
end
|
356
|
+
end
|
357
|
+
end
|
358
|
+
end
|
@@ -6,7 +6,7 @@ module Datahen
|
|
6
6
|
# Max allowed page size when query outputs (see #find_outputs).
|
7
7
|
MAX_FIND_OUTPUTS_PER_PAGE = 500
|
8
8
|
|
9
|
-
attr_accessor :filename, :gid, :job_id
|
9
|
+
attr_accessor :filename, :page, :gid, :job_id
|
10
10
|
|
11
11
|
include Datahen::Plugin::ContextExposer
|
12
12
|
|
@@ -15,6 +15,9 @@ module Datahen
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def init_page()
|
18
|
+
# skip whenever a page is provided
|
19
|
+
return self.page unless self.page.nil?
|
20
|
+
|
18
21
|
if job_id
|
19
22
|
puts "getting Job Page"
|
20
23
|
init_job_page
|
@@ -18,6 +18,22 @@ module Datahen
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
+
def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
22
|
+
extname = File.extname(filename)
|
23
|
+
case extname
|
24
|
+
when '.rb'
|
25
|
+
executor = RubyParserExecutor.new(
|
26
|
+
filename: filename,
|
27
|
+
page: page,
|
28
|
+
job_id: job_id,
|
29
|
+
vars: vars,
|
30
|
+
keep_outputs: keep_outputs
|
31
|
+
)
|
32
|
+
executor.exec_parser(save)
|
33
|
+
else
|
34
|
+
puts "Unable to find a parser executor for file type \"#{extname}\""
|
35
|
+
end
|
36
|
+
end
|
21
37
|
|
22
38
|
end
|
23
39
|
end
|
@@ -12,7 +12,8 @@ module Datahen
|
|
12
12
|
|
13
13
|
def initialize(options={})
|
14
14
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
15
|
-
@
|
15
|
+
@page = options.fetch(:page) { nil }
|
16
|
+
@gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
|
16
17
|
@job_id = options.fetch(:job_id)
|
17
18
|
@page_vars = options.fetch(:vars) { {} }
|
18
19
|
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
@@ -46,6 +47,8 @@ module Datahen
|
|
46
47
|
end
|
47
48
|
|
48
49
|
def init_page_vars(page)
|
50
|
+
return self.page unless self.page.nil?
|
51
|
+
|
49
52
|
if !@page_vars.nil? && !@page_vars.empty?
|
50
53
|
page['vars'] = @page_vars
|
51
54
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-05-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -45,9 +45,6 @@ dependencies:
|
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '1.6'
|
48
|
-
- - "<"
|
49
|
-
- !ruby/object:Gem::Version
|
50
|
-
version: '1.10'
|
51
48
|
type: :runtime
|
52
49
|
prerelease: false
|
53
50
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -55,9 +52,34 @@ dependencies:
|
|
55
52
|
- - "~>"
|
56
53
|
- !ruby/object:Gem::Version
|
57
54
|
version: '1.6'
|
58
|
-
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: concurrent-ruby
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.1'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.1'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: parallel
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.20'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
59
81
|
- !ruby/object:Gem::Version
|
60
|
-
version: '1.
|
82
|
+
version: '1.20'
|
61
83
|
- !ruby/object:Gem::Dependency
|
62
84
|
name: bundler
|
63
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -238,6 +260,7 @@ files:
|
|
238
260
|
- lib/datahen/plugin.rb
|
239
261
|
- lib/datahen/plugin/context_exposer.rb
|
240
262
|
- lib/datahen/scraper.rb
|
263
|
+
- lib/datahen/scraper/batch_parser.rb
|
241
264
|
- lib/datahen/scraper/executor.rb
|
242
265
|
- lib/datahen/scraper/finisher.rb
|
243
266
|
- lib/datahen/scraper/parser.rb
|
@@ -261,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
261
284
|
requirements:
|
262
285
|
- - ">="
|
263
286
|
- !ruby/object:Gem::Version
|
264
|
-
version: 2.
|
287
|
+
version: 2.4.4
|
265
288
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
266
289
|
requirements:
|
267
290
|
- - ">="
|