datahen 0.15.9 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7ad617ad097cf773fe496c3ccf801c49926524017784e5a51764ec88992b11ab
4
- data.tar.gz: 8292cf447a3fedcad565bc1aa98041d68a51f0d93ff1598b8c31980170d1f66c
3
+ metadata.gz: 8059eb96654de953b7a2d5df2f0a3d4f643d4ef3bba654b758e46e10c972bfc9
4
+ data.tar.gz: 2b57638a05fb85ed896398ffbd96b6af9342af9159a7a1b9964ef8c336a90d26
5
5
  SHA512:
6
- metadata.gz: 3e883e9a53339e342446ed543bab41bf0ab60de6e1f5f7aaaad54125ec67275803bf1e7676e3764c59f0ba7b75e8fdf4adfe98733ab376c350b3a3247a678895
7
- data.tar.gz: eaa4d12e93f31c7e516bb94f1222ecad2d6581609f77372de9078ac91a716438f6e998ec9b00ba4d3da76d02ccab5639fe06393c67346ab896e69d62d3fcc1e5
6
+ metadata.gz: 5e81e909432bade42c7e3cbe1c38efbf835dd36cab92099ff11dc5b6f7677fdc3ebb7bb13dccedb29311dd1f0a55c8e8dc64252b94fc546b488566d39e330fb0
7
+ data.tar.gz: 07f44f03c2ae9636e04972fec7eeade9c0ded9a6cdcd4064515a960ba3af1021d77d8226fa79bd87626f27694c69825eda16b64723acc7d1d440a0953d554e3c
@@ -72,6 +72,7 @@ module Datahen
72
72
  option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
73
  option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
74
  option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
75
76
  def batch_exec_parse(scraper_name, config_file)
76
77
  if options[:job]
77
78
  job_id = options[:job]
@@ -32,6 +32,7 @@ module Datahen
32
32
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
33
33
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
34
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
35
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
35
36
  def create(scraper_name, git_repository)
36
37
  # puts "options #{options}"
37
38
  client = Client::Scraper.new(options)
@@ -57,6 +58,7 @@ module Datahen
57
58
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
58
59
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
59
60
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
61
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
60
62
  def update(scraper_name)
61
63
  client = Client::Scraper.new(options)
62
64
  puts "#{client.update(scraper_name, options)}"
@@ -94,6 +96,7 @@ module Datahen
94
96
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
97
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
98
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
99
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
97
100
  def start(scraper_name)
98
101
  client = Client::ScraperJob.new(options)
99
102
  puts "Starting a scrape job..."
@@ -104,6 +104,7 @@ module Datahen
104
104
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
105
105
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
106
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
107
108
  def update(scraper_name)
108
109
  if options[:job]
109
110
  client = Client::Job.new(options)
@@ -45,6 +45,7 @@ module Datahen
45
45
  option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
46
46
  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
47
  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
48
49
  def add(scraper_name, url)
49
50
  begin
50
51
  options[:headers] = JSON.parse(options[:headers]) if options[:headers]
@@ -78,6 +79,7 @@ module Datahen
78
79
  option :page_type, :aliases => :t, desc: 'Set page type'
79
80
  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
80
81
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
82
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
81
83
  def update(scraper_name, gid)
82
84
  begin
83
85
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -21,6 +21,7 @@ module Datahen
21
21
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
22
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
23
  body[:profile] = opts[:profile] if opts[:profile]
24
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
24
25
  params = @options.merge({body: body.to_json})
25
26
 
26
27
  self.class.put("/jobs/#{job_id}", params)
@@ -15,6 +15,7 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
18
19
 
19
20
  params = @options.merge({body: body.to_json})
20
21
 
@@ -36,6 +37,7 @@ module Datahen
36
37
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
37
38
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
38
39
  body[:cookie] = opts[:cookie] if opts[:cookie]
40
+ body[:max_size] = opts[:max_size] if opts[:max_size]
39
41
 
40
42
  params = @options.merge({body: body.to_json})
41
43
 
@@ -48,7 +50,7 @@ module Datahen
48
50
  page_types: page_types,
49
51
  parse_fetching_failed: parse_fetching_failed
50
52
  }
51
- params = @options.merge({body: body.to_json})
53
+ params = @options.merge(opts).merge({body: body.to_json})
52
54
  self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
55
  end
54
56
 
@@ -28,6 +28,7 @@ module Datahen
28
28
  body[:profile] = opts[:profile] if opts[:profile]
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
30
  body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
31
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
31
32
  params = @options.merge({body: body.to_json})
32
33
  self.class.post("/scrapers", params)
33
34
  end
@@ -49,6 +50,7 @@ module Datahen
49
50
  body[:profile] = opts[:profile] if opts[:profile]
50
51
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
51
52
  body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
53
+ body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
52
54
  params = @options.merge({body: body.to_json})
53
55
 
54
56
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -11,6 +11,7 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
14
15
  if opts[:vars]
15
16
  if opts[:vars].is_a?(Array)
16
17
  body[:vars] = opts[:vars]
@@ -37,6 +38,7 @@ module Datahen
37
38
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
38
39
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
39
40
  body[:profile] = opts[:profile] if opts[:profile]
41
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
40
42
  params = @options.merge({body: body.to_json})
41
43
 
42
44
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -15,6 +15,7 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
18
19
 
19
20
  params = @options.merge({body: body.to_json})
20
21
 
@@ -59,6 +60,7 @@ module Datahen
59
60
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
60
61
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
61
62
  body[:cookie] = opts[:cookie] if opts[:cookie]
63
+ body[:max_size] = opts[:max_size] if opts[:max_size]
62
64
 
63
65
  params = @options.merge({body: body.to_json})
64
66
 
@@ -5,25 +5,110 @@ module Datahen
5
5
  module Scraper
6
6
  class BatchParser
7
7
  NOT_FOUND_MSG = "No more pages to parse found"
8
- NO_DEQUEUE_COUNT_MSG = "Warning: Max page to parse dequeue count is 0, check pages to parse scale"
9
- NO_WORKERS_MSG = "Warning: There are no parser workers"
8
+ NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
+ NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
10
 
11
- attr_accessor :config_file, :garbage_count, :last_message
12
- attr_reader :job_id, :worker_count, :pages, :max_garbage
13
- attr_reader :dequeue_interval, :dequeue_scale
14
- attr_reader :page_types, :parsers
15
- attr_reader :config, :client, :garbage_mutex
11
+ # Configuration file path.
12
+ # @return [String] config file path
13
+ attr_accessor :config_file
14
+ # Garbage collector request counter.
15
+ # @return [Integer] garbage collector counter
16
+ attr_accessor :garbage_count
17
+ # Last printed message, useful to prevent duplicated log messages.
18
+ # @return [String] last printed message
19
+ attr_accessor :last_message
20
+ # Second dequeue counter used to prevent false negative warning messages.
21
+ # @return [Integer] second dequeue counter
22
+ attr_accessor :second_dequeue_count
23
+ # Dequeue API request timeout in seconds.
24
+ # @return [Integer] dequeue API request timeout in seconds
25
+ attr_accessor :dequeue_timeout
26
+ # Job id to be executed.
27
+ # @return [Integer] job id
28
+ attr_reader :job_id
29
+ # Parallel worker quantity.
30
+ # @return [Integer] parallel worker quantity
31
+ attr_reader :worker_count
32
+ # Loaded pages array.
33
+ # @return [Concurrent::Array<Hash>] loaded pages as an array
34
+ attr_reader :pages
35
+ # Loaded pages hash, useful to avoid duplicates on the loaded pages array.
36
+ # @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
37
+ attr_reader :loaded_pages
38
+ # Max garbage collector requests before actually executing the garbage
39
+ # collector.
40
+ # @return [Integer] max garbage request quantity before actually executing
41
+ # it
42
+ attr_reader :max_garbage
43
+ # Dequeue interval in seconds.
44
+ # @return [Integer] dequeue interval in seconds
45
+ attr_reader :dequeue_interval
46
+ # Dequeue scale used to calculate the ideal dequeue size.
47
+ # @return [Numeric] dequeue scale
48
+ attr_reader :dequeue_scale
49
+ # Known page types extracted from the config file.
50
+ # @return [Array<String>] known page types
51
+ attr_reader :page_types
52
+ # Known parsers extracted from the config file.
53
+ # @return [Concurrent::Hash<String, String>] known parsers
54
+ attr_reader :parsers
55
+ # Current config file loaded.
56
+ # @return [Hash] current loaded configuration
57
+ attr_reader :config
58
+ # Datahen job pages client used for API pages dequeuing.
59
+ # @return [Datahen::Client::JobPage] datahen job pages API client
60
+ attr_reader :client
61
+ # Garbage collector mutex used to synchronize garbage collector requests.
62
+ # @return [Mutex] garbage collector mutex
63
+ attr_reader :garbage_mutex
64
+ # Current dequeuer thread.
65
+ # @return [Thread] dequeuer thread
66
+ attr_reader :dequeuer_thread
67
+ # Dequeuer mutext used to synchronize page dequeuing.
68
+ # @return [Mutex] dequeuer mutex
69
+ attr_reader :dequeue_mutex
70
+ # Dequeuer last run unix timestamp.
71
+ # @return [Integer] dequeuer last run unix timestamp
72
+ attr_reader :dequeuer_still_alive
73
+ # Indicates whenever the wait time is because there are no more pages.
74
+ # @return [Boolean] `true` when wait time is due to no more pages,
75
+ # else `false`
76
+ attr_reader :not_found
16
77
 
78
+ # Wait a specific amount of seconds.
79
+ # @param [Integer] time_in_seconds Seconds to wait.
17
80
  def self.wait time_in_seconds
18
81
  Kernel.sleep time_in_seconds
19
82
  end
20
83
 
84
+ # Get a unix timestamp.
85
+ # @return [Integer] unix timestamp
86
+ def self.timestamp
87
+ Time.new.utc.to_i
88
+ end
89
+
90
+ # Initialize a batch parser object.
91
+ # @param [Integer] job_id Job id.
92
+ # @param [String] config_file Config file path.
93
+ # @param [Hash] opts ({}) Configuration options
94
+ # @option opts [Integer] :worker_count (1) Parallel worker quantity.
95
+ # @option opts [Integer] :max_garbage (5) Max amount of times the garbage
96
+ # collector can be requested before actually executing.
97
+ # @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
98
+ # between page dequeuing.
99
+ # @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
100
+ # calculate page dequeue size.
101
+ # @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
102
+ # timeout in seconds.
103
+ # @option opts [Hash] :client_options ({}) Datahen client gem additional
104
+ # options (see Datahen::Client::Base#initialize method).
21
105
  def initialize(job_id, config_file, opts = {})
22
106
  opts = {
23
107
  worker_count: 1,
24
108
  max_garbage: 5,
25
109
  dequeue_interval: 3,
26
110
  dequeue_scale: 2,
111
+ dequeue_timeout: 30,
27
112
  client_options: {}
28
113
  }.merge opts
29
114
 
@@ -32,23 +117,36 @@ module Datahen
32
117
  @dequeue_interval = opts[:dequeue_interval]
33
118
  @dequeue_scale = opts[:dequeue_scale]
34
119
  @max_garbage = opts[:max_garbage]
35
- @pages = Concurrent::Hash.new
120
+ @pages = Concurrent::Array.new
121
+ @loaded_pages = Concurrent::Hash.new
36
122
  @garbage_mutex = Mutex.new
123
+ @dequeue_mutex = Mutex.new
124
+ @not_found = false
125
+ self.dequeue_timeout = opts[:dequeue_timeout]
126
+ self.second_dequeue_count = 0
37
127
  self.garbage_count = 0
38
128
  self.config_file = config_file
39
129
  self.load_config
40
130
 
41
131
  @client = Datahen::Client::JobPage.new(opts[:client_options])
132
+ nil
42
133
  end
43
134
 
135
+ # Execute garbage collector after it is requested as many times as
136
+ # described by #max_garbage.
44
137
  def recollect_garbage
45
138
  self.garbage_mutex.synchronize do
46
- puts "Recollect garbage"
47
- GC.start
48
- self.garbage_count = 0
139
+ self.garbage_count += 1
140
+ if self.garbage_count > self.max_garbage
141
+ puts "Recollect garbage"
142
+ GC.start
143
+ self.garbage_count = 0
144
+ end
49
145
  end
146
+ nil
50
147
  end
51
148
 
149
+ # Loads the config file into a Hash.
52
150
  def load_config
53
151
  # build page type to script file map
54
152
  @page_types = []
@@ -60,20 +158,40 @@ module Datahen
60
158
  self.parsers[v['page_type']] = v['file']
61
159
  end
62
160
  self.recollect_garbage
161
+ nil
63
162
  end
64
163
 
164
+ # Print the message regardless of it being the same as the last message.
165
+ # @param [String] message Message to display.
65
166
  def repeat_puts message
66
167
  puts message
67
- self.last_message = ''
168
+ self.last_message = message
169
+ nil
68
170
  end
69
171
 
172
+ # Print the message only when it is different from the last recorded
173
+ # message.
174
+ # @param [String] message Message to display.
70
175
  def no_repeat_puts message
71
176
  return if message == self.last_message
72
177
  puts message
73
178
  self.last_message = message
179
+ nil
74
180
  end
75
181
 
182
+ # Refresh dequeuer's still alive timestamp
183
+ def dequeuer_is_alive!
184
+ self.dequeue_mutex.synchronize do
185
+ @dequeuer_still_alive = self.class.timestamp
186
+ end
187
+ nil
188
+ end
189
+
190
+ # Load new pages by dequeuing from the API.
191
+ # @return [Integer] amount of pages loaded
76
192
  def load_pages
193
+ self.dequeuer_is_alive!
194
+
77
195
  # calculate dequeue size
78
196
  max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
79
197
  current_size = self.pages.length
@@ -84,10 +202,21 @@ module Datahen
84
202
  dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
85
203
 
86
204
  # reserve and get to pages parse
87
- response = client.dequeue self.job_id,
88
- dequeue_size,
89
- self.page_types,
90
- config['parse_fetching_failed']
205
+ response = nil
206
+ begin
207
+ response = client.dequeue self.job_id,
208
+ dequeue_size,
209
+ self.page_types,
210
+ config['parse_fetching_failed'],
211
+ timeout: self.dequeue_timeout
212
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
213
+ self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
214
+ self.dequeuer_is_alive!
215
+ return 0
216
+ rescue => e
217
+ raise e
218
+ end
219
+ self.dequeuer_is_alive!
91
220
 
92
221
  # ensure a valid response or try again
93
222
  if response.nil? || response.response.code.to_i != 200
@@ -100,16 +229,20 @@ module Datahen
100
229
  count = 0
101
230
  (JSON.parse(response.body) || []).each do |page|
102
231
  count += 1
103
- next if self.pages.has_key? page['gid']
104
- self.pages[page['gid']] = page
232
+ next if self.loaded_pages.has_key? page['gid']
233
+ self.pages << (self.loaded_pages[page['gid']] = page)
105
234
  end
106
235
  response = nil
236
+ self.dequeuer_is_alive!
107
237
 
108
238
  # recolect garbage to free some memory before parsing
109
239
  if count > 0
240
+ @not_found = false
110
241
  self.recollect_garbage
111
242
  self.repeat_puts "Found #{count} page(s) to parse"
243
+ self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
112
244
  else
245
+ @not_found = true
113
246
  self.no_repeat_puts NOT_FOUND_MSG
114
247
  end
115
248
 
@@ -117,21 +250,74 @@ module Datahen
117
250
  count
118
251
  end
119
252
 
253
+ # Ensures that the dequeuer thread exists and is running.
254
+ # @return [Boolean] `true` if thread was alive, or `false` if had to
255
+ # create a new thread
256
+ def ensure_dequeuer_thread
257
+ self.dequeue_mutex.synchronize do
258
+ # check if dequeuer thread is alive and healthy
259
+ if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
260
+ still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
261
+ return true if self.class.timestamp < still_alive_timeout
262
+
263
+ # kill dequeuer thread
264
+ self.repeat_puts "Dequeuer isn't healthy, will restart it..."
265
+ self.dequeuer_thread.kill
266
+ @dequeuer_thread = nil
267
+ self.recollect_garbage
268
+ self.no_repeat_puts "Dequeuer thread was killed!"
269
+ end
270
+
271
+ # dequeuing on parallel (the ride never ends :D)
272
+ @dequeuer_thread = Thread.new do
273
+ while true
274
+ begin
275
+ self.load_pages
276
+ self.class.wait self.dequeue_interval
277
+ rescue => e
278
+ puts [e.message] + e.backtrace rescue 'error'
279
+ end
280
+ end
281
+ puts "Error: dequeuer died! D:"
282
+ end
283
+ self.repeat_puts "Dequeuer thread was started!"
284
+ end
285
+ false
286
+ end
287
+
288
+ # Dequeue one page from the previously loaded pages, and waits until there
289
+ # are new pages whenever there are no loaded pages.
290
+ # @return [Hash] dequeued page
120
291
  def dequeue_pages
121
292
  # collect garbage
122
- self.garbage_count += 1
123
- if self.garbage_count > self.max_garbage
124
- self.recollect_garbage
125
- end
293
+ self.recollect_garbage
126
294
 
127
295
  # return page if there are loeaded pages
296
+ is_waiting = false
128
297
  while true do
129
- key_value = self.pages.shift
130
- return key_value[1] unless key_value.nil?
298
+ page = self.pages.shift
299
+ unless page.nil?
300
+ puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
301
+ loaded_pages.delete(page['gid'])
302
+ return page
303
+ end
304
+
305
+ # be more verbose on worker waiting
306
+ unless is_waiting
307
+ is_waiting = true
308
+ puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
309
+ if self.second_dequeue_count > 1 && !self.not_found
310
+ puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
311
+ end
312
+ end
131
313
  self.class.wait 1
314
+
315
+ # ensure the dequeuer thread is alive and healthy
316
+ self.ensure_dequeuer_thread
132
317
  end
133
318
  end
134
319
 
320
+ # Dequeue pages and execute the parsers associated to them on parallel.
135
321
  def exec_parse save = false, keep_outputs = false
136
322
  if self.worker_count < 1
137
323
  self.no_repeat_puts NO_WORKERS_MSG
@@ -140,20 +326,10 @@ module Datahen
140
326
  self.no_repeat_puts "Spawing #{self.worker_count} workers"
141
327
  end
142
328
 
143
- # dequeuing on parallel
144
- keep_dequeue = Concurrent::Array.new
145
- keep_dequeue[0] = true
146
- Thread.new do
147
- while keep_dequeue[0]
148
- begin
149
- self.load_pages
150
- self.class.wait self.dequeue_interval
151
- rescue => e
152
- puts [e.message] + e.backtrace rescue 'error'
153
- end
154
- end
155
- end
329
+ # start dequeuer
330
+ self.ensure_dequeuer_thread
156
331
 
332
+ # process the pages
157
333
  dequeue = lambda{ self.dequeue_pages }
158
334
  Parallel.each(dequeue, in_threads: (worker_count)) do |page|
159
335
  parser_file = self.parsers[page['page_type']]
@@ -166,11 +342,16 @@ module Datahen
166
342
  nil,
167
343
  keep_outputs
168
344
  )
345
+ rescue Parallel::Kill => e
346
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
347
+ rescue Parallel::Break => e
348
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
169
349
  rescue => e
170
350
  puts [e.message] + e.backtrace rescue 'error'
171
351
  end
172
352
  end
173
- keep_dequeue[0] = false
353
+
354
+ nil
174
355
  end
175
356
  end
176
357
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.15.9"
2
+ VERSION = "0.16.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.9
4
+ version: 0.16.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-27 00:00:00.000000000 Z
11
+ date: 2021-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -276,7 +276,7 @@ metadata:
276
276
  allowed_push_host: https://rubygems.org
277
277
  homepage_uri: https://datahen.com
278
278
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
279
- post_install_message:
279
+ post_install_message:
280
280
  rdoc_options: []
281
281
  require_paths:
282
282
  - lib
@@ -292,7 +292,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
292
292
  version: '0'
293
293
  requirements: []
294
294
  rubygems_version: 3.0.3
295
- signing_key:
295
+ signing_key:
296
296
  specification_version: 4
297
297
  summary: DataHen toolbelt for developers
298
298
  test_files: []