datahen 0.15.9 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7ad617ad097cf773fe496c3ccf801c49926524017784e5a51764ec88992b11ab
4
- data.tar.gz: 8292cf447a3fedcad565bc1aa98041d68a51f0d93ff1598b8c31980170d1f66c
3
+ metadata.gz: 8059eb96654de953b7a2d5df2f0a3d4f643d4ef3bba654b758e46e10c972bfc9
4
+ data.tar.gz: 2b57638a05fb85ed896398ffbd96b6af9342af9159a7a1b9964ef8c336a90d26
5
5
  SHA512:
6
- metadata.gz: 3e883e9a53339e342446ed543bab41bf0ab60de6e1f5f7aaaad54125ec67275803bf1e7676e3764c59f0ba7b75e8fdf4adfe98733ab376c350b3a3247a678895
7
- data.tar.gz: eaa4d12e93f31c7e516bb94f1222ecad2d6581609f77372de9078ac91a716438f6e998ec9b00ba4d3da76d02ccab5639fe06393c67346ab896e69d62d3fcc1e5
6
+ metadata.gz: 5e81e909432bade42c7e3cbe1c38efbf835dd36cab92099ff11dc5b6f7677fdc3ebb7bb13dccedb29311dd1f0a55c8e8dc64252b94fc546b488566d39e330fb0
7
+ data.tar.gz: 07f44f03c2ae9636e04972fec7eeade9c0ded9a6cdcd4064515a960ba3af1021d77d8226fa79bd87626f27694c69825eda16b64723acc7d1d440a0953d554e3c
@@ -72,6 +72,7 @@ module Datahen
72
72
  option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
73
  option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
74
  option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
75
76
  def batch_exec_parse(scraper_name, config_file)
76
77
  if options[:job]
77
78
  job_id = options[:job]
@@ -32,6 +32,7 @@ module Datahen
32
32
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
33
33
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
34
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
35
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
35
36
  def create(scraper_name, git_repository)
36
37
  # puts "options #{options}"
37
38
  client = Client::Scraper.new(options)
@@ -57,6 +58,7 @@ module Datahen
57
58
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
58
59
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
59
60
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
61
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
60
62
  def update(scraper_name)
61
63
  client = Client::Scraper.new(options)
62
64
  puts "#{client.update(scraper_name, options)}"
@@ -94,6 +96,7 @@ module Datahen
94
96
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
97
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
98
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
99
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
97
100
  def start(scraper_name)
98
101
  client = Client::ScraperJob.new(options)
99
102
  puts "Starting a scrape job..."
@@ -104,6 +104,7 @@ module Datahen
104
104
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
105
105
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
106
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
107
108
  def update(scraper_name)
108
109
  if options[:job]
109
110
  client = Client::Job.new(options)
@@ -45,6 +45,7 @@ module Datahen
45
45
  option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
46
46
  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
47
  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
48
49
  def add(scraper_name, url)
49
50
  begin
50
51
  options[:headers] = JSON.parse(options[:headers]) if options[:headers]
@@ -78,6 +79,7 @@ module Datahen
78
79
  option :page_type, :aliases => :t, desc: 'Set page type'
79
80
  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
80
81
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
82
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
81
83
  def update(scraper_name, gid)
82
84
  begin
83
85
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -21,6 +21,7 @@ module Datahen
21
21
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
22
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
23
  body[:profile] = opts[:profile] if opts[:profile]
24
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
24
25
  params = @options.merge({body: body.to_json})
25
26
 
26
27
  self.class.put("/jobs/#{job_id}", params)
@@ -15,6 +15,7 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
18
19
 
19
20
  params = @options.merge({body: body.to_json})
20
21
 
@@ -36,6 +37,7 @@ module Datahen
36
37
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
37
38
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
38
39
  body[:cookie] = opts[:cookie] if opts[:cookie]
40
+ body[:max_size] = opts[:max_size] if opts[:max_size]
39
41
 
40
42
  params = @options.merge({body: body.to_json})
41
43
 
@@ -48,7 +50,7 @@ module Datahen
48
50
  page_types: page_types,
49
51
  parse_fetching_failed: parse_fetching_failed
50
52
  }
51
- params = @options.merge({body: body.to_json})
53
+ params = @options.merge(opts).merge({body: body.to_json})
52
54
  self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
55
  end
54
56
 
@@ -28,6 +28,7 @@ module Datahen
28
28
  body[:profile] = opts[:profile] if opts[:profile]
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
30
  body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
31
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
31
32
  params = @options.merge({body: body.to_json})
32
33
  self.class.post("/scrapers", params)
33
34
  end
@@ -49,6 +50,7 @@ module Datahen
49
50
  body[:profile] = opts[:profile] if opts[:profile]
50
51
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
51
52
  body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
53
+ body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
52
54
  params = @options.merge({body: body.to_json})
53
55
 
54
56
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -11,6 +11,7 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
14
15
  if opts[:vars]
15
16
  if opts[:vars].is_a?(Array)
16
17
  body[:vars] = opts[:vars]
@@ -37,6 +38,7 @@ module Datahen
37
38
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
38
39
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
39
40
  body[:profile] = opts[:profile] if opts[:profile]
41
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
40
42
  params = @options.merge({body: body.to_json})
41
43
 
42
44
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -15,6 +15,7 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
18
19
 
19
20
  params = @options.merge({body: body.to_json})
20
21
 
@@ -59,6 +60,7 @@ module Datahen
59
60
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
60
61
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
61
62
  body[:cookie] = opts[:cookie] if opts[:cookie]
63
+ body[:max_size] = opts[:max_size] if opts[:max_size]
62
64
 
63
65
  params = @options.merge({body: body.to_json})
64
66
 
@@ -5,25 +5,110 @@ module Datahen
5
5
  module Scraper
6
6
  class BatchParser
7
7
  NOT_FOUND_MSG = "No more pages to parse found"
8
- NO_DEQUEUE_COUNT_MSG = "Warning: Max page to parse dequeue count is 0, check pages to parse scale"
9
- NO_WORKERS_MSG = "Warning: There are no parser workers"
8
+ NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
+ NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
10
 
11
- attr_accessor :config_file, :garbage_count, :last_message
12
- attr_reader :job_id, :worker_count, :pages, :max_garbage
13
- attr_reader :dequeue_interval, :dequeue_scale
14
- attr_reader :page_types, :parsers
15
- attr_reader :config, :client, :garbage_mutex
11
+ # Configuration file path.
12
+ # @return [String] config file path
13
+ attr_accessor :config_file
14
+ # Garbage collector request counter.
15
+ # @return [Integer] garbage collector counter
16
+ attr_accessor :garbage_count
17
+ # Last printed message, useful to prevent duplicated log messages.
18
+ # @return [String] last printed message
19
+ attr_accessor :last_message
20
+ # Second dequeue counter used to prevent false negative warning messages.
21
+ # @return [Integer] second dequeue counter
22
+ attr_accessor :second_dequeue_count
23
+ # Dequeue API request timeout in seconds.
24
+ # @return [Integer] dequeue API request timeout in seconds
25
+ attr_accessor :dequeue_timeout
26
+ # Job id to be executed.
27
+ # @return [Integer] job id
28
+ attr_reader :job_id
29
+ # Parallel worker quantity.
30
+ # @return [Integer] parallel worker quantity
31
+ attr_reader :worker_count
32
+ # Loaded pages array.
33
+ # @return [Concurrent::Array<Hash>] loaded pages as an array
34
+ attr_reader :pages
35
+ # Loaded pages hash, useful to avoid duplicates on the loaded pages array.
36
+ # @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
37
+ attr_reader :loaded_pages
38
+ # Max garbage collector requests before actually executing the garbage
39
+ # collector.
40
+ # @return [Integer] max garbage request quantity before actually executing
41
+ # it
42
+ attr_reader :max_garbage
43
+ # Dequeue interval in seconds.
44
+ # @return [Integer] dequeue interval in seconds
45
+ attr_reader :dequeue_interval
46
+ # Dequeue scale used to calculate the ideal dequeue size.
47
+ # @return [Numeric] dequeue scale
48
+ attr_reader :dequeue_scale
49
+ # Known page types extracted from the config file.
50
+ # @return [Array<String>] known page types
51
+ attr_reader :page_types
52
+ # Known parsers extracted from the config file.
53
+ # @return [Concurrent::Hash<String, String>] known parsers
54
+ attr_reader :parsers
55
+ # Current config file loaded.
56
+ # @return [Hash] current loaded configuration
57
+ attr_reader :config
58
+ # Datahen job pages client used for API pages dequeuing.
59
+ # @return [Datahen::Client::JobPage] datahen job pages API client
60
+ attr_reader :client
61
+ # Garbage collector mutex used to synchronize garbage collector requests.
62
+ # @return [Mutex] garbage collector mutex
63
+ attr_reader :garbage_mutex
64
+ # Current dequeuer thread.
65
+ # @return [Thread] dequeuer thread
66
+ attr_reader :dequeuer_thread
67
+ # Dequeuer mutext used to synchronize page dequeuing.
68
+ # @return [Mutex] dequeuer mutex
69
+ attr_reader :dequeue_mutex
70
+ # Dequeuer last run unix timestamp.
71
+ # @return [Integer] dequeuer last run unix timestamp
72
+ attr_reader :dequeuer_still_alive
73
+ # Indicates whenever the wait time is because there are no more pages.
74
+ # @return [Boolean] `true` when wait time is due to no more pages,
75
+ # else `false`
76
+ attr_reader :not_found
16
77
 
78
+ # Wait a specific amount of seconds.
79
+ # @param [Integer] time_in_seconds Seconds to wait.
17
80
  def self.wait time_in_seconds
18
81
  Kernel.sleep time_in_seconds
19
82
  end
20
83
 
84
+ # Get a unix timestamp.
85
+ # @return [Integer] unix timestamp
86
+ def self.timestamp
87
+ Time.new.utc.to_i
88
+ end
89
+
90
+ # Initialize a batch parser object.
91
+ # @param [Integer] job_id Job id.
92
+ # @param [String] config_file Config file path.
93
+ # @param [Hash] opts ({}) Configuration options
94
+ # @option opts [Integer] :worker_count (1) Parallel worker quantity.
95
+ # @option opts [Integer] :max_garbage (5) Max amount of times the garbage
96
+ # collector can be requested before actually executing.
97
+ # @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
98
+ # between page dequeuing.
99
+ # @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
100
+ # calculate page dequeue size.
101
+ # @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
102
+ # timeout in seconds.
103
+ # @option opts [Hash] :client_options ({}) Datahen client gem additional
104
+ # options (see Datahen::Client::Base#initialize method).
21
105
  def initialize(job_id, config_file, opts = {})
22
106
  opts = {
23
107
  worker_count: 1,
24
108
  max_garbage: 5,
25
109
  dequeue_interval: 3,
26
110
  dequeue_scale: 2,
111
+ dequeue_timeout: 30,
27
112
  client_options: {}
28
113
  }.merge opts
29
114
 
@@ -32,23 +117,36 @@ module Datahen
32
117
  @dequeue_interval = opts[:dequeue_interval]
33
118
  @dequeue_scale = opts[:dequeue_scale]
34
119
  @max_garbage = opts[:max_garbage]
35
- @pages = Concurrent::Hash.new
120
+ @pages = Concurrent::Array.new
121
+ @loaded_pages = Concurrent::Hash.new
36
122
  @garbage_mutex = Mutex.new
123
+ @dequeue_mutex = Mutex.new
124
+ @not_found = false
125
+ self.dequeue_timeout = opts[:dequeue_timeout]
126
+ self.second_dequeue_count = 0
37
127
  self.garbage_count = 0
38
128
  self.config_file = config_file
39
129
  self.load_config
40
130
 
41
131
  @client = Datahen::Client::JobPage.new(opts[:client_options])
132
+ nil
42
133
  end
43
134
 
135
+ # Execute garbage collector after it is requested as many times as
136
+ # described by #max_garbage.
44
137
  def recollect_garbage
45
138
  self.garbage_mutex.synchronize do
46
- puts "Recollect garbage"
47
- GC.start
48
- self.garbage_count = 0
139
+ self.garbage_count += 1
140
+ if self.garbage_count > self.max_garbage
141
+ puts "Recollect garbage"
142
+ GC.start
143
+ self.garbage_count = 0
144
+ end
49
145
  end
146
+ nil
50
147
  end
51
148
 
149
+ # Loads the config file into a Hash.
52
150
  def load_config
53
151
  # build page type to script file map
54
152
  @page_types = []
@@ -60,20 +158,40 @@ module Datahen
60
158
  self.parsers[v['page_type']] = v['file']
61
159
  end
62
160
  self.recollect_garbage
161
+ nil
63
162
  end
64
163
 
164
+ # Print the message regardless of it being the same as the last message.
165
+ # @param [String] message Message to display.
65
166
  def repeat_puts message
66
167
  puts message
67
- self.last_message = ''
168
+ self.last_message = message
169
+ nil
68
170
  end
69
171
 
172
+ # Print the message only when it is different from the last recorded
173
+ # message.
174
+ # @param [String] message Message to display.
70
175
  def no_repeat_puts message
71
176
  return if message == self.last_message
72
177
  puts message
73
178
  self.last_message = message
179
+ nil
74
180
  end
75
181
 
182
+ # Refresh dequeuer's still alive timestamp
183
+ def dequeuer_is_alive!
184
+ self.dequeue_mutex.synchronize do
185
+ @dequeuer_still_alive = self.class.timestamp
186
+ end
187
+ nil
188
+ end
189
+
190
+ # Load new pages by dequeuing from the API.
191
+ # @return [Integer] amount of pages loaded
76
192
  def load_pages
193
+ self.dequeuer_is_alive!
194
+
77
195
  # calculate dequeue size
78
196
  max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
79
197
  current_size = self.pages.length
@@ -84,10 +202,21 @@ module Datahen
84
202
  dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
85
203
 
86
204
  # reserve and get to pages parse
87
- response = client.dequeue self.job_id,
88
- dequeue_size,
89
- self.page_types,
90
- config['parse_fetching_failed']
205
+ response = nil
206
+ begin
207
+ response = client.dequeue self.job_id,
208
+ dequeue_size,
209
+ self.page_types,
210
+ config['parse_fetching_failed'],
211
+ timeout: self.dequeue_timeout
212
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
213
+ self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
214
+ self.dequeuer_is_alive!
215
+ return 0
216
+ rescue => e
217
+ raise e
218
+ end
219
+ self.dequeuer_is_alive!
91
220
 
92
221
  # ensure a valid response or try again
93
222
  if response.nil? || response.response.code.to_i != 200
@@ -100,16 +229,20 @@ module Datahen
100
229
  count = 0
101
230
  (JSON.parse(response.body) || []).each do |page|
102
231
  count += 1
103
- next if self.pages.has_key? page['gid']
104
- self.pages[page['gid']] = page
232
+ next if self.loaded_pages.has_key? page['gid']
233
+ self.pages << (self.loaded_pages[page['gid']] = page)
105
234
  end
106
235
  response = nil
236
+ self.dequeuer_is_alive!
107
237
 
108
238
  # recolect garbage to free some memory before parsing
109
239
  if count > 0
240
+ @not_found = false
110
241
  self.recollect_garbage
111
242
  self.repeat_puts "Found #{count} page(s) to parse"
243
+ self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
112
244
  else
245
+ @not_found = true
113
246
  self.no_repeat_puts NOT_FOUND_MSG
114
247
  end
115
248
 
@@ -117,21 +250,74 @@ module Datahen
117
250
  count
118
251
  end
119
252
 
253
+ # Ensures that the dequeuer thread exists and is running.
254
+ # @return [Boolean] `true` if thread was alive, or `false` if had to
255
+ # create a new thread
256
+ def ensure_dequeuer_thread
257
+ self.dequeue_mutex.synchronize do
258
+ # check if dequeuer thread is alive and healthy
259
+ if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
260
+ still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
261
+ return true if self.class.timestamp < still_alive_timeout
262
+
263
+ # kill dequeuer thread
264
+ self.repeat_puts "Dequeuer isn't healthy, will restart it..."
265
+ self.dequeuer_thread.kill
266
+ @dequeuer_thread = nil
267
+ self.recollect_garbage
268
+ self.no_repeat_puts "Dequeuer thread was killed!"
269
+ end
270
+
271
+ # dequeuing on parallel (the ride never ends :D)
272
+ @dequeuer_thread = Thread.new do
273
+ while true
274
+ begin
275
+ self.load_pages
276
+ self.class.wait self.dequeue_interval
277
+ rescue => e
278
+ puts [e.message] + e.backtrace rescue 'error'
279
+ end
280
+ end
281
+ puts "Error: dequeuer died! D:"
282
+ end
283
+ self.repeat_puts "Dequeuer thread was started!"
284
+ end
285
+ false
286
+ end
287
+
288
+ # Dequeue one page from the previously loaded pages, and waits until there
289
+ # are new pages whenever there are no loaded pages.
290
+ # @return [Hash] dequeued page
120
291
  def dequeue_pages
121
292
  # collect garbage
122
- self.garbage_count += 1
123
- if self.garbage_count > self.max_garbage
124
- self.recollect_garbage
125
- end
293
+ self.recollect_garbage
126
294
 
127
295
  # return page if there are loeaded pages
296
+ is_waiting = false
128
297
  while true do
129
- key_value = self.pages.shift
130
- return key_value[1] unless key_value.nil?
298
+ page = self.pages.shift
299
+ unless page.nil?
300
+ puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
301
+ loaded_pages.delete(page['gid'])
302
+ return page
303
+ end
304
+
305
+ # be more verbose on worker waiting
306
+ unless is_waiting
307
+ is_waiting = true
308
+ puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
309
+ if self.second_dequeue_count > 1 && !self.not_found
310
+ puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
311
+ end
312
+ end
131
313
  self.class.wait 1
314
+
315
+ # ensure the dequeuer thread is alive and healthy
316
+ self.ensure_dequeuer_thread
132
317
  end
133
318
  end
134
319
 
320
+ # Dequeue pages and execute the parsers associated to them on parallel.
135
321
  def exec_parse save = false, keep_outputs = false
136
322
  if self.worker_count < 1
137
323
  self.no_repeat_puts NO_WORKERS_MSG
@@ -140,20 +326,10 @@ module Datahen
140
326
  self.no_repeat_puts "Spawing #{self.worker_count} workers"
141
327
  end
142
328
 
143
- # dequeuing on parallel
144
- keep_dequeue = Concurrent::Array.new
145
- keep_dequeue[0] = true
146
- Thread.new do
147
- while keep_dequeue[0]
148
- begin
149
- self.load_pages
150
- self.class.wait self.dequeue_interval
151
- rescue => e
152
- puts [e.message] + e.backtrace rescue 'error'
153
- end
154
- end
155
- end
329
+ # start dequeuer
330
+ self.ensure_dequeuer_thread
156
331
 
332
+ # process the pages
157
333
  dequeue = lambda{ self.dequeue_pages }
158
334
  Parallel.each(dequeue, in_threads: (worker_count)) do |page|
159
335
  parser_file = self.parsers[page['page_type']]
@@ -166,11 +342,16 @@ module Datahen
166
342
  nil,
167
343
  keep_outputs
168
344
  )
345
+ rescue Parallel::Kill => e
346
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
347
+ rescue Parallel::Break => e
348
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
169
349
  rescue => e
170
350
  puts [e.message] + e.backtrace rescue 'error'
171
351
  end
172
352
  end
173
- keep_dequeue[0] = false
353
+
354
+ nil
174
355
  end
175
356
  end
176
357
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.15.9"
2
+ VERSION = "0.16.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.9
4
+ version: 0.16.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-27 00:00:00.000000000 Z
11
+ date: 2021-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -276,7 +276,7 @@ metadata:
276
276
  allowed_push_host: https://rubygems.org
277
277
  homepage_uri: https://datahen.com
278
278
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
279
- post_install_message:
279
+ post_install_message:
280
280
  rdoc_options: []
281
281
  require_paths:
282
282
  - lib
@@ -292,7 +292,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
292
292
  version: '0'
293
293
  requirements: []
294
294
  rubygems_version: 3.0.3
295
- signing_key:
295
+ signing_key:
296
296
  specification_version: 4
297
297
  summary: DataHen toolbelt for developers
298
298
  test_files: []