datahen 0.15.10 → 0.16.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
4
- data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
3
+ metadata.gz: 8945724e5d11f40eba22a9ffca7ca7d024b8565dae4ba1d3da9e486eac262575
4
+ data.tar.gz: 867319a0c6358c593951e6241b24d9c3fd9dcb759ce1287eeb640ade0c23e69a
5
5
  SHA512:
6
- metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
7
- data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
6
+ metadata.gz: bad8dea41951df061c84934fa63a0594bc8caf22b8665c3d6746bb29ca2821103ecd3814a9a9eba020009e8165390900b1c45a1fd0a7175fb7ea52f7a077fdab
7
+ data.tar.gz: 74342e01eaa21a590ef998219282fb40342cde4bf8db58617d24583bfa00b967df3f8c38e41c9b92868e90113b7cc5f6346fb2062b446edd3bb4612e053c5da7
@@ -72,6 +72,7 @@ module Datahen
72
72
  option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
73
  option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
74
  option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
75
76
  def batch_exec_parse(scraper_name, config_file)
76
77
  if options[:job]
77
78
  job_id = options[:job]
@@ -32,6 +32,7 @@ module Datahen
32
32
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
33
33
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
34
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
35
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
35
36
  def create(scraper_name, git_repository)
36
37
  # puts "options #{options}"
37
38
  client = Client::Scraper.new(options)
@@ -57,6 +58,7 @@ module Datahen
57
58
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
58
59
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
59
60
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
61
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
60
62
  def update(scraper_name)
61
63
  client = Client::Scraper.new(options)
62
64
  puts "#{client.update(scraper_name, options)}"
@@ -94,6 +96,7 @@ module Datahen
94
96
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
97
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
98
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
99
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
97
100
  def start(scraper_name)
98
101
  client = Client::ScraperJob.new(options)
99
102
  puts "Starting a scrape job..."
@@ -104,6 +104,7 @@ module Datahen
104
104
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
105
105
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
106
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
107
108
  def update(scraper_name)
108
109
  if options[:job]
109
110
  client = Client::Job.new(options)
@@ -45,6 +45,7 @@ module Datahen
45
45
  option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
46
46
  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
47
  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
48
49
  def add(scraper_name, url)
49
50
  begin
50
51
  options[:headers] = JSON.parse(options[:headers]) if options[:headers]
@@ -78,6 +79,7 @@ module Datahen
78
79
  option :page_type, :aliases => :t, desc: 'Set page type'
79
80
  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
80
81
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
82
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
81
83
  def update(scraper_name, gid)
82
84
  begin
83
85
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -21,6 +21,7 @@ module Datahen
21
21
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
22
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
23
  body[:profile] = opts[:profile] if opts[:profile]
24
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
24
25
  params = @options.merge({body: body.to_json})
25
26
 
26
27
  self.class.put("/jobs/#{job_id}", params)
@@ -15,6 +15,7 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
18
19
 
19
20
  params = @options.merge({body: body.to_json})
20
21
 
@@ -36,6 +37,7 @@ module Datahen
36
37
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
37
38
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
38
39
  body[:cookie] = opts[:cookie] if opts[:cookie]
40
+ body[:max_size] = opts[:max_size] if opts[:max_size]
39
41
 
40
42
  params = @options.merge({body: body.to_json})
41
43
 
@@ -48,7 +50,7 @@ module Datahen
48
50
  page_types: page_types,
49
51
  parse_fetching_failed: parse_fetching_failed
50
52
  }
51
- params = @options.merge({body: body.to_json, timeout: 30})
53
+ params = @options.merge(opts).merge({body: body.to_json})
52
54
  self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
55
  end
54
56
 
@@ -28,6 +28,7 @@ module Datahen
28
28
  body[:profile] = opts[:profile] if opts[:profile]
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
30
  body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
31
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
31
32
  params = @options.merge({body: body.to_json})
32
33
  self.class.post("/scrapers", params)
33
34
  end
@@ -49,6 +50,7 @@ module Datahen
49
50
  body[:profile] = opts[:profile] if opts[:profile]
50
51
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
51
52
  body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
53
+ body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
52
54
  params = @options.merge({body: body.to_json})
53
55
 
54
56
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -11,6 +11,7 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
14
15
  if opts[:vars]
15
16
  if opts[:vars].is_a?(Array)
16
17
  body[:vars] = opts[:vars]
@@ -37,6 +38,7 @@ module Datahen
37
38
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
38
39
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
39
40
  body[:profile] = opts[:profile] if opts[:profile]
41
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
40
42
  params = @options.merge({body: body.to_json})
41
43
 
42
44
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -15,6 +15,7 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
18
19
 
19
20
  params = @options.merge({body: body.to_json})
20
21
 
@@ -59,6 +60,7 @@ module Datahen
59
60
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
60
61
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
61
62
  body[:cookie] = opts[:cookie] if opts[:cookie]
63
+ body[:max_size] = opts[:max_size] if opts[:max_size]
62
64
 
63
65
  params = @options.merge({body: body.to_json})
64
66
 
@@ -8,22 +8,107 @@ module Datahen
8
8
  NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
9
  NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
10
 
11
- attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
12
- attr_reader :job_id, :worker_count, :pages, :max_garbage
13
- attr_reader :dequeue_interval, :dequeue_scale
14
- attr_reader :page_types, :parsers
15
- attr_reader :config, :client, :garbage_mutex
11
+ # Configuration file path.
12
+ # @return [String] config file path
13
+ attr_accessor :config_file
14
+ # Garbage collector request counter.
15
+ # @return [Integer] garbage collector counter
16
+ attr_accessor :garbage_count
17
+ # Last printed message, useful to prevent duplicated log messages.
18
+ # @return [String] last printed message
19
+ attr_accessor :last_message
20
+ # Second dequeue counter used to prevent false negative warning messages.
21
+ # @return [Integer] second dequeue counter
22
+ attr_accessor :second_dequeue_count
23
+ # Dequeue API request timeout in seconds.
24
+ # @return [Integer] dequeue API request timeout in seconds
25
+ attr_accessor :dequeue_timeout
26
+ # Job id to be executed.
27
+ # @return [Integer] job id
28
+ attr_reader :job_id
29
+ # Parallel worker quantity.
30
+ # @return [Integer] parallel worker quantity
31
+ attr_reader :worker_count
32
+ # Loaded pages array.
33
+ # @return [Concurrent::Array<Hash>] loaded pages as an array
34
+ attr_reader :pages
35
+ # Loaded pages hash, useful to avoid duplicates on the loaded pages array.
36
+ # @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
37
+ attr_reader :loaded_pages
38
+ # Max garbage collector requests before actually executing the garbage
39
+ # collector.
40
+ # @return [Integer] max garbage request quantity before actually executing
41
+ # it
42
+ attr_reader :max_garbage
43
+ # Dequeue interval in seconds.
44
+ # @return [Integer] dequeue interval in seconds
45
+ attr_reader :dequeue_interval
46
+ # Dequeue scale used to calculate the ideal dequeue size.
47
+ # @return [Numeric] dequeue scale
48
+ attr_reader :dequeue_scale
49
+ # Known page types extracted from the config file.
50
+ # @return [Array<String>] known page types
51
+ attr_reader :page_types
52
+ # Known parsers extracted from the config file.
53
+ # @return [Concurrent::Hash<String, String>] known parsers
54
+ attr_reader :parsers
55
+ # Current config file loaded.
56
+ # @return [Hash] current loaded configuration
57
+ attr_reader :config
58
+ # Datahen job pages client used for API pages dequeuing.
59
+ # @return [Datahen::Client::JobPage] datahen job pages API client
60
+ attr_reader :client
61
+ # Garbage collector mutex used to synchronize garbage collector requests.
62
+ # @return [Mutex] garbage collector mutex
63
+ attr_reader :garbage_mutex
64
+ # Current dequeuer thread.
65
+ # @return [Thread] dequeuer thread
66
+ attr_reader :dequeuer_thread
67
+ # Dequeuer mutext used to synchronize page dequeuing.
68
+ # @return [Mutex] dequeuer mutex
69
+ attr_reader :dequeue_mutex
70
+ # Dequeuer last run unix timestamp.
71
+ # @return [Integer] dequeuer last run unix timestamp
72
+ attr_reader :dequeuer_still_alive
73
+ # Indicates whenever the wait time is because there are no more pages.
74
+ # @return [Boolean] `true` when wait time is due to no more pages,
75
+ # else `false`
76
+ attr_reader :not_found
16
77
 
78
+ # Wait a specific amount of seconds.
79
+ # @param [Integer] time_in_seconds Seconds to wait.
17
80
  def self.wait time_in_seconds
18
81
  Kernel.sleep time_in_seconds
19
82
  end
20
83
 
84
+ # Get a unix timestamp.
85
+ # @return [Integer] unix timestamp
86
+ def self.timestamp
87
+ Time.new.utc.to_i
88
+ end
89
+
90
+ # Initialize a batch parser object.
91
+ # @param [Integer] job_id Job id.
92
+ # @param [String] config_file Config file path.
93
+ # @param [Hash] opts ({}) Configuration options
94
+ # @option opts [Integer] :worker_count (1) Parallel worker quantity.
95
+ # @option opts [Integer] :max_garbage (5) Max amount of times the garbage
96
+ # collector can be requested before actually executing.
97
+ # @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
98
+ # between page dequeuing.
99
+ # @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
100
+ # calculate page dequeue size.
101
+ # @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
102
+ # timeout in seconds.
103
+ # @option opts [Hash] :client_options ({}) Datahen client gem additional
104
+ # options (see Datahen::Client::Base#initialize method).
21
105
  def initialize(job_id, config_file, opts = {})
22
106
  opts = {
23
107
  worker_count: 1,
24
108
  max_garbage: 5,
25
109
  dequeue_interval: 3,
26
110
  dequeue_scale: 2,
111
+ dequeue_timeout: 30,
27
112
  client_options: {}
28
113
  }.merge opts
29
114
 
@@ -32,16 +117,23 @@ module Datahen
32
117
  @dequeue_interval = opts[:dequeue_interval]
33
118
  @dequeue_scale = opts[:dequeue_scale]
34
119
  @max_garbage = opts[:max_garbage]
35
- @pages = Concurrent::Hash.new
120
+ @pages = Concurrent::Array.new
121
+ @loaded_pages = Concurrent::Hash.new
36
122
  @garbage_mutex = Mutex.new
123
+ @dequeue_mutex = Mutex.new
124
+ @not_found = false
125
+ self.dequeue_timeout = opts[:dequeue_timeout]
37
126
  self.second_dequeue_count = 0
38
127
  self.garbage_count = 0
39
128
  self.config_file = config_file
40
129
  self.load_config
41
130
 
42
131
  @client = Datahen::Client::JobPage.new(opts[:client_options])
132
+ nil
43
133
  end
44
134
 
135
+ # Execute garbage collector after it is requested as many times as
136
+ # described by #max_garbage.
45
137
  def recollect_garbage
46
138
  self.garbage_mutex.synchronize do
47
139
  self.garbage_count += 1
@@ -51,8 +143,10 @@ module Datahen
51
143
  self.garbage_count = 0
52
144
  end
53
145
  end
146
+ nil
54
147
  end
55
148
 
149
+ # Loads the config file into a Hash.
56
150
  def load_config
57
151
  # build page type to script file map
58
152
  @page_types = []
@@ -64,20 +158,40 @@ module Datahen
64
158
  self.parsers[v['page_type']] = v['file']
65
159
  end
66
160
  self.recollect_garbage
161
+ nil
67
162
  end
68
163
 
164
+ # Print the message regardless of it being the same as the last message.
165
+ # @param [String] message Message to display.
69
166
  def repeat_puts message
70
167
  puts message
71
- self.last_message = ''
168
+ self.last_message = message
169
+ nil
72
170
  end
73
171
 
172
+ # Print the message only when it is different from the last recorded
173
+ # message.
174
+ # @param [String] message Message to display.
74
175
  def no_repeat_puts message
75
176
  return if message == self.last_message
76
177
  puts message
77
178
  self.last_message = message
179
+ nil
180
+ end
181
+
182
+ # Refresh dequeuer's still alive timestamp
183
+ def dequeuer_is_alive!
184
+ self.dequeue_mutex.synchronize do
185
+ @dequeuer_still_alive = self.class.timestamp
186
+ end
187
+ nil
78
188
  end
79
189
 
190
+ # Load new pages by dequeuing from the API.
191
+ # @return [Integer] amount of pages loaded
80
192
  def load_pages
193
+ self.dequeuer_is_alive!
194
+
81
195
  # calculate dequeue size
82
196
  max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
83
197
  current_size = self.pages.length
@@ -93,13 +207,16 @@ module Datahen
93
207
  response = client.dequeue self.job_id,
94
208
  dequeue_size,
95
209
  self.page_types,
96
- config['parse_fetching_failed']
210
+ config['parse_fetching_failed'],
211
+ timeout: self.dequeue_timeout
97
212
  rescue Net::ReadTimeout, Net::OpenTimeout => e
98
- self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
213
+ self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
214
+ self.dequeuer_is_alive!
99
215
  return 0
100
216
  rescue => e
101
217
  raise e
102
218
  end
219
+ self.dequeuer_is_alive!
103
220
 
104
221
  # ensure a valid response or try again
105
222
  if response.nil? || response.response.code.to_i != 200
@@ -112,17 +229,20 @@ module Datahen
112
229
  count = 0
113
230
  (JSON.parse(response.body) || []).each do |page|
114
231
  count += 1
115
- next if self.pages.has_key? page['gid']
116
- self.pages[page['gid']] = page
232
+ next if self.loaded_pages.has_key? page['gid']
233
+ self.pages << (self.loaded_pages[page['gid']] = page)
117
234
  end
118
235
  response = nil
236
+ self.dequeuer_is_alive!
119
237
 
120
238
  # recolect garbage to free some memory before parsing
121
239
  if count > 0
240
+ @not_found = false
122
241
  self.recollect_garbage
123
242
  self.repeat_puts "Found #{count} page(s) to parse"
124
243
  self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
125
244
  else
245
+ @not_found = true
126
246
  self.no_repeat_puts NOT_FOUND_MSG
127
247
  end
128
248
 
@@ -130,6 +250,44 @@ module Datahen
130
250
  count
131
251
  end
132
252
 
253
+ # Ensures that the dequeuer thread exists and is running.
254
+ # @return [Boolean] `true` if thread was alive, or `false` if had to
255
+ # create a new thread
256
+ def ensure_dequeuer_thread
257
+ self.dequeue_mutex.synchronize do
258
+ # check if dequeuer thread is alive and healthy
259
+ if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
260
+ still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
261
+ return true if self.class.timestamp < still_alive_timeout
262
+
263
+ # kill dequeuer thread
264
+ self.repeat_puts "Dequeuer isn't healthy, will restart it..."
265
+ self.dequeuer_thread.kill
266
+ @dequeuer_thread = nil
267
+ self.recollect_garbage
268
+ self.no_repeat_puts "Dequeuer thread was killed!"
269
+ end
270
+
271
+ # dequeuing on parallel (the ride never ends :D)
272
+ @dequeuer_thread = Thread.new do
273
+ while true
274
+ begin
275
+ self.load_pages
276
+ self.class.wait self.dequeue_interval
277
+ rescue => e
278
+ puts [e.message] + e.backtrace rescue 'error'
279
+ end
280
+ end
281
+ puts "Error: dequeuer died! D:"
282
+ end
283
+ self.repeat_puts "Dequeuer thread was started!"
284
+ end
285
+ false
286
+ end
287
+
288
+ # Dequeue one page from the previously loaded pages, and waits until there
289
+ # are new pages whenever there are no loaded pages.
290
+ # @return [Hash] dequeued page
133
291
  def dequeue_pages
134
292
  # collect garbage
135
293
  self.recollect_garbage
@@ -137,24 +295,29 @@ module Datahen
137
295
  # return page if there are loeaded pages
138
296
  is_waiting = false
139
297
  while true do
140
- key_value = self.pages.shift
141
- unless key_value.nil?
298
+ page = self.pages.shift
299
+ unless page.nil?
142
300
  puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
143
- return key_value[1]
301
+ loaded_pages.delete(page['gid'])
302
+ return page
144
303
  end
145
304
 
146
305
  # be more verbose on worker waiting
147
306
  unless is_waiting
148
307
  is_waiting = true
149
308
  puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
150
- if self.second_dequeue_count > 1
309
+ if self.second_dequeue_count > 1 && !self.not_found
151
310
  puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
152
311
  end
153
312
  end
154
313
  self.class.wait 1
314
+
315
+ # ensure the dequeuer thread is alive and healthy
316
+ self.ensure_dequeuer_thread
155
317
  end
156
318
  end
157
319
 
320
+ # Dequeue pages and execute the parsers associated to them on parallel.
158
321
  def exec_parse save = false, keep_outputs = false
159
322
  if self.worker_count < 1
160
323
  self.no_repeat_puts NO_WORKERS_MSG
@@ -163,24 +326,15 @@ module Datahen
163
326
  self.no_repeat_puts "Spawing #{self.worker_count} workers"
164
327
  end
165
328
 
166
- # dequeuing on parallel (the ride never ends :D)
167
- Thread.new do
168
- while true
169
- begin
170
- self.load_pages
171
- self.class.wait self.dequeue_interval
172
- rescue => e
173
- puts [e.message] + e.backtrace rescue 'error'
174
- end
175
- end
176
- puts "Error: dequeuer died! D:"
177
- end
329
+ # start dequeuer
330
+ self.ensure_dequeuer_thread
178
331
 
179
332
  # process the pages
180
333
  dequeue = lambda{ self.dequeue_pages }
181
334
  Parallel.each(dequeue, in_threads: (worker_count)) do |page|
182
335
  parser_file = self.parsers[page['page_type']]
183
336
  begin
337
+ self.repeat_puts("Parsing page with GID #{page['gid']}")
184
338
  puts Datahen::Scraper::Parser.exec_parser_by_page(
185
339
  parser_file,
186
340
  page,
@@ -189,6 +343,7 @@ module Datahen
189
343
  nil,
190
344
  keep_outputs
191
345
  )
346
+ self.repeat_puts("Finish parsing page with GID #{page['gid']}")
192
347
  rescue Parallel::Kill => e
193
348
  puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
194
349
  rescue Parallel::Break => e
@@ -197,6 +352,8 @@ module Datahen
197
352
  puts [e.message] + e.backtrace rescue 'error'
198
353
  end
199
354
  end
355
+
356
+ nil
200
357
  end
201
358
  end
202
359
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.15.10"
2
+ VERSION = "0.16.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.10
4
+ version: 0.16.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-28 00:00:00.000000000 Z
11
+ date: 2021-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor