datahen 0.15.10 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
4
- data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
3
+ metadata.gz: 8945724e5d11f40eba22a9ffca7ca7d024b8565dae4ba1d3da9e486eac262575
4
+ data.tar.gz: 867319a0c6358c593951e6241b24d9c3fd9dcb759ce1287eeb640ade0c23e69a
5
5
  SHA512:
6
- metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
7
- data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
6
+ metadata.gz: bad8dea41951df061c84934fa63a0594bc8caf22b8665c3d6746bb29ca2821103ecd3814a9a9eba020009e8165390900b1c45a1fd0a7175fb7ea52f7a077fdab
7
+ data.tar.gz: 74342e01eaa21a590ef998219282fb40342cde4bf8db58617d24583bfa00b967df3f8c38e41c9b92868e90113b7cc5f6346fb2062b446edd3bb4612e053c5da7
@@ -72,6 +72,7 @@ module Datahen
72
72
  option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
73
  option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
74
  option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
75
76
  def batch_exec_parse(scraper_name, config_file)
76
77
  if options[:job]
77
78
  job_id = options[:job]
@@ -32,6 +32,7 @@ module Datahen
32
32
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
33
33
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
34
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
35
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
35
36
  def create(scraper_name, git_repository)
36
37
  # puts "options #{options}"
37
38
  client = Client::Scraper.new(options)
@@ -57,6 +58,7 @@ module Datahen
57
58
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
58
59
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
59
60
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
61
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
60
62
  def update(scraper_name)
61
63
  client = Client::Scraper.new(options)
62
64
  puts "#{client.update(scraper_name, options)}"
@@ -94,6 +96,7 @@ module Datahen
94
96
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
97
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
98
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
99
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
97
100
  def start(scraper_name)
98
101
  client = Client::ScraperJob.new(options)
99
102
  puts "Starting a scrape job..."
@@ -104,6 +104,7 @@ module Datahen
104
104
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
105
105
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
106
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
107
108
  def update(scraper_name)
108
109
  if options[:job]
109
110
  client = Client::Job.new(options)
@@ -45,6 +45,7 @@ module Datahen
45
45
  option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
46
46
  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
47
  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
48
49
  def add(scraper_name, url)
49
50
  begin
50
51
  options[:headers] = JSON.parse(options[:headers]) if options[:headers]
@@ -78,6 +79,7 @@ module Datahen
78
79
  option :page_type, :aliases => :t, desc: 'Set page type'
79
80
  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
80
81
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
82
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
81
83
  def update(scraper_name, gid)
82
84
  begin
83
85
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -21,6 +21,7 @@ module Datahen
21
21
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
22
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
23
  body[:profile] = opts[:profile] if opts[:profile]
24
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
24
25
  params = @options.merge({body: body.to_json})
25
26
 
26
27
  self.class.put("/jobs/#{job_id}", params)
@@ -15,6 +15,7 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
18
19
 
19
20
  params = @options.merge({body: body.to_json})
20
21
 
@@ -36,6 +37,7 @@ module Datahen
36
37
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
37
38
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
38
39
  body[:cookie] = opts[:cookie] if opts[:cookie]
40
+ body[:max_size] = opts[:max_size] if opts[:max_size]
39
41
 
40
42
  params = @options.merge({body: body.to_json})
41
43
 
@@ -48,7 +50,7 @@ module Datahen
48
50
  page_types: page_types,
49
51
  parse_fetching_failed: parse_fetching_failed
50
52
  }
51
- params = @options.merge({body: body.to_json, timeout: 30})
53
+ params = @options.merge(opts).merge({body: body.to_json})
52
54
  self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
55
  end
54
56
 
@@ -28,6 +28,7 @@ module Datahen
28
28
  body[:profile] = opts[:profile] if opts[:profile]
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
30
  body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
31
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
31
32
  params = @options.merge({body: body.to_json})
32
33
  self.class.post("/scrapers", params)
33
34
  end
@@ -49,6 +50,7 @@ module Datahen
49
50
  body[:profile] = opts[:profile] if opts[:profile]
50
51
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
51
52
  body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
53
+ body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
52
54
  params = @options.merge({body: body.to_json})
53
55
 
54
56
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -11,6 +11,7 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
14
15
  if opts[:vars]
15
16
  if opts[:vars].is_a?(Array)
16
17
  body[:vars] = opts[:vars]
@@ -37,6 +38,7 @@ module Datahen
37
38
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
38
39
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
39
40
  body[:profile] = opts[:profile] if opts[:profile]
41
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
40
42
  params = @options.merge({body: body.to_json})
41
43
 
42
44
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -15,6 +15,7 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
18
19
 
19
20
  params = @options.merge({body: body.to_json})
20
21
 
@@ -59,6 +60,7 @@ module Datahen
59
60
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
60
61
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
61
62
  body[:cookie] = opts[:cookie] if opts[:cookie]
63
+ body[:max_size] = opts[:max_size] if opts[:max_size]
62
64
 
63
65
  params = @options.merge({body: body.to_json})
64
66
 
@@ -8,22 +8,107 @@ module Datahen
8
8
  NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
9
  NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
10
 
11
- attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
12
- attr_reader :job_id, :worker_count, :pages, :max_garbage
13
- attr_reader :dequeue_interval, :dequeue_scale
14
- attr_reader :page_types, :parsers
15
- attr_reader :config, :client, :garbage_mutex
11
+ # Configuration file path.
12
+ # @return [String] config file path
13
+ attr_accessor :config_file
14
+ # Garbage collector request counter.
15
+ # @return [Integer] garbage collector counter
16
+ attr_accessor :garbage_count
17
+ # Last printed message, useful to prevent duplicated log messages.
18
+ # @return [String] last printed message
19
+ attr_accessor :last_message
20
+ # Second dequeue counter used to prevent false negative warning messages.
21
+ # @return [Integer] second dequeue counter
22
+ attr_accessor :second_dequeue_count
23
+ # Dequeue API request timeout in seconds.
24
+ # @return [Integer] dequeue API request timeout in seconds
25
+ attr_accessor :dequeue_timeout
26
+ # Job id to be executed.
27
+ # @return [Integer] job id
28
+ attr_reader :job_id
29
+ # Parallel worker quantity.
30
+ # @return [Integer] parallel worker quantity
31
+ attr_reader :worker_count
32
+ # Loaded pages array.
33
+ # @return [Concurrent::Array<Hash>] loaded pages as an array
34
+ attr_reader :pages
35
+ # Loaded pages hash, useful to avoid duplicates on the loaded pages array.
36
+ # @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
37
+ attr_reader :loaded_pages
38
+ # Max garbage collector requests before actually executing the garbage
39
+ # collector.
40
+ # @return [Integer] max garbage request quantity before actually executing
41
+ # it
42
+ attr_reader :max_garbage
43
+ # Dequeue interval in seconds.
44
+ # @return [Integer] dequeue interval in seconds
45
+ attr_reader :dequeue_interval
46
+ # Dequeue scale used to calculate the ideal dequeue size.
47
+ # @return [Numeric] dequeue scale
48
+ attr_reader :dequeue_scale
49
+ # Known page types extracted from the config file.
50
+ # @return [Array<String>] known page types
51
+ attr_reader :page_types
52
+ # Known parsers extracted from the config file.
53
+ # @return [Concurrent::Hash<String, String>] known parsers
54
+ attr_reader :parsers
55
+ # Current config file loaded.
56
+ # @return [Hash] current loaded configuration
57
+ attr_reader :config
58
+ # Datahen job pages client used for API pages dequeuing.
59
+ # @return [Datahen::Client::JobPage] datahen job pages API client
60
+ attr_reader :client
61
+ # Garbage collector mutex used to synchronize garbage collector requests.
62
+ # @return [Mutex] garbage collector mutex
63
+ attr_reader :garbage_mutex
64
+ # Current dequeuer thread.
65
+ # @return [Thread] dequeuer thread
66
+ attr_reader :dequeuer_thread
67
+ # Dequeuer mutext used to synchronize page dequeuing.
68
+ # @return [Mutex] dequeuer mutex
69
+ attr_reader :dequeue_mutex
70
+ # Dequeuer last run unix timestamp.
71
+ # @return [Integer] dequeuer last run unix timestamp
72
+ attr_reader :dequeuer_still_alive
73
+ # Indicates whenever the wait time is because there are no more pages.
74
+ # @return [Boolean] `true` when wait time is due to no more pages,
75
+ # else `false`
76
+ attr_reader :not_found
16
77
 
78
+ # Wait a specific amount of seconds.
79
+ # @param [Integer] time_in_seconds Seconds to wait.
17
80
  def self.wait time_in_seconds
18
81
  Kernel.sleep time_in_seconds
19
82
  end
20
83
 
84
+ # Get a unix timestamp.
85
+ # @return [Integer] unix timestamp
86
+ def self.timestamp
87
+ Time.new.utc.to_i
88
+ end
89
+
90
+ # Initialize a batch parser object.
91
+ # @param [Integer] job_id Job id.
92
+ # @param [String] config_file Config file path.
93
+ # @param [Hash] opts ({}) Configuration options
94
+ # @option opts [Integer] :worker_count (1) Parallel worker quantity.
95
+ # @option opts [Integer] :max_garbage (5) Max amount of times the garbage
96
+ # collector can be requested before actually executing.
97
+ # @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
98
+ # between page dequeuing.
99
+ # @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
100
+ # calculate page dequeue size.
101
+ # @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
102
+ # timeout in seconds.
103
+ # @option opts [Hash] :client_options ({}) Datahen client gem additional
104
+ # options (see Datahen::Client::Base#initialize method).
21
105
  def initialize(job_id, config_file, opts = {})
22
106
  opts = {
23
107
  worker_count: 1,
24
108
  max_garbage: 5,
25
109
  dequeue_interval: 3,
26
110
  dequeue_scale: 2,
111
+ dequeue_timeout: 30,
27
112
  client_options: {}
28
113
  }.merge opts
29
114
 
@@ -32,16 +117,23 @@ module Datahen
32
117
  @dequeue_interval = opts[:dequeue_interval]
33
118
  @dequeue_scale = opts[:dequeue_scale]
34
119
  @max_garbage = opts[:max_garbage]
35
- @pages = Concurrent::Hash.new
120
+ @pages = Concurrent::Array.new
121
+ @loaded_pages = Concurrent::Hash.new
36
122
  @garbage_mutex = Mutex.new
123
+ @dequeue_mutex = Mutex.new
124
+ @not_found = false
125
+ self.dequeue_timeout = opts[:dequeue_timeout]
37
126
  self.second_dequeue_count = 0
38
127
  self.garbage_count = 0
39
128
  self.config_file = config_file
40
129
  self.load_config
41
130
 
42
131
  @client = Datahen::Client::JobPage.new(opts[:client_options])
132
+ nil
43
133
  end
44
134
 
135
+ # Execute garbage collector after it is requested as many times as
136
+ # described by #max_garbage.
45
137
  def recollect_garbage
46
138
  self.garbage_mutex.synchronize do
47
139
  self.garbage_count += 1
@@ -51,8 +143,10 @@ module Datahen
51
143
  self.garbage_count = 0
52
144
  end
53
145
  end
146
+ nil
54
147
  end
55
148
 
149
+ # Loads the config file into a Hash.
56
150
  def load_config
57
151
  # build page type to script file map
58
152
  @page_types = []
@@ -64,20 +158,40 @@ module Datahen
64
158
  self.parsers[v['page_type']] = v['file']
65
159
  end
66
160
  self.recollect_garbage
161
+ nil
67
162
  end
68
163
 
164
+ # Print the message regardless of it being the same as the last message.
165
+ # @param [String] message Message to display.
69
166
  def repeat_puts message
70
167
  puts message
71
- self.last_message = ''
168
+ self.last_message = message
169
+ nil
72
170
  end
73
171
 
172
+ # Print the message only when it is different from the last recorded
173
+ # message.
174
+ # @param [String] message Message to display.
74
175
  def no_repeat_puts message
75
176
  return if message == self.last_message
76
177
  puts message
77
178
  self.last_message = message
179
+ nil
180
+ end
181
+
182
+ # Refresh dequeuer's still alive timestamp
183
+ def dequeuer_is_alive!
184
+ self.dequeue_mutex.synchronize do
185
+ @dequeuer_still_alive = self.class.timestamp
186
+ end
187
+ nil
78
188
  end
79
189
 
190
+ # Load new pages by dequeuing from the API.
191
+ # @return [Integer] amount of pages loaded
80
192
  def load_pages
193
+ self.dequeuer_is_alive!
194
+
81
195
  # calculate dequeue size
82
196
  max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
83
197
  current_size = self.pages.length
@@ -93,13 +207,16 @@ module Datahen
93
207
  response = client.dequeue self.job_id,
94
208
  dequeue_size,
95
209
  self.page_types,
96
- config['parse_fetching_failed']
210
+ config['parse_fetching_failed'],
211
+ timeout: self.dequeue_timeout
97
212
  rescue Net::ReadTimeout, Net::OpenTimeout => e
98
- self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
213
+ self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
214
+ self.dequeuer_is_alive!
99
215
  return 0
100
216
  rescue => e
101
217
  raise e
102
218
  end
219
+ self.dequeuer_is_alive!
103
220
 
104
221
  # ensure a valid response or try again
105
222
  if response.nil? || response.response.code.to_i != 200
@@ -112,17 +229,20 @@ module Datahen
112
229
  count = 0
113
230
  (JSON.parse(response.body) || []).each do |page|
114
231
  count += 1
115
- next if self.pages.has_key? page['gid']
116
- self.pages[page['gid']] = page
232
+ next if self.loaded_pages.has_key? page['gid']
233
+ self.pages << (self.loaded_pages[page['gid']] = page)
117
234
  end
118
235
  response = nil
236
+ self.dequeuer_is_alive!
119
237
 
120
238
  # recolect garbage to free some memory before parsing
121
239
  if count > 0
240
+ @not_found = false
122
241
  self.recollect_garbage
123
242
  self.repeat_puts "Found #{count} page(s) to parse"
124
243
  self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
125
244
  else
245
+ @not_found = true
126
246
  self.no_repeat_puts NOT_FOUND_MSG
127
247
  end
128
248
 
@@ -130,6 +250,44 @@ module Datahen
130
250
  count
131
251
  end
132
252
 
253
+ # Ensures that the dequeuer thread exists and is running.
254
+ # @return [Boolean] `true` if thread was alive, or `false` if had to
255
+ # create a new thread
256
+ def ensure_dequeuer_thread
257
+ self.dequeue_mutex.synchronize do
258
+ # check if dequeuer thread is alive and healthy
259
+ if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
260
+ still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
261
+ return true if self.class.timestamp < still_alive_timeout
262
+
263
+ # kill dequeuer thread
264
+ self.repeat_puts "Dequeuer isn't healthy, will restart it..."
265
+ self.dequeuer_thread.kill
266
+ @dequeuer_thread = nil
267
+ self.recollect_garbage
268
+ self.no_repeat_puts "Dequeuer thread was killed!"
269
+ end
270
+
271
+ # dequeuing on parallel (the ride never ends :D)
272
+ @dequeuer_thread = Thread.new do
273
+ while true
274
+ begin
275
+ self.load_pages
276
+ self.class.wait self.dequeue_interval
277
+ rescue => e
278
+ puts [e.message] + e.backtrace rescue 'error'
279
+ end
280
+ end
281
+ puts "Error: dequeuer died! D:"
282
+ end
283
+ self.repeat_puts "Dequeuer thread was started!"
284
+ end
285
+ false
286
+ end
287
+
288
+ # Dequeue one page from the previously loaded pages, and waits until there
289
+ # are new pages whenever there are no loaded pages.
290
+ # @return [Hash] dequeued page
133
291
  def dequeue_pages
134
292
  # collect garbage
135
293
  self.recollect_garbage
@@ -137,24 +295,29 @@ module Datahen
137
295
  # return page if there are loeaded pages
138
296
  is_waiting = false
139
297
  while true do
140
- key_value = self.pages.shift
141
- unless key_value.nil?
298
+ page = self.pages.shift
299
+ unless page.nil?
142
300
  puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
143
- return key_value[1]
301
+ loaded_pages.delete(page['gid'])
302
+ return page
144
303
  end
145
304
 
146
305
  # be more verbose on worker waiting
147
306
  unless is_waiting
148
307
  is_waiting = true
149
308
  puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
150
- if self.second_dequeue_count > 1
309
+ if self.second_dequeue_count > 1 && !self.not_found
151
310
  puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
152
311
  end
153
312
  end
154
313
  self.class.wait 1
314
+
315
+ # ensure the dequeuer thread is alive and healthy
316
+ self.ensure_dequeuer_thread
155
317
  end
156
318
  end
157
319
 
320
+ # Dequeue pages and execute the parsers associated to them on parallel.
158
321
  def exec_parse save = false, keep_outputs = false
159
322
  if self.worker_count < 1
160
323
  self.no_repeat_puts NO_WORKERS_MSG
@@ -163,24 +326,15 @@ module Datahen
163
326
  self.no_repeat_puts "Spawing #{self.worker_count} workers"
164
327
  end
165
328
 
166
- # dequeuing on parallel (the ride never ends :D)
167
- Thread.new do
168
- while true
169
- begin
170
- self.load_pages
171
- self.class.wait self.dequeue_interval
172
- rescue => e
173
- puts [e.message] + e.backtrace rescue 'error'
174
- end
175
- end
176
- puts "Error: dequeuer died! D:"
177
- end
329
+ # start dequeuer
330
+ self.ensure_dequeuer_thread
178
331
 
179
332
  # process the pages
180
333
  dequeue = lambda{ self.dequeue_pages }
181
334
  Parallel.each(dequeue, in_threads: (worker_count)) do |page|
182
335
  parser_file = self.parsers[page['page_type']]
183
336
  begin
337
+ self.repeat_puts("Parsing page with GID #{page['gid']}")
184
338
  puts Datahen::Scraper::Parser.exec_parser_by_page(
185
339
  parser_file,
186
340
  page,
@@ -189,6 +343,7 @@ module Datahen
189
343
  nil,
190
344
  keep_outputs
191
345
  )
346
+ self.repeat_puts("Finish parsing page with GID #{page['gid']}")
192
347
  rescue Parallel::Kill => e
193
348
  puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
194
349
  rescue Parallel::Break => e
@@ -197,6 +352,8 @@ module Datahen
197
352
  puts [e.message] + e.backtrace rescue 'error'
198
353
  end
199
354
  end
355
+
356
+ nil
200
357
  end
201
358
  end
202
359
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.15.10"
2
+ VERSION = "0.16.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.10
4
+ version: 0.16.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-28 00:00:00.000000000 Z
11
+ date: 2021-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor