datahen 0.15.10 → 0.15.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
4
- data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
3
+ metadata.gz: f77e31da8e2a7ff08086c4aa9d174608a9c3f186679d456b22310b48384d3572
4
+ data.tar.gz: 0bf53ae0886b16bf6fe08b0db07b1a631b69f31d8e3a6868a4d483549049e4ed
5
5
  SHA512:
6
- metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
7
- data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
6
+ metadata.gz: a491874347ed6ac97c0a0e4f0d2c5830140b9367c1e01b4c95e7a447b071df643d12793ed7d6e5a0224b8876905cf74bc13b987e6e3e03e937d1f821557b8ec3
7
+ data.tar.gz: c553a372790654726f2921b6d9d582ca90b314b3fe8d78625cf2442e01cf2ce96ae556e78812d5bd6a03f350beb32aa3158981554179410c12926d480c911887
@@ -72,6 +72,7 @@ module Datahen
72
72
  option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
73
  option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
74
  option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
75
76
  def batch_exec_parse(scraper_name, config_file)
76
77
  if options[:job]
77
78
  job_id = options[:job]
@@ -48,7 +48,7 @@ module Datahen
48
48
  page_types: page_types,
49
49
  parse_fetching_failed: parse_fetching_failed
50
50
  }
51
- params = @options.merge({body: body.to_json, timeout: 30})
51
+ params = @options.merge(opts).merge({body: body.to_json})
52
52
  self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
53
  end
54
54
 
@@ -8,22 +8,107 @@ module Datahen
8
8
  NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
9
  NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
10
 
11
- attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
12
- attr_reader :job_id, :worker_count, :pages, :max_garbage
13
- attr_reader :dequeue_interval, :dequeue_scale
14
- attr_reader :page_types, :parsers
15
- attr_reader :config, :client, :garbage_mutex
11
+ # Configuration file path.
12
+ # @return [String] config file path
13
+ attr_accessor :config_file
14
+ # Garbage collector request counter.
15
+ # @return [Integer] garbage collector counter
16
+ attr_accessor :garbage_count
17
+ # Last printed message, useful to prevent duplicated log messages.
18
+ # @return [String] last printed message
19
+ attr_accessor :last_message
20
+ # Second dequeue counter used to prevent false negative warning messages.
21
+ # @return [Integer] second dequeue counter
22
+ attr_accessor :second_dequeue_count
23
+ # Dequeue API request timeout in seconds.
24
+ # @return [Integer] dequeue API request timeout in seconds
25
+ attr_accessor :dequeue_timeout
26
+ # Job id to be executed.
27
+ # @return [Integer] job id
28
+ attr_reader :job_id
29
+ # Parallel worker quantity.
30
+ # @return [Integer] parallel worker quantity
31
+ attr_reader :worker_count
32
+ # Loaded pages array.
33
+ # @return [Concurrent::Array<Hash>] loaded pages as an array
34
+ attr_reader :pages
35
+ # Loaded pages hash, useful to avoid duplicates on the loaded pages array.
36
+ # @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
37
+ attr_reader :loaded_pages
38
+ # Max garbage collector requests before actually executing the garbage
39
+ # collector.
40
+ # @return [Integer] max garbage request quantity before actually executing
41
+ # it
42
+ attr_reader :max_garbage
43
+ # Dequeue interval in seconds.
44
+ # @return [Integer] dequeue interval in seconds
45
+ attr_reader :dequeue_interval
46
+ # Dequeue scale used to calculate the ideal dequeue size.
47
+ # @return [Numeric] dequeue scale
48
+ attr_reader :dequeue_scale
49
+ # Known page types extracted from the config file.
50
+ # @return [Array<String>] known page types
51
+ attr_reader :page_types
52
+ # Known parsers extracted from the config file.
53
+ # @return [Concurrent::Hash<String, String>] known parsers
54
+ attr_reader :parsers
55
+ # Current config file loaded.
56
+ # @return [Hash] current loaded configuration
57
+ attr_reader :config
58
+ # Datahen job pages client used for API pages dequeuing.
59
+ # @return [Datahen::Client::JobPage] datahen job pages API client
60
+ attr_reader :client
61
+ # Garbage collector mutex used to synchronize garbage collector requests.
62
+ # @return [Mutex] garbage collector mutex
63
+ attr_reader :garbage_mutex
64
+ # Current dequeuer thread.
65
+ # @return [Thread] dequeuer thread
66
+ attr_reader :dequeuer_thread
67
+ # Dequeuer mutext used to synchronize page dequeuing.
68
+ # @return [Mutex] dequeuer mutex
69
+ attr_reader :dequeue_mutex
70
+ # Dequeuer last run unix timestamp.
71
+ # @return [Integer] dequeuer last run unix timestamp
72
+ attr_reader :dequeuer_still_alive
73
+ # Indicates whenever the wait time is because there are no more pages.
74
+ # @return [Boolean] `true` when wait time is due to no more pages,
75
+ # else `false`
76
+ attr_reader :not_found
16
77
 
78
+ # Wait a specific amount of seconds.
79
+ # @param [Integer] time_in_seconds Seconds to wait.
17
80
  def self.wait time_in_seconds
18
81
  Kernel.sleep time_in_seconds
19
82
  end
20
83
 
84
+ # Get a unix timestamp.
85
+ # @return [Integer] unix timestamp
86
+ def self.timestamp
87
+ Time.new.utc.to_i
88
+ end
89
+
90
+ # Initialize a batch parser object.
91
+ # @param [Integer] job_id Job id.
92
+ # @param [String] config_file Config file path.
93
+ # @param [Hash] opts ({}) Configuration options
94
+ # @option opts [Integer] :worker_count (1) Parallel worker quantity.
95
+ # @option opts [Integer] :max_garbage (5) Max amount of times the garbage
96
+ # collector can be requested before actually executing.
97
+ # @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
98
+ # between page dequeuing.
99
+ # @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
100
+ # calculate page dequeue size.
101
+ # @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
102
+ # timeout in seconds.
103
+ # @option opts [Hash] :client_options ({}) Datahen client gem additional
104
+ # options (see Datahen::Client::Base#initialize method).
21
105
  def initialize(job_id, config_file, opts = {})
22
106
  opts = {
23
107
  worker_count: 1,
24
108
  max_garbage: 5,
25
109
  dequeue_interval: 3,
26
110
  dequeue_scale: 2,
111
+ dequeue_timeout: 30,
27
112
  client_options: {}
28
113
  }.merge opts
29
114
 
@@ -32,16 +117,23 @@ module Datahen
32
117
  @dequeue_interval = opts[:dequeue_interval]
33
118
  @dequeue_scale = opts[:dequeue_scale]
34
119
  @max_garbage = opts[:max_garbage]
35
- @pages = Concurrent::Hash.new
120
+ @pages = Concurrent::Array.new
121
+ @loaded_pages = Concurrent::Hash.new
36
122
  @garbage_mutex = Mutex.new
123
+ @dequeue_mutex = Mutex.new
124
+ @not_found = false
125
+ self.dequeue_timeout = opts[:dequeue_timeout]
37
126
  self.second_dequeue_count = 0
38
127
  self.garbage_count = 0
39
128
  self.config_file = config_file
40
129
  self.load_config
41
130
 
42
131
  @client = Datahen::Client::JobPage.new(opts[:client_options])
132
+ nil
43
133
  end
44
134
 
135
+ # Execute garbage collector after it is requested as many times as
136
+ # described by #max_garbage.
45
137
  def recollect_garbage
46
138
  self.garbage_mutex.synchronize do
47
139
  self.garbage_count += 1
@@ -51,8 +143,10 @@ module Datahen
51
143
  self.garbage_count = 0
52
144
  end
53
145
  end
146
+ nil
54
147
  end
55
148
 
149
+ # Loads the config file into a Hash.
56
150
  def load_config
57
151
  # build page type to script file map
58
152
  @page_types = []
@@ -64,20 +158,40 @@ module Datahen
64
158
  self.parsers[v['page_type']] = v['file']
65
159
  end
66
160
  self.recollect_garbage
161
+ nil
67
162
  end
68
163
 
164
+ # Print the message regardless of it being the same as the last message.
165
+ # @param [String] message Message to display.
69
166
  def repeat_puts message
70
167
  puts message
71
- self.last_message = ''
168
+ self.last_message = message
169
+ nil
72
170
  end
73
171
 
172
+ # Print the message only when it is different from the last recorded
173
+ # message.
174
+ # @param [String] message Message to display.
74
175
  def no_repeat_puts message
75
176
  return if message == self.last_message
76
177
  puts message
77
178
  self.last_message = message
179
+ nil
180
+ end
181
+
182
+ # Refresh dequeuer's still alive timestamp
183
+ def dequeuer_is_alive!
184
+ self.dequeue_mutex.synchronize do
185
+ @dequeuer_still_alive = self.class.timestamp
186
+ end
187
+ nil
78
188
  end
79
189
 
190
+ # Load new pages by dequeuing from the API.
191
+ # @return [Integer] amount of pages loaded
80
192
  def load_pages
193
+ self.dequeuer_is_alive!
194
+
81
195
  # calculate dequeue size
82
196
  max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
83
197
  current_size = self.pages.length
@@ -93,13 +207,16 @@ module Datahen
93
207
  response = client.dequeue self.job_id,
94
208
  dequeue_size,
95
209
  self.page_types,
96
- config['parse_fetching_failed']
210
+ config['parse_fetching_failed'],
211
+ timeout: self.dequeue_timeout
97
212
  rescue Net::ReadTimeout, Net::OpenTimeout => e
98
- self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
213
+ self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
214
+ self.dequeuer_is_alive!
99
215
  return 0
100
216
  rescue => e
101
217
  raise e
102
218
  end
219
+ self.dequeuer_is_alive!
103
220
 
104
221
  # ensure a valid response or try again
105
222
  if response.nil? || response.response.code.to_i != 200
@@ -112,17 +229,20 @@ module Datahen
112
229
  count = 0
113
230
  (JSON.parse(response.body) || []).each do |page|
114
231
  count += 1
115
- next if self.pages.has_key? page['gid']
116
- self.pages[page['gid']] = page
232
+ next if self.loaded_pages.has_key? page['gid']
233
+ self.pages << (self.loaded_pages[page['gid']] = page)
117
234
  end
118
235
  response = nil
236
+ self.dequeuer_is_alive!
119
237
 
120
238
  # recolect garbage to free some memory before parsing
121
239
  if count > 0
240
+ @not_found = false
122
241
  self.recollect_garbage
123
242
  self.repeat_puts "Found #{count} page(s) to parse"
124
243
  self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
125
244
  else
245
+ @not_found = true
126
246
  self.no_repeat_puts NOT_FOUND_MSG
127
247
  end
128
248
 
@@ -130,6 +250,44 @@ module Datahen
130
250
  count
131
251
  end
132
252
 
253
+ # Ensures that the dequeuer thread exists and is running.
254
+ # @return [Boolean] `true` if thread was alive, or `false` if had to
255
+ # create a new thread
256
+ def ensure_dequeuer_thread
257
+ self.dequeue_mutex.synchronize do
258
+ # check if dequeuer thread is alive and healthy
259
+ if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
260
+ still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
261
+ return true if self.class.timestamp < still_alive_timeout
262
+
263
+ # kill dequeuer thread
264
+ self.repeat_puts "Dequeuer isn't healthy, will restart it..."
265
+ self.dequeuer_thread.kill
266
+ @dequeuer_thread = nil
267
+ self.recollect_garbage
268
+ self.no_repeat_puts "Dequeuer thread was killed!"
269
+ end
270
+
271
+ # dequeuing on parallel (the ride never ends :D)
272
+ @dequeuer_thread = Thread.new do
273
+ while true
274
+ begin
275
+ self.load_pages
276
+ self.class.wait self.dequeue_interval
277
+ rescue => e
278
+ puts [e.message] + e.backtrace rescue 'error'
279
+ end
280
+ end
281
+ puts "Error: dequeuer died! D:"
282
+ end
283
+ self.repeat_puts "Dequeuer thread was started!"
284
+ end
285
+ false
286
+ end
287
+
288
+ # Dequeue one page from the previously loaded pages, and waits until there
289
+ # are new pages whenever there are no loaded pages.
290
+ # @return [Hash] dequeued page
133
291
  def dequeue_pages
134
292
  # collect garbage
135
293
  self.recollect_garbage
@@ -137,24 +295,29 @@ module Datahen
137
295
  # return page if there are loeaded pages
138
296
  is_waiting = false
139
297
  while true do
140
- key_value = self.pages.shift
141
- unless key_value.nil?
298
+ page = self.pages.shift
299
+ unless page.nil?
142
300
  puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
143
- return key_value[1]
301
+ loaded_pages.delete(page['gid'])
302
+ return page
144
303
  end
145
304
 
146
305
  # be more verbose on worker waiting
147
306
  unless is_waiting
148
307
  is_waiting = true
149
308
  puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
150
- if self.second_dequeue_count > 1
309
+ if self.second_dequeue_count > 1 && !self.not_found
151
310
  puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
152
311
  end
153
312
  end
154
313
  self.class.wait 1
314
+
315
+ # ensure the dequeuer thread is alive and healthy
316
+ self.ensure_dequeuer_thread
155
317
  end
156
318
  end
157
319
 
320
+ # Dequeue pages and execute the parsers associated to them on parallel.
158
321
  def exec_parse save = false, keep_outputs = false
159
322
  if self.worker_count < 1
160
323
  self.no_repeat_puts NO_WORKERS_MSG
@@ -163,18 +326,8 @@ module Datahen
163
326
  self.no_repeat_puts "Spawing #{self.worker_count} workers"
164
327
  end
165
328
 
166
- # dequeuing on parallel (the ride never ends :D)
167
- Thread.new do
168
- while true
169
- begin
170
- self.load_pages
171
- self.class.wait self.dequeue_interval
172
- rescue => e
173
- puts [e.message] + e.backtrace rescue 'error'
174
- end
175
- end
176
- puts "Error: dequeuer died! D:"
177
- end
329
+ # start dequeuer
330
+ self.ensure_dequeuer_thread
178
331
 
179
332
  # process the pages
180
333
  dequeue = lambda{ self.dequeue_pages }
@@ -197,6 +350,8 @@ module Datahen
197
350
  puts [e.message] + e.backtrace rescue 'error'
198
351
  end
199
352
  end
353
+
354
+ nil
200
355
  end
201
356
  end
202
357
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.15.10"
2
+ VERSION = "0.15.11"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.10
4
+ version: 0.15.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-28 00:00:00.000000000 Z
11
+ date: 2021-05-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor