datahen 0.15.9 → 0.15.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7ad617ad097cf773fe496c3ccf801c49926524017784e5a51764ec88992b11ab
4
- data.tar.gz: 8292cf447a3fedcad565bc1aa98041d68a51f0d93ff1598b8c31980170d1f66c
3
+ metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
4
+ data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
5
5
  SHA512:
6
- metadata.gz: 3e883e9a53339e342446ed543bab41bf0ab60de6e1f5f7aaaad54125ec67275803bf1e7676e3764c59f0ba7b75e8fdf4adfe98733ab376c350b3a3247a678895
7
- data.tar.gz: eaa4d12e93f31c7e516bb94f1222ecad2d6581609f77372de9078ac91a716438f6e998ec9b00ba4d3da76d02ccab5639fe06393c67346ab896e69d62d3fcc1e5
6
+ metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
7
+ data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
@@ -48,7 +48,7 @@ module Datahen
48
48
  page_types: page_types,
49
49
  parse_fetching_failed: parse_fetching_failed
50
50
  }
51
- params = @options.merge({body: body.to_json})
51
+ params = @options.merge({body: body.to_json, timeout: 30})
52
52
  self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
53
  end
54
54
 
@@ -5,10 +5,10 @@ module Datahen
5
5
  module Scraper
6
6
  class BatchParser
7
7
  NOT_FOUND_MSG = "No more pages to parse found"
8
- NO_DEQUEUE_COUNT_MSG = "Warning: Max page to parse dequeue count is 0, check pages to parse scale"
9
- NO_WORKERS_MSG = "Warning: There are no parser workers"
8
+ NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
+ NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
10
 
11
- attr_accessor :config_file, :garbage_count, :last_message
11
+ attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
12
12
  attr_reader :job_id, :worker_count, :pages, :max_garbage
13
13
  attr_reader :dequeue_interval, :dequeue_scale
14
14
  attr_reader :page_types, :parsers
@@ -34,6 +34,7 @@ module Datahen
34
34
  @max_garbage = opts[:max_garbage]
35
35
  @pages = Concurrent::Hash.new
36
36
  @garbage_mutex = Mutex.new
37
+ self.second_dequeue_count = 0
37
38
  self.garbage_count = 0
38
39
  self.config_file = config_file
39
40
  self.load_config
@@ -43,9 +44,12 @@ module Datahen
43
44
 
44
45
  def recollect_garbage
45
46
  self.garbage_mutex.synchronize do
46
- puts "Recollect garbage"
47
- GC.start
48
- self.garbage_count = 0
47
+ self.garbage_count += 1
48
+ if self.garbage_count > self.max_garbage
49
+ puts "Recollect garbage"
50
+ GC.start
51
+ self.garbage_count = 0
52
+ end
49
53
  end
50
54
  end
51
55
 
@@ -84,10 +88,18 @@ module Datahen
84
88
  dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
85
89
 
86
90
  # reserve and get to pages parse
87
- response = client.dequeue self.job_id,
88
- dequeue_size,
89
- self.page_types,
90
- config['parse_fetching_failed']
91
+ response = nil
92
+ begin
93
+ response = client.dequeue self.job_id,
94
+ dequeue_size,
95
+ self.page_types,
96
+ config['parse_fetching_failed']
97
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
98
+ self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
99
+ return 0
100
+ rescue => e
101
+ raise e
102
+ end
91
103
 
92
104
  # ensure a valid response or try again
93
105
  if response.nil? || response.response.code.to_i != 200
@@ -109,6 +121,7 @@ module Datahen
109
121
  if count > 0
110
122
  self.recollect_garbage
111
123
  self.repeat_puts "Found #{count} page(s) to parse"
124
+ self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
112
125
  else
113
126
  self.no_repeat_puts NOT_FOUND_MSG
114
127
  end
@@ -119,15 +132,25 @@ module Datahen
119
132
 
120
133
  def dequeue_pages
121
134
  # collect garbage
122
- self.garbage_count += 1
123
- if self.garbage_count > self.max_garbage
124
- self.recollect_garbage
125
- end
135
+ self.recollect_garbage
126
136
 
127
137
  # return page if there are loeaded pages
138
+ is_waiting = false
128
139
  while true do
129
140
  key_value = self.pages.shift
130
- return key_value[1] unless key_value.nil?
141
+ unless key_value.nil?
142
+ puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
143
+ return key_value[1]
144
+ end
145
+
146
+ # be more verbose on worker waiting
147
+ unless is_waiting
148
+ is_waiting = true
149
+ puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
150
+ if self.second_dequeue_count > 1
151
+ puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
152
+ end
153
+ end
131
154
  self.class.wait 1
132
155
  end
133
156
  end
@@ -140,11 +163,9 @@ module Datahen
140
163
  self.no_repeat_puts "Spawing #{self.worker_count} workers"
141
164
  end
142
165
 
143
- # dequeuing on parallel
144
- keep_dequeue = Concurrent::Array.new
145
- keep_dequeue[0] = true
166
+ # dequeuing on parallel (the ride never ends :D)
146
167
  Thread.new do
147
- while keep_dequeue[0]
168
+ while true
148
169
  begin
149
170
  self.load_pages
150
171
  self.class.wait self.dequeue_interval
@@ -152,8 +173,10 @@ module Datahen
152
173
  puts [e.message] + e.backtrace rescue 'error'
153
174
  end
154
175
  end
176
+ puts "Error: dequeuer died! D:"
155
177
  end
156
178
 
179
+ # process the pages
157
180
  dequeue = lambda{ self.dequeue_pages }
158
181
  Parallel.each(dequeue, in_threads: (worker_count)) do |page|
159
182
  parser_file = self.parsers[page['page_type']]
@@ -166,11 +189,14 @@ module Datahen
166
189
  nil,
167
190
  keep_outputs
168
191
  )
192
+ rescue Parallel::Kill => e
193
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
194
+ rescue Parallel::Break => e
195
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
169
196
  rescue => e
170
197
  puts [e.message] + e.backtrace rescue 'error'
171
198
  end
172
199
  end
173
- keep_dequeue[0] = false
174
200
  end
175
201
  end
176
202
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.15.9"
2
+ VERSION = "0.15.10"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.9
4
+ version: 0.15.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-27 00:00:00.000000000 Z
11
+ date: 2021-05-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor