datahen 0.15.9 → 0.15.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7ad617ad097cf773fe496c3ccf801c49926524017784e5a51764ec88992b11ab
4
- data.tar.gz: 8292cf447a3fedcad565bc1aa98041d68a51f0d93ff1598b8c31980170d1f66c
3
+ metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
4
+ data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
5
5
  SHA512:
6
- metadata.gz: 3e883e9a53339e342446ed543bab41bf0ab60de6e1f5f7aaaad54125ec67275803bf1e7676e3764c59f0ba7b75e8fdf4adfe98733ab376c350b3a3247a678895
7
- data.tar.gz: eaa4d12e93f31c7e516bb94f1222ecad2d6581609f77372de9078ac91a716438f6e998ec9b00ba4d3da76d02ccab5639fe06393c67346ab896e69d62d3fcc1e5
6
+ metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
7
+ data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
@@ -48,7 +48,7 @@ module Datahen
48
48
  page_types: page_types,
49
49
  parse_fetching_failed: parse_fetching_failed
50
50
  }
51
- params = @options.merge({body: body.to_json})
51
+ params = @options.merge({body: body.to_json, timeout: 30})
52
52
  self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
53
  end
54
54
 
@@ -5,10 +5,10 @@ module Datahen
5
5
  module Scraper
6
6
  class BatchParser
7
7
  NOT_FOUND_MSG = "No more pages to parse found"
8
- NO_DEQUEUE_COUNT_MSG = "Warning: Max page to parse dequeue count is 0, check pages to parse scale"
9
- NO_WORKERS_MSG = "Warning: There are no parser workers"
8
+ NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
+ NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
10
 
11
- attr_accessor :config_file, :garbage_count, :last_message
11
+ attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
12
12
  attr_reader :job_id, :worker_count, :pages, :max_garbage
13
13
  attr_reader :dequeue_interval, :dequeue_scale
14
14
  attr_reader :page_types, :parsers
@@ -34,6 +34,7 @@ module Datahen
34
34
  @max_garbage = opts[:max_garbage]
35
35
  @pages = Concurrent::Hash.new
36
36
  @garbage_mutex = Mutex.new
37
+ self.second_dequeue_count = 0
37
38
  self.garbage_count = 0
38
39
  self.config_file = config_file
39
40
  self.load_config
@@ -43,9 +44,12 @@ module Datahen
43
44
 
44
45
  def recollect_garbage
45
46
  self.garbage_mutex.synchronize do
46
- puts "Recollect garbage"
47
- GC.start
48
- self.garbage_count = 0
47
+ self.garbage_count += 1
48
+ if self.garbage_count > self.max_garbage
49
+ puts "Recollect garbage"
50
+ GC.start
51
+ self.garbage_count = 0
52
+ end
49
53
  end
50
54
  end
51
55
 
@@ -84,10 +88,18 @@ module Datahen
84
88
  dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
85
89
 
86
90
  # reserve and get to pages parse
87
- response = client.dequeue self.job_id,
88
- dequeue_size,
89
- self.page_types,
90
- config['parse_fetching_failed']
91
+ response = nil
92
+ begin
93
+ response = client.dequeue self.job_id,
94
+ dequeue_size,
95
+ self.page_types,
96
+ config['parse_fetching_failed']
97
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
98
+ self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
99
+ return 0
100
+ rescue => e
101
+ raise e
102
+ end
91
103
 
92
104
  # ensure a valid response or try again
93
105
  if response.nil? || response.response.code.to_i != 200
@@ -109,6 +121,7 @@ module Datahen
109
121
  if count > 0
110
122
  self.recollect_garbage
111
123
  self.repeat_puts "Found #{count} page(s) to parse"
124
+ self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
112
125
  else
113
126
  self.no_repeat_puts NOT_FOUND_MSG
114
127
  end
@@ -119,15 +132,25 @@ module Datahen
119
132
 
120
133
  def dequeue_pages
121
134
  # collect garbage
122
- self.garbage_count += 1
123
- if self.garbage_count > self.max_garbage
124
- self.recollect_garbage
125
- end
135
+ self.recollect_garbage
126
136
 
127
137
  # return page if there are loeaded pages
138
+ is_waiting = false
128
139
  while true do
129
140
  key_value = self.pages.shift
130
- return key_value[1] unless key_value.nil?
141
+ unless key_value.nil?
142
+ puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
143
+ return key_value[1]
144
+ end
145
+
146
+ # be more verbose on worker waiting
147
+ unless is_waiting
148
+ is_waiting = true
149
+ puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
150
+ if self.second_dequeue_count > 1
151
+ puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
152
+ end
153
+ end
131
154
  self.class.wait 1
132
155
  end
133
156
  end
@@ -140,11 +163,9 @@ module Datahen
140
163
  self.no_repeat_puts "Spawing #{self.worker_count} workers"
141
164
  end
142
165
 
143
- # dequeuing on parallel
144
- keep_dequeue = Concurrent::Array.new
145
- keep_dequeue[0] = true
166
+ # dequeuing on parallel (the ride never ends :D)
146
167
  Thread.new do
147
- while keep_dequeue[0]
168
+ while true
148
169
  begin
149
170
  self.load_pages
150
171
  self.class.wait self.dequeue_interval
@@ -152,8 +173,10 @@ module Datahen
152
173
  puts [e.message] + e.backtrace rescue 'error'
153
174
  end
154
175
  end
176
+ puts "Error: dequeuer died! D:"
155
177
  end
156
178
 
179
+ # process the pages
157
180
  dequeue = lambda{ self.dequeue_pages }
158
181
  Parallel.each(dequeue, in_threads: (worker_count)) do |page|
159
182
  parser_file = self.parsers[page['page_type']]
@@ -166,11 +189,14 @@ module Datahen
166
189
  nil,
167
190
  keep_outputs
168
191
  )
192
+ rescue Parallel::Kill => e
193
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
194
+ rescue Parallel::Break => e
195
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
169
196
  rescue => e
170
197
  puts [e.message] + e.backtrace rescue 'error'
171
198
  end
172
199
  end
173
- keep_dequeue[0] = false
174
200
  end
175
201
  end
176
202
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.15.9"
2
+ VERSION = "0.15.10"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.9
4
+ version: 0.15.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-27 00:00:00.000000000 Z
11
+ date: 2021-05-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor