datahen 0.15.9 → 0.15.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/client/job_page.rb +1 -1
- data/lib/datahen/scraper/batch_parser.rb +46 -20
- data/lib/datahen/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
|
4
|
+
data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
|
7
|
+
data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
|
@@ -48,7 +48,7 @@ module Datahen
|
|
48
48
|
page_types: page_types,
|
49
49
|
parse_fetching_failed: parse_fetching_failed
|
50
50
|
}
|
51
|
-
params = @options.merge({body: body.to_json})
|
51
|
+
params = @options.merge({body: body.to_json, timeout: 30})
|
52
52
|
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
53
53
|
end
|
54
54
|
|
@@ -5,10 +5,10 @@ module Datahen
|
|
5
5
|
module Scraper
|
6
6
|
class BatchParser
|
7
7
|
NOT_FOUND_MSG = "No more pages to parse found"
|
8
|
-
NO_DEQUEUE_COUNT_MSG = "
|
9
|
-
NO_WORKERS_MSG = "
|
8
|
+
NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
|
9
|
+
NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
|
10
10
|
|
11
|
-
attr_accessor :config_file, :garbage_count, :last_message
|
11
|
+
attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
|
12
12
|
attr_reader :job_id, :worker_count, :pages, :max_garbage
|
13
13
|
attr_reader :dequeue_interval, :dequeue_scale
|
14
14
|
attr_reader :page_types, :parsers
|
@@ -34,6 +34,7 @@ module Datahen
|
|
34
34
|
@max_garbage = opts[:max_garbage]
|
35
35
|
@pages = Concurrent::Hash.new
|
36
36
|
@garbage_mutex = Mutex.new
|
37
|
+
self.second_dequeue_count = 0
|
37
38
|
self.garbage_count = 0
|
38
39
|
self.config_file = config_file
|
39
40
|
self.load_config
|
@@ -43,9 +44,12 @@ module Datahen
|
|
43
44
|
|
44
45
|
def recollect_garbage
|
45
46
|
self.garbage_mutex.synchronize do
|
46
|
-
|
47
|
-
|
48
|
-
|
47
|
+
self.garbage_count += 1
|
48
|
+
if self.garbage_count > self.max_garbage
|
49
|
+
puts "Recollect garbage"
|
50
|
+
GC.start
|
51
|
+
self.garbage_count = 0
|
52
|
+
end
|
49
53
|
end
|
50
54
|
end
|
51
55
|
|
@@ -84,10 +88,18 @@ module Datahen
|
|
84
88
|
dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
|
85
89
|
|
86
90
|
# reserve and get to pages parse
|
87
|
-
response =
|
88
|
-
|
89
|
-
self.
|
90
|
-
|
91
|
+
response = nil
|
92
|
+
begin
|
93
|
+
response = client.dequeue self.job_id,
|
94
|
+
dequeue_size,
|
95
|
+
self.page_types,
|
96
|
+
config['parse_fetching_failed']
|
97
|
+
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
98
|
+
self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
|
99
|
+
return 0
|
100
|
+
rescue => e
|
101
|
+
raise e
|
102
|
+
end
|
91
103
|
|
92
104
|
# ensure a valid response or try again
|
93
105
|
if response.nil? || response.response.code.to_i != 200
|
@@ -109,6 +121,7 @@ module Datahen
|
|
109
121
|
if count > 0
|
110
122
|
self.recollect_garbage
|
111
123
|
self.repeat_puts "Found #{count} page(s) to parse"
|
124
|
+
self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
|
112
125
|
else
|
113
126
|
self.no_repeat_puts NOT_FOUND_MSG
|
114
127
|
end
|
@@ -119,15 +132,25 @@ module Datahen
|
|
119
132
|
|
120
133
|
def dequeue_pages
|
121
134
|
# collect garbage
|
122
|
-
self.
|
123
|
-
if self.garbage_count > self.max_garbage
|
124
|
-
self.recollect_garbage
|
125
|
-
end
|
135
|
+
self.recollect_garbage
|
126
136
|
|
127
137
|
# return page if there are loeaded pages
|
138
|
+
is_waiting = false
|
128
139
|
while true do
|
129
140
|
key_value = self.pages.shift
|
130
|
-
|
141
|
+
unless key_value.nil?
|
142
|
+
puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
|
143
|
+
return key_value[1]
|
144
|
+
end
|
145
|
+
|
146
|
+
# be more verbose on worker waiting
|
147
|
+
unless is_waiting
|
148
|
+
is_waiting = true
|
149
|
+
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
150
|
+
if self.second_dequeue_count > 1
|
151
|
+
puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
|
152
|
+
end
|
153
|
+
end
|
131
154
|
self.class.wait 1
|
132
155
|
end
|
133
156
|
end
|
@@ -140,11 +163,9 @@ module Datahen
|
|
140
163
|
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
141
164
|
end
|
142
165
|
|
143
|
-
# dequeuing on parallel
|
144
|
-
keep_dequeue = Concurrent::Array.new
|
145
|
-
keep_dequeue[0] = true
|
166
|
+
# dequeuing on parallel (the ride never ends :D)
|
146
167
|
Thread.new do
|
147
|
-
while
|
168
|
+
while true
|
148
169
|
begin
|
149
170
|
self.load_pages
|
150
171
|
self.class.wait self.dequeue_interval
|
@@ -152,8 +173,10 @@ module Datahen
|
|
152
173
|
puts [e.message] + e.backtrace rescue 'error'
|
153
174
|
end
|
154
175
|
end
|
176
|
+
puts "Error: dequeuer died! D:"
|
155
177
|
end
|
156
178
|
|
179
|
+
# process the pages
|
157
180
|
dequeue = lambda{ self.dequeue_pages }
|
158
181
|
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
159
182
|
parser_file = self.parsers[page['page_type']]
|
@@ -166,11 +189,14 @@ module Datahen
|
|
166
189
|
nil,
|
167
190
|
keep_outputs
|
168
191
|
)
|
192
|
+
rescue Parallel::Kill => e
|
193
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
|
194
|
+
rescue Parallel::Break => e
|
195
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
|
169
196
|
rescue => e
|
170
197
|
puts [e.message] + e.backtrace rescue 'error'
|
171
198
|
end
|
172
199
|
end
|
173
|
-
keep_dequeue[0] = false
|
174
200
|
end
|
175
201
|
end
|
176
202
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.15.
|
4
|
+
version: 0.15.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-05-
|
11
|
+
date: 2021-05-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|