datahen 0.15.9 → 0.15.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/client/job_page.rb +1 -1
- data/lib/datahen/scraper/batch_parser.rb +46 -20
- data/lib/datahen/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
|
4
|
+
data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
|
7
|
+
data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
|
@@ -48,7 +48,7 @@ module Datahen
|
|
48
48
|
page_types: page_types,
|
49
49
|
parse_fetching_failed: parse_fetching_failed
|
50
50
|
}
|
51
|
-
params = @options.merge({body: body.to_json})
|
51
|
+
params = @options.merge({body: body.to_json, timeout: 30})
|
52
52
|
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
53
53
|
end
|
54
54
|
|
@@ -5,10 +5,10 @@ module Datahen
|
|
5
5
|
module Scraper
|
6
6
|
class BatchParser
|
7
7
|
NOT_FOUND_MSG = "No more pages to parse found"
|
8
|
-
NO_DEQUEUE_COUNT_MSG = "
|
9
|
-
NO_WORKERS_MSG = "
|
8
|
+
NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
|
9
|
+
NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
|
10
10
|
|
11
|
-
attr_accessor :config_file, :garbage_count, :last_message
|
11
|
+
attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
|
12
12
|
attr_reader :job_id, :worker_count, :pages, :max_garbage
|
13
13
|
attr_reader :dequeue_interval, :dequeue_scale
|
14
14
|
attr_reader :page_types, :parsers
|
@@ -34,6 +34,7 @@ module Datahen
|
|
34
34
|
@max_garbage = opts[:max_garbage]
|
35
35
|
@pages = Concurrent::Hash.new
|
36
36
|
@garbage_mutex = Mutex.new
|
37
|
+
self.second_dequeue_count = 0
|
37
38
|
self.garbage_count = 0
|
38
39
|
self.config_file = config_file
|
39
40
|
self.load_config
|
@@ -43,9 +44,12 @@ module Datahen
|
|
43
44
|
|
44
45
|
def recollect_garbage
|
45
46
|
self.garbage_mutex.synchronize do
|
46
|
-
|
47
|
-
|
48
|
-
|
47
|
+
self.garbage_count += 1
|
48
|
+
if self.garbage_count > self.max_garbage
|
49
|
+
puts "Recollect garbage"
|
50
|
+
GC.start
|
51
|
+
self.garbage_count = 0
|
52
|
+
end
|
49
53
|
end
|
50
54
|
end
|
51
55
|
|
@@ -84,10 +88,18 @@ module Datahen
|
|
84
88
|
dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
|
85
89
|
|
86
90
|
# reserve and get to pages parse
|
87
|
-
response =
|
88
|
-
|
89
|
-
self.
|
90
|
-
|
91
|
+
response = nil
|
92
|
+
begin
|
93
|
+
response = client.dequeue self.job_id,
|
94
|
+
dequeue_size,
|
95
|
+
self.page_types,
|
96
|
+
config['parse_fetching_failed']
|
97
|
+
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
98
|
+
self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
|
99
|
+
return 0
|
100
|
+
rescue => e
|
101
|
+
raise e
|
102
|
+
end
|
91
103
|
|
92
104
|
# ensure a valid response or try again
|
93
105
|
if response.nil? || response.response.code.to_i != 200
|
@@ -109,6 +121,7 @@ module Datahen
|
|
109
121
|
if count > 0
|
110
122
|
self.recollect_garbage
|
111
123
|
self.repeat_puts "Found #{count} page(s) to parse"
|
124
|
+
self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
|
112
125
|
else
|
113
126
|
self.no_repeat_puts NOT_FOUND_MSG
|
114
127
|
end
|
@@ -119,15 +132,25 @@ module Datahen
|
|
119
132
|
|
120
133
|
def dequeue_pages
|
121
134
|
# collect garbage
|
122
|
-
self.
|
123
|
-
if self.garbage_count > self.max_garbage
|
124
|
-
self.recollect_garbage
|
125
|
-
end
|
135
|
+
self.recollect_garbage
|
126
136
|
|
127
137
|
# return page if there are loeaded pages
|
138
|
+
is_waiting = false
|
128
139
|
while true do
|
129
140
|
key_value = self.pages.shift
|
130
|
-
|
141
|
+
unless key_value.nil?
|
142
|
+
puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
|
143
|
+
return key_value[1]
|
144
|
+
end
|
145
|
+
|
146
|
+
# be more verbose on worker waiting
|
147
|
+
unless is_waiting
|
148
|
+
is_waiting = true
|
149
|
+
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
150
|
+
if self.second_dequeue_count > 1
|
151
|
+
puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
|
152
|
+
end
|
153
|
+
end
|
131
154
|
self.class.wait 1
|
132
155
|
end
|
133
156
|
end
|
@@ -140,11 +163,9 @@ module Datahen
|
|
140
163
|
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
141
164
|
end
|
142
165
|
|
143
|
-
# dequeuing on parallel
|
144
|
-
keep_dequeue = Concurrent::Array.new
|
145
|
-
keep_dequeue[0] = true
|
166
|
+
# dequeuing on parallel (the ride never ends :D)
|
146
167
|
Thread.new do
|
147
|
-
while
|
168
|
+
while true
|
148
169
|
begin
|
149
170
|
self.load_pages
|
150
171
|
self.class.wait self.dequeue_interval
|
@@ -152,8 +173,10 @@ module Datahen
|
|
152
173
|
puts [e.message] + e.backtrace rescue 'error'
|
153
174
|
end
|
154
175
|
end
|
176
|
+
puts "Error: dequeuer died! D:"
|
155
177
|
end
|
156
178
|
|
179
|
+
# process the pages
|
157
180
|
dequeue = lambda{ self.dequeue_pages }
|
158
181
|
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
159
182
|
parser_file = self.parsers[page['page_type']]
|
@@ -166,11 +189,14 @@ module Datahen
|
|
166
189
|
nil,
|
167
190
|
keep_outputs
|
168
191
|
)
|
192
|
+
rescue Parallel::Kill => e
|
193
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
|
194
|
+
rescue Parallel::Break => e
|
195
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
|
169
196
|
rescue => e
|
170
197
|
puts [e.message] + e.backtrace rescue 'error'
|
171
198
|
end
|
172
199
|
end
|
173
|
-
keep_dequeue[0] = false
|
174
200
|
end
|
175
201
|
end
|
176
202
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.15.
|
4
|
+
version: 0.15.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-05-
|
11
|
+
date: 2021-05-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|