datahen 0.15.10 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/parser.rb +1 -0
- data/lib/datahen/cli/scraper.rb +3 -0
- data/lib/datahen/cli/scraper_job.rb +1 -0
- data/lib/datahen/cli/scraper_page.rb +2 -0
- data/lib/datahen/client/job.rb +1 -0
- data/lib/datahen/client/job_page.rb +3 -1
- data/lib/datahen/client/scraper.rb +2 -0
- data/lib/datahen/client/scraper_job.rb +2 -0
- data/lib/datahen/client/scraper_job_page.rb +2 -0
- data/lib/datahen/scraper/batch_parser.rb +184 -27
- data/lib/datahen/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8945724e5d11f40eba22a9ffca7ca7d024b8565dae4ba1d3da9e486eac262575
|
4
|
+
data.tar.gz: 867319a0c6358c593951e6241b24d9c3fd9dcb759ce1287eeb640ade0c23e69a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bad8dea41951df061c84934fa63a0594bc8caf22b8665c3d6746bb29ca2821103ecd3814a9a9eba020009e8165390900b1c45a1fd0a7175fb7ea52f7a077fdab
|
7
|
+
data.tar.gz: 74342e01eaa21a590ef998219282fb40342cde4bf8db58617d24583bfa00b967df3f8c38e41c9b92868e90113b7cc5f6346fb2062b446edd3bb4612e053c5da7
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -72,6 +72,7 @@ module Datahen
|
|
72
72
|
option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
|
73
73
|
option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
|
74
74
|
option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
|
75
|
+
option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
|
75
76
|
def batch_exec_parse(scraper_name, config_file)
|
76
77
|
if options[:job]
|
77
78
|
job_id = options[:job]
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -32,6 +32,7 @@ module Datahen
|
|
32
32
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
33
33
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
34
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
35
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
35
36
|
def create(scraper_name, git_repository)
|
36
37
|
# puts "options #{options}"
|
37
38
|
client = Client::Scraper.new(options)
|
@@ -57,6 +58,7 @@ module Datahen
|
|
57
58
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
58
59
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
59
60
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
61
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
60
62
|
def update(scraper_name)
|
61
63
|
client = Client::Scraper.new(options)
|
62
64
|
puts "#{client.update(scraper_name, options)}"
|
@@ -94,6 +96,7 @@ module Datahen
|
|
94
96
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
97
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
98
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
99
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
97
100
|
def start(scraper_name)
|
98
101
|
client = Client::ScraperJob.new(options)
|
99
102
|
puts "Starting a scrape job..."
|
@@ -104,6 +104,7 @@ module Datahen
|
|
104
104
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
105
105
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
106
106
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
107
108
|
def update(scraper_name)
|
108
109
|
if options[:job]
|
109
110
|
client = Client::Job.new(options)
|
@@ -45,6 +45,7 @@ module Datahen
|
|
45
45
|
option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
|
46
46
|
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
47
|
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
|
+
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
48
49
|
def add(scraper_name, url)
|
49
50
|
begin
|
50
51
|
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
@@ -78,6 +79,7 @@ module Datahen
|
|
78
79
|
option :page_type, :aliases => :t, desc: 'Set page type'
|
79
80
|
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
80
81
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
82
|
+
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
81
83
|
def update(scraper_name, gid)
|
82
84
|
begin
|
83
85
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
data/lib/datahen/client/job.rb
CHANGED
@@ -21,6 +21,7 @@ module Datahen
|
|
21
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
23
|
body[:profile] = opts[:profile] if opts[:profile]
|
24
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
24
25
|
params = @options.merge({body: body.to_json})
|
25
26
|
|
26
27
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
body[:page_type] = opts[:page_type] if opts[:page_type]
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
18
19
|
|
19
20
|
params = @options.merge({body: body.to_json})
|
20
21
|
|
@@ -36,6 +37,7 @@ module Datahen
|
|
36
37
|
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
37
38
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
38
39
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
40
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
39
41
|
|
40
42
|
params = @options.merge({body: body.to_json})
|
41
43
|
|
@@ -48,7 +50,7 @@ module Datahen
|
|
48
50
|
page_types: page_types,
|
49
51
|
parse_fetching_failed: parse_fetching_failed
|
50
52
|
}
|
51
|
-
params = @options.merge({body: body.to_json
|
53
|
+
params = @options.merge(opts).merge({body: body.to_json})
|
52
54
|
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
53
55
|
end
|
54
56
|
|
@@ -28,6 +28,7 @@ module Datahen
|
|
28
28
|
body[:profile] = opts[:profile] if opts[:profile]
|
29
29
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
30
|
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
31
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
31
32
|
params = @options.merge({body: body.to_json})
|
32
33
|
self.class.post("/scrapers", params)
|
33
34
|
end
|
@@ -49,6 +50,7 @@ module Datahen
|
|
49
50
|
body[:profile] = opts[:profile] if opts[:profile]
|
50
51
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
51
52
|
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
53
|
+
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
52
54
|
params = @options.merge({body: body.to_json})
|
53
55
|
|
54
56
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -11,6 +11,7 @@ module Datahen
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
14
15
|
if opts[:vars]
|
15
16
|
if opts[:vars].is_a?(Array)
|
16
17
|
body[:vars] = opts[:vars]
|
@@ -37,6 +38,7 @@ module Datahen
|
|
37
38
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
38
39
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
39
40
|
body[:profile] = opts[:profile] if opts[:profile]
|
41
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
40
42
|
params = @options.merge({body: body.to_json})
|
41
43
|
|
42
44
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
body[:page_type] = opts[:page_type] if opts[:page_type]
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
18
19
|
|
19
20
|
params = @options.merge({body: body.to_json})
|
20
21
|
|
@@ -59,6 +60,7 @@ module Datahen
|
|
59
60
|
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
60
61
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
61
62
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
63
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
62
64
|
|
63
65
|
params = @options.merge({body: body.to_json})
|
64
66
|
|
@@ -8,22 +8,107 @@ module Datahen
|
|
8
8
|
NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
|
9
9
|
NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
11
|
+
# Configuration file path.
|
12
|
+
# @return [String] config file path
|
13
|
+
attr_accessor :config_file
|
14
|
+
# Garbage collector request counter.
|
15
|
+
# @return [Integer] garbage collector counter
|
16
|
+
attr_accessor :garbage_count
|
17
|
+
# Last printed message, useful to prevent duplicated log messages.
|
18
|
+
# @return [String] last printed message
|
19
|
+
attr_accessor :last_message
|
20
|
+
# Second dequeue counter used to prevent false negative warning messages.
|
21
|
+
# @return [Integer] second dequeue counter
|
22
|
+
attr_accessor :second_dequeue_count
|
23
|
+
# Dequeue API request timeout in seconds.
|
24
|
+
# @return [Integer] dequeue API request timeout in seconds
|
25
|
+
attr_accessor :dequeue_timeout
|
26
|
+
# Job id to be executed.
|
27
|
+
# @return [Integer] job id
|
28
|
+
attr_reader :job_id
|
29
|
+
# Parallel worker quantity.
|
30
|
+
# @return [Integer] parallel worker quantity
|
31
|
+
attr_reader :worker_count
|
32
|
+
# Loaded pages array.
|
33
|
+
# @return [Concurrent::Array<Hash>] loaded pages as an array
|
34
|
+
attr_reader :pages
|
35
|
+
# Loaded pages hash, useful to avoid duplicates on the loaded pages array.
|
36
|
+
# @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
|
37
|
+
attr_reader :loaded_pages
|
38
|
+
# Max garbage collector requests before actually executing the garbage
|
39
|
+
# collector.
|
40
|
+
# @return [Integer] max garbage request quantity before actually executing
|
41
|
+
# it
|
42
|
+
attr_reader :max_garbage
|
43
|
+
# Dequeue interval in seconds.
|
44
|
+
# @return [Integer] dequeue interval in seconds
|
45
|
+
attr_reader :dequeue_interval
|
46
|
+
# Dequeue scale used to calculate the ideal dequeue size.
|
47
|
+
# @return [Numeric] dequeue scale
|
48
|
+
attr_reader :dequeue_scale
|
49
|
+
# Known page types extracted from the config file.
|
50
|
+
# @return [Array<String>] known page types
|
51
|
+
attr_reader :page_types
|
52
|
+
# Known parsers extracted from the config file.
|
53
|
+
# @return [Concurrent::Hash<String, String>] known parsers
|
54
|
+
attr_reader :parsers
|
55
|
+
# Current config file loaded.
|
56
|
+
# @return [Hash] current loaded configuration
|
57
|
+
attr_reader :config
|
58
|
+
# Datahen job pages client used for API pages dequeuing.
|
59
|
+
# @return [Datahen::Client::JobPage] datahen job pages API client
|
60
|
+
attr_reader :client
|
61
|
+
# Garbage collector mutex used to synchronize garbage collector requests.
|
62
|
+
# @return [Mutex] garbage collector mutex
|
63
|
+
attr_reader :garbage_mutex
|
64
|
+
# Current dequeuer thread.
|
65
|
+
# @return [Thread] dequeuer thread
|
66
|
+
attr_reader :dequeuer_thread
|
67
|
+
# Dequeuer mutext used to synchronize page dequeuing.
|
68
|
+
# @return [Mutex] dequeuer mutex
|
69
|
+
attr_reader :dequeue_mutex
|
70
|
+
# Dequeuer last run unix timestamp.
|
71
|
+
# @return [Integer] dequeuer last run unix timestamp
|
72
|
+
attr_reader :dequeuer_still_alive
|
73
|
+
# Indicates whenever the wait time is because there are no more pages.
|
74
|
+
# @return [Boolean] `true` when wait time is due to no more pages,
|
75
|
+
# else `false`
|
76
|
+
attr_reader :not_found
|
16
77
|
|
78
|
+
# Wait a specific amount of seconds.
|
79
|
+
# @param [Integer] time_in_seconds Seconds to wait.
|
17
80
|
def self.wait time_in_seconds
|
18
81
|
Kernel.sleep time_in_seconds
|
19
82
|
end
|
20
83
|
|
84
|
+
# Get a unix timestamp.
|
85
|
+
# @return [Integer] unix timestamp
|
86
|
+
def self.timestamp
|
87
|
+
Time.new.utc.to_i
|
88
|
+
end
|
89
|
+
|
90
|
+
# Initialize a batch parser object.
|
91
|
+
# @param [Integer] job_id Job id.
|
92
|
+
# @param [String] config_file Config file path.
|
93
|
+
# @param [Hash] opts ({}) Configuration options
|
94
|
+
# @option opts [Integer] :worker_count (1) Parallel worker quantity.
|
95
|
+
# @option opts [Integer] :max_garbage (5) Max amount of times the garbage
|
96
|
+
# collector can be requested before actually executing.
|
97
|
+
# @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
|
98
|
+
# between page dequeuing.
|
99
|
+
# @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
|
100
|
+
# calculate page dequeue size.
|
101
|
+
# @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
|
102
|
+
# timeout in seconds.
|
103
|
+
# @option opts [Hash] :client_options ({}) Datahen client gem additional
|
104
|
+
# options (see Datahen::Client::Base#initialize method).
|
21
105
|
def initialize(job_id, config_file, opts = {})
|
22
106
|
opts = {
|
23
107
|
worker_count: 1,
|
24
108
|
max_garbage: 5,
|
25
109
|
dequeue_interval: 3,
|
26
110
|
dequeue_scale: 2,
|
111
|
+
dequeue_timeout: 30,
|
27
112
|
client_options: {}
|
28
113
|
}.merge opts
|
29
114
|
|
@@ -32,16 +117,23 @@ module Datahen
|
|
32
117
|
@dequeue_interval = opts[:dequeue_interval]
|
33
118
|
@dequeue_scale = opts[:dequeue_scale]
|
34
119
|
@max_garbage = opts[:max_garbage]
|
35
|
-
@pages = Concurrent::
|
120
|
+
@pages = Concurrent::Array.new
|
121
|
+
@loaded_pages = Concurrent::Hash.new
|
36
122
|
@garbage_mutex = Mutex.new
|
123
|
+
@dequeue_mutex = Mutex.new
|
124
|
+
@not_found = false
|
125
|
+
self.dequeue_timeout = opts[:dequeue_timeout]
|
37
126
|
self.second_dequeue_count = 0
|
38
127
|
self.garbage_count = 0
|
39
128
|
self.config_file = config_file
|
40
129
|
self.load_config
|
41
130
|
|
42
131
|
@client = Datahen::Client::JobPage.new(opts[:client_options])
|
132
|
+
nil
|
43
133
|
end
|
44
134
|
|
135
|
+
# Execute garbage collector after it is requested as many times as
|
136
|
+
# described by #max_garbage.
|
45
137
|
def recollect_garbage
|
46
138
|
self.garbage_mutex.synchronize do
|
47
139
|
self.garbage_count += 1
|
@@ -51,8 +143,10 @@ module Datahen
|
|
51
143
|
self.garbage_count = 0
|
52
144
|
end
|
53
145
|
end
|
146
|
+
nil
|
54
147
|
end
|
55
148
|
|
149
|
+
# Loads the config file into a Hash.
|
56
150
|
def load_config
|
57
151
|
# build page type to script file map
|
58
152
|
@page_types = []
|
@@ -64,20 +158,40 @@ module Datahen
|
|
64
158
|
self.parsers[v['page_type']] = v['file']
|
65
159
|
end
|
66
160
|
self.recollect_garbage
|
161
|
+
nil
|
67
162
|
end
|
68
163
|
|
164
|
+
# Print the message regardless of it being the same as the last message.
|
165
|
+
# @param [String] message Message to display.
|
69
166
|
def repeat_puts message
|
70
167
|
puts message
|
71
|
-
self.last_message =
|
168
|
+
self.last_message = message
|
169
|
+
nil
|
72
170
|
end
|
73
171
|
|
172
|
+
# Print the message only when it is different from the last recorded
|
173
|
+
# message.
|
174
|
+
# @param [String] message Message to display.
|
74
175
|
def no_repeat_puts message
|
75
176
|
return if message == self.last_message
|
76
177
|
puts message
|
77
178
|
self.last_message = message
|
179
|
+
nil
|
180
|
+
end
|
181
|
+
|
182
|
+
# Refresh dequeuer's still alive timestamp
|
183
|
+
def dequeuer_is_alive!
|
184
|
+
self.dequeue_mutex.synchronize do
|
185
|
+
@dequeuer_still_alive = self.class.timestamp
|
186
|
+
end
|
187
|
+
nil
|
78
188
|
end
|
79
189
|
|
190
|
+
# Load new pages by dequeuing from the API.
|
191
|
+
# @return [Integer] amount of pages loaded
|
80
192
|
def load_pages
|
193
|
+
self.dequeuer_is_alive!
|
194
|
+
|
81
195
|
# calculate dequeue size
|
82
196
|
max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
|
83
197
|
current_size = self.pages.length
|
@@ -93,13 +207,16 @@ module Datahen
|
|
93
207
|
response = client.dequeue self.job_id,
|
94
208
|
dequeue_size,
|
95
209
|
self.page_types,
|
96
|
-
config['parse_fetching_failed']
|
210
|
+
config['parse_fetching_failed'],
|
211
|
+
timeout: self.dequeue_timeout
|
97
212
|
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
98
|
-
self.
|
213
|
+
self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
|
214
|
+
self.dequeuer_is_alive!
|
99
215
|
return 0
|
100
216
|
rescue => e
|
101
217
|
raise e
|
102
218
|
end
|
219
|
+
self.dequeuer_is_alive!
|
103
220
|
|
104
221
|
# ensure a valid response or try again
|
105
222
|
if response.nil? || response.response.code.to_i != 200
|
@@ -112,17 +229,20 @@ module Datahen
|
|
112
229
|
count = 0
|
113
230
|
(JSON.parse(response.body) || []).each do |page|
|
114
231
|
count += 1
|
115
|
-
next if self.
|
116
|
-
self.pages[page['gid']] = page
|
232
|
+
next if self.loaded_pages.has_key? page['gid']
|
233
|
+
self.pages << (self.loaded_pages[page['gid']] = page)
|
117
234
|
end
|
118
235
|
response = nil
|
236
|
+
self.dequeuer_is_alive!
|
119
237
|
|
120
238
|
# recolect garbage to free some memory before parsing
|
121
239
|
if count > 0
|
240
|
+
@not_found = false
|
122
241
|
self.recollect_garbage
|
123
242
|
self.repeat_puts "Found #{count} page(s) to parse"
|
124
243
|
self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
|
125
244
|
else
|
245
|
+
@not_found = true
|
126
246
|
self.no_repeat_puts NOT_FOUND_MSG
|
127
247
|
end
|
128
248
|
|
@@ -130,6 +250,44 @@ module Datahen
|
|
130
250
|
count
|
131
251
|
end
|
132
252
|
|
253
|
+
# Ensures that the dequeuer thread exists and is running.
|
254
|
+
# @return [Boolean] `true` if thread was alive, or `false` if had to
|
255
|
+
# create a new thread
|
256
|
+
def ensure_dequeuer_thread
|
257
|
+
self.dequeue_mutex.synchronize do
|
258
|
+
# check if dequeuer thread is alive and healthy
|
259
|
+
if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
|
260
|
+
still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
|
261
|
+
return true if self.class.timestamp < still_alive_timeout
|
262
|
+
|
263
|
+
# kill dequeuer thread
|
264
|
+
self.repeat_puts "Dequeuer isn't healthy, will restart it..."
|
265
|
+
self.dequeuer_thread.kill
|
266
|
+
@dequeuer_thread = nil
|
267
|
+
self.recollect_garbage
|
268
|
+
self.no_repeat_puts "Dequeuer thread was killed!"
|
269
|
+
end
|
270
|
+
|
271
|
+
# dequeuing on parallel (the ride never ends :D)
|
272
|
+
@dequeuer_thread = Thread.new do
|
273
|
+
while true
|
274
|
+
begin
|
275
|
+
self.load_pages
|
276
|
+
self.class.wait self.dequeue_interval
|
277
|
+
rescue => e
|
278
|
+
puts [e.message] + e.backtrace rescue 'error'
|
279
|
+
end
|
280
|
+
end
|
281
|
+
puts "Error: dequeuer died! D:"
|
282
|
+
end
|
283
|
+
self.repeat_puts "Dequeuer thread was started!"
|
284
|
+
end
|
285
|
+
false
|
286
|
+
end
|
287
|
+
|
288
|
+
# Dequeue one page from the previously loaded pages, and waits until there
|
289
|
+
# are new pages whenever there are no loaded pages.
|
290
|
+
# @return [Hash] dequeued page
|
133
291
|
def dequeue_pages
|
134
292
|
# collect garbage
|
135
293
|
self.recollect_garbage
|
@@ -137,24 +295,29 @@ module Datahen
|
|
137
295
|
# return page if there are loeaded pages
|
138
296
|
is_waiting = false
|
139
297
|
while true do
|
140
|
-
|
141
|
-
unless
|
298
|
+
page = self.pages.shift
|
299
|
+
unless page.nil?
|
142
300
|
puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
|
143
|
-
|
301
|
+
loaded_pages.delete(page['gid'])
|
302
|
+
return page
|
144
303
|
end
|
145
304
|
|
146
305
|
# be more verbose on worker waiting
|
147
306
|
unless is_waiting
|
148
307
|
is_waiting = true
|
149
308
|
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
150
|
-
if self.second_dequeue_count > 1
|
309
|
+
if self.second_dequeue_count > 1 && !self.not_found
|
151
310
|
puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
|
152
311
|
end
|
153
312
|
end
|
154
313
|
self.class.wait 1
|
314
|
+
|
315
|
+
# ensure the dequeuer thread is alive and healthy
|
316
|
+
self.ensure_dequeuer_thread
|
155
317
|
end
|
156
318
|
end
|
157
319
|
|
320
|
+
# Dequeue pages and execute the parsers associated to them on parallel.
|
158
321
|
def exec_parse save = false, keep_outputs = false
|
159
322
|
if self.worker_count < 1
|
160
323
|
self.no_repeat_puts NO_WORKERS_MSG
|
@@ -163,24 +326,15 @@ module Datahen
|
|
163
326
|
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
164
327
|
end
|
165
328
|
|
166
|
-
#
|
167
|
-
|
168
|
-
while true
|
169
|
-
begin
|
170
|
-
self.load_pages
|
171
|
-
self.class.wait self.dequeue_interval
|
172
|
-
rescue => e
|
173
|
-
puts [e.message] + e.backtrace rescue 'error'
|
174
|
-
end
|
175
|
-
end
|
176
|
-
puts "Error: dequeuer died! D:"
|
177
|
-
end
|
329
|
+
# start dequeuer
|
330
|
+
self.ensure_dequeuer_thread
|
178
331
|
|
179
332
|
# process the pages
|
180
333
|
dequeue = lambda{ self.dequeue_pages }
|
181
334
|
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
182
335
|
parser_file = self.parsers[page['page_type']]
|
183
336
|
begin
|
337
|
+
self.repeat_puts("Parsing page with GID #{page['gid']}")
|
184
338
|
puts Datahen::Scraper::Parser.exec_parser_by_page(
|
185
339
|
parser_file,
|
186
340
|
page,
|
@@ -189,6 +343,7 @@ module Datahen
|
|
189
343
|
nil,
|
190
344
|
keep_outputs
|
191
345
|
)
|
346
|
+
self.repeat_puts("Finish parsing page with GID #{page['gid']}")
|
192
347
|
rescue Parallel::Kill => e
|
193
348
|
puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
|
194
349
|
rescue Parallel::Break => e
|
@@ -197,6 +352,8 @@ module Datahen
|
|
197
352
|
puts [e.message] + e.backtrace rescue 'error'
|
198
353
|
end
|
199
354
|
end
|
355
|
+
|
356
|
+
nil
|
200
357
|
end
|
201
358
|
end
|
202
359
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|