datahen 0.15.10 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/parser.rb +1 -0
- data/lib/datahen/cli/scraper.rb +3 -0
- data/lib/datahen/cli/scraper_job.rb +1 -0
- data/lib/datahen/cli/scraper_page.rb +2 -0
- data/lib/datahen/client/job.rb +1 -0
- data/lib/datahen/client/job_page.rb +3 -1
- data/lib/datahen/client/scraper.rb +2 -0
- data/lib/datahen/client/scraper_job.rb +2 -0
- data/lib/datahen/client/scraper_job_page.rb +2 -0
- data/lib/datahen/scraper/batch_parser.rb +184 -27
- data/lib/datahen/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8945724e5d11f40eba22a9ffca7ca7d024b8565dae4ba1d3da9e486eac262575
|
4
|
+
data.tar.gz: 867319a0c6358c593951e6241b24d9c3fd9dcb759ce1287eeb640ade0c23e69a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bad8dea41951df061c84934fa63a0594bc8caf22b8665c3d6746bb29ca2821103ecd3814a9a9eba020009e8165390900b1c45a1fd0a7175fb7ea52f7a077fdab
|
7
|
+
data.tar.gz: 74342e01eaa21a590ef998219282fb40342cde4bf8db58617d24583bfa00b967df3f8c38e41c9b92868e90113b7cc5f6346fb2062b446edd3bb4612e053c5da7
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -72,6 +72,7 @@ module Datahen
|
|
72
72
|
option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
|
73
73
|
option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
|
74
74
|
option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
|
75
|
+
option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
|
75
76
|
def batch_exec_parse(scraper_name, config_file)
|
76
77
|
if options[:job]
|
77
78
|
job_id = options[:job]
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -32,6 +32,7 @@ module Datahen
|
|
32
32
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
33
33
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
34
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
35
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
35
36
|
def create(scraper_name, git_repository)
|
36
37
|
# puts "options #{options}"
|
37
38
|
client = Client::Scraper.new(options)
|
@@ -57,6 +58,7 @@ module Datahen
|
|
57
58
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
58
59
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
59
60
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
61
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
60
62
|
def update(scraper_name)
|
61
63
|
client = Client::Scraper.new(options)
|
62
64
|
puts "#{client.update(scraper_name, options)}"
|
@@ -94,6 +96,7 @@ module Datahen
|
|
94
96
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
97
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
98
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
99
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
97
100
|
def start(scraper_name)
|
98
101
|
client = Client::ScraperJob.new(options)
|
99
102
|
puts "Starting a scrape job..."
|
@@ -104,6 +104,7 @@ module Datahen
|
|
104
104
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
105
105
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
106
106
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
107
108
|
def update(scraper_name)
|
108
109
|
if options[:job]
|
109
110
|
client = Client::Job.new(options)
|
@@ -45,6 +45,7 @@ module Datahen
|
|
45
45
|
option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
|
46
46
|
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
47
|
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
|
+
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
48
49
|
def add(scraper_name, url)
|
49
50
|
begin
|
50
51
|
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
@@ -78,6 +79,7 @@ module Datahen
|
|
78
79
|
option :page_type, :aliases => :t, desc: 'Set page type'
|
79
80
|
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
80
81
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
82
|
+
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
81
83
|
def update(scraper_name, gid)
|
82
84
|
begin
|
83
85
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
data/lib/datahen/client/job.rb
CHANGED
@@ -21,6 +21,7 @@ module Datahen
|
|
21
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
23
|
body[:profile] = opts[:profile] if opts[:profile]
|
24
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
24
25
|
params = @options.merge({body: body.to_json})
|
25
26
|
|
26
27
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
body[:page_type] = opts[:page_type] if opts[:page_type]
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
18
19
|
|
19
20
|
params = @options.merge({body: body.to_json})
|
20
21
|
|
@@ -36,6 +37,7 @@ module Datahen
|
|
36
37
|
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
37
38
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
38
39
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
40
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
39
41
|
|
40
42
|
params = @options.merge({body: body.to_json})
|
41
43
|
|
@@ -48,7 +50,7 @@ module Datahen
|
|
48
50
|
page_types: page_types,
|
49
51
|
parse_fetching_failed: parse_fetching_failed
|
50
52
|
}
|
51
|
-
params = @options.merge({body: body.to_json
|
53
|
+
params = @options.merge(opts).merge({body: body.to_json})
|
52
54
|
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
53
55
|
end
|
54
56
|
|
@@ -28,6 +28,7 @@ module Datahen
|
|
28
28
|
body[:profile] = opts[:profile] if opts[:profile]
|
29
29
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
30
|
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
31
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
31
32
|
params = @options.merge({body: body.to_json})
|
32
33
|
self.class.post("/scrapers", params)
|
33
34
|
end
|
@@ -49,6 +50,7 @@ module Datahen
|
|
49
50
|
body[:profile] = opts[:profile] if opts[:profile]
|
50
51
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
51
52
|
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
53
|
+
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
52
54
|
params = @options.merge({body: body.to_json})
|
53
55
|
|
54
56
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -11,6 +11,7 @@ module Datahen
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
14
15
|
if opts[:vars]
|
15
16
|
if opts[:vars].is_a?(Array)
|
16
17
|
body[:vars] = opts[:vars]
|
@@ -37,6 +38,7 @@ module Datahen
|
|
37
38
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
38
39
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
39
40
|
body[:profile] = opts[:profile] if opts[:profile]
|
41
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
40
42
|
params = @options.merge({body: body.to_json})
|
41
43
|
|
42
44
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
body[:page_type] = opts[:page_type] if opts[:page_type]
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
18
19
|
|
19
20
|
params = @options.merge({body: body.to_json})
|
20
21
|
|
@@ -59,6 +60,7 @@ module Datahen
|
|
59
60
|
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
60
61
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
61
62
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
63
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
62
64
|
|
63
65
|
params = @options.merge({body: body.to_json})
|
64
66
|
|
@@ -8,22 +8,107 @@ module Datahen
|
|
8
8
|
NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
|
9
9
|
NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
11
|
+
# Configuration file path.
|
12
|
+
# @return [String] config file path
|
13
|
+
attr_accessor :config_file
|
14
|
+
# Garbage collector request counter.
|
15
|
+
# @return [Integer] garbage collector counter
|
16
|
+
attr_accessor :garbage_count
|
17
|
+
# Last printed message, useful to prevent duplicated log messages.
|
18
|
+
# @return [String] last printed message
|
19
|
+
attr_accessor :last_message
|
20
|
+
# Second dequeue counter used to prevent false negative warning messages.
|
21
|
+
# @return [Integer] second dequeue counter
|
22
|
+
attr_accessor :second_dequeue_count
|
23
|
+
# Dequeue API request timeout in seconds.
|
24
|
+
# @return [Integer] dequeue API request timeout in seconds
|
25
|
+
attr_accessor :dequeue_timeout
|
26
|
+
# Job id to be executed.
|
27
|
+
# @return [Integer] job id
|
28
|
+
attr_reader :job_id
|
29
|
+
# Parallel worker quantity.
|
30
|
+
# @return [Integer] parallel worker quantity
|
31
|
+
attr_reader :worker_count
|
32
|
+
# Loaded pages array.
|
33
|
+
# @return [Concurrent::Array<Hash>] loaded pages as an array
|
34
|
+
attr_reader :pages
|
35
|
+
# Loaded pages hash, useful to avoid duplicates on the loaded pages array.
|
36
|
+
# @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
|
37
|
+
attr_reader :loaded_pages
|
38
|
+
# Max garbage collector requests before actually executing the garbage
|
39
|
+
# collector.
|
40
|
+
# @return [Integer] max garbage request quantity before actually executing
|
41
|
+
# it
|
42
|
+
attr_reader :max_garbage
|
43
|
+
# Dequeue interval in seconds.
|
44
|
+
# @return [Integer] dequeue interval in seconds
|
45
|
+
attr_reader :dequeue_interval
|
46
|
+
# Dequeue scale used to calculate the ideal dequeue size.
|
47
|
+
# @return [Numeric] dequeue scale
|
48
|
+
attr_reader :dequeue_scale
|
49
|
+
# Known page types extracted from the config file.
|
50
|
+
# @return [Array<String>] known page types
|
51
|
+
attr_reader :page_types
|
52
|
+
# Known parsers extracted from the config file.
|
53
|
+
# @return [Concurrent::Hash<String, String>] known parsers
|
54
|
+
attr_reader :parsers
|
55
|
+
# Current config file loaded.
|
56
|
+
# @return [Hash] current loaded configuration
|
57
|
+
attr_reader :config
|
58
|
+
# Datahen job pages client used for API pages dequeuing.
|
59
|
+
# @return [Datahen::Client::JobPage] datahen job pages API client
|
60
|
+
attr_reader :client
|
61
|
+
# Garbage collector mutex used to synchronize garbage collector requests.
|
62
|
+
# @return [Mutex] garbage collector mutex
|
63
|
+
attr_reader :garbage_mutex
|
64
|
+
# Current dequeuer thread.
|
65
|
+
# @return [Thread] dequeuer thread
|
66
|
+
attr_reader :dequeuer_thread
|
67
|
+
# Dequeuer mutext used to synchronize page dequeuing.
|
68
|
+
# @return [Mutex] dequeuer mutex
|
69
|
+
attr_reader :dequeue_mutex
|
70
|
+
# Dequeuer last run unix timestamp.
|
71
|
+
# @return [Integer] dequeuer last run unix timestamp
|
72
|
+
attr_reader :dequeuer_still_alive
|
73
|
+
# Indicates whenever the wait time is because there are no more pages.
|
74
|
+
# @return [Boolean] `true` when wait time is due to no more pages,
|
75
|
+
# else `false`
|
76
|
+
attr_reader :not_found
|
16
77
|
|
78
|
+
# Wait a specific amount of seconds.
|
79
|
+
# @param [Integer] time_in_seconds Seconds to wait.
|
17
80
|
def self.wait time_in_seconds
|
18
81
|
Kernel.sleep time_in_seconds
|
19
82
|
end
|
20
83
|
|
84
|
+
# Get a unix timestamp.
|
85
|
+
# @return [Integer] unix timestamp
|
86
|
+
def self.timestamp
|
87
|
+
Time.new.utc.to_i
|
88
|
+
end
|
89
|
+
|
90
|
+
# Initialize a batch parser object.
|
91
|
+
# @param [Integer] job_id Job id.
|
92
|
+
# @param [String] config_file Config file path.
|
93
|
+
# @param [Hash] opts ({}) Configuration options
|
94
|
+
# @option opts [Integer] :worker_count (1) Parallel worker quantity.
|
95
|
+
# @option opts [Integer] :max_garbage (5) Max amount of times the garbage
|
96
|
+
# collector can be requested before actually executing.
|
97
|
+
# @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
|
98
|
+
# between page dequeuing.
|
99
|
+
# @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
|
100
|
+
# calculate page dequeue size.
|
101
|
+
# @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
|
102
|
+
# timeout in seconds.
|
103
|
+
# @option opts [Hash] :client_options ({}) Datahen client gem additional
|
104
|
+
# options (see Datahen::Client::Base#initialize method).
|
21
105
|
def initialize(job_id, config_file, opts = {})
|
22
106
|
opts = {
|
23
107
|
worker_count: 1,
|
24
108
|
max_garbage: 5,
|
25
109
|
dequeue_interval: 3,
|
26
110
|
dequeue_scale: 2,
|
111
|
+
dequeue_timeout: 30,
|
27
112
|
client_options: {}
|
28
113
|
}.merge opts
|
29
114
|
|
@@ -32,16 +117,23 @@ module Datahen
|
|
32
117
|
@dequeue_interval = opts[:dequeue_interval]
|
33
118
|
@dequeue_scale = opts[:dequeue_scale]
|
34
119
|
@max_garbage = opts[:max_garbage]
|
35
|
-
@pages = Concurrent::
|
120
|
+
@pages = Concurrent::Array.new
|
121
|
+
@loaded_pages = Concurrent::Hash.new
|
36
122
|
@garbage_mutex = Mutex.new
|
123
|
+
@dequeue_mutex = Mutex.new
|
124
|
+
@not_found = false
|
125
|
+
self.dequeue_timeout = opts[:dequeue_timeout]
|
37
126
|
self.second_dequeue_count = 0
|
38
127
|
self.garbage_count = 0
|
39
128
|
self.config_file = config_file
|
40
129
|
self.load_config
|
41
130
|
|
42
131
|
@client = Datahen::Client::JobPage.new(opts[:client_options])
|
132
|
+
nil
|
43
133
|
end
|
44
134
|
|
135
|
+
# Execute garbage collector after it is requested as many times as
|
136
|
+
# described by #max_garbage.
|
45
137
|
def recollect_garbage
|
46
138
|
self.garbage_mutex.synchronize do
|
47
139
|
self.garbage_count += 1
|
@@ -51,8 +143,10 @@ module Datahen
|
|
51
143
|
self.garbage_count = 0
|
52
144
|
end
|
53
145
|
end
|
146
|
+
nil
|
54
147
|
end
|
55
148
|
|
149
|
+
# Loads the config file into a Hash.
|
56
150
|
def load_config
|
57
151
|
# build page type to script file map
|
58
152
|
@page_types = []
|
@@ -64,20 +158,40 @@ module Datahen
|
|
64
158
|
self.parsers[v['page_type']] = v['file']
|
65
159
|
end
|
66
160
|
self.recollect_garbage
|
161
|
+
nil
|
67
162
|
end
|
68
163
|
|
164
|
+
# Print the message regardless of it being the same as the last message.
|
165
|
+
# @param [String] message Message to display.
|
69
166
|
def repeat_puts message
|
70
167
|
puts message
|
71
|
-
self.last_message =
|
168
|
+
self.last_message = message
|
169
|
+
nil
|
72
170
|
end
|
73
171
|
|
172
|
+
# Print the message only when it is different from the last recorded
|
173
|
+
# message.
|
174
|
+
# @param [String] message Message to display.
|
74
175
|
def no_repeat_puts message
|
75
176
|
return if message == self.last_message
|
76
177
|
puts message
|
77
178
|
self.last_message = message
|
179
|
+
nil
|
180
|
+
end
|
181
|
+
|
182
|
+
# Refresh dequeuer's still alive timestamp
|
183
|
+
def dequeuer_is_alive!
|
184
|
+
self.dequeue_mutex.synchronize do
|
185
|
+
@dequeuer_still_alive = self.class.timestamp
|
186
|
+
end
|
187
|
+
nil
|
78
188
|
end
|
79
189
|
|
190
|
+
# Load new pages by dequeuing from the API.
|
191
|
+
# @return [Integer] amount of pages loaded
|
80
192
|
def load_pages
|
193
|
+
self.dequeuer_is_alive!
|
194
|
+
|
81
195
|
# calculate dequeue size
|
82
196
|
max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
|
83
197
|
current_size = self.pages.length
|
@@ -93,13 +207,16 @@ module Datahen
|
|
93
207
|
response = client.dequeue self.job_id,
|
94
208
|
dequeue_size,
|
95
209
|
self.page_types,
|
96
|
-
config['parse_fetching_failed']
|
210
|
+
config['parse_fetching_failed'],
|
211
|
+
timeout: self.dequeue_timeout
|
97
212
|
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
98
|
-
self.
|
213
|
+
self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
|
214
|
+
self.dequeuer_is_alive!
|
99
215
|
return 0
|
100
216
|
rescue => e
|
101
217
|
raise e
|
102
218
|
end
|
219
|
+
self.dequeuer_is_alive!
|
103
220
|
|
104
221
|
# ensure a valid response or try again
|
105
222
|
if response.nil? || response.response.code.to_i != 200
|
@@ -112,17 +229,20 @@ module Datahen
|
|
112
229
|
count = 0
|
113
230
|
(JSON.parse(response.body) || []).each do |page|
|
114
231
|
count += 1
|
115
|
-
next if self.
|
116
|
-
self.pages[page['gid']] = page
|
232
|
+
next if self.loaded_pages.has_key? page['gid']
|
233
|
+
self.pages << (self.loaded_pages[page['gid']] = page)
|
117
234
|
end
|
118
235
|
response = nil
|
236
|
+
self.dequeuer_is_alive!
|
119
237
|
|
120
238
|
# recolect garbage to free some memory before parsing
|
121
239
|
if count > 0
|
240
|
+
@not_found = false
|
122
241
|
self.recollect_garbage
|
123
242
|
self.repeat_puts "Found #{count} page(s) to parse"
|
124
243
|
self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
|
125
244
|
else
|
245
|
+
@not_found = true
|
126
246
|
self.no_repeat_puts NOT_FOUND_MSG
|
127
247
|
end
|
128
248
|
|
@@ -130,6 +250,44 @@ module Datahen
|
|
130
250
|
count
|
131
251
|
end
|
132
252
|
|
253
|
+
# Ensures that the dequeuer thread exists and is running.
|
254
|
+
# @return [Boolean] `true` if thread was alive, or `false` if had to
|
255
|
+
# create a new thread
|
256
|
+
def ensure_dequeuer_thread
|
257
|
+
self.dequeue_mutex.synchronize do
|
258
|
+
# check if dequeuer thread is alive and healthy
|
259
|
+
if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
|
260
|
+
still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
|
261
|
+
return true if self.class.timestamp < still_alive_timeout
|
262
|
+
|
263
|
+
# kill dequeuer thread
|
264
|
+
self.repeat_puts "Dequeuer isn't healthy, will restart it..."
|
265
|
+
self.dequeuer_thread.kill
|
266
|
+
@dequeuer_thread = nil
|
267
|
+
self.recollect_garbage
|
268
|
+
self.no_repeat_puts "Dequeuer thread was killed!"
|
269
|
+
end
|
270
|
+
|
271
|
+
# dequeuing on parallel (the ride never ends :D)
|
272
|
+
@dequeuer_thread = Thread.new do
|
273
|
+
while true
|
274
|
+
begin
|
275
|
+
self.load_pages
|
276
|
+
self.class.wait self.dequeue_interval
|
277
|
+
rescue => e
|
278
|
+
puts [e.message] + e.backtrace rescue 'error'
|
279
|
+
end
|
280
|
+
end
|
281
|
+
puts "Error: dequeuer died! D:"
|
282
|
+
end
|
283
|
+
self.repeat_puts "Dequeuer thread was started!"
|
284
|
+
end
|
285
|
+
false
|
286
|
+
end
|
287
|
+
|
288
|
+
# Dequeue one page from the previously loaded pages, and waits until there
|
289
|
+
# are new pages whenever there are no loaded pages.
|
290
|
+
# @return [Hash] dequeued page
|
133
291
|
def dequeue_pages
|
134
292
|
# collect garbage
|
135
293
|
self.recollect_garbage
|
@@ -137,24 +295,29 @@ module Datahen
|
|
137
295
|
# return page if there are loeaded pages
|
138
296
|
is_waiting = false
|
139
297
|
while true do
|
140
|
-
|
141
|
-
unless
|
298
|
+
page = self.pages.shift
|
299
|
+
unless page.nil?
|
142
300
|
puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
|
143
|
-
|
301
|
+
loaded_pages.delete(page['gid'])
|
302
|
+
return page
|
144
303
|
end
|
145
304
|
|
146
305
|
# be more verbose on worker waiting
|
147
306
|
unless is_waiting
|
148
307
|
is_waiting = true
|
149
308
|
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
150
|
-
if self.second_dequeue_count > 1
|
309
|
+
if self.second_dequeue_count > 1 && !self.not_found
|
151
310
|
puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
|
152
311
|
end
|
153
312
|
end
|
154
313
|
self.class.wait 1
|
314
|
+
|
315
|
+
# ensure the dequeuer thread is alive and healthy
|
316
|
+
self.ensure_dequeuer_thread
|
155
317
|
end
|
156
318
|
end
|
157
319
|
|
320
|
+
# Dequeue pages and execute the parsers associated to them on parallel.
|
158
321
|
def exec_parse save = false, keep_outputs = false
|
159
322
|
if self.worker_count < 1
|
160
323
|
self.no_repeat_puts NO_WORKERS_MSG
|
@@ -163,24 +326,15 @@ module Datahen
|
|
163
326
|
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
164
327
|
end
|
165
328
|
|
166
|
-
#
|
167
|
-
|
168
|
-
while true
|
169
|
-
begin
|
170
|
-
self.load_pages
|
171
|
-
self.class.wait self.dequeue_interval
|
172
|
-
rescue => e
|
173
|
-
puts [e.message] + e.backtrace rescue 'error'
|
174
|
-
end
|
175
|
-
end
|
176
|
-
puts "Error: dequeuer died! D:"
|
177
|
-
end
|
329
|
+
# start dequeuer
|
330
|
+
self.ensure_dequeuer_thread
|
178
331
|
|
179
332
|
# process the pages
|
180
333
|
dequeue = lambda{ self.dequeue_pages }
|
181
334
|
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
182
335
|
parser_file = self.parsers[page['page_type']]
|
183
336
|
begin
|
337
|
+
self.repeat_puts("Parsing page with GID #{page['gid']}")
|
184
338
|
puts Datahen::Scraper::Parser.exec_parser_by_page(
|
185
339
|
parser_file,
|
186
340
|
page,
|
@@ -189,6 +343,7 @@ module Datahen
|
|
189
343
|
nil,
|
190
344
|
keep_outputs
|
191
345
|
)
|
346
|
+
self.repeat_puts("Finish parsing page with GID #{page['gid']}")
|
192
347
|
rescue Parallel::Kill => e
|
193
348
|
puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
|
194
349
|
rescue Parallel::Break => e
|
@@ -197,6 +352,8 @@ module Datahen
|
|
197
352
|
puts [e.message] + e.backtrace rescue 'error'
|
198
353
|
end
|
199
354
|
end
|
355
|
+
|
356
|
+
nil
|
200
357
|
end
|
201
358
|
end
|
202
359
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|