datahen 0.15.9 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/parser.rb +1 -0
- data/lib/datahen/cli/scraper.rb +3 -0
- data/lib/datahen/cli/scraper_job.rb +1 -0
- data/lib/datahen/cli/scraper_page.rb +2 -0
- data/lib/datahen/client/job.rb +1 -0
- data/lib/datahen/client/job_page.rb +3 -1
- data/lib/datahen/client/scraper.rb +2 -0
- data/lib/datahen/client/scraper_job.rb +2 -0
- data/lib/datahen/client/scraper_job_page.rb +2 -0
- data/lib/datahen/scraper/batch_parser.rb +219 -38
- data/lib/datahen/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8059eb96654de953b7a2d5df2f0a3d4f643d4ef3bba654b758e46e10c972bfc9
|
4
|
+
data.tar.gz: 2b57638a05fb85ed896398ffbd96b6af9342af9159a7a1b9964ef8c336a90d26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e81e909432bade42c7e3cbe1c38efbf835dd36cab92099ff11dc5b6f7677fdc3ebb7bb13dccedb29311dd1f0a55c8e8dc64252b94fc546b488566d39e330fb0
|
7
|
+
data.tar.gz: 07f44f03c2ae9636e04972fec7eeade9c0ded9a6cdcd4064515a960ba3af1021d77d8226fa79bd87626f27694c69825eda16b64723acc7d1d440a0953d554e3c
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -72,6 +72,7 @@ module Datahen
|
|
72
72
|
option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
|
73
73
|
option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
|
74
74
|
option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
|
75
|
+
option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
|
75
76
|
def batch_exec_parse(scraper_name, config_file)
|
76
77
|
if options[:job]
|
77
78
|
job_id = options[:job]
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -32,6 +32,7 @@ module Datahen
|
|
32
32
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
33
33
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
34
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
35
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
35
36
|
def create(scraper_name, git_repository)
|
36
37
|
# puts "options #{options}"
|
37
38
|
client = Client::Scraper.new(options)
|
@@ -57,6 +58,7 @@ module Datahen
|
|
57
58
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
58
59
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
59
60
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
61
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
60
62
|
def update(scraper_name)
|
61
63
|
client = Client::Scraper.new(options)
|
62
64
|
puts "#{client.update(scraper_name, options)}"
|
@@ -94,6 +96,7 @@ module Datahen
|
|
94
96
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
97
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
98
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
99
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
97
100
|
def start(scraper_name)
|
98
101
|
client = Client::ScraperJob.new(options)
|
99
102
|
puts "Starting a scrape job..."
|
@@ -104,6 +104,7 @@ module Datahen
|
|
104
104
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
105
105
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
106
106
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
107
108
|
def update(scraper_name)
|
108
109
|
if options[:job]
|
109
110
|
client = Client::Job.new(options)
|
@@ -45,6 +45,7 @@ module Datahen
|
|
45
45
|
option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
|
46
46
|
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
47
|
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
|
+
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
48
49
|
def add(scraper_name, url)
|
49
50
|
begin
|
50
51
|
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
@@ -78,6 +79,7 @@ module Datahen
|
|
78
79
|
option :page_type, :aliases => :t, desc: 'Set page type'
|
79
80
|
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
80
81
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
82
|
+
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
81
83
|
def update(scraper_name, gid)
|
82
84
|
begin
|
83
85
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
data/lib/datahen/client/job.rb
CHANGED
@@ -21,6 +21,7 @@ module Datahen
|
|
21
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
23
|
body[:profile] = opts[:profile] if opts[:profile]
|
24
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
24
25
|
params = @options.merge({body: body.to_json})
|
25
26
|
|
26
27
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
body[:page_type] = opts[:page_type] if opts[:page_type]
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
18
19
|
|
19
20
|
params = @options.merge({body: body.to_json})
|
20
21
|
|
@@ -36,6 +37,7 @@ module Datahen
|
|
36
37
|
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
37
38
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
38
39
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
40
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
39
41
|
|
40
42
|
params = @options.merge({body: body.to_json})
|
41
43
|
|
@@ -48,7 +50,7 @@ module Datahen
|
|
48
50
|
page_types: page_types,
|
49
51
|
parse_fetching_failed: parse_fetching_failed
|
50
52
|
}
|
51
|
-
params = @options.merge({body: body.to_json})
|
53
|
+
params = @options.merge(opts).merge({body: body.to_json})
|
52
54
|
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
53
55
|
end
|
54
56
|
|
@@ -28,6 +28,7 @@ module Datahen
|
|
28
28
|
body[:profile] = opts[:profile] if opts[:profile]
|
29
29
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
30
|
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
31
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
31
32
|
params = @options.merge({body: body.to_json})
|
32
33
|
self.class.post("/scrapers", params)
|
33
34
|
end
|
@@ -49,6 +50,7 @@ module Datahen
|
|
49
50
|
body[:profile] = opts[:profile] if opts[:profile]
|
50
51
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
51
52
|
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
53
|
+
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
52
54
|
params = @options.merge({body: body.to_json})
|
53
55
|
|
54
56
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -11,6 +11,7 @@ module Datahen
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
14
15
|
if opts[:vars]
|
15
16
|
if opts[:vars].is_a?(Array)
|
16
17
|
body[:vars] = opts[:vars]
|
@@ -37,6 +38,7 @@ module Datahen
|
|
37
38
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
38
39
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
39
40
|
body[:profile] = opts[:profile] if opts[:profile]
|
41
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
40
42
|
params = @options.merge({body: body.to_json})
|
41
43
|
|
42
44
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
body[:page_type] = opts[:page_type] if opts[:page_type]
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
18
19
|
|
19
20
|
params = @options.merge({body: body.to_json})
|
20
21
|
|
@@ -59,6 +60,7 @@ module Datahen
|
|
59
60
|
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
60
61
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
61
62
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
63
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
62
64
|
|
63
65
|
params = @options.merge({body: body.to_json})
|
64
66
|
|
@@ -5,25 +5,110 @@ module Datahen
|
|
5
5
|
module Scraper
|
6
6
|
class BatchParser
|
7
7
|
NOT_FOUND_MSG = "No more pages to parse found"
|
8
|
-
NO_DEQUEUE_COUNT_MSG = "
|
9
|
-
NO_WORKERS_MSG = "
|
8
|
+
NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
|
9
|
+
NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
11
|
+
# Configuration file path.
|
12
|
+
# @return [String] config file path
|
13
|
+
attr_accessor :config_file
|
14
|
+
# Garbage collector request counter.
|
15
|
+
# @return [Integer] garbage collector counter
|
16
|
+
attr_accessor :garbage_count
|
17
|
+
# Last printed message, useful to prevent duplicated log messages.
|
18
|
+
# @return [String] last printed message
|
19
|
+
attr_accessor :last_message
|
20
|
+
# Second dequeue counter used to prevent false negative warning messages.
|
21
|
+
# @return [Integer] second dequeue counter
|
22
|
+
attr_accessor :second_dequeue_count
|
23
|
+
# Dequeue API request timeout in seconds.
|
24
|
+
# @return [Integer] dequeue API request timeout in seconds
|
25
|
+
attr_accessor :dequeue_timeout
|
26
|
+
# Job id to be executed.
|
27
|
+
# @return [Integer] job id
|
28
|
+
attr_reader :job_id
|
29
|
+
# Parallel worker quantity.
|
30
|
+
# @return [Integer] parallel worker quantity
|
31
|
+
attr_reader :worker_count
|
32
|
+
# Loaded pages array.
|
33
|
+
# @return [Concurrent::Array<Hash>] loaded pages as an array
|
34
|
+
attr_reader :pages
|
35
|
+
# Loaded pages hash, useful to avoid duplicates on the loaded pages array.
|
36
|
+
# @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
|
37
|
+
attr_reader :loaded_pages
|
38
|
+
# Max garbage collector requests before actually executing the garbage
|
39
|
+
# collector.
|
40
|
+
# @return [Integer] max garbage request quantity before actually executing
|
41
|
+
# it
|
42
|
+
attr_reader :max_garbage
|
43
|
+
# Dequeue interval in seconds.
|
44
|
+
# @return [Integer] dequeue interval in seconds
|
45
|
+
attr_reader :dequeue_interval
|
46
|
+
# Dequeue scale used to calculate the ideal dequeue size.
|
47
|
+
# @return [Numeric] dequeue scale
|
48
|
+
attr_reader :dequeue_scale
|
49
|
+
# Known page types extracted from the config file.
|
50
|
+
# @return [Array<String>] known page types
|
51
|
+
attr_reader :page_types
|
52
|
+
# Known parsers extracted from the config file.
|
53
|
+
# @return [Concurrent::Hash<String, String>] known parsers
|
54
|
+
attr_reader :parsers
|
55
|
+
# Current config file loaded.
|
56
|
+
# @return [Hash] current loaded configuration
|
57
|
+
attr_reader :config
|
58
|
+
# Datahen job pages client used for API pages dequeuing.
|
59
|
+
# @return [Datahen::Client::JobPage] datahen job pages API client
|
60
|
+
attr_reader :client
|
61
|
+
# Garbage collector mutex used to synchronize garbage collector requests.
|
62
|
+
# @return [Mutex] garbage collector mutex
|
63
|
+
attr_reader :garbage_mutex
|
64
|
+
# Current dequeuer thread.
|
65
|
+
# @return [Thread] dequeuer thread
|
66
|
+
attr_reader :dequeuer_thread
|
67
|
+
# Dequeuer mutext used to synchronize page dequeuing.
|
68
|
+
# @return [Mutex] dequeuer mutex
|
69
|
+
attr_reader :dequeue_mutex
|
70
|
+
# Dequeuer last run unix timestamp.
|
71
|
+
# @return [Integer] dequeuer last run unix timestamp
|
72
|
+
attr_reader :dequeuer_still_alive
|
73
|
+
# Indicates whenever the wait time is because there are no more pages.
|
74
|
+
# @return [Boolean] `true` when wait time is due to no more pages,
|
75
|
+
# else `false`
|
76
|
+
attr_reader :not_found
|
16
77
|
|
78
|
+
# Wait a specific amount of seconds.
|
79
|
+
# @param [Integer] time_in_seconds Seconds to wait.
|
17
80
|
def self.wait time_in_seconds
|
18
81
|
Kernel.sleep time_in_seconds
|
19
82
|
end
|
20
83
|
|
84
|
+
# Get a unix timestamp.
|
85
|
+
# @return [Integer] unix timestamp
|
86
|
+
def self.timestamp
|
87
|
+
Time.new.utc.to_i
|
88
|
+
end
|
89
|
+
|
90
|
+
# Initialize a batch parser object.
|
91
|
+
# @param [Integer] job_id Job id.
|
92
|
+
# @param [String] config_file Config file path.
|
93
|
+
# @param [Hash] opts ({}) Configuration options
|
94
|
+
# @option opts [Integer] :worker_count (1) Parallel worker quantity.
|
95
|
+
# @option opts [Integer] :max_garbage (5) Max amount of times the garbage
|
96
|
+
# collector can be requested before actually executing.
|
97
|
+
# @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
|
98
|
+
# between page dequeuing.
|
99
|
+
# @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
|
100
|
+
# calculate page dequeue size.
|
101
|
+
# @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
|
102
|
+
# timeout in seconds.
|
103
|
+
# @option opts [Hash] :client_options ({}) Datahen client gem additional
|
104
|
+
# options (see Datahen::Client::Base#initialize method).
|
21
105
|
def initialize(job_id, config_file, opts = {})
|
22
106
|
opts = {
|
23
107
|
worker_count: 1,
|
24
108
|
max_garbage: 5,
|
25
109
|
dequeue_interval: 3,
|
26
110
|
dequeue_scale: 2,
|
111
|
+
dequeue_timeout: 30,
|
27
112
|
client_options: {}
|
28
113
|
}.merge opts
|
29
114
|
|
@@ -32,23 +117,36 @@ module Datahen
|
|
32
117
|
@dequeue_interval = opts[:dequeue_interval]
|
33
118
|
@dequeue_scale = opts[:dequeue_scale]
|
34
119
|
@max_garbage = opts[:max_garbage]
|
35
|
-
@pages = Concurrent::
|
120
|
+
@pages = Concurrent::Array.new
|
121
|
+
@loaded_pages = Concurrent::Hash.new
|
36
122
|
@garbage_mutex = Mutex.new
|
123
|
+
@dequeue_mutex = Mutex.new
|
124
|
+
@not_found = false
|
125
|
+
self.dequeue_timeout = opts[:dequeue_timeout]
|
126
|
+
self.second_dequeue_count = 0
|
37
127
|
self.garbage_count = 0
|
38
128
|
self.config_file = config_file
|
39
129
|
self.load_config
|
40
130
|
|
41
131
|
@client = Datahen::Client::JobPage.new(opts[:client_options])
|
132
|
+
nil
|
42
133
|
end
|
43
134
|
|
135
|
+
# Execute garbage collector after it is requested as many times as
|
136
|
+
# described by #max_garbage.
|
44
137
|
def recollect_garbage
|
45
138
|
self.garbage_mutex.synchronize do
|
46
|
-
|
47
|
-
|
48
|
-
|
139
|
+
self.garbage_count += 1
|
140
|
+
if self.garbage_count > self.max_garbage
|
141
|
+
puts "Recollect garbage"
|
142
|
+
GC.start
|
143
|
+
self.garbage_count = 0
|
144
|
+
end
|
49
145
|
end
|
146
|
+
nil
|
50
147
|
end
|
51
148
|
|
149
|
+
# Loads the config file into a Hash.
|
52
150
|
def load_config
|
53
151
|
# build page type to script file map
|
54
152
|
@page_types = []
|
@@ -60,20 +158,40 @@ module Datahen
|
|
60
158
|
self.parsers[v['page_type']] = v['file']
|
61
159
|
end
|
62
160
|
self.recollect_garbage
|
161
|
+
nil
|
63
162
|
end
|
64
163
|
|
164
|
+
# Print the message regardless of it being the same as the last message.
|
165
|
+
# @param [String] message Message to display.
|
65
166
|
def repeat_puts message
|
66
167
|
puts message
|
67
|
-
self.last_message =
|
168
|
+
self.last_message = message
|
169
|
+
nil
|
68
170
|
end
|
69
171
|
|
172
|
+
# Print the message only when it is different from the last recorded
|
173
|
+
# message.
|
174
|
+
# @param [String] message Message to display.
|
70
175
|
def no_repeat_puts message
|
71
176
|
return if message == self.last_message
|
72
177
|
puts message
|
73
178
|
self.last_message = message
|
179
|
+
nil
|
74
180
|
end
|
75
181
|
|
182
|
+
# Refresh dequeuer's still alive timestamp
|
183
|
+
def dequeuer_is_alive!
|
184
|
+
self.dequeue_mutex.synchronize do
|
185
|
+
@dequeuer_still_alive = self.class.timestamp
|
186
|
+
end
|
187
|
+
nil
|
188
|
+
end
|
189
|
+
|
190
|
+
# Load new pages by dequeuing from the API.
|
191
|
+
# @return [Integer] amount of pages loaded
|
76
192
|
def load_pages
|
193
|
+
self.dequeuer_is_alive!
|
194
|
+
|
77
195
|
# calculate dequeue size
|
78
196
|
max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
|
79
197
|
current_size = self.pages.length
|
@@ -84,10 +202,21 @@ module Datahen
|
|
84
202
|
dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
|
85
203
|
|
86
204
|
# reserve and get to pages parse
|
87
|
-
response =
|
88
|
-
|
89
|
-
self.
|
90
|
-
|
205
|
+
response = nil
|
206
|
+
begin
|
207
|
+
response = client.dequeue self.job_id,
|
208
|
+
dequeue_size,
|
209
|
+
self.page_types,
|
210
|
+
config['parse_fetching_failed'],
|
211
|
+
timeout: self.dequeue_timeout
|
212
|
+
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
213
|
+
self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
|
214
|
+
self.dequeuer_is_alive!
|
215
|
+
return 0
|
216
|
+
rescue => e
|
217
|
+
raise e
|
218
|
+
end
|
219
|
+
self.dequeuer_is_alive!
|
91
220
|
|
92
221
|
# ensure a valid response or try again
|
93
222
|
if response.nil? || response.response.code.to_i != 200
|
@@ -100,16 +229,20 @@ module Datahen
|
|
100
229
|
count = 0
|
101
230
|
(JSON.parse(response.body) || []).each do |page|
|
102
231
|
count += 1
|
103
|
-
next if self.
|
104
|
-
self.pages[page['gid']] = page
|
232
|
+
next if self.loaded_pages.has_key? page['gid']
|
233
|
+
self.pages << (self.loaded_pages[page['gid']] = page)
|
105
234
|
end
|
106
235
|
response = nil
|
236
|
+
self.dequeuer_is_alive!
|
107
237
|
|
108
238
|
# recolect garbage to free some memory before parsing
|
109
239
|
if count > 0
|
240
|
+
@not_found = false
|
110
241
|
self.recollect_garbage
|
111
242
|
self.repeat_puts "Found #{count} page(s) to parse"
|
243
|
+
self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
|
112
244
|
else
|
245
|
+
@not_found = true
|
113
246
|
self.no_repeat_puts NOT_FOUND_MSG
|
114
247
|
end
|
115
248
|
|
@@ -117,21 +250,74 @@ module Datahen
|
|
117
250
|
count
|
118
251
|
end
|
119
252
|
|
253
|
+
# Ensures that the dequeuer thread exists and is running.
|
254
|
+
# @return [Boolean] `true` if thread was alive, or `false` if had to
|
255
|
+
# create a new thread
|
256
|
+
def ensure_dequeuer_thread
|
257
|
+
self.dequeue_mutex.synchronize do
|
258
|
+
# check if dequeuer thread is alive and healthy
|
259
|
+
if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
|
260
|
+
still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
|
261
|
+
return true if self.class.timestamp < still_alive_timeout
|
262
|
+
|
263
|
+
# kill dequeuer thread
|
264
|
+
self.repeat_puts "Dequeuer isn't healthy, will restart it..."
|
265
|
+
self.dequeuer_thread.kill
|
266
|
+
@dequeuer_thread = nil
|
267
|
+
self.recollect_garbage
|
268
|
+
self.no_repeat_puts "Dequeuer thread was killed!"
|
269
|
+
end
|
270
|
+
|
271
|
+
# dequeuing on parallel (the ride never ends :D)
|
272
|
+
@dequeuer_thread = Thread.new do
|
273
|
+
while true
|
274
|
+
begin
|
275
|
+
self.load_pages
|
276
|
+
self.class.wait self.dequeue_interval
|
277
|
+
rescue => e
|
278
|
+
puts [e.message] + e.backtrace rescue 'error'
|
279
|
+
end
|
280
|
+
end
|
281
|
+
puts "Error: dequeuer died! D:"
|
282
|
+
end
|
283
|
+
self.repeat_puts "Dequeuer thread was started!"
|
284
|
+
end
|
285
|
+
false
|
286
|
+
end
|
287
|
+
|
288
|
+
# Dequeue one page from the previously loaded pages, and waits until there
|
289
|
+
# are new pages whenever there are no loaded pages.
|
290
|
+
# @return [Hash] dequeued page
|
120
291
|
def dequeue_pages
|
121
292
|
# collect garbage
|
122
|
-
self.
|
123
|
-
if self.garbage_count > self.max_garbage
|
124
|
-
self.recollect_garbage
|
125
|
-
end
|
293
|
+
self.recollect_garbage
|
126
294
|
|
127
295
|
# return page if there are loeaded pages
|
296
|
+
is_waiting = false
|
128
297
|
while true do
|
129
|
-
|
130
|
-
|
298
|
+
page = self.pages.shift
|
299
|
+
unless page.nil?
|
300
|
+
puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
|
301
|
+
loaded_pages.delete(page['gid'])
|
302
|
+
return page
|
303
|
+
end
|
304
|
+
|
305
|
+
# be more verbose on worker waiting
|
306
|
+
unless is_waiting
|
307
|
+
is_waiting = true
|
308
|
+
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
309
|
+
if self.second_dequeue_count > 1 && !self.not_found
|
310
|
+
puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
|
311
|
+
end
|
312
|
+
end
|
131
313
|
self.class.wait 1
|
314
|
+
|
315
|
+
# ensure the dequeuer thread is alive and healthy
|
316
|
+
self.ensure_dequeuer_thread
|
132
317
|
end
|
133
318
|
end
|
134
319
|
|
320
|
+
# Dequeue pages and execute the parsers associated to them on parallel.
|
135
321
|
def exec_parse save = false, keep_outputs = false
|
136
322
|
if self.worker_count < 1
|
137
323
|
self.no_repeat_puts NO_WORKERS_MSG
|
@@ -140,20 +326,10 @@ module Datahen
|
|
140
326
|
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
141
327
|
end
|
142
328
|
|
143
|
-
#
|
144
|
-
|
145
|
-
keep_dequeue[0] = true
|
146
|
-
Thread.new do
|
147
|
-
while keep_dequeue[0]
|
148
|
-
begin
|
149
|
-
self.load_pages
|
150
|
-
self.class.wait self.dequeue_interval
|
151
|
-
rescue => e
|
152
|
-
puts [e.message] + e.backtrace rescue 'error'
|
153
|
-
end
|
154
|
-
end
|
155
|
-
end
|
329
|
+
# start dequeuer
|
330
|
+
self.ensure_dequeuer_thread
|
156
331
|
|
332
|
+
# process the pages
|
157
333
|
dequeue = lambda{ self.dequeue_pages }
|
158
334
|
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
159
335
|
parser_file = self.parsers[page['page_type']]
|
@@ -166,11 +342,16 @@ module Datahen
|
|
166
342
|
nil,
|
167
343
|
keep_outputs
|
168
344
|
)
|
345
|
+
rescue Parallel::Kill => e
|
346
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
|
347
|
+
rescue Parallel::Break => e
|
348
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
|
169
349
|
rescue => e
|
170
350
|
puts [e.message] + e.backtrace rescue 'error'
|
171
351
|
end
|
172
352
|
end
|
173
|
-
|
353
|
+
|
354
|
+
nil
|
174
355
|
end
|
175
356
|
end
|
176
357
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -276,7 +276,7 @@ metadata:
|
|
276
276
|
allowed_push_host: https://rubygems.org
|
277
277
|
homepage_uri: https://datahen.com
|
278
278
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
279
|
-
post_install_message:
|
279
|
+
post_install_message:
|
280
280
|
rdoc_options: []
|
281
281
|
require_paths:
|
282
282
|
- lib
|
@@ -292,7 +292,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
292
292
|
version: '0'
|
293
293
|
requirements: []
|
294
294
|
rubygems_version: 3.0.3
|
295
|
-
signing_key:
|
295
|
+
signing_key:
|
296
296
|
specification_version: 4
|
297
297
|
summary: DataHen toolbelt for developers
|
298
298
|
test_files: []
|