datahen 0.14.26 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/datahen.gemspec +3 -1
- data/lib/datahen/cli/parser.rb +48 -7
- data/lib/datahen/cli/scraper.rb +3 -0
- data/lib/datahen/cli/scraper_job.rb +1 -0
- data/lib/datahen/cli/scraper_page.rb +2 -0
- data/lib/datahen/client/job.rb +1 -0
- data/lib/datahen/client/job_page.rb +12 -0
- data/lib/datahen/client/scraper.rb +2 -0
- data/lib/datahen/client/scraper_job.rb +2 -0
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/scraper/batch_parser.rb +358 -0
- data/lib/datahen/scraper/executor.rb +4 -1
- data/lib/datahen/scraper/parser.rb +16 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +4 -1
- data/lib/datahen/version.rb +1 -1
- metadata +35 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39397d5cb4e60a6d24cdec5bd979f543a23019b7c9b9dffe6140a204d330465c
|
4
|
+
data.tar.gz: 1db7c2b448179c2bc4b56e99428dfb4303cbb1451df032ec43cb5264f58935ec
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7058506211d537c8ea3c9a521625fd339b255f41188a70341cc04683ca1abc1fa7f19ed796026b5e07679bb2fd7e57f096d319fbe8a75ae6fb7fd59a704a9824
|
7
|
+
data.tar.gz: b8b60607cd27acbd654afe0816b0b7738871ca61c370fbafa747985d3723fec64f9b07c6078b34c1192db1d7b160682522a4901a7047c3d067860bc2b745b0b0
|
data/datahen.gemspec
CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.bindir = "exe"
|
34
34
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
35
35
|
spec.require_paths = ["lib"]
|
36
|
-
spec.required_ruby_version = '>= 2.
|
36
|
+
spec.required_ruby_version = '>= 2.4.4'
|
37
37
|
spec.add_dependency "thor", "~> 0.20.3"
|
38
38
|
spec.add_dependency 'httparty', '~> 0.16.2'
|
39
39
|
spec.add_dependency 'nokogiri', '~> 1.6'
|
40
|
+
spec.add_dependency 'concurrent-ruby', '~> 1.1'
|
41
|
+
spec.add_dependency 'parallel', '~> 1.20'
|
40
42
|
spec.add_development_dependency 'bundler', '>= 1.16'
|
41
43
|
spec.add_development_dependency 'rake', '>= 10.0'
|
42
44
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -43,17 +43,17 @@ module Datahen
|
|
43
43
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
44
44
|
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
45
45
|
def exec_parse(scraper_name, parser_file, *gids)
|
46
|
+
if options[:job]
|
47
|
+
job_id = options[:job]
|
48
|
+
else
|
49
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
50
|
+
job_id = job['id']
|
51
|
+
end
|
52
|
+
|
46
53
|
gids.each do |gid|
|
47
54
|
begin
|
48
55
|
puts "Parsing #{gid}"
|
49
56
|
|
50
|
-
if options[:job]
|
51
|
-
job_id = options[:job]
|
52
|
-
else
|
53
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
54
|
-
job_id = job['id']
|
55
|
-
end
|
56
|
-
|
57
57
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
58
58
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
59
59
|
rescue => e
|
@@ -61,6 +61,47 @@ module Datahen
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
64
|
+
|
65
|
+
desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
|
66
|
+
long_desc <<-LONGDESC
|
67
|
+
Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
|
68
|
+
LONGDESC
|
69
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
70
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
71
|
+
option :"workers", type: :numeric, default: 1, desc: "Worker count"
|
72
|
+
option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
|
73
|
+
option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
|
74
|
+
option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
|
75
|
+
option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
|
76
|
+
def batch_exec_parse(scraper_name, config_file)
|
77
|
+
if options[:job]
|
78
|
+
job_id = options[:job]
|
79
|
+
else
|
80
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
81
|
+
job_id = job['id']
|
82
|
+
end
|
83
|
+
|
84
|
+
# make the stdout and stderr sync to prevent buffering
|
85
|
+
old_stdout_sync = $stdout.sync
|
86
|
+
old_stderr_sync = $stderr.sync
|
87
|
+
$stdout.sync = true
|
88
|
+
$stderr.sync = true
|
89
|
+
|
90
|
+
begin
|
91
|
+
batch = Datahen::Scraper::BatchParser.new job_id, config_file,
|
92
|
+
worker_count: options[:"workers"],
|
93
|
+
max_garbage: options[:"max-garbage"],
|
94
|
+
dequeue_interval: options[:"dequeue-interval"],
|
95
|
+
dequeue_scale: options[:"dequeue-scale"]
|
96
|
+
batch.exec_parse true, options[:"keep-outputs"]
|
97
|
+
rescue => e
|
98
|
+
puts [e.message] + e.backtrace
|
99
|
+
end
|
100
|
+
|
101
|
+
# resume whatever state the stdout and stderr sync were
|
102
|
+
$stdout.sync = old_stdout_sync
|
103
|
+
$stderr.sync = old_stderr_sync
|
104
|
+
end
|
64
105
|
end
|
65
106
|
end
|
66
107
|
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -32,6 +32,7 @@ module Datahen
|
|
32
32
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
33
33
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
34
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
35
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
35
36
|
def create(scraper_name, git_repository)
|
36
37
|
# puts "options #{options}"
|
37
38
|
client = Client::Scraper.new(options)
|
@@ -57,6 +58,7 @@ module Datahen
|
|
57
58
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
58
59
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
59
60
|
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
61
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
60
62
|
def update(scraper_name)
|
61
63
|
client = Client::Scraper.new(options)
|
62
64
|
puts "#{client.update(scraper_name, options)}"
|
@@ -94,6 +96,7 @@ module Datahen
|
|
94
96
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
97
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
98
|
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
99
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
97
100
|
def start(scraper_name)
|
98
101
|
client = Client::ScraperJob.new(options)
|
99
102
|
puts "Starting a scrape job..."
|
@@ -104,6 +104,7 @@ module Datahen
|
|
104
104
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
105
105
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
106
106
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
107
|
+
option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
107
108
|
def update(scraper_name)
|
108
109
|
if options[:job]
|
109
110
|
client = Client::Job.new(options)
|
@@ -45,6 +45,7 @@ module Datahen
|
|
45
45
|
option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
|
46
46
|
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
47
47
|
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
48
|
+
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
48
49
|
def add(scraper_name, url)
|
49
50
|
begin
|
50
51
|
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
@@ -78,6 +79,7 @@ module Datahen
|
|
78
79
|
option :page_type, :aliases => :t, desc: 'Set page type'
|
79
80
|
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
80
81
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
82
|
+
option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
|
81
83
|
def update(scraper_name, gid)
|
82
84
|
begin
|
83
85
|
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
data/lib/datahen/client/job.rb
CHANGED
@@ -21,6 +21,7 @@ module Datahen
|
|
21
21
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
22
22
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
23
23
|
body[:profile] = opts[:profile] if opts[:profile]
|
24
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
24
25
|
params = @options.merge({body: body.to_json})
|
25
26
|
|
26
27
|
self.class.put("/jobs/#{job_id}", params)
|
@@ -15,6 +15,7 @@ module Datahen
|
|
15
15
|
body[:page_type] = opts[:page_type] if opts[:page_type]
|
16
16
|
body[:priority] = opts[:priority] if opts[:priority]
|
17
17
|
body[:vars] = opts[:vars] if opts[:vars]
|
18
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
18
19
|
|
19
20
|
params = @options.merge({body: body.to_json})
|
20
21
|
|
@@ -36,12 +37,23 @@ module Datahen
|
|
36
37
|
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
37
38
|
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
38
39
|
body[:cookie] = opts[:cookie] if opts[:cookie]
|
40
|
+
body[:max_size] = opts[:max_size] if opts[:max_size]
|
39
41
|
|
40
42
|
params = @options.merge({body: body.to_json})
|
41
43
|
|
42
44
|
self.class.post("/jobs/#{job_id}/pages", params)
|
43
45
|
end
|
44
46
|
|
47
|
+
def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
|
48
|
+
body = {
|
49
|
+
limit: limit,
|
50
|
+
page_types: page_types,
|
51
|
+
parse_fetching_failed: parse_fetching_failed
|
52
|
+
}
|
53
|
+
params = @options.merge(opts).merge({body: body.to_json})
|
54
|
+
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
55
|
+
end
|
56
|
+
|
45
57
|
def parsing_update(job_id, gid, opts={})
|
46
58
|
body = {}
|
47
59
|
body[:outputs] = opts.fetch(:outputs) {[]}
|
@@ -28,6 +28,7 @@ module Datahen
|
|
28
28
|
body[:profile] = opts[:profile] if opts[:profile]
|
29
29
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
30
|
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
31
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
31
32
|
params = @options.merge({body: body.to_json})
|
32
33
|
self.class.post("/scrapers", params)
|
33
34
|
end
|
@@ -49,6 +50,7 @@ module Datahen
|
|
49
50
|
body[:profile] = opts[:profile] if opts[:profile]
|
50
51
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
51
52
|
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
53
|
+
body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
|
52
54
|
params = @options.merge({body: body.to_json})
|
53
55
|
|
54
56
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -11,6 +11,7 @@ module Datahen
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
14
15
|
if opts[:vars]
|
15
16
|
if opts[:vars].is_a?(Array)
|
16
17
|
body[:vars] = opts[:vars]
|
@@ -37,6 +38,7 @@ module Datahen
|
|
37
38
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
38
39
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
39
40
|
body[:profile] = opts[:profile] if opts[:profile]
|
41
|
+
body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
|
40
42
|
params = @options.merge({body: body.to_json})
|
41
43
|
|
42
44
|
self.class.put("/scrapers/#{scraper_name}/current_job", params)
|
data/lib/datahen/scraper.rb
CHANGED
@@ -0,0 +1,358 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
module Datahen
|
5
|
+
module Scraper
|
6
|
+
class BatchParser
|
7
|
+
NOT_FOUND_MSG = "No more pages to parse found"
|
8
|
+
NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
|
9
|
+
NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
|
10
|
+
|
11
|
+
# Configuration file path.
|
12
|
+
# @return [String] config file path
|
13
|
+
attr_accessor :config_file
|
14
|
+
# Garbage collector request counter.
|
15
|
+
# @return [Integer] garbage collector counter
|
16
|
+
attr_accessor :garbage_count
|
17
|
+
# Last printed message, useful to prevent duplicated log messages.
|
18
|
+
# @return [String] last printed message
|
19
|
+
attr_accessor :last_message
|
20
|
+
# Second dequeue counter used to prevent false negative warning messages.
|
21
|
+
# @return [Integer] second dequeue counter
|
22
|
+
attr_accessor :second_dequeue_count
|
23
|
+
# Dequeue API request timeout in seconds.
|
24
|
+
# @return [Integer] dequeue API request timeout in seconds
|
25
|
+
attr_accessor :dequeue_timeout
|
26
|
+
# Job id to be executed.
|
27
|
+
# @return [Integer] job id
|
28
|
+
attr_reader :job_id
|
29
|
+
# Parallel worker quantity.
|
30
|
+
# @return [Integer] parallel worker quantity
|
31
|
+
attr_reader :worker_count
|
32
|
+
# Loaded pages array.
|
33
|
+
# @return [Concurrent::Array<Hash>] loaded pages as an array
|
34
|
+
attr_reader :pages
|
35
|
+
# Loaded pages hash, useful to avoid duplicates on the loaded pages array.
|
36
|
+
# @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
|
37
|
+
attr_reader :loaded_pages
|
38
|
+
# Max garbage collector requests before actually executing the garbage
|
39
|
+
# collector.
|
40
|
+
# @return [Integer] max garbage request quantity before actually executing
|
41
|
+
# it
|
42
|
+
attr_reader :max_garbage
|
43
|
+
# Dequeue interval in seconds.
|
44
|
+
# @return [Integer] dequeue interval in seconds
|
45
|
+
attr_reader :dequeue_interval
|
46
|
+
# Dequeue scale used to calculate the ideal dequeue size.
|
47
|
+
# @return [Numeric] dequeue scale
|
48
|
+
attr_reader :dequeue_scale
|
49
|
+
# Known page types extracted from the config file.
|
50
|
+
# @return [Array<String>] known page types
|
51
|
+
attr_reader :page_types
|
52
|
+
# Known parsers extracted from the config file.
|
53
|
+
# @return [Concurrent::Hash<String, String>] known parsers
|
54
|
+
attr_reader :parsers
|
55
|
+
# Current config file loaded.
|
56
|
+
# @return [Hash] current loaded configuration
|
57
|
+
attr_reader :config
|
58
|
+
# Datahen job pages client used for API pages dequeuing.
|
59
|
+
# @return [Datahen::Client::JobPage] datahen job pages API client
|
60
|
+
attr_reader :client
|
61
|
+
# Garbage collector mutex used to synchronize garbage collector requests.
|
62
|
+
# @return [Mutex] garbage collector mutex
|
63
|
+
attr_reader :garbage_mutex
|
64
|
+
# Current dequeuer thread.
|
65
|
+
# @return [Thread] dequeuer thread
|
66
|
+
attr_reader :dequeuer_thread
|
67
|
+
# Dequeuer mutext used to synchronize page dequeuing.
|
68
|
+
# @return [Mutex] dequeuer mutex
|
69
|
+
attr_reader :dequeue_mutex
|
70
|
+
# Dequeuer last run unix timestamp.
|
71
|
+
# @return [Integer] dequeuer last run unix timestamp
|
72
|
+
attr_reader :dequeuer_still_alive
|
73
|
+
# Indicates whenever the wait time is because there are no more pages.
|
74
|
+
# @return [Boolean] `true` when wait time is due to no more pages,
|
75
|
+
# else `false`
|
76
|
+
attr_reader :not_found
|
77
|
+
|
78
|
+
# Wait a specific amount of seconds.
|
79
|
+
# @param [Integer] time_in_seconds Seconds to wait.
|
80
|
+
def self.wait time_in_seconds
|
81
|
+
Kernel.sleep time_in_seconds
|
82
|
+
end
|
83
|
+
|
84
|
+
# Get a unix timestamp.
|
85
|
+
# @return [Integer] unix timestamp
|
86
|
+
def self.timestamp
|
87
|
+
Time.new.utc.to_i
|
88
|
+
end
|
89
|
+
|
90
|
+
# Initialize a batch parser object.
|
91
|
+
# @param [Integer] job_id Job id.
|
92
|
+
# @param [String] config_file Config file path.
|
93
|
+
# @param [Hash] opts ({}) Configuration options
|
94
|
+
# @option opts [Integer] :worker_count (1) Parallel worker quantity.
|
95
|
+
# @option opts [Integer] :max_garbage (5) Max amount of times the garbage
|
96
|
+
# collector can be requested before actually executing.
|
97
|
+
# @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
|
98
|
+
# between page dequeuing.
|
99
|
+
# @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
|
100
|
+
# calculate page dequeue size.
|
101
|
+
# @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
|
102
|
+
# timeout in seconds.
|
103
|
+
# @option opts [Hash] :client_options ({}) Datahen client gem additional
|
104
|
+
# options (see Datahen::Client::Base#initialize method).
|
105
|
+
def initialize(job_id, config_file, opts = {})
|
106
|
+
opts = {
|
107
|
+
worker_count: 1,
|
108
|
+
max_garbage: 5,
|
109
|
+
dequeue_interval: 3,
|
110
|
+
dequeue_scale: 2,
|
111
|
+
dequeue_timeout: 30,
|
112
|
+
client_options: {}
|
113
|
+
}.merge opts
|
114
|
+
|
115
|
+
@job_id = job_id
|
116
|
+
@worker_count = opts[:worker_count]
|
117
|
+
@dequeue_interval = opts[:dequeue_interval]
|
118
|
+
@dequeue_scale = opts[:dequeue_scale]
|
119
|
+
@max_garbage = opts[:max_garbage]
|
120
|
+
@pages = Concurrent::Array.new
|
121
|
+
@loaded_pages = Concurrent::Hash.new
|
122
|
+
@garbage_mutex = Mutex.new
|
123
|
+
@dequeue_mutex = Mutex.new
|
124
|
+
@not_found = false
|
125
|
+
self.dequeue_timeout = opts[:dequeue_timeout]
|
126
|
+
self.second_dequeue_count = 0
|
127
|
+
self.garbage_count = 0
|
128
|
+
self.config_file = config_file
|
129
|
+
self.load_config
|
130
|
+
|
131
|
+
@client = Datahen::Client::JobPage.new(opts[:client_options])
|
132
|
+
nil
|
133
|
+
end
|
134
|
+
|
135
|
+
# Execute garbage collector after it is requested as many times as
|
136
|
+
# described by #max_garbage.
|
137
|
+
def recollect_garbage
|
138
|
+
self.garbage_mutex.synchronize do
|
139
|
+
self.garbage_count += 1
|
140
|
+
if self.garbage_count > self.max_garbage
|
141
|
+
puts "Recollect garbage"
|
142
|
+
GC.start
|
143
|
+
self.garbage_count = 0
|
144
|
+
end
|
145
|
+
end
|
146
|
+
nil
|
147
|
+
end
|
148
|
+
|
149
|
+
# Loads the config file into a Hash.
|
150
|
+
def load_config
|
151
|
+
# build page type to script file map
|
152
|
+
@page_types = []
|
153
|
+
@parsers = Concurrent::Hash.new
|
154
|
+
@config = YAML.load_file(config_file)
|
155
|
+
self.config['parsers'].each do |v|
|
156
|
+
next if !v['disabled'].nil? && !!v['disabled']
|
157
|
+
@page_types << v['page_type']
|
158
|
+
self.parsers[v['page_type']] = v['file']
|
159
|
+
end
|
160
|
+
self.recollect_garbage
|
161
|
+
nil
|
162
|
+
end
|
163
|
+
|
164
|
+
# Print the message regardless of it being the same as the last message.
|
165
|
+
# @param [String] message Message to display.
|
166
|
+
def repeat_puts message
|
167
|
+
puts message
|
168
|
+
self.last_message = message
|
169
|
+
nil
|
170
|
+
end
|
171
|
+
|
172
|
+
# Print the message only when it is different from the last recorded
|
173
|
+
# message.
|
174
|
+
# @param [String] message Message to display.
|
175
|
+
def no_repeat_puts message
|
176
|
+
return if message == self.last_message
|
177
|
+
puts message
|
178
|
+
self.last_message = message
|
179
|
+
nil
|
180
|
+
end
|
181
|
+
|
182
|
+
# Refresh dequeuer's still alive timestamp
|
183
|
+
def dequeuer_is_alive!
|
184
|
+
self.dequeue_mutex.synchronize do
|
185
|
+
@dequeuer_still_alive = self.class.timestamp
|
186
|
+
end
|
187
|
+
nil
|
188
|
+
end
|
189
|
+
|
190
|
+
# Load new pages by dequeuing from the API.
|
191
|
+
# @return [Integer] amount of pages loaded
|
192
|
+
def load_pages
|
193
|
+
self.dequeuer_is_alive!
|
194
|
+
|
195
|
+
# calculate dequeue size
|
196
|
+
max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
|
197
|
+
current_size = self.pages.length
|
198
|
+
dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
|
199
|
+
if dequeue_size < 1
|
200
|
+
return 0
|
201
|
+
end
|
202
|
+
dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
|
203
|
+
|
204
|
+
# reserve and get to pages parse
|
205
|
+
response = nil
|
206
|
+
begin
|
207
|
+
response = client.dequeue self.job_id,
|
208
|
+
dequeue_size,
|
209
|
+
self.page_types,
|
210
|
+
config['parse_fetching_failed'],
|
211
|
+
timeout: self.dequeue_timeout
|
212
|
+
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
213
|
+
self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
|
214
|
+
self.dequeuer_is_alive!
|
215
|
+
return 0
|
216
|
+
rescue => e
|
217
|
+
raise e
|
218
|
+
end
|
219
|
+
self.dequeuer_is_alive!
|
220
|
+
|
221
|
+
# ensure a valid response or try again
|
222
|
+
if response.nil? || response.response.code.to_i != 200
|
223
|
+
self.repeat_puts(response.nil? ? 'null' : response.body)
|
224
|
+
self.recollect_garbage
|
225
|
+
return 0
|
226
|
+
end
|
227
|
+
|
228
|
+
# add pages
|
229
|
+
count = 0
|
230
|
+
(JSON.parse(response.body) || []).each do |page|
|
231
|
+
count += 1
|
232
|
+
next if self.loaded_pages.has_key? page['gid']
|
233
|
+
self.pages << (self.loaded_pages[page['gid']] = page)
|
234
|
+
end
|
235
|
+
response = nil
|
236
|
+
self.dequeuer_is_alive!
|
237
|
+
|
238
|
+
# recolect garbage to free some memory before parsing
|
239
|
+
if count > 0
|
240
|
+
@not_found = false
|
241
|
+
self.recollect_garbage
|
242
|
+
self.repeat_puts "Found #{count} page(s) to parse"
|
243
|
+
self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
|
244
|
+
else
|
245
|
+
@not_found = true
|
246
|
+
self.no_repeat_puts NOT_FOUND_MSG
|
247
|
+
end
|
248
|
+
|
249
|
+
# return how many pages were loaded
|
250
|
+
count
|
251
|
+
end
|
252
|
+
|
253
|
+
# Ensures that the dequeuer thread exists and is running.
|
254
|
+
# @return [Boolean] `true` if thread was alive, or `false` if had to
|
255
|
+
# create a new thread
|
256
|
+
def ensure_dequeuer_thread
|
257
|
+
self.dequeue_mutex.synchronize do
|
258
|
+
# check if dequeuer thread is alive and healthy
|
259
|
+
if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
|
260
|
+
still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
|
261
|
+
return true if self.class.timestamp < still_alive_timeout
|
262
|
+
|
263
|
+
# kill dequeuer thread
|
264
|
+
self.repeat_puts "Dequeuer isn't healthy, will restart it..."
|
265
|
+
self.dequeuer_thread.kill
|
266
|
+
@dequeuer_thread = nil
|
267
|
+
self.recollect_garbage
|
268
|
+
self.no_repeat_puts "Dequeuer thread was killed!"
|
269
|
+
end
|
270
|
+
|
271
|
+
# dequeuing on parallel (the ride never ends :D)
|
272
|
+
@dequeuer_thread = Thread.new do
|
273
|
+
while true
|
274
|
+
begin
|
275
|
+
self.load_pages
|
276
|
+
self.class.wait self.dequeue_interval
|
277
|
+
rescue => e
|
278
|
+
puts [e.message] + e.backtrace rescue 'error'
|
279
|
+
end
|
280
|
+
end
|
281
|
+
puts "Error: dequeuer died! D:"
|
282
|
+
end
|
283
|
+
self.repeat_puts "Dequeuer thread was started!"
|
284
|
+
end
|
285
|
+
false
|
286
|
+
end
|
287
|
+
|
288
|
+
# Dequeue one page from the previously loaded pages, and waits until there
|
289
|
+
# are new pages whenever there are no loaded pages.
|
290
|
+
# @return [Hash] dequeued page
|
291
|
+
def dequeue_pages
|
292
|
+
# collect garbage
|
293
|
+
self.recollect_garbage
|
294
|
+
|
295
|
+
# return page if there are loeaded pages
|
296
|
+
is_waiting = false
|
297
|
+
while true do
|
298
|
+
page = self.pages.shift
|
299
|
+
unless page.nil?
|
300
|
+
puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
|
301
|
+
loaded_pages.delete(page['gid'])
|
302
|
+
return page
|
303
|
+
end
|
304
|
+
|
305
|
+
# be more verbose on worker waiting
|
306
|
+
unless is_waiting
|
307
|
+
is_waiting = true
|
308
|
+
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
309
|
+
if self.second_dequeue_count > 1 && !self.not_found
|
310
|
+
puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
|
311
|
+
end
|
312
|
+
end
|
313
|
+
self.class.wait 1
|
314
|
+
|
315
|
+
# ensure the dequeuer thread is alive and healthy
|
316
|
+
self.ensure_dequeuer_thread
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
# Dequeue pages and execute the parsers associated to them on parallel.
|
321
|
+
def exec_parse save = false, keep_outputs = false
|
322
|
+
if self.worker_count < 1
|
323
|
+
self.no_repeat_puts NO_WORKERS_MSG
|
324
|
+
return
|
325
|
+
else
|
326
|
+
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
327
|
+
end
|
328
|
+
|
329
|
+
# start dequeuer
|
330
|
+
self.ensure_dequeuer_thread
|
331
|
+
|
332
|
+
# process the pages
|
333
|
+
dequeue = lambda{ self.dequeue_pages }
|
334
|
+
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
335
|
+
parser_file = self.parsers[page['page_type']]
|
336
|
+
begin
|
337
|
+
puts Datahen::Scraper::Parser.exec_parser_by_page(
|
338
|
+
parser_file,
|
339
|
+
page,
|
340
|
+
job_id,
|
341
|
+
save,
|
342
|
+
nil,
|
343
|
+
keep_outputs
|
344
|
+
)
|
345
|
+
rescue Parallel::Kill => e
|
346
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
|
347
|
+
rescue Parallel::Break => e
|
348
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
|
349
|
+
rescue => e
|
350
|
+
puts [e.message] + e.backtrace rescue 'error'
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
nil
|
355
|
+
end
|
356
|
+
end
|
357
|
+
end
|
358
|
+
end
|
@@ -6,7 +6,7 @@ module Datahen
|
|
6
6
|
# Max allowed page size when query outputs (see #find_outputs).
|
7
7
|
MAX_FIND_OUTPUTS_PER_PAGE = 500
|
8
8
|
|
9
|
-
attr_accessor :filename, :gid, :job_id
|
9
|
+
attr_accessor :filename, :page, :gid, :job_id
|
10
10
|
|
11
11
|
include Datahen::Plugin::ContextExposer
|
12
12
|
|
@@ -15,6 +15,9 @@ module Datahen
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def init_page()
|
18
|
+
# skip whenever a page is provided
|
19
|
+
return self.page unless self.page.nil?
|
20
|
+
|
18
21
|
if job_id
|
19
22
|
puts "getting Job Page"
|
20
23
|
init_job_page
|
@@ -18,6 +18,22 @@ module Datahen
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
+
def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
22
|
+
extname = File.extname(filename)
|
23
|
+
case extname
|
24
|
+
when '.rb'
|
25
|
+
executor = RubyParserExecutor.new(
|
26
|
+
filename: filename,
|
27
|
+
page: page,
|
28
|
+
job_id: job_id,
|
29
|
+
vars: vars,
|
30
|
+
keep_outputs: keep_outputs
|
31
|
+
)
|
32
|
+
executor.exec_parser(save)
|
33
|
+
else
|
34
|
+
puts "Unable to find a parser executor for file type \"#{extname}\""
|
35
|
+
end
|
36
|
+
end
|
21
37
|
|
22
38
|
end
|
23
39
|
end
|
@@ -12,7 +12,8 @@ module Datahen
|
|
12
12
|
|
13
13
|
def initialize(options={})
|
14
14
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
15
|
-
@
|
15
|
+
@page = options.fetch(:page) { nil }
|
16
|
+
@gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
|
16
17
|
@job_id = options.fetch(:job_id)
|
17
18
|
@page_vars = options.fetch(:vars) { {} }
|
18
19
|
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
@@ -46,6 +47,8 @@ module Datahen
|
|
46
47
|
end
|
47
48
|
|
48
49
|
def init_page_vars(page)
|
50
|
+
return self.page unless self.page.nil?
|
51
|
+
|
49
52
|
if !@page_vars.nil? && !@page_vars.empty?
|
50
53
|
page['vars'] = @page_vars
|
51
54
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -52,6 +52,34 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: concurrent-ruby
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.1'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.1'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: parallel
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.20'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.20'
|
55
83
|
- !ruby/object:Gem::Dependency
|
56
84
|
name: bundler
|
57
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -232,6 +260,7 @@ files:
|
|
232
260
|
- lib/datahen/plugin.rb
|
233
261
|
- lib/datahen/plugin/context_exposer.rb
|
234
262
|
- lib/datahen/scraper.rb
|
263
|
+
- lib/datahen/scraper/batch_parser.rb
|
235
264
|
- lib/datahen/scraper/executor.rb
|
236
265
|
- lib/datahen/scraper/finisher.rb
|
237
266
|
- lib/datahen/scraper/parser.rb
|
@@ -247,7 +276,7 @@ metadata:
|
|
247
276
|
allowed_push_host: https://rubygems.org
|
248
277
|
homepage_uri: https://datahen.com
|
249
278
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
250
|
-
post_install_message:
|
279
|
+
post_install_message:
|
251
280
|
rdoc_options: []
|
252
281
|
require_paths:
|
253
282
|
- lib
|
@@ -255,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
255
284
|
requirements:
|
256
285
|
- - ">="
|
257
286
|
- !ruby/object:Gem::Version
|
258
|
-
version: 2.
|
287
|
+
version: 2.4.4
|
259
288
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
260
289
|
requirements:
|
261
290
|
- - ">="
|
@@ -263,7 +292,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
263
292
|
version: '0'
|
264
293
|
requirements: []
|
265
294
|
rubygems_version: 3.0.3
|
266
|
-
signing_key:
|
295
|
+
signing_key:
|
267
296
|
specification_version: 4
|
268
297
|
summary: DataHen toolbelt for developers
|
269
298
|
test_files: []
|