datahen 0.14.26 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dac57d98132102aa9ae8244b6528394473b2bdeb9992c7ea15d6979eaf87d4af
4
- data.tar.gz: e68858d2f088b2d7b8538411dd59cf2ae2de7866416fc213c6a6fa009d93c556
3
+ metadata.gz: 39397d5cb4e60a6d24cdec5bd979f543a23019b7c9b9dffe6140a204d330465c
4
+ data.tar.gz: 1db7c2b448179c2bc4b56e99428dfb4303cbb1451df032ec43cb5264f58935ec
5
5
  SHA512:
6
- metadata.gz: 857126b2f7ec4fa058aaa8d5b4a7095108224bdf3f6ece690dbfc930e0527a294853705227f0e63be5af3524982fff21f7d3c9d940c22b31caade5139a3d607b
7
- data.tar.gz: 81ecf95378e6f4aa31a87e39a82bc815216fce1b84aa65d8f7f2aa8ee8b19b871f08eb8c86025d9dc8d84617f20864f5f39c21d7b8ac4900a739599c0aa6283c
6
+ metadata.gz: 7058506211d537c8ea3c9a521625fd339b255f41188a70341cc04683ca1abc1fa7f19ed796026b5e07679bb2fd7e57f096d319fbe8a75ae6fb7fd59a704a9824
7
+ data.tar.gz: b8b60607cd27acbd654afe0816b0b7738871ca61c370fbafa747985d3723fec64f9b07c6078b34c1192db1d7b160682522a4901a7047c3d067860bc2b745b0b0
data/datahen.gemspec CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
33
33
  spec.bindir = "exe"
34
34
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
35
35
  spec.require_paths = ["lib"]
36
- spec.required_ruby_version = '>= 2.2.2'
36
+ spec.required_ruby_version = '>= 2.4.4'
37
37
  spec.add_dependency "thor", "~> 0.20.3"
38
38
  spec.add_dependency 'httparty', '~> 0.16.2'
39
39
  spec.add_dependency 'nokogiri', '~> 1.6'
40
+ spec.add_dependency 'concurrent-ruby', '~> 1.1'
41
+ spec.add_dependency 'parallel', '~> 1.20'
40
42
  spec.add_development_dependency 'bundler', '>= 1.16'
41
43
  spec.add_development_dependency 'rake', '>= 10.0'
42
44
  spec.add_development_dependency 'minitest', '>= 5.11'
@@ -43,17 +43,17 @@ module Datahen
43
43
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
44
44
  option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
45
45
  def exec_parse(scraper_name, parser_file, *gids)
46
+ if options[:job]
47
+ job_id = options[:job]
48
+ else
49
+ job = Client::ScraperJob.new(options).find(scraper_name)
50
+ job_id = job['id']
51
+ end
52
+
46
53
  gids.each do |gid|
47
54
  begin
48
55
  puts "Parsing #{gid}"
49
56
 
50
- if options[:job]
51
- job_id = options[:job]
52
- else
53
- job = Client::ScraperJob.new(options).find(scraper_name)
54
- job_id = job['id']
55
- end
56
-
57
57
  vars = JSON.parse(options[:vars]) if options[:vars]
58
58
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
59
59
  rescue => e
@@ -61,6 +61,47 @@ module Datahen
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
66
+ long_desc <<-LONGDESC
67
+ Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
68
+ LONGDESC
69
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
70
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
71
+ option :"workers", type: :numeric, default: 1, desc: "Worker count"
72
+ option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
+ option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
+ option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
76
+ def batch_exec_parse(scraper_name, config_file)
77
+ if options[:job]
78
+ job_id = options[:job]
79
+ else
80
+ job = Client::ScraperJob.new(options).find(scraper_name)
81
+ job_id = job['id']
82
+ end
83
+
84
+ # make the stdout and stderr sync to prevent buffering
85
+ old_stdout_sync = $stdout.sync
86
+ old_stderr_sync = $stderr.sync
87
+ $stdout.sync = true
88
+ $stderr.sync = true
89
+
90
+ begin
91
+ batch = Datahen::Scraper::BatchParser.new job_id, config_file,
92
+ worker_count: options[:"workers"],
93
+ max_garbage: options[:"max-garbage"],
94
+ dequeue_interval: options[:"dequeue-interval"],
95
+ dequeue_scale: options[:"dequeue-scale"]
96
+ batch.exec_parse true, options[:"keep-outputs"]
97
+ rescue => e
98
+ puts [e.message] + e.backtrace
99
+ end
100
+
101
+ # resume whatever state the stdout and stderr sync were
102
+ $stdout.sync = old_stdout_sync
103
+ $stderr.sync = old_stderr_sync
104
+ end
64
105
  end
65
106
  end
66
107
 
@@ -32,6 +32,7 @@ module Datahen
32
32
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
33
33
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
34
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
35
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
35
36
  def create(scraper_name, git_repository)
36
37
  # puts "options #{options}"
37
38
  client = Client::Scraper.new(options)
@@ -57,6 +58,7 @@ module Datahen
57
58
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
58
59
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
59
60
  option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
61
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
60
62
  def update(scraper_name)
61
63
  client = Client::Scraper.new(options)
62
64
  puts "#{client.update(scraper_name, options)}"
@@ -94,6 +96,7 @@ module Datahen
94
96
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
97
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
98
  option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
99
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
97
100
  def start(scraper_name)
98
101
  client = Client::ScraperJob.new(options)
99
102
  puts "Starting a scrape job..."
@@ -104,6 +104,7 @@ module Datahen
104
104
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
105
105
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
106
106
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
107
+ option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
107
108
  def update(scraper_name)
108
109
  if options[:job]
109
110
  client = Client::Job.new(options)
@@ -45,6 +45,7 @@ module Datahen
45
45
  option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
46
46
  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
47
47
  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
48
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
48
49
  def add(scraper_name, url)
49
50
  begin
50
51
  options[:headers] = JSON.parse(options[:headers]) if options[:headers]
@@ -78,6 +79,7 @@ module Datahen
78
79
  option :page_type, :aliases => :t, desc: 'Set page type'
79
80
  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
80
81
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
82
+ option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
81
83
  def update(scraper_name, gid)
82
84
  begin
83
85
  options[:vars] = JSON.parse(options[:vars]) if options[:vars]
@@ -21,6 +21,7 @@ module Datahen
21
21
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
22
22
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
23
23
  body[:profile] = opts[:profile] if opts[:profile]
24
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
24
25
  params = @options.merge({body: body.to_json})
25
26
 
26
27
  self.class.put("/jobs/#{job_id}", params)
@@ -15,6 +15,7 @@ module Datahen
15
15
  body[:page_type] = opts[:page_type] if opts[:page_type]
16
16
  body[:priority] = opts[:priority] if opts[:priority]
17
17
  body[:vars] = opts[:vars] if opts[:vars]
18
+ body[:max_size] = opts[:max_size] if opts[:max_size]
18
19
 
19
20
  params = @options.merge({body: body.to_json})
20
21
 
@@ -36,12 +37,23 @@ module Datahen
36
37
  body[:ua_type] = opts[:ua_type] if opts[:ua_type]
37
38
  body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
38
39
  body[:cookie] = opts[:cookie] if opts[:cookie]
40
+ body[:max_size] = opts[:max_size] if opts[:max_size]
39
41
 
40
42
  params = @options.merge({body: body.to_json})
41
43
 
42
44
  self.class.post("/jobs/#{job_id}/pages", params)
43
45
  end
44
46
 
47
+ def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
48
+ body = {
49
+ limit: limit,
50
+ page_types: page_types,
51
+ parse_fetching_failed: parse_fetching_failed
52
+ }
53
+ params = @options.merge(opts).merge({body: body.to_json})
54
+ self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
55
+ end
56
+
45
57
  def parsing_update(job_id, gid, opts={})
46
58
  body = {}
47
59
  body[:outputs] = opts.fetch(:outputs) {[]}
@@ -28,6 +28,7 @@ module Datahen
28
28
  body[:profile] = opts[:profile] if opts[:profile]
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
30
  body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
31
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
31
32
  params = @options.merge({body: body.to_json})
32
33
  self.class.post("/scrapers", params)
33
34
  end
@@ -49,6 +50,7 @@ module Datahen
49
50
  body[:profile] = opts[:profile] if opts[:profile]
50
51
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
51
52
  body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
53
+ body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
52
54
  params = @options.merge({body: body.to_json})
53
55
 
54
56
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -11,6 +11,7 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
14
15
  if opts[:vars]
15
16
  if opts[:vars].is_a?(Array)
16
17
  body[:vars] = opts[:vars]
@@ -37,6 +38,7 @@ module Datahen
37
38
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
38
39
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
39
40
  body[:profile] = opts[:profile] if opts[:profile]
41
+ body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
40
42
  params = @options.merge({body: body.to_json})
41
43
 
42
44
  self.class.put("/scrapers/#{scraper_name}/current_job", params)
@@ -1,6 +1,7 @@
1
1
  require "datahen/error"
2
2
  require "datahen/plugin"
3
3
  require "datahen/scraper/parser"
4
+ require "datahen/scraper/batch_parser"
4
5
  require "datahen/scraper/seeder"
5
6
  require "datahen/scraper/finisher"
6
7
  require "datahen/scraper/executor"
@@ -0,0 +1,358 @@
1
+ require 'concurrent'
2
+ require 'parallel'
3
+
4
+ module Datahen
5
+ module Scraper
6
+ class BatchParser
7
+ NOT_FOUND_MSG = "No more pages to parse found"
8
+ NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
+ NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
+
11
+ # Configuration file path.
12
+ # @return [String] config file path
13
+ attr_accessor :config_file
14
+ # Garbage collector request counter.
15
+ # @return [Integer] garbage collector counter
16
+ attr_accessor :garbage_count
17
+ # Last printed message, useful to prevent duplicated log messages.
18
+ # @return [String] last printed message
19
+ attr_accessor :last_message
20
+ # Second dequeue counter used to prevent false negative warning messages.
21
+ # @return [Integer] second dequeue counter
22
+ attr_accessor :second_dequeue_count
23
+ # Dequeue API request timeout in seconds.
24
+ # @return [Integer] dequeue API request timeout in seconds
25
+ attr_accessor :dequeue_timeout
26
+ # Job id to be executed.
27
+ # @return [Integer] job id
28
+ attr_reader :job_id
29
+ # Parallel worker quantity.
30
+ # @return [Integer] parallel worker quantity
31
+ attr_reader :worker_count
32
+ # Loaded pages array.
33
+ # @return [Concurrent::Array<Hash>] loaded pages as an array
34
+ attr_reader :pages
35
+ # Loaded pages hash, useful to avoid duplicates on the loaded pages array.
36
+ # @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
37
+ attr_reader :loaded_pages
38
+ # Max garbage collector requests before actually executing the garbage
39
+ # collector.
40
+ # @return [Integer] max garbage request quantity before actually executing
41
+ # it
42
+ attr_reader :max_garbage
43
+ # Dequeue interval in seconds.
44
+ # @return [Integer] dequeue interval in seconds
45
+ attr_reader :dequeue_interval
46
+ # Dequeue scale used to calculate the ideal dequeue size.
47
+ # @return [Numeric] dequeue scale
48
+ attr_reader :dequeue_scale
49
+ # Known page types extracted from the config file.
50
+ # @return [Array<String>] known page types
51
+ attr_reader :page_types
52
+ # Known parsers extracted from the config file.
53
+ # @return [Concurrent::Hash<String, String>] known parsers
54
+ attr_reader :parsers
55
+ # Current config file loaded.
56
+ # @return [Hash] current loaded configuration
57
+ attr_reader :config
58
+ # Datahen job pages client used for API pages dequeuing.
59
+ # @return [Datahen::Client::JobPage] datahen job pages API client
60
+ attr_reader :client
61
+ # Garbage collector mutex used to synchronize garbage collector requests.
62
+ # @return [Mutex] garbage collector mutex
63
+ attr_reader :garbage_mutex
64
+ # Current dequeuer thread.
65
+ # @return [Thread] dequeuer thread
66
+ attr_reader :dequeuer_thread
67
+ # Dequeuer mutext used to synchronize page dequeuing.
68
+ # @return [Mutex] dequeuer mutex
69
+ attr_reader :dequeue_mutex
70
+ # Dequeuer last run unix timestamp.
71
+ # @return [Integer] dequeuer last run unix timestamp
72
+ attr_reader :dequeuer_still_alive
73
+ # Indicates whenever the wait time is because there are no more pages.
74
+ # @return [Boolean] `true` when wait time is due to no more pages,
75
+ # else `false`
76
+ attr_reader :not_found
77
+
78
+ # Wait a specific amount of seconds.
79
+ # @param [Integer] time_in_seconds Seconds to wait.
80
+ def self.wait time_in_seconds
81
+ Kernel.sleep time_in_seconds
82
+ end
83
+
84
+ # Get a unix timestamp.
85
+ # @return [Integer] unix timestamp
86
+ def self.timestamp
87
+ Time.new.utc.to_i
88
+ end
89
+
90
+ # Initialize a batch parser object.
91
+ # @param [Integer] job_id Job id.
92
+ # @param [String] config_file Config file path.
93
+ # @param [Hash] opts ({}) Configuration options
94
+ # @option opts [Integer] :worker_count (1) Parallel worker quantity.
95
+ # @option opts [Integer] :max_garbage (5) Max amount of times the garbage
96
+ # collector can be requested before actually executing.
97
+ # @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
98
+ # between page dequeuing.
99
+ # @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
100
+ # calculate page dequeue size.
101
+ # @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
102
+ # timeout in seconds.
103
+ # @option opts [Hash] :client_options ({}) Datahen client gem additional
104
+ # options (see Datahen::Client::Base#initialize method).
105
+ def initialize(job_id, config_file, opts = {})
106
+ opts = {
107
+ worker_count: 1,
108
+ max_garbage: 5,
109
+ dequeue_interval: 3,
110
+ dequeue_scale: 2,
111
+ dequeue_timeout: 30,
112
+ client_options: {}
113
+ }.merge opts
114
+
115
+ @job_id = job_id
116
+ @worker_count = opts[:worker_count]
117
+ @dequeue_interval = opts[:dequeue_interval]
118
+ @dequeue_scale = opts[:dequeue_scale]
119
+ @max_garbage = opts[:max_garbage]
120
+ @pages = Concurrent::Array.new
121
+ @loaded_pages = Concurrent::Hash.new
122
+ @garbage_mutex = Mutex.new
123
+ @dequeue_mutex = Mutex.new
124
+ @not_found = false
125
+ self.dequeue_timeout = opts[:dequeue_timeout]
126
+ self.second_dequeue_count = 0
127
+ self.garbage_count = 0
128
+ self.config_file = config_file
129
+ self.load_config
130
+
131
+ @client = Datahen::Client::JobPage.new(opts[:client_options])
132
+ nil
133
+ end
134
+
135
+ # Execute garbage collector after it is requested as many times as
136
+ # described by #max_garbage.
137
+ def recollect_garbage
138
+ self.garbage_mutex.synchronize do
139
+ self.garbage_count += 1
140
+ if self.garbage_count > self.max_garbage
141
+ puts "Recollect garbage"
142
+ GC.start
143
+ self.garbage_count = 0
144
+ end
145
+ end
146
+ nil
147
+ end
148
+
149
+ # Loads the config file into a Hash.
150
+ def load_config
151
+ # build page type to script file map
152
+ @page_types = []
153
+ @parsers = Concurrent::Hash.new
154
+ @config = YAML.load_file(config_file)
155
+ self.config['parsers'].each do |v|
156
+ next if !v['disabled'].nil? && !!v['disabled']
157
+ @page_types << v['page_type']
158
+ self.parsers[v['page_type']] = v['file']
159
+ end
160
+ self.recollect_garbage
161
+ nil
162
+ end
163
+
164
+ # Print the message regardless of it being the same as the last message.
165
+ # @param [String] message Message to display.
166
+ def repeat_puts message
167
+ puts message
168
+ self.last_message = message
169
+ nil
170
+ end
171
+
172
+ # Print the message only when it is different from the last recorded
173
+ # message.
174
+ # @param [String] message Message to display.
175
+ def no_repeat_puts message
176
+ return if message == self.last_message
177
+ puts message
178
+ self.last_message = message
179
+ nil
180
+ end
181
+
182
+ # Refresh dequeuer's still alive timestamp
183
+ def dequeuer_is_alive!
184
+ self.dequeue_mutex.synchronize do
185
+ @dequeuer_still_alive = self.class.timestamp
186
+ end
187
+ nil
188
+ end
189
+
190
+ # Load new pages by dequeuing from the API.
191
+ # @return [Integer] amount of pages loaded
192
+ def load_pages
193
+ self.dequeuer_is_alive!
194
+
195
+ # calculate dequeue size
196
+ max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
197
+ current_size = self.pages.length
198
+ dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
199
+ if dequeue_size < 1
200
+ return 0
201
+ end
202
+ dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
203
+
204
+ # reserve and get to pages parse
205
+ response = nil
206
+ begin
207
+ response = client.dequeue self.job_id,
208
+ dequeue_size,
209
+ self.page_types,
210
+ config['parse_fetching_failed'],
211
+ timeout: self.dequeue_timeout
212
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
213
+ self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
214
+ self.dequeuer_is_alive!
215
+ return 0
216
+ rescue => e
217
+ raise e
218
+ end
219
+ self.dequeuer_is_alive!
220
+
221
+ # ensure a valid response or try again
222
+ if response.nil? || response.response.code.to_i != 200
223
+ self.repeat_puts(response.nil? ? 'null' : response.body)
224
+ self.recollect_garbage
225
+ return 0
226
+ end
227
+
228
+ # add pages
229
+ count = 0
230
+ (JSON.parse(response.body) || []).each do |page|
231
+ count += 1
232
+ next if self.loaded_pages.has_key? page['gid']
233
+ self.pages << (self.loaded_pages[page['gid']] = page)
234
+ end
235
+ response = nil
236
+ self.dequeuer_is_alive!
237
+
238
+ # recolect garbage to free some memory before parsing
239
+ if count > 0
240
+ @not_found = false
241
+ self.recollect_garbage
242
+ self.repeat_puts "Found #{count} page(s) to parse"
243
+ self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
244
+ else
245
+ @not_found = true
246
+ self.no_repeat_puts NOT_FOUND_MSG
247
+ end
248
+
249
+ # return how many pages were loaded
250
+ count
251
+ end
252
+
253
+ # Ensures that the dequeuer thread exists and is running.
254
+ # @return [Boolean] `true` if thread was alive, or `false` if had to
255
+ # create a new thread
256
+ def ensure_dequeuer_thread
257
+ self.dequeue_mutex.synchronize do
258
+ # check if dequeuer thread is alive and healthy
259
+ if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
260
+ still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
261
+ return true if self.class.timestamp < still_alive_timeout
262
+
263
+ # kill dequeuer thread
264
+ self.repeat_puts "Dequeuer isn't healthy, will restart it..."
265
+ self.dequeuer_thread.kill
266
+ @dequeuer_thread = nil
267
+ self.recollect_garbage
268
+ self.no_repeat_puts "Dequeuer thread was killed!"
269
+ end
270
+
271
+ # dequeuing on parallel (the ride never ends :D)
272
+ @dequeuer_thread = Thread.new do
273
+ while true
274
+ begin
275
+ self.load_pages
276
+ self.class.wait self.dequeue_interval
277
+ rescue => e
278
+ puts [e.message] + e.backtrace rescue 'error'
279
+ end
280
+ end
281
+ puts "Error: dequeuer died! D:"
282
+ end
283
+ self.repeat_puts "Dequeuer thread was started!"
284
+ end
285
+ false
286
+ end
287
+
288
+ # Dequeue one page from the previously loaded pages, and waits until there
289
+ # are new pages whenever there are no loaded pages.
290
+ # @return [Hash] dequeued page
291
+ def dequeue_pages
292
+ # collect garbage
293
+ self.recollect_garbage
294
+
295
+ # return page if there are loeaded pages
296
+ is_waiting = false
297
+ while true do
298
+ page = self.pages.shift
299
+ unless page.nil?
300
+ puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
301
+ loaded_pages.delete(page['gid'])
302
+ return page
303
+ end
304
+
305
+ # be more verbose on worker waiting
306
+ unless is_waiting
307
+ is_waiting = true
308
+ puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
309
+ if self.second_dequeue_count > 1 && !self.not_found
310
+ puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
311
+ end
312
+ end
313
+ self.class.wait 1
314
+
315
+ # ensure the dequeuer thread is alive and healthy
316
+ self.ensure_dequeuer_thread
317
+ end
318
+ end
319
+
320
+ # Dequeue pages and execute the parsers associated to them on parallel.
321
+ def exec_parse save = false, keep_outputs = false
322
+ if self.worker_count < 1
323
+ self.no_repeat_puts NO_WORKERS_MSG
324
+ return
325
+ else
326
+ self.no_repeat_puts "Spawing #{self.worker_count} workers"
327
+ end
328
+
329
+ # start dequeuer
330
+ self.ensure_dequeuer_thread
331
+
332
+ # process the pages
333
+ dequeue = lambda{ self.dequeue_pages }
334
+ Parallel.each(dequeue, in_threads: (worker_count)) do |page|
335
+ parser_file = self.parsers[page['page_type']]
336
+ begin
337
+ puts Datahen::Scraper::Parser.exec_parser_by_page(
338
+ parser_file,
339
+ page,
340
+ job_id,
341
+ save,
342
+ nil,
343
+ keep_outputs
344
+ )
345
+ rescue Parallel::Kill => e
346
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
347
+ rescue Parallel::Break => e
348
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
349
+ rescue => e
350
+ puts [e.message] + e.backtrace rescue 'error'
351
+ end
352
+ end
353
+
354
+ nil
355
+ end
356
+ end
357
+ end
358
+ end
@@ -6,7 +6,7 @@ module Datahen
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
8
 
9
- attr_accessor :filename, :gid, :job_id
9
+ attr_accessor :filename, :page, :gid, :job_id
10
10
 
11
11
  include Datahen::Plugin::ContextExposer
12
12
 
@@ -15,6 +15,9 @@ module Datahen
15
15
  end
16
16
 
17
17
  def init_page()
18
+ # skip whenever a page is provided
19
+ return self.page unless self.page.nil?
20
+
18
21
  if job_id
19
22
  puts "getting Job Page"
20
23
  init_job_page
@@ -18,6 +18,22 @@ module Datahen
18
18
  end
19
19
  end
20
20
 
21
+ def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
22
+ extname = File.extname(filename)
23
+ case extname
24
+ when '.rb'
25
+ executor = RubyParserExecutor.new(
26
+ filename: filename,
27
+ page: page,
28
+ job_id: job_id,
29
+ vars: vars,
30
+ keep_outputs: keep_outputs
31
+ )
32
+ executor.exec_parser(save)
33
+ else
34
+ puts "Unable to find a parser executor for file type \"#{extname}\""
35
+ end
36
+ end
21
37
 
22
38
  end
23
39
  end
@@ -12,7 +12,8 @@ module Datahen
12
12
 
13
13
  def initialize(options={})
14
14
  @filename = options.fetch(:filename) { raise "Filename is required"}
15
- @gid = options.fetch(:gid) { raise "GID is required"}
15
+ @page = options.fetch(:page) { nil }
16
+ @gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
16
17
  @job_id = options.fetch(:job_id)
17
18
  @page_vars = options.fetch(:vars) { {} }
18
19
  @keep_outputs = !!(options.fetch(:keep_outputs) { false })
@@ -46,6 +47,8 @@ module Datahen
46
47
  end
47
48
 
48
49
  def init_page_vars(page)
50
+ return self.page unless self.page.nil?
51
+
49
52
  if !@page_vars.nil? && !@page_vars.empty?
50
53
  page['vars'] = @page_vars
51
54
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.26"
2
+ VERSION = "0.16.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.26
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-04-20 00:00:00.000000000 Z
11
+ date: 2021-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -52,6 +52,34 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.20'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.20'
55
83
  - !ruby/object:Gem::Dependency
56
84
  name: bundler
57
85
  requirement: !ruby/object:Gem::Requirement
@@ -232,6 +260,7 @@ files:
232
260
  - lib/datahen/plugin.rb
233
261
  - lib/datahen/plugin/context_exposer.rb
234
262
  - lib/datahen/scraper.rb
263
+ - lib/datahen/scraper/batch_parser.rb
235
264
  - lib/datahen/scraper/executor.rb
236
265
  - lib/datahen/scraper/finisher.rb
237
266
  - lib/datahen/scraper/parser.rb
@@ -247,7 +276,7 @@ metadata:
247
276
  allowed_push_host: https://rubygems.org
248
277
  homepage_uri: https://datahen.com
249
278
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
250
- post_install_message:
279
+ post_install_message:
251
280
  rdoc_options: []
252
281
  require_paths:
253
282
  - lib
@@ -255,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
255
284
  requirements:
256
285
  - - ">="
257
286
  - !ruby/object:Gem::Version
258
- version: 2.2.2
287
+ version: 2.4.4
259
288
  required_rubygems_version: !ruby/object:Gem::Requirement
260
289
  requirements:
261
290
  - - ">="
@@ -263,7 +292,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
263
292
  version: '0'
264
293
  requirements: []
265
294
  rubygems_version: 3.0.3
266
- signing_key:
295
+ signing_key:
267
296
  specification_version: 4
268
297
  summary: DataHen toolbelt for developers
269
298
  test_files: []