datahen 0.14.22 → 0.15.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 25eb02598ac32462a53995a4b9e72b3bc466b54c2d74be02516f8d04f178a7b8
4
- data.tar.gz: 5f6fcedfa7f4a477e18fc1a0ee80126b1a646a3ecefdd8258d1982bf7d7fe06f
3
+ metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
4
+ data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
5
5
  SHA512:
6
- metadata.gz: 69a71d740f9078a5a4c2a77211587c0099a4064cabda690cd6fb4803c153975e3e91f1c05f98278f2852a0bacf8cb444bba8f29f56c3cfbd0fba12cece39b9cd
7
- data.tar.gz: df131c11592d2b6192fa74d26fc0e8d823b99f8073b907c82c8e9f04622c7d28aa5e1145419ac0377a99f6efcd3f46ff8fcef88fc436e802d51afc014fd4383a
6
+ metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
7
+ data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
data/datahen.gemspec CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
33
33
  spec.bindir = "exe"
34
34
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
35
35
  spec.require_paths = ["lib"]
36
- spec.required_ruby_version = '>= 2.2.2'
36
+ spec.required_ruby_version = '>= 2.4.4'
37
37
  spec.add_dependency "thor", "~> 0.20.3"
38
38
  spec.add_dependency 'httparty', '~> 0.16.2'
39
- spec.add_dependency 'nokogiri', '~> 1.6', '< 1.10'
39
+ spec.add_dependency 'nokogiri', '~> 1.6'
40
+ spec.add_dependency 'concurrent-ruby', '~> 1.1'
41
+ spec.add_dependency 'parallel', '~> 1.20'
40
42
  spec.add_development_dependency 'bundler', '>= 1.16'
41
43
  spec.add_development_dependency 'rake', '>= 10.0'
42
44
  spec.add_development_dependency 'minitest', '>= 5.11'
@@ -14,20 +14,19 @@ module Datahen
14
14
  def try_parse(scraper_name, parser_file, gid)
15
15
  begin
16
16
 
17
- if options[:job]
18
- job_id = options[:job]
19
- elsif options[:global]
20
- job_id = nil
21
- else
22
- job = Client::ScraperJob.new(options).find(scraper_name)
23
- job_id = job['id']
24
- end
25
-
17
+ if options[:job]
18
+ job_id = options[:job]
19
+ elsif options[:global]
20
+ job_id = nil
21
+ else
22
+ job = Client::ScraperJob.new(options).find(scraper_name)
23
+ job_id = job['id']
24
+ end
26
25
 
27
26
  vars = JSON.parse(options[:vars]) if options[:vars]
28
27
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
29
28
 
30
- rescue JSON::ParserError
29
+ rescue JSON::ParserError
31
30
  if options[:vars]
32
31
  puts "Error: #{options[:vars]} on vars is not a valid JSON"
33
32
  end
@@ -44,17 +43,17 @@ module Datahen
44
43
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
45
44
  option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
46
45
  def exec_parse(scraper_name, parser_file, *gids)
46
+ if options[:job]
47
+ job_id = options[:job]
48
+ else
49
+ job = Client::ScraperJob.new(options).find(scraper_name)
50
+ job_id = job['id']
51
+ end
52
+
47
53
  gids.each do |gid|
48
54
  begin
49
55
  puts "Parsing #{gid}"
50
56
 
51
- if options[:job]
52
- job_id = options[:job]
53
- else
54
- job = Client::ScraperJob.new(options).find(scraper_name)
55
- job_id = job['id']
56
- end
57
-
58
57
  vars = JSON.parse(options[:vars]) if options[:vars]
59
58
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
60
59
  rescue => e
@@ -62,6 +61,46 @@ module Datahen
62
61
  end
63
62
  end
64
63
  end
64
+
65
+ desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
66
+ long_desc <<-LONGDESC
67
+ Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
68
+ LONGDESC
69
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
70
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
71
+ option :"workers", type: :numeric, default: 1, desc: "Worker count"
72
+ option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
+ option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
+ option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ def batch_exec_parse(scraper_name, config_file)
76
+ if options[:job]
77
+ job_id = options[:job]
78
+ else
79
+ job = Client::ScraperJob.new(options).find(scraper_name)
80
+ job_id = job['id']
81
+ end
82
+
83
+ # make the stdout and stderr sync to prevent buffering
84
+ old_stdout_sync = $stdout.sync
85
+ old_stderr_sync = $stderr.sync
86
+ $stdout.sync = true
87
+ $stderr.sync = true
88
+
89
+ begin
90
+ batch = Datahen::Scraper::BatchParser.new job_id, config_file,
91
+ worker_count: options[:"workers"],
92
+ max_garbage: options[:"max-garbage"],
93
+ dequeue_interval: options[:"dequeue-interval"],
94
+ dequeue_scale: options[:"dequeue-scale"]
95
+ batch.exec_parse true, options[:"keep-outputs"]
96
+ rescue => e
97
+ puts [e.message] + e.backtrace
98
+ end
99
+
100
+ # resume whatever state the stdout and stderr sync were
101
+ $stdout.sync = old_stdout_sync
102
+ $stderr.sync = old_stderr_sync
103
+ end
65
104
  end
66
105
  end
67
106
 
@@ -4,10 +4,10 @@ require 'httparty'
4
4
  module Datahen
5
5
  module Client
6
6
  class BackblazeContent
7
- include HTTParty
8
-
7
+ include HTTParty
8
+
9
9
  def get_content(url)
10
- self.class.get(url, format: :plain)
10
+ self.class.get(url, format: :plain).response.body
11
11
  end
12
12
 
13
13
  def get_gunzipped_content(url)
@@ -19,19 +19,23 @@ module Datahen
19
19
  sio = StringIO.new(string)
20
20
  gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
21
21
  _content = ""
22
- begin
22
+ begin
23
23
  _content = gz.read
24
24
  rescue => e
25
25
  # if unexpected eof error, then readchar until error, and ignore it
26
26
  if e.to_s == 'unexpected end of file'
27
- begin
28
- while !gz.eof?
29
- _content += gz.readchar
30
- end
27
+ # heavily improve content read recovery by using "String#<<",
28
+ # reading all "good" lines and then concat the remaining chars
29
+ begin
30
+ gz.each_line{|line| _content << line}
31
31
  rescue => e
32
- puts "Ignored Zlib error: #{e.to_s}"
32
+ begin
33
+ _content << gz.readchar while !gz.eof
34
+ rescue => e
35
+ puts "Ignored Zlib error: #{e.to_s}"
36
+ end
33
37
  end
34
- else
38
+ else
35
39
  raise e
36
40
  end
37
41
  end
@@ -42,6 +42,16 @@ module Datahen
42
42
  self.class.post("/jobs/#{job_id}/pages", params)
43
43
  end
44
44
 
45
+ def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
46
+ body = {
47
+ limit: limit,
48
+ page_types: page_types,
49
+ parse_fetching_failed: parse_fetching_failed
50
+ }
51
+ params = @options.merge({body: body.to_json, timeout: 30})
52
+ self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
+ end
54
+
45
55
  def parsing_update(job_id, gid, opts={})
46
56
  body = {}
47
57
  body[:outputs] = opts.fetch(:outputs) {[]}
@@ -1,6 +1,7 @@
1
1
  require "datahen/error"
2
2
  require "datahen/plugin"
3
3
  require "datahen/scraper/parser"
4
+ require "datahen/scraper/batch_parser"
4
5
  require "datahen/scraper/seeder"
5
6
  require "datahen/scraper/finisher"
6
7
  require "datahen/scraper/executor"
@@ -0,0 +1,203 @@
1
+ require 'concurrent'
2
+ require 'parallel'
3
+
4
+ module Datahen
5
+ module Scraper
6
+ class BatchParser
7
+ NOT_FOUND_MSG = "No more pages to parse found"
8
+ NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
+ NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
+
11
+ attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
12
+ attr_reader :job_id, :worker_count, :pages, :max_garbage
13
+ attr_reader :dequeue_interval, :dequeue_scale
14
+ attr_reader :page_types, :parsers
15
+ attr_reader :config, :client, :garbage_mutex
16
+
17
+ def self.wait time_in_seconds
18
+ Kernel.sleep time_in_seconds
19
+ end
20
+
21
+ def initialize(job_id, config_file, opts = {})
22
+ opts = {
23
+ worker_count: 1,
24
+ max_garbage: 5,
25
+ dequeue_interval: 3,
26
+ dequeue_scale: 2,
27
+ client_options: {}
28
+ }.merge opts
29
+
30
+ @job_id = job_id
31
+ @worker_count = opts[:worker_count]
32
+ @dequeue_interval = opts[:dequeue_interval]
33
+ @dequeue_scale = opts[:dequeue_scale]
34
+ @max_garbage = opts[:max_garbage]
35
+ @pages = Concurrent::Hash.new
36
+ @garbage_mutex = Mutex.new
37
+ self.second_dequeue_count = 0
38
+ self.garbage_count = 0
39
+ self.config_file = config_file
40
+ self.load_config
41
+
42
+ @client = Datahen::Client::JobPage.new(opts[:client_options])
43
+ end
44
+
45
+ def recollect_garbage
46
+ self.garbage_mutex.synchronize do
47
+ self.garbage_count += 1
48
+ if self.garbage_count > self.max_garbage
49
+ puts "Recollect garbage"
50
+ GC.start
51
+ self.garbage_count = 0
52
+ end
53
+ end
54
+ end
55
+
56
+ def load_config
57
+ # build page type to script file map
58
+ @page_types = []
59
+ @parsers = Concurrent::Hash.new
60
+ @config = YAML.load_file(config_file)
61
+ self.config['parsers'].each do |v|
62
+ next if !v['disabled'].nil? && !!v['disabled']
63
+ @page_types << v['page_type']
64
+ self.parsers[v['page_type']] = v['file']
65
+ end
66
+ self.recollect_garbage
67
+ end
68
+
69
+ def repeat_puts message
70
+ puts message
71
+ self.last_message = ''
72
+ end
73
+
74
+ def no_repeat_puts message
75
+ return if message == self.last_message
76
+ puts message
77
+ self.last_message = message
78
+ end
79
+
80
+ def load_pages
81
+ # calculate dequeue size
82
+ max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
83
+ current_size = self.pages.length
84
+ dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
85
+ if dequeue_size < 1
86
+ return 0
87
+ end
88
+ dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
89
+
90
+ # reserve and get to pages parse
91
+ response = nil
92
+ begin
93
+ response = client.dequeue self.job_id,
94
+ dequeue_size,
95
+ self.page_types,
96
+ config['parse_fetching_failed']
97
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
98
+ self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
99
+ return 0
100
+ rescue => e
101
+ raise e
102
+ end
103
+
104
+ # ensure a valid response or try again
105
+ if response.nil? || response.response.code.to_i != 200
106
+ self.repeat_puts(response.nil? ? 'null' : response.body)
107
+ self.recollect_garbage
108
+ return 0
109
+ end
110
+
111
+ # add pages
112
+ count = 0
113
+ (JSON.parse(response.body) || []).each do |page|
114
+ count += 1
115
+ next if self.pages.has_key? page['gid']
116
+ self.pages[page['gid']] = page
117
+ end
118
+ response = nil
119
+
120
+ # recolect garbage to free some memory before parsing
121
+ if count > 0
122
+ self.recollect_garbage
123
+ self.repeat_puts "Found #{count} page(s) to parse"
124
+ self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
125
+ else
126
+ self.no_repeat_puts NOT_FOUND_MSG
127
+ end
128
+
129
+ # return how many pages were loaded
130
+ count
131
+ end
132
+
133
+ def dequeue_pages
134
+ # collect garbage
135
+ self.recollect_garbage
136
+
137
+ # return page if there are loeaded pages
138
+ is_waiting = false
139
+ while true do
140
+ key_value = self.pages.shift
141
+ unless key_value.nil?
142
+ puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
143
+ return key_value[1]
144
+ end
145
+
146
+ # be more verbose on worker waiting
147
+ unless is_waiting
148
+ is_waiting = true
149
+ puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
150
+ if self.second_dequeue_count > 1
151
+ puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
152
+ end
153
+ end
154
+ self.class.wait 1
155
+ end
156
+ end
157
+
158
+ def exec_parse save = false, keep_outputs = false
159
+ if self.worker_count < 1
160
+ self.no_repeat_puts NO_WORKERS_MSG
161
+ return
162
+ else
163
+ self.no_repeat_puts "Spawing #{self.worker_count} workers"
164
+ end
165
+
166
+ # dequeuing on parallel (the ride never ends :D)
167
+ Thread.new do
168
+ while true
169
+ begin
170
+ self.load_pages
171
+ self.class.wait self.dequeue_interval
172
+ rescue => e
173
+ puts [e.message] + e.backtrace rescue 'error'
174
+ end
175
+ end
176
+ puts "Error: dequeuer died! D:"
177
+ end
178
+
179
+ # process the pages
180
+ dequeue = lambda{ self.dequeue_pages }
181
+ Parallel.each(dequeue, in_threads: (worker_count)) do |page|
182
+ parser_file = self.parsers[page['page_type']]
183
+ begin
184
+ puts Datahen::Scraper::Parser.exec_parser_by_page(
185
+ parser_file,
186
+ page,
187
+ job_id,
188
+ save,
189
+ nil,
190
+ keep_outputs
191
+ )
192
+ rescue Parallel::Kill => e
193
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
194
+ rescue Parallel::Break => e
195
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
196
+ rescue => e
197
+ puts [e.message] + e.backtrace rescue 'error'
198
+ end
199
+ end
200
+ end
201
+ end
202
+ end
203
+ end
@@ -6,7 +6,7 @@ module Datahen
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
8
 
9
- attr_accessor :filename, :gid, :job_id
9
+ attr_accessor :filename, :page, :gid, :job_id
10
10
 
11
11
  include Datahen::Plugin::ContextExposer
12
12
 
@@ -15,6 +15,9 @@ module Datahen
15
15
  end
16
16
 
17
17
  def init_page()
18
+ # skip whenever a page is provided
19
+ return self.page unless self.page.nil?
20
+
18
21
  if job_id
19
22
  puts "getting Job Page"
20
23
  init_job_page
@@ -374,6 +377,11 @@ module Datahen
374
377
  def eval_with_context file_path, context
375
378
  eval(File.read(file_path), context, file_path)
376
379
  end
380
+
381
+ # Finish the executor execution
382
+ def finish
383
+ raise Error::SafeTerminateError
384
+ end
377
385
  end
378
386
  end
379
387
  end
@@ -18,6 +18,22 @@ module Datahen
18
18
  end
19
19
  end
20
20
 
21
+ def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
22
+ extname = File.extname(filename)
23
+ case extname
24
+ when '.rb'
25
+ executor = RubyParserExecutor.new(
26
+ filename: filename,
27
+ page: page,
28
+ job_id: job_id,
29
+ vars: vars,
30
+ keep_outputs: keep_outputs
31
+ )
32
+ executor.exec_parser(save)
33
+ else
34
+ puts "Unable to find a parser executor for file type \"#{extname}\""
35
+ end
36
+ end
21
37
 
22
38
  end
23
39
  end
@@ -40,6 +40,8 @@ module Datahen
40
40
  job_id: job_id
41
41
  })
42
42
  eval_with_context filename, context
43
+ rescue Error::SafeTerminateError => e
44
+ # do nothing, this is fine
43
45
  rescue SyntaxError => e
44
46
  handle_error(e) if save
45
47
  raise e
@@ -55,7 +57,7 @@ module Datahen
55
57
  handle_error(e) if save
56
58
  raise e
57
59
  end
58
-
60
+
59
61
  update_finisher_done_status
60
62
  end
61
63
  proc.call
@@ -12,7 +12,8 @@ module Datahen
12
12
 
13
13
  def initialize(options={})
14
14
  @filename = options.fetch(:filename) { raise "Filename is required"}
15
- @gid = options.fetch(:gid) { raise "GID is required"}
15
+ @page = options.fetch(:page) { nil }
16
+ @gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
16
17
  @job_id = options.fetch(:job_id)
17
18
  @page_vars = options.fetch(:vars) { {} }
18
19
  @keep_outputs = !!(options.fetch(:keep_outputs) { false })
@@ -46,6 +47,8 @@ module Datahen
46
47
  end
47
48
 
48
49
  def init_page_vars(page)
50
+ return self.page unless self.page.nil?
51
+
49
52
  if !@page_vars.nil? && !@page_vars.empty?
50
53
  page['vars'] = @page_vars
51
54
  end
@@ -44,6 +44,8 @@ module Datahen
44
44
  pages: pages
45
45
  })
46
46
  eval_with_context filename, context
47
+ rescue Error::SafeTerminateError => e
48
+ # do nothing, this is fine
47
49
  rescue SyntaxError => e
48
50
  handle_error(e) if save
49
51
  raise e
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.22"
2
+ VERSION = "0.15.10"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.22
4
+ version: 0.15.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-12-14 00:00:00.000000000 Z
11
+ date: 2021-05-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -45,9 +45,6 @@ dependencies:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.6'
48
- - - "<"
49
- - !ruby/object:Gem::Version
50
- version: '1.10'
51
48
  type: :runtime
52
49
  prerelease: false
53
50
  version_requirements: !ruby/object:Gem::Requirement
@@ -55,9 +52,34 @@ dependencies:
55
52
  - - "~>"
56
53
  - !ruby/object:Gem::Version
57
54
  version: '1.6'
58
- - - "<"
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.20'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
59
81
  - !ruby/object:Gem::Version
60
- version: '1.10'
82
+ version: '1.20'
61
83
  - !ruby/object:Gem::Dependency
62
84
  name: bundler
63
85
  requirement: !ruby/object:Gem::Requirement
@@ -238,6 +260,7 @@ files:
238
260
  - lib/datahen/plugin.rb
239
261
  - lib/datahen/plugin/context_exposer.rb
240
262
  - lib/datahen/scraper.rb
263
+ - lib/datahen/scraper/batch_parser.rb
241
264
  - lib/datahen/scraper/executor.rb
242
265
  - lib/datahen/scraper/finisher.rb
243
266
  - lib/datahen/scraper/parser.rb
@@ -261,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
261
284
  requirements:
262
285
  - - ">="
263
286
  - !ruby/object:Gem::Version
264
- version: 2.2.2
287
+ version: 2.4.4
265
288
  required_rubygems_version: !ruby/object:Gem::Requirement
266
289
  requirements:
267
290
  - - ">="