datahen 0.14.22 → 0.15.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 25eb02598ac32462a53995a4b9e72b3bc466b54c2d74be02516f8d04f178a7b8
4
- data.tar.gz: 5f6fcedfa7f4a477e18fc1a0ee80126b1a646a3ecefdd8258d1982bf7d7fe06f
3
+ metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
4
+ data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
5
5
  SHA512:
6
- metadata.gz: 69a71d740f9078a5a4c2a77211587c0099a4064cabda690cd6fb4803c153975e3e91f1c05f98278f2852a0bacf8cb444bba8f29f56c3cfbd0fba12cece39b9cd
7
- data.tar.gz: df131c11592d2b6192fa74d26fc0e8d823b99f8073b907c82c8e9f04622c7d28aa5e1145419ac0377a99f6efcd3f46ff8fcef88fc436e802d51afc014fd4383a
6
+ metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
7
+ data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
data/datahen.gemspec CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
33
33
  spec.bindir = "exe"
34
34
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
35
35
  spec.require_paths = ["lib"]
36
- spec.required_ruby_version = '>= 2.2.2'
36
+ spec.required_ruby_version = '>= 2.4.4'
37
37
  spec.add_dependency "thor", "~> 0.20.3"
38
38
  spec.add_dependency 'httparty', '~> 0.16.2'
39
- spec.add_dependency 'nokogiri', '~> 1.6', '< 1.10'
39
+ spec.add_dependency 'nokogiri', '~> 1.6'
40
+ spec.add_dependency 'concurrent-ruby', '~> 1.1'
41
+ spec.add_dependency 'parallel', '~> 1.20'
40
42
  spec.add_development_dependency 'bundler', '>= 1.16'
41
43
  spec.add_development_dependency 'rake', '>= 10.0'
42
44
  spec.add_development_dependency 'minitest', '>= 5.11'
@@ -14,20 +14,19 @@ module Datahen
14
14
  def try_parse(scraper_name, parser_file, gid)
15
15
  begin
16
16
 
17
- if options[:job]
18
- job_id = options[:job]
19
- elsif options[:global]
20
- job_id = nil
21
- else
22
- job = Client::ScraperJob.new(options).find(scraper_name)
23
- job_id = job['id']
24
- end
25
-
17
+ if options[:job]
18
+ job_id = options[:job]
19
+ elsif options[:global]
20
+ job_id = nil
21
+ else
22
+ job = Client::ScraperJob.new(options).find(scraper_name)
23
+ job_id = job['id']
24
+ end
26
25
 
27
26
  vars = JSON.parse(options[:vars]) if options[:vars]
28
27
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
29
28
 
30
- rescue JSON::ParserError
29
+ rescue JSON::ParserError
31
30
  if options[:vars]
32
31
  puts "Error: #{options[:vars]} on vars is not a valid JSON"
33
32
  end
@@ -44,17 +43,17 @@ module Datahen
44
43
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
45
44
  option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
46
45
  def exec_parse(scraper_name, parser_file, *gids)
46
+ if options[:job]
47
+ job_id = options[:job]
48
+ else
49
+ job = Client::ScraperJob.new(options).find(scraper_name)
50
+ job_id = job['id']
51
+ end
52
+
47
53
  gids.each do |gid|
48
54
  begin
49
55
  puts "Parsing #{gid}"
50
56
 
51
- if options[:job]
52
- job_id = options[:job]
53
- else
54
- job = Client::ScraperJob.new(options).find(scraper_name)
55
- job_id = job['id']
56
- end
57
-
58
57
  vars = JSON.parse(options[:vars]) if options[:vars]
59
58
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
60
59
  rescue => e
@@ -62,6 +61,46 @@ module Datahen
62
61
  end
63
62
  end
64
63
  end
64
+
65
+ desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
66
+ long_desc <<-LONGDESC
67
+ Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
68
+ LONGDESC
69
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
70
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
71
+ option :"workers", type: :numeric, default: 1, desc: "Worker count"
72
+ option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
+ option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
+ option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ def batch_exec_parse(scraper_name, config_file)
76
+ if options[:job]
77
+ job_id = options[:job]
78
+ else
79
+ job = Client::ScraperJob.new(options).find(scraper_name)
80
+ job_id = job['id']
81
+ end
82
+
83
+ # make the stdout and stderr sync to prevent buffering
84
+ old_stdout_sync = $stdout.sync
85
+ old_stderr_sync = $stderr.sync
86
+ $stdout.sync = true
87
+ $stderr.sync = true
88
+
89
+ begin
90
+ batch = Datahen::Scraper::BatchParser.new job_id, config_file,
91
+ worker_count: options[:"workers"],
92
+ max_garbage: options[:"max-garbage"],
93
+ dequeue_interval: options[:"dequeue-interval"],
94
+ dequeue_scale: options[:"dequeue-scale"]
95
+ batch.exec_parse true, options[:"keep-outputs"]
96
+ rescue => e
97
+ puts [e.message] + e.backtrace
98
+ end
99
+
100
+ # resume whatever state the stdout and stderr sync were
101
+ $stdout.sync = old_stdout_sync
102
+ $stderr.sync = old_stderr_sync
103
+ end
65
104
  end
66
105
  end
67
106
 
@@ -4,10 +4,10 @@ require 'httparty'
4
4
  module Datahen
5
5
  module Client
6
6
  class BackblazeContent
7
- include HTTParty
8
-
7
+ include HTTParty
8
+
9
9
  def get_content(url)
10
- self.class.get(url, format: :plain)
10
+ self.class.get(url, format: :plain).response.body
11
11
  end
12
12
 
13
13
  def get_gunzipped_content(url)
@@ -19,19 +19,23 @@ module Datahen
19
19
  sio = StringIO.new(string)
20
20
  gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
21
21
  _content = ""
22
- begin
22
+ begin
23
23
  _content = gz.read
24
24
  rescue => e
25
25
  # if unexpected eof error, then readchar until error, and ignore it
26
26
  if e.to_s == 'unexpected end of file'
27
- begin
28
- while !gz.eof?
29
- _content += gz.readchar
30
- end
27
+ # heavily improve content read recovery by using "String#<<",
28
+ # reading all "good" lines and then concat the remaining chars
29
+ begin
30
+ gz.each_line{|line| _content << line}
31
31
  rescue => e
32
- puts "Ignored Zlib error: #{e.to_s}"
32
+ begin
33
+ _content << gz.readchar while !gz.eof
34
+ rescue => e
35
+ puts "Ignored Zlib error: #{e.to_s}"
36
+ end
33
37
  end
34
- else
38
+ else
35
39
  raise e
36
40
  end
37
41
  end
@@ -42,6 +42,16 @@ module Datahen
42
42
  self.class.post("/jobs/#{job_id}/pages", params)
43
43
  end
44
44
 
45
+ def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
46
+ body = {
47
+ limit: limit,
48
+ page_types: page_types,
49
+ parse_fetching_failed: parse_fetching_failed
50
+ }
51
+ params = @options.merge({body: body.to_json, timeout: 30})
52
+ self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
+ end
54
+
45
55
  def parsing_update(job_id, gid, opts={})
46
56
  body = {}
47
57
  body[:outputs] = opts.fetch(:outputs) {[]}
@@ -1,6 +1,7 @@
1
1
  require "datahen/error"
2
2
  require "datahen/plugin"
3
3
  require "datahen/scraper/parser"
4
+ require "datahen/scraper/batch_parser"
4
5
  require "datahen/scraper/seeder"
5
6
  require "datahen/scraper/finisher"
6
7
  require "datahen/scraper/executor"
@@ -0,0 +1,203 @@
1
+ require 'concurrent'
2
+ require 'parallel'
3
+
4
+ module Datahen
5
+ module Scraper
6
+ class BatchParser
7
+ NOT_FOUND_MSG = "No more pages to parse found"
8
+ NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
+ NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
+
11
+ attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
12
+ attr_reader :job_id, :worker_count, :pages, :max_garbage
13
+ attr_reader :dequeue_interval, :dequeue_scale
14
+ attr_reader :page_types, :parsers
15
+ attr_reader :config, :client, :garbage_mutex
16
+
17
+ def self.wait time_in_seconds
18
+ Kernel.sleep time_in_seconds
19
+ end
20
+
21
+ def initialize(job_id, config_file, opts = {})
22
+ opts = {
23
+ worker_count: 1,
24
+ max_garbage: 5,
25
+ dequeue_interval: 3,
26
+ dequeue_scale: 2,
27
+ client_options: {}
28
+ }.merge opts
29
+
30
+ @job_id = job_id
31
+ @worker_count = opts[:worker_count]
32
+ @dequeue_interval = opts[:dequeue_interval]
33
+ @dequeue_scale = opts[:dequeue_scale]
34
+ @max_garbage = opts[:max_garbage]
35
+ @pages = Concurrent::Hash.new
36
+ @garbage_mutex = Mutex.new
37
+ self.second_dequeue_count = 0
38
+ self.garbage_count = 0
39
+ self.config_file = config_file
40
+ self.load_config
41
+
42
+ @client = Datahen::Client::JobPage.new(opts[:client_options])
43
+ end
44
+
45
+ def recollect_garbage
46
+ self.garbage_mutex.synchronize do
47
+ self.garbage_count += 1
48
+ if self.garbage_count > self.max_garbage
49
+ puts "Recollect garbage"
50
+ GC.start
51
+ self.garbage_count = 0
52
+ end
53
+ end
54
+ end
55
+
56
+ def load_config
57
+ # build page type to script file map
58
+ @page_types = []
59
+ @parsers = Concurrent::Hash.new
60
+ @config = YAML.load_file(config_file)
61
+ self.config['parsers'].each do |v|
62
+ next if !v['disabled'].nil? && !!v['disabled']
63
+ @page_types << v['page_type']
64
+ self.parsers[v['page_type']] = v['file']
65
+ end
66
+ self.recollect_garbage
67
+ end
68
+
69
+ def repeat_puts message
70
+ puts message
71
+ self.last_message = ''
72
+ end
73
+
74
+ def no_repeat_puts message
75
+ return if message == self.last_message
76
+ puts message
77
+ self.last_message = message
78
+ end
79
+
80
+ def load_pages
81
+ # calculate dequeue size
82
+ max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
83
+ current_size = self.pages.length
84
+ dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
85
+ if dequeue_size < 1
86
+ return 0
87
+ end
88
+ dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
89
+
90
+ # reserve and get to pages parse
91
+ response = nil
92
+ begin
93
+ response = client.dequeue self.job_id,
94
+ dequeue_size,
95
+ self.page_types,
96
+ config['parse_fetching_failed']
97
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
98
+ self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
99
+ return 0
100
+ rescue => e
101
+ raise e
102
+ end
103
+
104
+ # ensure a valid response or try again
105
+ if response.nil? || response.response.code.to_i != 200
106
+ self.repeat_puts(response.nil? ? 'null' : response.body)
107
+ self.recollect_garbage
108
+ return 0
109
+ end
110
+
111
+ # add pages
112
+ count = 0
113
+ (JSON.parse(response.body) || []).each do |page|
114
+ count += 1
115
+ next if self.pages.has_key? page['gid']
116
+ self.pages[page['gid']] = page
117
+ end
118
+ response = nil
119
+
120
+ # recolect garbage to free some memory before parsing
121
+ if count > 0
122
+ self.recollect_garbage
123
+ self.repeat_puts "Found #{count} page(s) to parse"
124
+ self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
125
+ else
126
+ self.no_repeat_puts NOT_FOUND_MSG
127
+ end
128
+
129
+ # return how many pages were loaded
130
+ count
131
+ end
132
+
133
+ def dequeue_pages
134
+ # collect garbage
135
+ self.recollect_garbage
136
+
137
+ # return page if there are loeaded pages
138
+ is_waiting = false
139
+ while true do
140
+ key_value = self.pages.shift
141
+ unless key_value.nil?
142
+ puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
143
+ return key_value[1]
144
+ end
145
+
146
+ # be more verbose on worker waiting
147
+ unless is_waiting
148
+ is_waiting = true
149
+ puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
150
+ if self.second_dequeue_count > 1
151
+ puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
152
+ end
153
+ end
154
+ self.class.wait 1
155
+ end
156
+ end
157
+
158
+ def exec_parse save = false, keep_outputs = false
159
+ if self.worker_count < 1
160
+ self.no_repeat_puts NO_WORKERS_MSG
161
+ return
162
+ else
163
+ self.no_repeat_puts "Spawing #{self.worker_count} workers"
164
+ end
165
+
166
+ # dequeuing on parallel (the ride never ends :D)
167
+ Thread.new do
168
+ while true
169
+ begin
170
+ self.load_pages
171
+ self.class.wait self.dequeue_interval
172
+ rescue => e
173
+ puts [e.message] + e.backtrace rescue 'error'
174
+ end
175
+ end
176
+ puts "Error: dequeuer died! D:"
177
+ end
178
+
179
+ # process the pages
180
+ dequeue = lambda{ self.dequeue_pages }
181
+ Parallel.each(dequeue, in_threads: (worker_count)) do |page|
182
+ parser_file = self.parsers[page['page_type']]
183
+ begin
184
+ puts Datahen::Scraper::Parser.exec_parser_by_page(
185
+ parser_file,
186
+ page,
187
+ job_id,
188
+ save,
189
+ nil,
190
+ keep_outputs
191
+ )
192
+ rescue Parallel::Kill => e
193
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
194
+ rescue Parallel::Break => e
195
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
196
+ rescue => e
197
+ puts [e.message] + e.backtrace rescue 'error'
198
+ end
199
+ end
200
+ end
201
+ end
202
+ end
203
+ end
@@ -6,7 +6,7 @@ module Datahen
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
8
 
9
- attr_accessor :filename, :gid, :job_id
9
+ attr_accessor :filename, :page, :gid, :job_id
10
10
 
11
11
  include Datahen::Plugin::ContextExposer
12
12
 
@@ -15,6 +15,9 @@ module Datahen
15
15
  end
16
16
 
17
17
  def init_page()
18
+ # skip whenever a page is provided
19
+ return self.page unless self.page.nil?
20
+
18
21
  if job_id
19
22
  puts "getting Job Page"
20
23
  init_job_page
@@ -374,6 +377,11 @@ module Datahen
374
377
  def eval_with_context file_path, context
375
378
  eval(File.read(file_path), context, file_path)
376
379
  end
380
+
381
+ # Finish the executor execution
382
+ def finish
383
+ raise Error::SafeTerminateError
384
+ end
377
385
  end
378
386
  end
379
387
  end
@@ -18,6 +18,22 @@ module Datahen
18
18
  end
19
19
  end
20
20
 
21
+ def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
22
+ extname = File.extname(filename)
23
+ case extname
24
+ when '.rb'
25
+ executor = RubyParserExecutor.new(
26
+ filename: filename,
27
+ page: page,
28
+ job_id: job_id,
29
+ vars: vars,
30
+ keep_outputs: keep_outputs
31
+ )
32
+ executor.exec_parser(save)
33
+ else
34
+ puts "Unable to find a parser executor for file type \"#{extname}\""
35
+ end
36
+ end
21
37
 
22
38
  end
23
39
  end
@@ -40,6 +40,8 @@ module Datahen
40
40
  job_id: job_id
41
41
  })
42
42
  eval_with_context filename, context
43
+ rescue Error::SafeTerminateError => e
44
+ # do nothing, this is fine
43
45
  rescue SyntaxError => e
44
46
  handle_error(e) if save
45
47
  raise e
@@ -55,7 +57,7 @@ module Datahen
55
57
  handle_error(e) if save
56
58
  raise e
57
59
  end
58
-
60
+
59
61
  update_finisher_done_status
60
62
  end
61
63
  proc.call
@@ -12,7 +12,8 @@ module Datahen
12
12
 
13
13
  def initialize(options={})
14
14
  @filename = options.fetch(:filename) { raise "Filename is required"}
15
- @gid = options.fetch(:gid) { raise "GID is required"}
15
+ @page = options.fetch(:page) { nil }
16
+ @gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
16
17
  @job_id = options.fetch(:job_id)
17
18
  @page_vars = options.fetch(:vars) { {} }
18
19
  @keep_outputs = !!(options.fetch(:keep_outputs) { false })
@@ -46,6 +47,8 @@ module Datahen
46
47
  end
47
48
 
48
49
  def init_page_vars(page)
50
+ return self.page unless self.page.nil?
51
+
49
52
  if !@page_vars.nil? && !@page_vars.empty?
50
53
  page['vars'] = @page_vars
51
54
  end
@@ -44,6 +44,8 @@ module Datahen
44
44
  pages: pages
45
45
  })
46
46
  eval_with_context filename, context
47
+ rescue Error::SafeTerminateError => e
48
+ # do nothing, this is fine
47
49
  rescue SyntaxError => e
48
50
  handle_error(e) if save
49
51
  raise e
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.22"
2
+ VERSION = "0.15.10"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.22
4
+ version: 0.15.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-12-14 00:00:00.000000000 Z
11
+ date: 2021-05-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -45,9 +45,6 @@ dependencies:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.6'
48
- - - "<"
49
- - !ruby/object:Gem::Version
50
- version: '1.10'
51
48
  type: :runtime
52
49
  prerelease: false
53
50
  version_requirements: !ruby/object:Gem::Requirement
@@ -55,9 +52,34 @@ dependencies:
55
52
  - - "~>"
56
53
  - !ruby/object:Gem::Version
57
54
  version: '1.6'
58
- - - "<"
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.20'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
59
81
  - !ruby/object:Gem::Version
60
- version: '1.10'
82
+ version: '1.20'
61
83
  - !ruby/object:Gem::Dependency
62
84
  name: bundler
63
85
  requirement: !ruby/object:Gem::Requirement
@@ -238,6 +260,7 @@ files:
238
260
  - lib/datahen/plugin.rb
239
261
  - lib/datahen/plugin/context_exposer.rb
240
262
  - lib/datahen/scraper.rb
263
+ - lib/datahen/scraper/batch_parser.rb
241
264
  - lib/datahen/scraper/executor.rb
242
265
  - lib/datahen/scraper/finisher.rb
243
266
  - lib/datahen/scraper/parser.rb
@@ -261,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
261
284
  requirements:
262
285
  - - ">="
263
286
  - !ruby/object:Gem::Version
264
- version: 2.2.2
287
+ version: 2.4.4
265
288
  required_rubygems_version: !ruby/object:Gem::Requirement
266
289
  requirements:
267
290
  - - ">="