datahen 0.14.26 → 0.15.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dac57d98132102aa9ae8244b6528394473b2bdeb9992c7ea15d6979eaf87d4af
4
- data.tar.gz: e68858d2f088b2d7b8538411dd59cf2ae2de7866416fc213c6a6fa009d93c556
3
+ metadata.gz: 7ad617ad097cf773fe496c3ccf801c49926524017784e5a51764ec88992b11ab
4
+ data.tar.gz: 8292cf447a3fedcad565bc1aa98041d68a51f0d93ff1598b8c31980170d1f66c
5
5
  SHA512:
6
- metadata.gz: 857126b2f7ec4fa058aaa8d5b4a7095108224bdf3f6ece690dbfc930e0527a294853705227f0e63be5af3524982fff21f7d3c9d940c22b31caade5139a3d607b
7
- data.tar.gz: 81ecf95378e6f4aa31a87e39a82bc815216fce1b84aa65d8f7f2aa8ee8b19b871f08eb8c86025d9dc8d84617f20864f5f39c21d7b8ac4900a739599c0aa6283c
6
+ metadata.gz: 3e883e9a53339e342446ed543bab41bf0ab60de6e1f5f7aaaad54125ec67275803bf1e7676e3764c59f0ba7b75e8fdf4adfe98733ab376c350b3a3247a678895
7
+ data.tar.gz: eaa4d12e93f31c7e516bb94f1222ecad2d6581609f77372de9078ac91a716438f6e998ec9b00ba4d3da76d02ccab5639fe06393c67346ab896e69d62d3fcc1e5
data/datahen.gemspec CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
33
33
  spec.bindir = "exe"
34
34
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
35
35
  spec.require_paths = ["lib"]
36
- spec.required_ruby_version = '>= 2.2.2'
36
+ spec.required_ruby_version = '>= 2.4.4'
37
37
  spec.add_dependency "thor", "~> 0.20.3"
38
38
  spec.add_dependency 'httparty', '~> 0.16.2'
39
39
  spec.add_dependency 'nokogiri', '~> 1.6'
40
+ spec.add_dependency 'concurrent-ruby', '~> 1.1'
41
+ spec.add_dependency 'parallel', '~> 1.20'
40
42
  spec.add_development_dependency 'bundler', '>= 1.16'
41
43
  spec.add_development_dependency 'rake', '>= 10.0'
42
44
  spec.add_development_dependency 'minitest', '>= 5.11'
@@ -43,17 +43,17 @@ module Datahen
43
43
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
44
44
  option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
45
45
  def exec_parse(scraper_name, parser_file, *gids)
46
+ if options[:job]
47
+ job_id = options[:job]
48
+ else
49
+ job = Client::ScraperJob.new(options).find(scraper_name)
50
+ job_id = job['id']
51
+ end
52
+
46
53
  gids.each do |gid|
47
54
  begin
48
55
  puts "Parsing #{gid}"
49
56
 
50
- if options[:job]
51
- job_id = options[:job]
52
- else
53
- job = Client::ScraperJob.new(options).find(scraper_name)
54
- job_id = job['id']
55
- end
56
-
57
57
  vars = JSON.parse(options[:vars]) if options[:vars]
58
58
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
59
59
  rescue => e
@@ -61,6 +61,46 @@ module Datahen
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
66
+ long_desc <<-LONGDESC
67
+ Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
68
+ LONGDESC
69
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
70
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
71
+ option :"workers", type: :numeric, default: 1, desc: "Worker count"
72
+ option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
+ option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
+ option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ def batch_exec_parse(scraper_name, config_file)
76
+ if options[:job]
77
+ job_id = options[:job]
78
+ else
79
+ job = Client::ScraperJob.new(options).find(scraper_name)
80
+ job_id = job['id']
81
+ end
82
+
83
+ # make the stdout and stderr sync to prevent buffering
84
+ old_stdout_sync = $stdout.sync
85
+ old_stderr_sync = $stderr.sync
86
+ $stdout.sync = true
87
+ $stderr.sync = true
88
+
89
+ begin
90
+ batch = Datahen::Scraper::BatchParser.new job_id, config_file,
91
+ worker_count: options[:"workers"],
92
+ max_garbage: options[:"max-garbage"],
93
+ dequeue_interval: options[:"dequeue-interval"],
94
+ dequeue_scale: options[:"dequeue-scale"]
95
+ batch.exec_parse true, options[:"keep-outputs"]
96
+ rescue => e
97
+ puts [e.message] + e.backtrace
98
+ end
99
+
100
+ # resume whatever state the stdout and stderr sync were
101
+ $stdout.sync = old_stdout_sync
102
+ $stderr.sync = old_stderr_sync
103
+ end
64
104
  end
65
105
  end
66
106
 
@@ -42,6 +42,16 @@ module Datahen
42
42
  self.class.post("/jobs/#{job_id}/pages", params)
43
43
  end
44
44
 
45
+ def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
46
+ body = {
47
+ limit: limit,
48
+ page_types: page_types,
49
+ parse_fetching_failed: parse_fetching_failed
50
+ }
51
+ params = @options.merge({body: body.to_json})
52
+ self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
+ end
54
+
45
55
  def parsing_update(job_id, gid, opts={})
46
56
  body = {}
47
57
  body[:outputs] = opts.fetch(:outputs) {[]}
@@ -0,0 +1,177 @@
1
+ require 'concurrent'
2
+ require 'parallel'
3
+
4
+ module Datahen
5
+ module Scraper
6
+ class BatchParser
7
+ NOT_FOUND_MSG = "No more pages to parse found"
8
+ NO_DEQUEUE_COUNT_MSG = "Warning: Max page to parse dequeue count is 0, check pages to parse scale"
9
+ NO_WORKERS_MSG = "Warning: There are no parser workers"
10
+
11
+ attr_accessor :config_file, :garbage_count, :last_message
12
+ attr_reader :job_id, :worker_count, :pages, :max_garbage
13
+ attr_reader :dequeue_interval, :dequeue_scale
14
+ attr_reader :page_types, :parsers
15
+ attr_reader :config, :client, :garbage_mutex
16
+
17
+ def self.wait time_in_seconds
18
+ Kernel.sleep time_in_seconds
19
+ end
20
+
21
+ def initialize(job_id, config_file, opts = {})
22
+ opts = {
23
+ worker_count: 1,
24
+ max_garbage: 5,
25
+ dequeue_interval: 3,
26
+ dequeue_scale: 2,
27
+ client_options: {}
28
+ }.merge opts
29
+
30
+ @job_id = job_id
31
+ @worker_count = opts[:worker_count]
32
+ @dequeue_interval = opts[:dequeue_interval]
33
+ @dequeue_scale = opts[:dequeue_scale]
34
+ @max_garbage = opts[:max_garbage]
35
+ @pages = Concurrent::Hash.new
36
+ @garbage_mutex = Mutex.new
37
+ self.garbage_count = 0
38
+ self.config_file = config_file
39
+ self.load_config
40
+
41
+ @client = Datahen::Client::JobPage.new(opts[:client_options])
42
+ end
43
+
44
+ def recollect_garbage
45
+ self.garbage_mutex.synchronize do
46
+ puts "Recollect garbage"
47
+ GC.start
48
+ self.garbage_count = 0
49
+ end
50
+ end
51
+
52
+ def load_config
53
+ # build page type to script file map
54
+ @page_types = []
55
+ @parsers = Concurrent::Hash.new
56
+ @config = YAML.load_file(config_file)
57
+ self.config['parsers'].each do |v|
58
+ next if !v['disabled'].nil? && !!v['disabled']
59
+ @page_types << v['page_type']
60
+ self.parsers[v['page_type']] = v['file']
61
+ end
62
+ self.recollect_garbage
63
+ end
64
+
65
+ def repeat_puts message
66
+ puts message
67
+ self.last_message = ''
68
+ end
69
+
70
+ def no_repeat_puts message
71
+ return if message == self.last_message
72
+ puts message
73
+ self.last_message = message
74
+ end
75
+
76
+ def load_pages
77
+ # calculate dequeue size
78
+ max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
79
+ current_size = self.pages.length
80
+ dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
81
+ if dequeue_size < 1
82
+ return 0
83
+ end
84
+ dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
85
+
86
+ # reserve and get to pages parse
87
+ response = client.dequeue self.job_id,
88
+ dequeue_size,
89
+ self.page_types,
90
+ config['parse_fetching_failed']
91
+
92
+ # ensure a valid response or try again
93
+ if response.nil? || response.response.code.to_i != 200
94
+ self.repeat_puts(response.nil? ? 'null' : response.body)
95
+ self.recollect_garbage
96
+ return 0
97
+ end
98
+
99
+ # add pages
100
+ count = 0
101
+ (JSON.parse(response.body) || []).each do |page|
102
+ count += 1
103
+ next if self.pages.has_key? page['gid']
104
+ self.pages[page['gid']] = page
105
+ end
106
+ response = nil
107
+
108
+ # recolect garbage to free some memory before parsing
109
+ if count > 0
110
+ self.recollect_garbage
111
+ self.repeat_puts "Found #{count} page(s) to parse"
112
+ else
113
+ self.no_repeat_puts NOT_FOUND_MSG
114
+ end
115
+
116
+ # return how many pages were loaded
117
+ count
118
+ end
119
+
120
+ def dequeue_pages
121
+ # collect garbage
122
+ self.garbage_count += 1
123
+ if self.garbage_count > self.max_garbage
124
+ self.recollect_garbage
125
+ end
126
+
127
+ # return page if there are loeaded pages
128
+ while true do
129
+ key_value = self.pages.shift
130
+ return key_value[1] unless key_value.nil?
131
+ self.class.wait 1
132
+ end
133
+ end
134
+
135
+ def exec_parse save = false, keep_outputs = false
136
+ if self.worker_count < 1
137
+ self.no_repeat_puts NO_WORKERS_MSG
138
+ return
139
+ else
140
+ self.no_repeat_puts "Spawing #{self.worker_count} workers"
141
+ end
142
+
143
+ # dequeuing on parallel
144
+ keep_dequeue = Concurrent::Array.new
145
+ keep_dequeue[0] = true
146
+ Thread.new do
147
+ while keep_dequeue[0]
148
+ begin
149
+ self.load_pages
150
+ self.class.wait self.dequeue_interval
151
+ rescue => e
152
+ puts [e.message] + e.backtrace rescue 'error'
153
+ end
154
+ end
155
+ end
156
+
157
+ dequeue = lambda{ self.dequeue_pages }
158
+ Parallel.each(dequeue, in_threads: (worker_count)) do |page|
159
+ parser_file = self.parsers[page['page_type']]
160
+ begin
161
+ puts Datahen::Scraper::Parser.exec_parser_by_page(
162
+ parser_file,
163
+ page,
164
+ job_id,
165
+ save,
166
+ nil,
167
+ keep_outputs
168
+ )
169
+ rescue => e
170
+ puts [e.message] + e.backtrace rescue 'error'
171
+ end
172
+ end
173
+ keep_dequeue[0] = false
174
+ end
175
+ end
176
+ end
177
+ end
@@ -6,7 +6,7 @@ module Datahen
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
8
 
9
- attr_accessor :filename, :gid, :job_id
9
+ attr_accessor :filename, :page, :gid, :job_id
10
10
 
11
11
  include Datahen::Plugin::ContextExposer
12
12
 
@@ -15,6 +15,9 @@ module Datahen
15
15
  end
16
16
 
17
17
  def init_page()
18
+ # skip whenever a page is provided
19
+ return self.page unless self.page.nil?
20
+
18
21
  if job_id
19
22
  puts "getting Job Page"
20
23
  init_job_page
@@ -18,6 +18,22 @@ module Datahen
18
18
  end
19
19
  end
20
20
 
21
+ def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
22
+ extname = File.extname(filename)
23
+ case extname
24
+ when '.rb'
25
+ executor = RubyParserExecutor.new(
26
+ filename: filename,
27
+ page: page,
28
+ job_id: job_id,
29
+ vars: vars,
30
+ keep_outputs: keep_outputs
31
+ )
32
+ executor.exec_parser(save)
33
+ else
34
+ puts "Unable to find a parser executor for file type \"#{extname}\""
35
+ end
36
+ end
21
37
 
22
38
  end
23
39
  end
@@ -12,7 +12,8 @@ module Datahen
12
12
 
13
13
  def initialize(options={})
14
14
  @filename = options.fetch(:filename) { raise "Filename is required"}
15
- @gid = options.fetch(:gid) { raise "GID is required"}
15
+ @page = options.fetch(:page) { nil }
16
+ @gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
16
17
  @job_id = options.fetch(:job_id)
17
18
  @page_vars = options.fetch(:vars) { {} }
18
19
  @keep_outputs = !!(options.fetch(:keep_outputs) { false })
@@ -46,6 +47,8 @@ module Datahen
46
47
  end
47
48
 
48
49
  def init_page_vars(page)
50
+ return self.page unless self.page.nil?
51
+
49
52
  if !@page_vars.nil? && !@page_vars.empty?
50
53
  page['vars'] = @page_vars
51
54
  end
@@ -1,6 +1,7 @@
1
1
  require "datahen/error"
2
2
  require "datahen/plugin"
3
3
  require "datahen/scraper/parser"
4
+ require "datahen/scraper/batch_parser"
4
5
  require "datahen/scraper/seeder"
5
6
  require "datahen/scraper/finisher"
6
7
  require "datahen/scraper/executor"
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.26"
2
+ VERSION = "0.15.9"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.26
4
+ version: 0.15.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-04-20 00:00:00.000000000 Z
11
+ date: 2021-05-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -52,6 +52,34 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.20'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.20'
55
83
  - !ruby/object:Gem::Dependency
56
84
  name: bundler
57
85
  requirement: !ruby/object:Gem::Requirement
@@ -232,6 +260,7 @@ files:
232
260
  - lib/datahen/plugin.rb
233
261
  - lib/datahen/plugin/context_exposer.rb
234
262
  - lib/datahen/scraper.rb
263
+ - lib/datahen/scraper/batch_parser.rb
235
264
  - lib/datahen/scraper/executor.rb
236
265
  - lib/datahen/scraper/finisher.rb
237
266
  - lib/datahen/scraper/parser.rb
@@ -255,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
255
284
  requirements:
256
285
  - - ">="
257
286
  - !ruby/object:Gem::Version
258
- version: 2.2.2
287
+ version: 2.4.4
259
288
  required_rubygems_version: !ruby/object:Gem::Requirement
260
289
  requirements:
261
290
  - - ">="