datahen 0.14.26 → 0.15.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dac57d98132102aa9ae8244b6528394473b2bdeb9992c7ea15d6979eaf87d4af
4
- data.tar.gz: e68858d2f088b2d7b8538411dd59cf2ae2de7866416fc213c6a6fa009d93c556
3
+ metadata.gz: 7ad617ad097cf773fe496c3ccf801c49926524017784e5a51764ec88992b11ab
4
+ data.tar.gz: 8292cf447a3fedcad565bc1aa98041d68a51f0d93ff1598b8c31980170d1f66c
5
5
  SHA512:
6
- metadata.gz: 857126b2f7ec4fa058aaa8d5b4a7095108224bdf3f6ece690dbfc930e0527a294853705227f0e63be5af3524982fff21f7d3c9d940c22b31caade5139a3d607b
7
- data.tar.gz: 81ecf95378e6f4aa31a87e39a82bc815216fce1b84aa65d8f7f2aa8ee8b19b871f08eb8c86025d9dc8d84617f20864f5f39c21d7b8ac4900a739599c0aa6283c
6
+ metadata.gz: 3e883e9a53339e342446ed543bab41bf0ab60de6e1f5f7aaaad54125ec67275803bf1e7676e3764c59f0ba7b75e8fdf4adfe98733ab376c350b3a3247a678895
7
+ data.tar.gz: eaa4d12e93f31c7e516bb94f1222ecad2d6581609f77372de9078ac91a716438f6e998ec9b00ba4d3da76d02ccab5639fe06393c67346ab896e69d62d3fcc1e5
data/datahen.gemspec CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
33
33
  spec.bindir = "exe"
34
34
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
35
35
  spec.require_paths = ["lib"]
36
- spec.required_ruby_version = '>= 2.2.2'
36
+ spec.required_ruby_version = '>= 2.4.4'
37
37
  spec.add_dependency "thor", "~> 0.20.3"
38
38
  spec.add_dependency 'httparty', '~> 0.16.2'
39
39
  spec.add_dependency 'nokogiri', '~> 1.6'
40
+ spec.add_dependency 'concurrent-ruby', '~> 1.1'
41
+ spec.add_dependency 'parallel', '~> 1.20'
40
42
  spec.add_development_dependency 'bundler', '>= 1.16'
41
43
  spec.add_development_dependency 'rake', '>= 10.0'
42
44
  spec.add_development_dependency 'minitest', '>= 5.11'
@@ -43,17 +43,17 @@ module Datahen
43
43
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
44
44
  option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
45
45
  def exec_parse(scraper_name, parser_file, *gids)
46
+ if options[:job]
47
+ job_id = options[:job]
48
+ else
49
+ job = Client::ScraperJob.new(options).find(scraper_name)
50
+ job_id = job['id']
51
+ end
52
+
46
53
  gids.each do |gid|
47
54
  begin
48
55
  puts "Parsing #{gid}"
49
56
 
50
- if options[:job]
51
- job_id = options[:job]
52
- else
53
- job = Client::ScraperJob.new(options).find(scraper_name)
54
- job_id = job['id']
55
- end
56
-
57
57
  vars = JSON.parse(options[:vars]) if options[:vars]
58
58
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
59
59
  rescue => e
@@ -61,6 +61,46 @@ module Datahen
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
66
+ long_desc <<-LONGDESC
67
+ Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
68
+ LONGDESC
69
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
70
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
71
+ option :"workers", type: :numeric, default: 1, desc: "Worker count"
72
+ option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
+ option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
+ option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ def batch_exec_parse(scraper_name, config_file)
76
+ if options[:job]
77
+ job_id = options[:job]
78
+ else
79
+ job = Client::ScraperJob.new(options).find(scraper_name)
80
+ job_id = job['id']
81
+ end
82
+
83
+ # make the stdout and stderr sync to prevent buffering
84
+ old_stdout_sync = $stdout.sync
85
+ old_stderr_sync = $stderr.sync
86
+ $stdout.sync = true
87
+ $stderr.sync = true
88
+
89
+ begin
90
+ batch = Datahen::Scraper::BatchParser.new job_id, config_file,
91
+ worker_count: options[:"workers"],
92
+ max_garbage: options[:"max-garbage"],
93
+ dequeue_interval: options[:"dequeue-interval"],
94
+ dequeue_scale: options[:"dequeue-scale"]
95
+ batch.exec_parse true, options[:"keep-outputs"]
96
+ rescue => e
97
+ puts [e.message] + e.backtrace
98
+ end
99
+
100
+ # resume whatever state the stdout and stderr sync were
101
+ $stdout.sync = old_stdout_sync
102
+ $stderr.sync = old_stderr_sync
103
+ end
64
104
  end
65
105
  end
66
106
 
@@ -42,6 +42,16 @@ module Datahen
42
42
  self.class.post("/jobs/#{job_id}/pages", params)
43
43
  end
44
44
 
45
+ def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
46
+ body = {
47
+ limit: limit,
48
+ page_types: page_types,
49
+ parse_fetching_failed: parse_fetching_failed
50
+ }
51
+ params = @options.merge({body: body.to_json})
52
+ self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
+ end
54
+
45
55
  def parsing_update(job_id, gid, opts={})
46
56
  body = {}
47
57
  body[:outputs] = opts.fetch(:outputs) {[]}
@@ -0,0 +1,177 @@
1
+ require 'concurrent'
2
+ require 'parallel'
3
+
4
+ module Datahen
5
+ module Scraper
6
+ class BatchParser
7
+ NOT_FOUND_MSG = "No more pages to parse found"
8
+ NO_DEQUEUE_COUNT_MSG = "Warning: Max page to parse dequeue count is 0, check pages to parse scale"
9
+ NO_WORKERS_MSG = "Warning: There are no parser workers"
10
+
11
+ attr_accessor :config_file, :garbage_count, :last_message
12
+ attr_reader :job_id, :worker_count, :pages, :max_garbage
13
+ attr_reader :dequeue_interval, :dequeue_scale
14
+ attr_reader :page_types, :parsers
15
+ attr_reader :config, :client, :garbage_mutex
16
+
17
+ def self.wait time_in_seconds
18
+ Kernel.sleep time_in_seconds
19
+ end
20
+
21
+ def initialize(job_id, config_file, opts = {})
22
+ opts = {
23
+ worker_count: 1,
24
+ max_garbage: 5,
25
+ dequeue_interval: 3,
26
+ dequeue_scale: 2,
27
+ client_options: {}
28
+ }.merge opts
29
+
30
+ @job_id = job_id
31
+ @worker_count = opts[:worker_count]
32
+ @dequeue_interval = opts[:dequeue_interval]
33
+ @dequeue_scale = opts[:dequeue_scale]
34
+ @max_garbage = opts[:max_garbage]
35
+ @pages = Concurrent::Hash.new
36
+ @garbage_mutex = Mutex.new
37
+ self.garbage_count = 0
38
+ self.config_file = config_file
39
+ self.load_config
40
+
41
+ @client = Datahen::Client::JobPage.new(opts[:client_options])
42
+ end
43
+
44
+ def recollect_garbage
45
+ self.garbage_mutex.synchronize do
46
+ puts "Recollect garbage"
47
+ GC.start
48
+ self.garbage_count = 0
49
+ end
50
+ end
51
+
52
+ def load_config
53
+ # build page type to script file map
54
+ @page_types = []
55
+ @parsers = Concurrent::Hash.new
56
+ @config = YAML.load_file(config_file)
57
+ self.config['parsers'].each do |v|
58
+ next if !v['disabled'].nil? && !!v['disabled']
59
+ @page_types << v['page_type']
60
+ self.parsers[v['page_type']] = v['file']
61
+ end
62
+ self.recollect_garbage
63
+ end
64
+
65
+ def repeat_puts message
66
+ puts message
67
+ self.last_message = ''
68
+ end
69
+
70
+ def no_repeat_puts message
71
+ return if message == self.last_message
72
+ puts message
73
+ self.last_message = message
74
+ end
75
+
76
+ def load_pages
77
+ # calculate dequeue size
78
+ max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
79
+ current_size = self.pages.length
80
+ dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
81
+ if dequeue_size < 1
82
+ return 0
83
+ end
84
+ dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
85
+
86
+ # reserve and get to pages parse
87
+ response = client.dequeue self.job_id,
88
+ dequeue_size,
89
+ self.page_types,
90
+ config['parse_fetching_failed']
91
+
92
+ # ensure a valid response or try again
93
+ if response.nil? || response.response.code.to_i != 200
94
+ self.repeat_puts(response.nil? ? 'null' : response.body)
95
+ self.recollect_garbage
96
+ return 0
97
+ end
98
+
99
+ # add pages
100
+ count = 0
101
+ (JSON.parse(response.body) || []).each do |page|
102
+ count += 1
103
+ next if self.pages.has_key? page['gid']
104
+ self.pages[page['gid']] = page
105
+ end
106
+ response = nil
107
+
108
+ # recolect garbage to free some memory before parsing
109
+ if count > 0
110
+ self.recollect_garbage
111
+ self.repeat_puts "Found #{count} page(s) to parse"
112
+ else
113
+ self.no_repeat_puts NOT_FOUND_MSG
114
+ end
115
+
116
+ # return how many pages were loaded
117
+ count
118
+ end
119
+
120
+ def dequeue_pages
121
+ # collect garbage
122
+ self.garbage_count += 1
123
+ if self.garbage_count > self.max_garbage
124
+ self.recollect_garbage
125
+ end
126
+
127
+ # return page if there are loeaded pages
128
+ while true do
129
+ key_value = self.pages.shift
130
+ return key_value[1] unless key_value.nil?
131
+ self.class.wait 1
132
+ end
133
+ end
134
+
135
+ def exec_parse save = false, keep_outputs = false
136
+ if self.worker_count < 1
137
+ self.no_repeat_puts NO_WORKERS_MSG
138
+ return
139
+ else
140
+ self.no_repeat_puts "Spawing #{self.worker_count} workers"
141
+ end
142
+
143
+ # dequeuing on parallel
144
+ keep_dequeue = Concurrent::Array.new
145
+ keep_dequeue[0] = true
146
+ Thread.new do
147
+ while keep_dequeue[0]
148
+ begin
149
+ self.load_pages
150
+ self.class.wait self.dequeue_interval
151
+ rescue => e
152
+ puts [e.message] + e.backtrace rescue 'error'
153
+ end
154
+ end
155
+ end
156
+
157
+ dequeue = lambda{ self.dequeue_pages }
158
+ Parallel.each(dequeue, in_threads: (worker_count)) do |page|
159
+ parser_file = self.parsers[page['page_type']]
160
+ begin
161
+ puts Datahen::Scraper::Parser.exec_parser_by_page(
162
+ parser_file,
163
+ page,
164
+ job_id,
165
+ save,
166
+ nil,
167
+ keep_outputs
168
+ )
169
+ rescue => e
170
+ puts [e.message] + e.backtrace rescue 'error'
171
+ end
172
+ end
173
+ keep_dequeue[0] = false
174
+ end
175
+ end
176
+ end
177
+ end
@@ -6,7 +6,7 @@ module Datahen
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
8
 
9
- attr_accessor :filename, :gid, :job_id
9
+ attr_accessor :filename, :page, :gid, :job_id
10
10
 
11
11
  include Datahen::Plugin::ContextExposer
12
12
 
@@ -15,6 +15,9 @@ module Datahen
15
15
  end
16
16
 
17
17
  def init_page()
18
+ # skip whenever a page is provided
19
+ return self.page unless self.page.nil?
20
+
18
21
  if job_id
19
22
  puts "getting Job Page"
20
23
  init_job_page
@@ -18,6 +18,22 @@ module Datahen
18
18
  end
19
19
  end
20
20
 
21
+ def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
22
+ extname = File.extname(filename)
23
+ case extname
24
+ when '.rb'
25
+ executor = RubyParserExecutor.new(
26
+ filename: filename,
27
+ page: page,
28
+ job_id: job_id,
29
+ vars: vars,
30
+ keep_outputs: keep_outputs
31
+ )
32
+ executor.exec_parser(save)
33
+ else
34
+ puts "Unable to find a parser executor for file type \"#{extname}\""
35
+ end
36
+ end
21
37
 
22
38
  end
23
39
  end
@@ -12,7 +12,8 @@ module Datahen
12
12
 
13
13
  def initialize(options={})
14
14
  @filename = options.fetch(:filename) { raise "Filename is required"}
15
- @gid = options.fetch(:gid) { raise "GID is required"}
15
+ @page = options.fetch(:page) { nil }
16
+ @gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
16
17
  @job_id = options.fetch(:job_id)
17
18
  @page_vars = options.fetch(:vars) { {} }
18
19
  @keep_outputs = !!(options.fetch(:keep_outputs) { false })
@@ -46,6 +47,8 @@ module Datahen
46
47
  end
47
48
 
48
49
  def init_page_vars(page)
50
+ return self.page unless self.page.nil?
51
+
49
52
  if !@page_vars.nil? && !@page_vars.empty?
50
53
  page['vars'] = @page_vars
51
54
  end
@@ -1,6 +1,7 @@
1
1
  require "datahen/error"
2
2
  require "datahen/plugin"
3
3
  require "datahen/scraper/parser"
4
+ require "datahen/scraper/batch_parser"
4
5
  require "datahen/scraper/seeder"
5
6
  require "datahen/scraper/finisher"
6
7
  require "datahen/scraper/executor"
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.26"
2
+ VERSION = "0.15.9"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.26
4
+ version: 0.15.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-04-20 00:00:00.000000000 Z
11
+ date: 2021-05-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -52,6 +52,34 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.20'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.20'
55
83
  - !ruby/object:Gem::Dependency
56
84
  name: bundler
57
85
  requirement: !ruby/object:Gem::Requirement
@@ -232,6 +260,7 @@ files:
232
260
  - lib/datahen/plugin.rb
233
261
  - lib/datahen/plugin/context_exposer.rb
234
262
  - lib/datahen/scraper.rb
263
+ - lib/datahen/scraper/batch_parser.rb
235
264
  - lib/datahen/scraper/executor.rb
236
265
  - lib/datahen/scraper/finisher.rb
237
266
  - lib/datahen/scraper/parser.rb
@@ -255,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
255
284
  requirements:
256
285
  - - ">="
257
286
  - !ruby/object:Gem::Version
258
- version: 2.2.2
287
+ version: 2.4.4
259
288
  required_rubygems_version: !ruby/object:Gem::Requirement
260
289
  requirements:
261
290
  - - ">="