datahen 0.14.24 → 0.15.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 54064cf5656d253f683540fa3704cdcae3991bc07a91ab5339bf9083eb0792f2
4
- data.tar.gz: 3c82cedd06937454a9af5d91eea9b32f3ad744394763ca9fadbcdc6068eda683
3
+ metadata.gz: f77e31da8e2a7ff08086c4aa9d174608a9c3f186679d456b22310b48384d3572
4
+ data.tar.gz: 0bf53ae0886b16bf6fe08b0db07b1a631b69f31d8e3a6868a4d483549049e4ed
5
5
  SHA512:
6
- metadata.gz: fb17b046f9dbd15cf7a278f68f550e8b3b84d8c16a030d0ef4df100df28c8e4dce29fe74ffc70e02f79af26a2d1e3f66ccb6890e06a342a70fac09d824169431
7
- data.tar.gz: 732256fc714635896f444bed50e2d6f49c2a03c4868fd23eb84e52701935bc2bfbef25811055983691c5e0ff39af53328dbeb67faf91f127c1d06af450d7d666
6
+ metadata.gz: a491874347ed6ac97c0a0e4f0d2c5830140b9367c1e01b4c95e7a447b071df643d12793ed7d6e5a0224b8876905cf74bc13b987e6e3e03e937d1f821557b8ec3
7
+ data.tar.gz: c553a372790654726f2921b6d9d582ca90b314b3fe8d78625cf2442e01cf2ce96ae556e78812d5bd6a03f350beb32aa3158981554179410c12926d480c911887
data/datahen.gemspec CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
33
33
  spec.bindir = "exe"
34
34
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
35
35
  spec.require_paths = ["lib"]
36
- spec.required_ruby_version = '>= 2.2.2'
36
+ spec.required_ruby_version = '>= 2.4.4'
37
37
  spec.add_dependency "thor", "~> 0.20.3"
38
38
  spec.add_dependency 'httparty', '~> 0.16.2'
39
- spec.add_dependency 'nokogiri', '~> 1.6', '< 1.10'
39
+ spec.add_dependency 'nokogiri', '~> 1.6'
40
+ spec.add_dependency 'concurrent-ruby', '~> 1.1'
41
+ spec.add_dependency 'parallel', '~> 1.20'
40
42
  spec.add_development_dependency 'bundler', '>= 1.16'
41
43
  spec.add_development_dependency 'rake', '>= 10.0'
42
44
  spec.add_development_dependency 'minitest', '>= 5.11'
@@ -43,17 +43,17 @@ module Datahen
43
43
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
44
44
  option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
45
45
  def exec_parse(scraper_name, parser_file, *gids)
46
+ if options[:job]
47
+ job_id = options[:job]
48
+ else
49
+ job = Client::ScraperJob.new(options).find(scraper_name)
50
+ job_id = job['id']
51
+ end
52
+
46
53
  gids.each do |gid|
47
54
  begin
48
55
  puts "Parsing #{gid}"
49
56
 
50
- if options[:job]
51
- job_id = options[:job]
52
- else
53
- job = Client::ScraperJob.new(options).find(scraper_name)
54
- job_id = job['id']
55
- end
56
-
57
57
  vars = JSON.parse(options[:vars]) if options[:vars]
58
58
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
59
59
  rescue => e
@@ -61,6 +61,47 @@ module Datahen
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
66
+ long_desc <<-LONGDESC
67
+ Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
68
+ LONGDESC
69
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
70
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
71
+ option :"workers", type: :numeric, default: 1, desc: "Worker count"
72
+ option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
+ option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
+ option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
76
+ def batch_exec_parse(scraper_name, config_file)
77
+ if options[:job]
78
+ job_id = options[:job]
79
+ else
80
+ job = Client::ScraperJob.new(options).find(scraper_name)
81
+ job_id = job['id']
82
+ end
83
+
84
+ # make the stdout and stderr sync to prevent buffering
85
+ old_stdout_sync = $stdout.sync
86
+ old_stderr_sync = $stderr.sync
87
+ $stdout.sync = true
88
+ $stderr.sync = true
89
+
90
+ begin
91
+ batch = Datahen::Scraper::BatchParser.new job_id, config_file,
92
+ worker_count: options[:"workers"],
93
+ max_garbage: options[:"max-garbage"],
94
+ dequeue_interval: options[:"dequeue-interval"],
95
+ dequeue_scale: options[:"dequeue-scale"]
96
+ batch.exec_parse true, options[:"keep-outputs"]
97
+ rescue => e
98
+ puts [e.message] + e.backtrace
99
+ end
100
+
101
+ # resume whatever state the stdout and stderr sync were
102
+ $stdout.sync = old_stdout_sync
103
+ $stderr.sync = old_stderr_sync
104
+ end
64
105
  end
65
106
  end
66
107
 
@@ -7,7 +7,7 @@ module Datahen
7
7
  include HTTParty
8
8
 
9
9
  def get_content(url)
10
- self.class.get(url, format: :plain)
10
+ self.class.get(url, format: :plain).response.body
11
11
  end
12
12
 
13
13
  def get_gunzipped_content(url)
@@ -42,6 +42,16 @@ module Datahen
42
42
  self.class.post("/jobs/#{job_id}/pages", params)
43
43
  end
44
44
 
45
+ def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
46
+ body = {
47
+ limit: limit,
48
+ page_types: page_types,
49
+ parse_fetching_failed: parse_fetching_failed
50
+ }
51
+ params = @options.merge(opts).merge({body: body.to_json})
52
+ self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
+ end
54
+
45
55
  def parsing_update(job_id, gid, opts={})
46
56
  body = {}
47
57
  body[:outputs] = opts.fetch(:outputs) {[]}
@@ -1,6 +1,7 @@
1
1
  require "datahen/error"
2
2
  require "datahen/plugin"
3
3
  require "datahen/scraper/parser"
4
+ require "datahen/scraper/batch_parser"
4
5
  require "datahen/scraper/seeder"
5
6
  require "datahen/scraper/finisher"
6
7
  require "datahen/scraper/executor"
@@ -0,0 +1,358 @@
1
+ require 'concurrent'
2
+ require 'parallel'
3
+
4
+ module Datahen
5
+ module Scraper
6
+ class BatchParser
7
+ NOT_FOUND_MSG = "No more pages to parse found"
8
+ NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
+ NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
+
11
+ # Configuration file path.
12
+ # @return [String] config file path
13
+ attr_accessor :config_file
14
+ # Garbage collector request counter.
15
+ # @return [Integer] garbage collector counter
16
+ attr_accessor :garbage_count
17
+ # Last printed message, useful to prevent duplicated log messages.
18
+ # @return [String] last printed message
19
+ attr_accessor :last_message
20
+ # Second dequeue counter used to prevent false negative warning messages.
21
+ # @return [Integer] second dequeue counter
22
+ attr_accessor :second_dequeue_count
23
+ # Dequeue API request timeout in seconds.
24
+ # @return [Integer] dequeue API request timeout in seconds
25
+ attr_accessor :dequeue_timeout
26
+ # Job id to be executed.
27
+ # @return [Integer] job id
28
+ attr_reader :job_id
29
+ # Parallel worker quantity.
30
+ # @return [Integer] parallel worker quantity
31
+ attr_reader :worker_count
32
+ # Loaded pages array.
33
+ # @return [Concurrent::Array<Hash>] loaded pages as an array
34
+ attr_reader :pages
35
+ # Loaded pages hash, useful to avoid duplicates on the loaded pages array.
36
+ # @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
37
+ attr_reader :loaded_pages
38
+ # Max garbage collector requests before actually executing the garbage
39
+ # collector.
40
+ # @return [Integer] max garbage request quantity before actually executing
41
+ # it
42
+ attr_reader :max_garbage
43
+ # Dequeue interval in seconds.
44
+ # @return [Integer] dequeue interval in seconds
45
+ attr_reader :dequeue_interval
46
+ # Dequeue scale used to calculate the ideal dequeue size.
47
+ # @return [Numeric] dequeue scale
48
+ attr_reader :dequeue_scale
49
+ # Known page types extracted from the config file.
50
+ # @return [Array<String>] known page types
51
+ attr_reader :page_types
52
+ # Known parsers extracted from the config file.
53
+ # @return [Concurrent::Hash<String, String>] known parsers
54
+ attr_reader :parsers
55
+ # Current config file loaded.
56
+ # @return [Hash] current loaded configuration
57
+ attr_reader :config
58
+ # Datahen job pages client used for API pages dequeuing.
59
+ # @return [Datahen::Client::JobPage] datahen job pages API client
60
+ attr_reader :client
61
+ # Garbage collector mutex used to synchronize garbage collector requests.
62
+ # @return [Mutex] garbage collector mutex
63
+ attr_reader :garbage_mutex
64
+ # Current dequeuer thread.
65
+ # @return [Thread] dequeuer thread
66
+ attr_reader :dequeuer_thread
67
+ # Dequeuer mutext used to synchronize page dequeuing.
68
+ # @return [Mutex] dequeuer mutex
69
+ attr_reader :dequeue_mutex
70
+ # Dequeuer last run unix timestamp.
71
+ # @return [Integer] dequeuer last run unix timestamp
72
+ attr_reader :dequeuer_still_alive
73
+ # Indicates whenever the wait time is because there are no more pages.
74
+ # @return [Boolean] `true` when wait time is due to no more pages,
75
+ # else `false`
76
+ attr_reader :not_found
77
+
78
+ # Wait a specific amount of seconds.
79
+ # @param [Integer] time_in_seconds Seconds to wait.
80
+ def self.wait time_in_seconds
81
+ Kernel.sleep time_in_seconds
82
+ end
83
+
84
+ # Get a unix timestamp.
85
+ # @return [Integer] unix timestamp
86
+ def self.timestamp
87
+ Time.new.utc.to_i
88
+ end
89
+
90
+ # Initialize a batch parser object.
91
+ # @param [Integer] job_id Job id.
92
+ # @param [String] config_file Config file path.
93
+ # @param [Hash] opts ({}) Configuration options
94
+ # @option opts [Integer] :worker_count (1) Parallel worker quantity.
95
+ # @option opts [Integer] :max_garbage (5) Max amount of times the garbage
96
+ # collector can be requested before actually executing.
97
+ # @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
98
+ # between page dequeuing.
99
+ # @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
100
+ # calculate page dequeue size.
101
+ # @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
102
+ # timeout in seconds.
103
+ # @option opts [Hash] :client_options ({}) Datahen client gem additional
104
+ # options (see Datahen::Client::Base#initialize method).
105
+ def initialize(job_id, config_file, opts = {})
106
+ opts = {
107
+ worker_count: 1,
108
+ max_garbage: 5,
109
+ dequeue_interval: 3,
110
+ dequeue_scale: 2,
111
+ dequeue_timeout: 30,
112
+ client_options: {}
113
+ }.merge opts
114
+
115
+ @job_id = job_id
116
+ @worker_count = opts[:worker_count]
117
+ @dequeue_interval = opts[:dequeue_interval]
118
+ @dequeue_scale = opts[:dequeue_scale]
119
+ @max_garbage = opts[:max_garbage]
120
+ @pages = Concurrent::Array.new
121
+ @loaded_pages = Concurrent::Hash.new
122
+ @garbage_mutex = Mutex.new
123
+ @dequeue_mutex = Mutex.new
124
+ @not_found = false
125
+ self.dequeue_timeout = opts[:dequeue_timeout]
126
+ self.second_dequeue_count = 0
127
+ self.garbage_count = 0
128
+ self.config_file = config_file
129
+ self.load_config
130
+
131
+ @client = Datahen::Client::JobPage.new(opts[:client_options])
132
+ nil
133
+ end
134
+
135
+ # Execute garbage collector after it is requested as many times as
136
+ # described by #max_garbage.
137
+ def recollect_garbage
138
+ self.garbage_mutex.synchronize do
139
+ self.garbage_count += 1
140
+ if self.garbage_count > self.max_garbage
141
+ puts "Recollect garbage"
142
+ GC.start
143
+ self.garbage_count = 0
144
+ end
145
+ end
146
+ nil
147
+ end
148
+
149
+ # Loads the config file into a Hash.
150
+ def load_config
151
+ # build page type to script file map
152
+ @page_types = []
153
+ @parsers = Concurrent::Hash.new
154
+ @config = YAML.load_file(config_file)
155
+ self.config['parsers'].each do |v|
156
+ next if !v['disabled'].nil? && !!v['disabled']
157
+ @page_types << v['page_type']
158
+ self.parsers[v['page_type']] = v['file']
159
+ end
160
+ self.recollect_garbage
161
+ nil
162
+ end
163
+
164
+ # Print the message regardless of it being the same as the last message.
165
+ # @param [String] message Message to display.
166
+ def repeat_puts message
167
+ puts message
168
+ self.last_message = message
169
+ nil
170
+ end
171
+
172
+ # Print the message only when it is different from the last recorded
173
+ # message.
174
+ # @param [String] message Message to display.
175
+ def no_repeat_puts message
176
+ return if message == self.last_message
177
+ puts message
178
+ self.last_message = message
179
+ nil
180
+ end
181
+
182
+ # Refresh dequeuer's still alive timestamp
183
+ def dequeuer_is_alive!
184
+ self.dequeue_mutex.synchronize do
185
+ @dequeuer_still_alive = self.class.timestamp
186
+ end
187
+ nil
188
+ end
189
+
190
+ # Load new pages by dequeuing from the API.
191
+ # @return [Integer] amount of pages loaded
192
+ def load_pages
193
+ self.dequeuer_is_alive!
194
+
195
+ # calculate dequeue size
196
+ max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
197
+ current_size = self.pages.length
198
+ dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
199
+ if dequeue_size < 1
200
+ return 0
201
+ end
202
+ dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
203
+
204
+ # reserve and get to pages parse
205
+ response = nil
206
+ begin
207
+ response = client.dequeue self.job_id,
208
+ dequeue_size,
209
+ self.page_types,
210
+ config['parse_fetching_failed'],
211
+ timeout: self.dequeue_timeout
212
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
213
+ self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
214
+ self.dequeuer_is_alive!
215
+ return 0
216
+ rescue => e
217
+ raise e
218
+ end
219
+ self.dequeuer_is_alive!
220
+
221
+ # ensure a valid response or try again
222
+ if response.nil? || response.response.code.to_i != 200
223
+ self.repeat_puts(response.nil? ? 'null' : response.body)
224
+ self.recollect_garbage
225
+ return 0
226
+ end
227
+
228
+ # add pages
229
+ count = 0
230
+ (JSON.parse(response.body) || []).each do |page|
231
+ count += 1
232
+ next if self.loaded_pages.has_key? page['gid']
233
+ self.pages << (self.loaded_pages[page['gid']] = page)
234
+ end
235
+ response = nil
236
+ self.dequeuer_is_alive!
237
+
238
+ # recolect garbage to free some memory before parsing
239
+ if count > 0
240
+ @not_found = false
241
+ self.recollect_garbage
242
+ self.repeat_puts "Found #{count} page(s) to parse"
243
+ self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
244
+ else
245
+ @not_found = true
246
+ self.no_repeat_puts NOT_FOUND_MSG
247
+ end
248
+
249
+ # return how many pages were loaded
250
+ count
251
+ end
252
+
253
+ # Ensures that the dequeuer thread exists and is running.
254
+ # @return [Boolean] `true` if thread was alive, or `false` if had to
255
+ # create a new thread
256
+ def ensure_dequeuer_thread
257
+ self.dequeue_mutex.synchronize do
258
+ # check if dequeuer thread is alive and healthy
259
+ if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
260
+ still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
261
+ return true if self.class.timestamp < still_alive_timeout
262
+
263
+ # kill dequeuer thread
264
+ self.repeat_puts "Dequeuer isn't healthy, will restart it..."
265
+ self.dequeuer_thread.kill
266
+ @dequeuer_thread = nil
267
+ self.recollect_garbage
268
+ self.no_repeat_puts "Dequeuer thread was killed!"
269
+ end
270
+
271
+ # dequeuing on parallel (the ride never ends :D)
272
+ @dequeuer_thread = Thread.new do
273
+ while true
274
+ begin
275
+ self.load_pages
276
+ self.class.wait self.dequeue_interval
277
+ rescue => e
278
+ puts [e.message] + e.backtrace rescue 'error'
279
+ end
280
+ end
281
+ puts "Error: dequeuer died! D:"
282
+ end
283
+ self.repeat_puts "Dequeuer thread was started!"
284
+ end
285
+ false
286
+ end
287
+
288
+ # Dequeue one page from the previously loaded pages, and waits until there
289
+ # are new pages whenever there are no loaded pages.
290
+ # @return [Hash] dequeued page
291
+ def dequeue_pages
292
+ # collect garbage
293
+ self.recollect_garbage
294
+
295
+ # return page if there are loeaded pages
296
+ is_waiting = false
297
+ while true do
298
+ page = self.pages.shift
299
+ unless page.nil?
300
+ puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
301
+ loaded_pages.delete(page['gid'])
302
+ return page
303
+ end
304
+
305
+ # be more verbose on worker waiting
306
+ unless is_waiting
307
+ is_waiting = true
308
+ puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
309
+ if self.second_dequeue_count > 1 && !self.not_found
310
+ puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
311
+ end
312
+ end
313
+ self.class.wait 1
314
+
315
+ # ensure the dequeuer thread is alive and healthy
316
+ self.ensure_dequeuer_thread
317
+ end
318
+ end
319
+
320
+ # Dequeue pages and execute the parsers associated to them on parallel.
321
+ def exec_parse save = false, keep_outputs = false
322
+ if self.worker_count < 1
323
+ self.no_repeat_puts NO_WORKERS_MSG
324
+ return
325
+ else
326
+ self.no_repeat_puts "Spawing #{self.worker_count} workers"
327
+ end
328
+
329
+ # start dequeuer
330
+ self.ensure_dequeuer_thread
331
+
332
+ # process the pages
333
+ dequeue = lambda{ self.dequeue_pages }
334
+ Parallel.each(dequeue, in_threads: (worker_count)) do |page|
335
+ parser_file = self.parsers[page['page_type']]
336
+ begin
337
+ puts Datahen::Scraper::Parser.exec_parser_by_page(
338
+ parser_file,
339
+ page,
340
+ job_id,
341
+ save,
342
+ nil,
343
+ keep_outputs
344
+ )
345
+ rescue Parallel::Kill => e
346
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
347
+ rescue Parallel::Break => e
348
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
349
+ rescue => e
350
+ puts [e.message] + e.backtrace rescue 'error'
351
+ end
352
+ end
353
+
354
+ nil
355
+ end
356
+ end
357
+ end
358
+ end
@@ -6,7 +6,7 @@ module Datahen
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
8
 
9
- attr_accessor :filename, :gid, :job_id
9
+ attr_accessor :filename, :page, :gid, :job_id
10
10
 
11
11
  include Datahen::Plugin::ContextExposer
12
12
 
@@ -15,6 +15,9 @@ module Datahen
15
15
  end
16
16
 
17
17
  def init_page()
18
+ # skip whenever a page is provided
19
+ return self.page unless self.page.nil?
20
+
18
21
  if job_id
19
22
  puts "getting Job Page"
20
23
  init_job_page
@@ -18,6 +18,22 @@ module Datahen
18
18
  end
19
19
  end
20
20
 
21
+ def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
22
+ extname = File.extname(filename)
23
+ case extname
24
+ when '.rb'
25
+ executor = RubyParserExecutor.new(
26
+ filename: filename,
27
+ page: page,
28
+ job_id: job_id,
29
+ vars: vars,
30
+ keep_outputs: keep_outputs
31
+ )
32
+ executor.exec_parser(save)
33
+ else
34
+ puts "Unable to find a parser executor for file type \"#{extname}\""
35
+ end
36
+ end
21
37
 
22
38
  end
23
39
  end
@@ -12,7 +12,8 @@ module Datahen
12
12
 
13
13
  def initialize(options={})
14
14
  @filename = options.fetch(:filename) { raise "Filename is required"}
15
- @gid = options.fetch(:gid) { raise "GID is required"}
15
+ @page = options.fetch(:page) { nil }
16
+ @gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
16
17
  @job_id = options.fetch(:job_id)
17
18
  @page_vars = options.fetch(:vars) { {} }
18
19
  @keep_outputs = !!(options.fetch(:keep_outputs) { false })
@@ -46,6 +47,8 @@ module Datahen
46
47
  end
47
48
 
48
49
  def init_page_vars(page)
50
+ return self.page unless self.page.nil?
51
+
49
52
  if !@page_vars.nil? && !@page_vars.empty?
50
53
  page['vars'] = @page_vars
51
54
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.24"
2
+ VERSION = "0.15.11"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.24
4
+ version: 0.15.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-03-16 00:00:00.000000000 Z
11
+ date: 2021-05-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -45,9 +45,6 @@ dependencies:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.6'
48
- - - "<"
49
- - !ruby/object:Gem::Version
50
- version: '1.10'
51
48
  type: :runtime
52
49
  prerelease: false
53
50
  version_requirements: !ruby/object:Gem::Requirement
@@ -55,9 +52,34 @@ dependencies:
55
52
  - - "~>"
56
53
  - !ruby/object:Gem::Version
57
54
  version: '1.6'
58
- - - "<"
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.20'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
59
81
  - !ruby/object:Gem::Version
60
- version: '1.10'
82
+ version: '1.20'
61
83
  - !ruby/object:Gem::Dependency
62
84
  name: bundler
63
85
  requirement: !ruby/object:Gem::Requirement
@@ -238,6 +260,7 @@ files:
238
260
  - lib/datahen/plugin.rb
239
261
  - lib/datahen/plugin/context_exposer.rb
240
262
  - lib/datahen/scraper.rb
263
+ - lib/datahen/scraper/batch_parser.rb
241
264
  - lib/datahen/scraper/executor.rb
242
265
  - lib/datahen/scraper/finisher.rb
243
266
  - lib/datahen/scraper/parser.rb
@@ -261,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
261
284
  requirements:
262
285
  - - ">="
263
286
  - !ruby/object:Gem::Version
264
- version: 2.2.2
287
+ version: 2.4.4
265
288
  required_rubygems_version: !ruby/object:Gem::Requirement
266
289
  requirements:
267
290
  - - ">="