datahen 0.14.24 → 0.15.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 54064cf5656d253f683540fa3704cdcae3991bc07a91ab5339bf9083eb0792f2
4
- data.tar.gz: 3c82cedd06937454a9af5d91eea9b32f3ad744394763ca9fadbcdc6068eda683
3
+ metadata.gz: f77e31da8e2a7ff08086c4aa9d174608a9c3f186679d456b22310b48384d3572
4
+ data.tar.gz: 0bf53ae0886b16bf6fe08b0db07b1a631b69f31d8e3a6868a4d483549049e4ed
5
5
  SHA512:
6
- metadata.gz: fb17b046f9dbd15cf7a278f68f550e8b3b84d8c16a030d0ef4df100df28c8e4dce29fe74ffc70e02f79af26a2d1e3f66ccb6890e06a342a70fac09d824169431
7
- data.tar.gz: 732256fc714635896f444bed50e2d6f49c2a03c4868fd23eb84e52701935bc2bfbef25811055983691c5e0ff39af53328dbeb67faf91f127c1d06af450d7d666
6
+ metadata.gz: a491874347ed6ac97c0a0e4f0d2c5830140b9367c1e01b4c95e7a447b071df643d12793ed7d6e5a0224b8876905cf74bc13b987e6e3e03e937d1f821557b8ec3
7
+ data.tar.gz: c553a372790654726f2921b6d9d582ca90b314b3fe8d78625cf2442e01cf2ce96ae556e78812d5bd6a03f350beb32aa3158981554179410c12926d480c911887
data/datahen.gemspec CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
33
33
  spec.bindir = "exe"
34
34
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
35
35
  spec.require_paths = ["lib"]
36
- spec.required_ruby_version = '>= 2.2.2'
36
+ spec.required_ruby_version = '>= 2.4.4'
37
37
  spec.add_dependency "thor", "~> 0.20.3"
38
38
  spec.add_dependency 'httparty', '~> 0.16.2'
39
- spec.add_dependency 'nokogiri', '~> 1.6', '< 1.10'
39
+ spec.add_dependency 'nokogiri', '~> 1.6'
40
+ spec.add_dependency 'concurrent-ruby', '~> 1.1'
41
+ spec.add_dependency 'parallel', '~> 1.20'
40
42
  spec.add_development_dependency 'bundler', '>= 1.16'
41
43
  spec.add_development_dependency 'rake', '>= 10.0'
42
44
  spec.add_development_dependency 'minitest', '>= 5.11'
@@ -43,17 +43,17 @@ module Datahen
43
43
  option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
44
44
  option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
45
45
  def exec_parse(scraper_name, parser_file, *gids)
46
+ if options[:job]
47
+ job_id = options[:job]
48
+ else
49
+ job = Client::ScraperJob.new(options).find(scraper_name)
50
+ job_id = job['id']
51
+ end
52
+
46
53
  gids.each do |gid|
47
54
  begin
48
55
  puts "Parsing #{gid}"
49
56
 
50
- if options[:job]
51
- job_id = options[:job]
52
- else
53
- job = Client::ScraperJob.new(options).find(scraper_name)
54
- job_id = job['id']
55
- end
56
-
57
57
  vars = JSON.parse(options[:vars]) if options[:vars]
58
58
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
59
59
  rescue => e
@@ -61,6 +61,47 @@ module Datahen
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
66
+ long_desc <<-LONGDESC
67
+ Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
68
+ LONGDESC
69
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
70
+ option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
71
+ option :"workers", type: :numeric, default: 1, desc: "Worker count"
72
+ option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
73
+ option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
74
+ option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
75
+ option :"dequeue-timeout", type: :numeric, default: 30, desc: "Dequeue pages API request timeout"
76
+ def batch_exec_parse(scraper_name, config_file)
77
+ if options[:job]
78
+ job_id = options[:job]
79
+ else
80
+ job = Client::ScraperJob.new(options).find(scraper_name)
81
+ job_id = job['id']
82
+ end
83
+
84
+ # make the stdout and stderr sync to prevent buffering
85
+ old_stdout_sync = $stdout.sync
86
+ old_stderr_sync = $stderr.sync
87
+ $stdout.sync = true
88
+ $stderr.sync = true
89
+
90
+ begin
91
+ batch = Datahen::Scraper::BatchParser.new job_id, config_file,
92
+ worker_count: options[:"workers"],
93
+ max_garbage: options[:"max-garbage"],
94
+ dequeue_interval: options[:"dequeue-interval"],
95
+ dequeue_scale: options[:"dequeue-scale"]
96
+ batch.exec_parse true, options[:"keep-outputs"]
97
+ rescue => e
98
+ puts [e.message] + e.backtrace
99
+ end
100
+
101
+ # resume whatever state the stdout and stderr sync were
102
+ $stdout.sync = old_stdout_sync
103
+ $stderr.sync = old_stderr_sync
104
+ end
64
105
  end
65
106
  end
66
107
 
@@ -7,7 +7,7 @@ module Datahen
7
7
  include HTTParty
8
8
 
9
9
  def get_content(url)
10
- self.class.get(url, format: :plain)
10
+ self.class.get(url, format: :plain).response.body
11
11
  end
12
12
 
13
13
  def get_gunzipped_content(url)
@@ -42,6 +42,16 @@ module Datahen
42
42
  self.class.post("/jobs/#{job_id}/pages", params)
43
43
  end
44
44
 
45
+ def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
46
+ body = {
47
+ limit: limit,
48
+ page_types: page_types,
49
+ parse_fetching_failed: parse_fetching_failed
50
+ }
51
+ params = @options.merge(opts).merge({body: body.to_json})
52
+ self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
53
+ end
54
+
45
55
  def parsing_update(job_id, gid, opts={})
46
56
  body = {}
47
57
  body[:outputs] = opts.fetch(:outputs) {[]}
@@ -1,6 +1,7 @@
1
1
  require "datahen/error"
2
2
  require "datahen/plugin"
3
3
  require "datahen/scraper/parser"
4
+ require "datahen/scraper/batch_parser"
4
5
  require "datahen/scraper/seeder"
5
6
  require "datahen/scraper/finisher"
6
7
  require "datahen/scraper/executor"
@@ -0,0 +1,358 @@
1
+ require 'concurrent'
2
+ require 'parallel'
3
+
4
+ module Datahen
5
+ module Scraper
6
+ class BatchParser
7
+ NOT_FOUND_MSG = "No more pages to parse found"
8
+ NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
9
+ NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
10
+
11
+ # Configuration file path.
12
+ # @return [String] config file path
13
+ attr_accessor :config_file
14
+ # Garbage collector request counter.
15
+ # @return [Integer] garbage collector counter
16
+ attr_accessor :garbage_count
17
+ # Last printed message, useful to prevent duplicated log messages.
18
+ # @return [String] last printed message
19
+ attr_accessor :last_message
20
+ # Second dequeue counter used to prevent false negative warning messages.
21
+ # @return [Integer] second dequeue counter
22
+ attr_accessor :second_dequeue_count
23
+ # Dequeue API request timeout in seconds.
24
+ # @return [Integer] dequeue API request timeout in seconds
25
+ attr_accessor :dequeue_timeout
26
+ # Job id to be executed.
27
+ # @return [Integer] job id
28
+ attr_reader :job_id
29
+ # Parallel worker quantity.
30
+ # @return [Integer] parallel worker quantity
31
+ attr_reader :worker_count
32
+ # Loaded pages array.
33
+ # @return [Concurrent::Array<Hash>] loaded pages as an array
34
+ attr_reader :pages
35
+ # Loaded pages hash, useful to avoid duplicates on the loaded pages array.
36
+ # @return [Concurrent::Hash<String, Hash>] loaded pages as a concurrent hash
37
+ attr_reader :loaded_pages
38
+ # Max garbage collector requests before actually executing the garbage
39
+ # collector.
40
+ # @return [Integer] max garbage request quantity before actually executing
41
+ # it
42
+ attr_reader :max_garbage
43
+ # Dequeue interval in seconds.
44
+ # @return [Integer] dequeue interval in seconds
45
+ attr_reader :dequeue_interval
46
+ # Dequeue scale used to calculate the ideal dequeue size.
47
+ # @return [Numeric] dequeue scale
48
+ attr_reader :dequeue_scale
49
+ # Known page types extracted from the config file.
50
+ # @return [Array<String>] known page types
51
+ attr_reader :page_types
52
+ # Known parsers extracted from the config file.
53
+ # @return [Concurrent::Hash<String, String>] known parsers
54
+ attr_reader :parsers
55
+ # Current config file loaded.
56
+ # @return [Hash] current loaded configuration
57
+ attr_reader :config
58
+ # Datahen job pages client used for API pages dequeuing.
59
+ # @return [Datahen::Client::JobPage] datahen job pages API client
60
+ attr_reader :client
61
+ # Garbage collector mutex used to synchronize garbage collector requests.
62
+ # @return [Mutex] garbage collector mutex
63
+ attr_reader :garbage_mutex
64
+ # Current dequeuer thread.
65
+ # @return [Thread] dequeuer thread
66
+ attr_reader :dequeuer_thread
67
+ # Dequeuer mutext used to synchronize page dequeuing.
68
+ # @return [Mutex] dequeuer mutex
69
+ attr_reader :dequeue_mutex
70
+ # Dequeuer last run unix timestamp.
71
+ # @return [Integer] dequeuer last run unix timestamp
72
+ attr_reader :dequeuer_still_alive
73
+ # Indicates whenever the wait time is because there are no more pages.
74
+ # @return [Boolean] `true` when wait time is due to no more pages,
75
+ # else `false`
76
+ attr_reader :not_found
77
+
78
+ # Wait a specific amount of seconds.
79
+ # @param [Integer] time_in_seconds Seconds to wait.
80
+ def self.wait time_in_seconds
81
+ Kernel.sleep time_in_seconds
82
+ end
83
+
84
+ # Get a unix timestamp.
85
+ # @return [Integer] unix timestamp
86
+ def self.timestamp
87
+ Time.new.utc.to_i
88
+ end
89
+
90
+ # Initialize a batch parser object.
91
+ # @param [Integer] job_id Job id.
92
+ # @param [String] config_file Config file path.
93
+ # @param [Hash] opts ({}) Configuration options
94
+ # @option opts [Integer] :worker_count (1) Parallel worker quantity.
95
+ # @option opts [Integer] :max_garbage (5) Max amount of times the garbage
96
+ # collector can be requested before actually executing.
97
+ # @option opts [Integer] :dequeue_interval (3) Time in seconds to wait
98
+ # between page dequeuing.
99
+ # @option opts [Numeric] :dequeue_scale (2) Scaling factor to used to
100
+ # calculate page dequeue size.
101
+ # @option opts [Numeric] :dequeue_timeout (30) Page dequeue API request
102
+ # timeout in seconds.
103
+ # @option opts [Hash] :client_options ({}) Datahen client gem additional
104
+ # options (see Datahen::Client::Base#initialize method).
105
+ def initialize(job_id, config_file, opts = {})
106
+ opts = {
107
+ worker_count: 1,
108
+ max_garbage: 5,
109
+ dequeue_interval: 3,
110
+ dequeue_scale: 2,
111
+ dequeue_timeout: 30,
112
+ client_options: {}
113
+ }.merge opts
114
+
115
+ @job_id = job_id
116
+ @worker_count = opts[:worker_count]
117
+ @dequeue_interval = opts[:dequeue_interval]
118
+ @dequeue_scale = opts[:dequeue_scale]
119
+ @max_garbage = opts[:max_garbage]
120
+ @pages = Concurrent::Array.new
121
+ @loaded_pages = Concurrent::Hash.new
122
+ @garbage_mutex = Mutex.new
123
+ @dequeue_mutex = Mutex.new
124
+ @not_found = false
125
+ self.dequeue_timeout = opts[:dequeue_timeout]
126
+ self.second_dequeue_count = 0
127
+ self.garbage_count = 0
128
+ self.config_file = config_file
129
+ self.load_config
130
+
131
+ @client = Datahen::Client::JobPage.new(opts[:client_options])
132
+ nil
133
+ end
134
+
135
+ # Execute garbage collector after it is requested as many times as
136
+ # described by #max_garbage.
137
+ def recollect_garbage
138
+ self.garbage_mutex.synchronize do
139
+ self.garbage_count += 1
140
+ if self.garbage_count > self.max_garbage
141
+ puts "Recollect garbage"
142
+ GC.start
143
+ self.garbage_count = 0
144
+ end
145
+ end
146
+ nil
147
+ end
148
+
149
+ # Loads the config file into a Hash.
150
+ def load_config
151
+ # build page type to script file map
152
+ @page_types = []
153
+ @parsers = Concurrent::Hash.new
154
+ @config = YAML.load_file(config_file)
155
+ self.config['parsers'].each do |v|
156
+ next if !v['disabled'].nil? && !!v['disabled']
157
+ @page_types << v['page_type']
158
+ self.parsers[v['page_type']] = v['file']
159
+ end
160
+ self.recollect_garbage
161
+ nil
162
+ end
163
+
164
+ # Print the message regardless of it being the same as the last message.
165
+ # @param [String] message Message to display.
166
+ def repeat_puts message
167
+ puts message
168
+ self.last_message = message
169
+ nil
170
+ end
171
+
172
+ # Print the message only when it is different from the last recorded
173
+ # message.
174
+ # @param [String] message Message to display.
175
+ def no_repeat_puts message
176
+ return if message == self.last_message
177
+ puts message
178
+ self.last_message = message
179
+ nil
180
+ end
181
+
182
+ # Refresh dequeuer's still alive timestamp
183
+ def dequeuer_is_alive!
184
+ self.dequeue_mutex.synchronize do
185
+ @dequeuer_still_alive = self.class.timestamp
186
+ end
187
+ nil
188
+ end
189
+
190
+ # Load new pages by dequeuing from the API.
191
+ # @return [Integer] amount of pages loaded
192
+ def load_pages
193
+ self.dequeuer_is_alive!
194
+
195
+ # calculate dequeue size
196
+ max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
197
+ current_size = self.pages.length
198
+ dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
199
+ if dequeue_size < 1
200
+ return 0
201
+ end
202
+ dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
203
+
204
+ # reserve and get to pages parse
205
+ response = nil
206
+ begin
207
+ response = client.dequeue self.job_id,
208
+ dequeue_size,
209
+ self.page_types,
210
+ config['parse_fetching_failed'],
211
+ timeout: self.dequeue_timeout
212
+ rescue Net::ReadTimeout, Net::OpenTimeout => e
213
+ self.repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
214
+ self.dequeuer_is_alive!
215
+ return 0
216
+ rescue => e
217
+ raise e
218
+ end
219
+ self.dequeuer_is_alive!
220
+
221
+ # ensure a valid response or try again
222
+ if response.nil? || response.response.code.to_i != 200
223
+ self.repeat_puts(response.nil? ? 'null' : response.body)
224
+ self.recollect_garbage
225
+ return 0
226
+ end
227
+
228
+ # add pages
229
+ count = 0
230
+ (JSON.parse(response.body) || []).each do |page|
231
+ count += 1
232
+ next if self.loaded_pages.has_key? page['gid']
233
+ self.pages << (self.loaded_pages[page['gid']] = page)
234
+ end
235
+ response = nil
236
+ self.dequeuer_is_alive!
237
+
238
+ # recolect garbage to free some memory before parsing
239
+ if count > 0
240
+ @not_found = false
241
+ self.recollect_garbage
242
+ self.repeat_puts "Found #{count} page(s) to parse"
243
+ self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
244
+ else
245
+ @not_found = true
246
+ self.no_repeat_puts NOT_FOUND_MSG
247
+ end
248
+
249
+ # return how many pages were loaded
250
+ count
251
+ end
252
+
253
+ # Ensures that the dequeuer thread exists and is running.
254
+ # @return [Boolean] `true` if thread was alive, or `false` if had to
255
+ # create a new thread
256
+ def ensure_dequeuer_thread
257
+ self.dequeue_mutex.synchronize do
258
+ # check if dequeuer thread is alive and healthy
259
+ if !self.dequeuer_thread.nil? && self.dequeuer_thread.alive?
260
+ still_alive_timeout = (self.dequeue_timeout + self.dequeue_interval) * 2 + self.dequeuer_still_alive
261
+ return true if self.class.timestamp < still_alive_timeout
262
+
263
+ # kill dequeuer thread
264
+ self.repeat_puts "Dequeuer isn't healthy, will restart it..."
265
+ self.dequeuer_thread.kill
266
+ @dequeuer_thread = nil
267
+ self.recollect_garbage
268
+ self.no_repeat_puts "Dequeuer thread was killed!"
269
+ end
270
+
271
+ # dequeuing on parallel (the ride never ends :D)
272
+ @dequeuer_thread = Thread.new do
273
+ while true
274
+ begin
275
+ self.load_pages
276
+ self.class.wait self.dequeue_interval
277
+ rescue => e
278
+ puts [e.message] + e.backtrace rescue 'error'
279
+ end
280
+ end
281
+ puts "Error: dequeuer died! D:"
282
+ end
283
+ self.repeat_puts "Dequeuer thread was started!"
284
+ end
285
+ false
286
+ end
287
+
288
+ # Dequeue one page from the previously loaded pages, and waits until there
289
+ # are new pages whenever there are no loaded pages.
290
+ # @return [Hash] dequeued page
291
+ def dequeue_pages
292
+ # collect garbage
293
+ self.recollect_garbage
294
+
295
+ # return page if there are loeaded pages
296
+ is_waiting = false
297
+ while true do
298
+ page = self.pages.shift
299
+ unless page.nil?
300
+ puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
301
+ loaded_pages.delete(page['gid'])
302
+ return page
303
+ end
304
+
305
+ # be more verbose on worker waiting
306
+ unless is_waiting
307
+ is_waiting = true
308
+ puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
309
+ if self.second_dequeue_count > 1 && !self.not_found
310
+ puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
311
+ end
312
+ end
313
+ self.class.wait 1
314
+
315
+ # ensure the dequeuer thread is alive and healthy
316
+ self.ensure_dequeuer_thread
317
+ end
318
+ end
319
+
320
+ # Dequeue pages and execute the parsers associated to them on parallel.
321
+ def exec_parse save = false, keep_outputs = false
322
+ if self.worker_count < 1
323
+ self.no_repeat_puts NO_WORKERS_MSG
324
+ return
325
+ else
326
+ self.no_repeat_puts "Spawing #{self.worker_count} workers"
327
+ end
328
+
329
+ # start dequeuer
330
+ self.ensure_dequeuer_thread
331
+
332
+ # process the pages
333
+ dequeue = lambda{ self.dequeue_pages }
334
+ Parallel.each(dequeue, in_threads: (worker_count)) do |page|
335
+ parser_file = self.parsers[page['page_type']]
336
+ begin
337
+ puts Datahen::Scraper::Parser.exec_parser_by_page(
338
+ parser_file,
339
+ page,
340
+ job_id,
341
+ save,
342
+ nil,
343
+ keep_outputs
344
+ )
345
+ rescue Parallel::Kill => e
346
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
347
+ rescue Parallel::Break => e
348
+ puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
349
+ rescue => e
350
+ puts [e.message] + e.backtrace rescue 'error'
351
+ end
352
+ end
353
+
354
+ nil
355
+ end
356
+ end
357
+ end
358
+ end
@@ -6,7 +6,7 @@ module Datahen
6
6
  # Max allowed page size when query outputs (see #find_outputs).
7
7
  MAX_FIND_OUTPUTS_PER_PAGE = 500
8
8
 
9
- attr_accessor :filename, :gid, :job_id
9
+ attr_accessor :filename, :page, :gid, :job_id
10
10
 
11
11
  include Datahen::Plugin::ContextExposer
12
12
 
@@ -15,6 +15,9 @@ module Datahen
15
15
  end
16
16
 
17
17
  def init_page()
18
+ # skip whenever a page is provided
19
+ return self.page unless self.page.nil?
20
+
18
21
  if job_id
19
22
  puts "getting Job Page"
20
23
  init_job_page
@@ -18,6 +18,22 @@ module Datahen
18
18
  end
19
19
  end
20
20
 
21
+ def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
22
+ extname = File.extname(filename)
23
+ case extname
24
+ when '.rb'
25
+ executor = RubyParserExecutor.new(
26
+ filename: filename,
27
+ page: page,
28
+ job_id: job_id,
29
+ vars: vars,
30
+ keep_outputs: keep_outputs
31
+ )
32
+ executor.exec_parser(save)
33
+ else
34
+ puts "Unable to find a parser executor for file type \"#{extname}\""
35
+ end
36
+ end
21
37
 
22
38
  end
23
39
  end
@@ -12,7 +12,8 @@ module Datahen
12
12
 
13
13
  def initialize(options={})
14
14
  @filename = options.fetch(:filename) { raise "Filename is required"}
15
- @gid = options.fetch(:gid) { raise "GID is required"}
15
+ @page = options.fetch(:page) { nil }
16
+ @gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
16
17
  @job_id = options.fetch(:job_id)
17
18
  @page_vars = options.fetch(:vars) { {} }
18
19
  @keep_outputs = !!(options.fetch(:keep_outputs) { false })
@@ -46,6 +47,8 @@ module Datahen
46
47
  end
47
48
 
48
49
  def init_page_vars(page)
50
+ return self.page unless self.page.nil?
51
+
49
52
  if !@page_vars.nil? && !@page_vars.empty?
50
53
  page['vars'] = @page_vars
51
54
  end
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.24"
2
+ VERSION = "0.15.11"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.24
4
+ version: 0.15.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-03-16 00:00:00.000000000 Z
11
+ date: 2021-05-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -45,9 +45,6 @@ dependencies:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.6'
48
- - - "<"
49
- - !ruby/object:Gem::Version
50
- version: '1.10'
51
48
  type: :runtime
52
49
  prerelease: false
53
50
  version_requirements: !ruby/object:Gem::Requirement
@@ -55,9 +52,34 @@ dependencies:
55
52
  - - "~>"
56
53
  - !ruby/object:Gem::Version
57
54
  version: '1.6'
58
- - - "<"
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.20'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
59
81
  - !ruby/object:Gem::Version
60
- version: '1.10'
82
+ version: '1.20'
61
83
  - !ruby/object:Gem::Dependency
62
84
  name: bundler
63
85
  requirement: !ruby/object:Gem::Requirement
@@ -238,6 +260,7 @@ files:
238
260
  - lib/datahen/plugin.rb
239
261
  - lib/datahen/plugin/context_exposer.rb
240
262
  - lib/datahen/scraper.rb
263
+ - lib/datahen/scraper/batch_parser.rb
241
264
  - lib/datahen/scraper/executor.rb
242
265
  - lib/datahen/scraper/finisher.rb
243
266
  - lib/datahen/scraper/parser.rb
@@ -261,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
261
284
  requirements:
262
285
  - - ">="
263
286
  - !ruby/object:Gem::Version
264
- version: 2.2.2
287
+ version: 2.4.4
265
288
  required_rubygems_version: !ruby/object:Gem::Requirement
266
289
  requirements:
267
290
  - - ">="