datahen 0.14.22 → 0.15.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/datahen.gemspec +4 -2
- data/lib/datahen/cli/parser.rb +56 -17
- data/lib/datahen/client/backblaze_content.rb +14 -10
- data/lib/datahen/client/job_page.rb +10 -0
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/scraper/batch_parser.rb +203 -0
- data/lib/datahen/scraper/executor.rb +9 -1
- data/lib/datahen/scraper/parser.rb +16 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +3 -1
- data/lib/datahen/scraper/ruby_parser_executor.rb +4 -1
- data/lib/datahen/scraper/ruby_seeder_executor.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +31 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84e5a734ca1b827228db1550d14f4e9b21e1369d88259a7067dc9209c9a5605b
|
4
|
+
data.tar.gz: 629b472f401b88dc4caabdb87264f9ab1d39d492ff27ac673a79d8ec2c411928
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 320bd2aded5b02fa14a0e4f4cb1c1d4a7d64fe6f4fedcffc62bb04a4c71eab215174f874dc2408dc4416ebf357017f5e743e3f1120fd0e944daeca51deee5311
|
7
|
+
data.tar.gz: 8a86dbc795fae177e4bbd7bde44892ced937cd0fe74383ad962528f25d7d1ef9118897ae24219379299fdd7650fa433ffd436faca71af925c723679f8780db14
|
data/datahen.gemspec
CHANGED
@@ -33,10 +33,12 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.bindir = "exe"
|
34
34
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
35
35
|
spec.require_paths = ["lib"]
|
36
|
-
spec.required_ruby_version = '>= 2.
|
36
|
+
spec.required_ruby_version = '>= 2.4.4'
|
37
37
|
spec.add_dependency "thor", "~> 0.20.3"
|
38
38
|
spec.add_dependency 'httparty', '~> 0.16.2'
|
39
|
-
spec.add_dependency 'nokogiri', '~> 1.6'
|
39
|
+
spec.add_dependency 'nokogiri', '~> 1.6'
|
40
|
+
spec.add_dependency 'concurrent-ruby', '~> 1.1'
|
41
|
+
spec.add_dependency 'parallel', '~> 1.20'
|
40
42
|
spec.add_development_dependency 'bundler', '>= 1.16'
|
41
43
|
spec.add_development_dependency 'rake', '>= 10.0'
|
42
44
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -14,20 +14,19 @@ module Datahen
|
|
14
14
|
def try_parse(scraper_name, parser_file, gid)
|
15
15
|
begin
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
17
|
+
if options[:job]
|
18
|
+
job_id = options[:job]
|
19
|
+
elsif options[:global]
|
20
|
+
job_id = nil
|
21
|
+
else
|
22
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
23
|
+
job_id = job['id']
|
24
|
+
end
|
26
25
|
|
27
26
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
28
27
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
29
28
|
|
30
|
-
|
29
|
+
rescue JSON::ParserError
|
31
30
|
if options[:vars]
|
32
31
|
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
33
32
|
end
|
@@ -44,17 +43,17 @@ module Datahen
|
|
44
43
|
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
45
44
|
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
46
45
|
def exec_parse(scraper_name, parser_file, *gids)
|
46
|
+
if options[:job]
|
47
|
+
job_id = options[:job]
|
48
|
+
else
|
49
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
50
|
+
job_id = job['id']
|
51
|
+
end
|
52
|
+
|
47
53
|
gids.each do |gid|
|
48
54
|
begin
|
49
55
|
puts "Parsing #{gid}"
|
50
56
|
|
51
|
-
if options[:job]
|
52
|
-
job_id = options[:job]
|
53
|
-
else
|
54
|
-
job = Client::ScraperJob.new(options).find(scraper_name)
|
55
|
-
job_id = job['id']
|
56
|
-
end
|
57
|
-
|
58
57
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
59
58
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, true, vars, options[:"keep-outputs"])
|
60
59
|
rescue => e
|
@@ -62,6 +61,46 @@ module Datahen
|
|
62
61
|
end
|
63
62
|
end
|
64
63
|
end
|
64
|
+
|
65
|
+
desc "batch <scraper_name> <config_file>", "Dequeue and execute Job Pages within a scraper's current job"
|
66
|
+
long_desc <<-LONGDESC
|
67
|
+
Dequeue pending job page(s) to execute their scripts and save the output to the scraper's current job\x5
|
68
|
+
LONGDESC
|
69
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
70
|
+
option :"keep-outputs", :aliases => :ko, type: :boolean, default: false, desc: "Don't delete existing outputs"
|
71
|
+
option :"workers", type: :numeric, default: 1, desc: "Worker count"
|
72
|
+
option :"max-garbage", type: :numeric, default: 5, desc: "Pages processed before calling the garbage collector"
|
73
|
+
option :"dequeue-interval", type: :numeric, default: 3, desc: "Seconds to wait between dequeueing"
|
74
|
+
option :"dequeue-scale", type: :numeric, default: 2, desc: "Scale vs worker count describing how many pages to dequeue"
|
75
|
+
def batch_exec_parse(scraper_name, config_file)
|
76
|
+
if options[:job]
|
77
|
+
job_id = options[:job]
|
78
|
+
else
|
79
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
80
|
+
job_id = job['id']
|
81
|
+
end
|
82
|
+
|
83
|
+
# make the stdout and stderr sync to prevent buffering
|
84
|
+
old_stdout_sync = $stdout.sync
|
85
|
+
old_stderr_sync = $stderr.sync
|
86
|
+
$stdout.sync = true
|
87
|
+
$stderr.sync = true
|
88
|
+
|
89
|
+
begin
|
90
|
+
batch = Datahen::Scraper::BatchParser.new job_id, config_file,
|
91
|
+
worker_count: options[:"workers"],
|
92
|
+
max_garbage: options[:"max-garbage"],
|
93
|
+
dequeue_interval: options[:"dequeue-interval"],
|
94
|
+
dequeue_scale: options[:"dequeue-scale"]
|
95
|
+
batch.exec_parse true, options[:"keep-outputs"]
|
96
|
+
rescue => e
|
97
|
+
puts [e.message] + e.backtrace
|
98
|
+
end
|
99
|
+
|
100
|
+
# resume whatever state the stdout and stderr sync were
|
101
|
+
$stdout.sync = old_stdout_sync
|
102
|
+
$stderr.sync = old_stderr_sync
|
103
|
+
end
|
65
104
|
end
|
66
105
|
end
|
67
106
|
|
@@ -4,10 +4,10 @@ require 'httparty'
|
|
4
4
|
module Datahen
|
5
5
|
module Client
|
6
6
|
class BackblazeContent
|
7
|
-
include HTTParty
|
8
|
-
|
7
|
+
include HTTParty
|
8
|
+
|
9
9
|
def get_content(url)
|
10
|
-
self.class.get(url, format: :plain)
|
10
|
+
self.class.get(url, format: :plain).response.body
|
11
11
|
end
|
12
12
|
|
13
13
|
def get_gunzipped_content(url)
|
@@ -19,19 +19,23 @@ module Datahen
|
|
19
19
|
sio = StringIO.new(string)
|
20
20
|
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
21
|
_content = ""
|
22
|
-
begin
|
22
|
+
begin
|
23
23
|
_content = gz.read
|
24
24
|
rescue => e
|
25
25
|
# if unexpected eof error, then readchar until error, and ignore it
|
26
26
|
if e.to_s == 'unexpected end of file'
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
# heavily improve content read recovery by using "String#<<",
|
28
|
+
# reading all "good" lines and then concat the remaining chars
|
29
|
+
begin
|
30
|
+
gz.each_line{|line| _content << line}
|
31
31
|
rescue => e
|
32
|
-
|
32
|
+
begin
|
33
|
+
_content << gz.readchar while !gz.eof
|
34
|
+
rescue => e
|
35
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
36
|
+
end
|
33
37
|
end
|
34
|
-
else
|
38
|
+
else
|
35
39
|
raise e
|
36
40
|
end
|
37
41
|
end
|
@@ -42,6 +42,16 @@ module Datahen
|
|
42
42
|
self.class.post("/jobs/#{job_id}/pages", params)
|
43
43
|
end
|
44
44
|
|
45
|
+
def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
|
46
|
+
body = {
|
47
|
+
limit: limit,
|
48
|
+
page_types: page_types,
|
49
|
+
parse_fetching_failed: parse_fetching_failed
|
50
|
+
}
|
51
|
+
params = @options.merge({body: body.to_json, timeout: 30})
|
52
|
+
self.class.put("/jobs/#{job_id}/pages/parse_dequeue", params)
|
53
|
+
end
|
54
|
+
|
45
55
|
def parsing_update(job_id, gid, opts={})
|
46
56
|
body = {}
|
47
57
|
body[:outputs] = opts.fetch(:outputs) {[]}
|
data/lib/datahen/scraper.rb
CHANGED
@@ -0,0 +1,203 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
module Datahen
|
5
|
+
module Scraper
|
6
|
+
class BatchParser
|
7
|
+
NOT_FOUND_MSG = "No more pages to parse found"
|
8
|
+
NO_DEQUEUE_COUNT_MSG = "\nWarning: Max page to parse dequeue count is 0, check pages to parse scale\n"
|
9
|
+
NO_WORKERS_MSG = "\nWarning: There are no parser workers\n"
|
10
|
+
|
11
|
+
attr_accessor :config_file, :garbage_count, :last_message, :second_dequeue_count
|
12
|
+
attr_reader :job_id, :worker_count, :pages, :max_garbage
|
13
|
+
attr_reader :dequeue_interval, :dequeue_scale
|
14
|
+
attr_reader :page_types, :parsers
|
15
|
+
attr_reader :config, :client, :garbage_mutex
|
16
|
+
|
17
|
+
def self.wait time_in_seconds
|
18
|
+
Kernel.sleep time_in_seconds
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(job_id, config_file, opts = {})
|
22
|
+
opts = {
|
23
|
+
worker_count: 1,
|
24
|
+
max_garbage: 5,
|
25
|
+
dequeue_interval: 3,
|
26
|
+
dequeue_scale: 2,
|
27
|
+
client_options: {}
|
28
|
+
}.merge opts
|
29
|
+
|
30
|
+
@job_id = job_id
|
31
|
+
@worker_count = opts[:worker_count]
|
32
|
+
@dequeue_interval = opts[:dequeue_interval]
|
33
|
+
@dequeue_scale = opts[:dequeue_scale]
|
34
|
+
@max_garbage = opts[:max_garbage]
|
35
|
+
@pages = Concurrent::Hash.new
|
36
|
+
@garbage_mutex = Mutex.new
|
37
|
+
self.second_dequeue_count = 0
|
38
|
+
self.garbage_count = 0
|
39
|
+
self.config_file = config_file
|
40
|
+
self.load_config
|
41
|
+
|
42
|
+
@client = Datahen::Client::JobPage.new(opts[:client_options])
|
43
|
+
end
|
44
|
+
|
45
|
+
def recollect_garbage
|
46
|
+
self.garbage_mutex.synchronize do
|
47
|
+
self.garbage_count += 1
|
48
|
+
if self.garbage_count > self.max_garbage
|
49
|
+
puts "Recollect garbage"
|
50
|
+
GC.start
|
51
|
+
self.garbage_count = 0
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def load_config
|
57
|
+
# build page type to script file map
|
58
|
+
@page_types = []
|
59
|
+
@parsers = Concurrent::Hash.new
|
60
|
+
@config = YAML.load_file(config_file)
|
61
|
+
self.config['parsers'].each do |v|
|
62
|
+
next if !v['disabled'].nil? && !!v['disabled']
|
63
|
+
@page_types << v['page_type']
|
64
|
+
self.parsers[v['page_type']] = v['file']
|
65
|
+
end
|
66
|
+
self.recollect_garbage
|
67
|
+
end
|
68
|
+
|
69
|
+
def repeat_puts message
|
70
|
+
puts message
|
71
|
+
self.last_message = ''
|
72
|
+
end
|
73
|
+
|
74
|
+
def no_repeat_puts message
|
75
|
+
return if message == self.last_message
|
76
|
+
puts message
|
77
|
+
self.last_message = message
|
78
|
+
end
|
79
|
+
|
80
|
+
def load_pages
|
81
|
+
# calculate dequeue size
|
82
|
+
max_dequeue_size = (self.worker_count * self.dequeue_scale).ceil
|
83
|
+
current_size = self.pages.length
|
84
|
+
dequeue_size = (self.dequeue_scale * (max_dequeue_size - current_size)).ceil
|
85
|
+
if dequeue_size < 1
|
86
|
+
return 0
|
87
|
+
end
|
88
|
+
dequeue_size = max_dequeue_size if dequeue_size > max_dequeue_size
|
89
|
+
|
90
|
+
# reserve and get to pages parse
|
91
|
+
response = nil
|
92
|
+
begin
|
93
|
+
response = client.dequeue self.job_id,
|
94
|
+
dequeue_size,
|
95
|
+
self.page_types,
|
96
|
+
config['parse_fetching_failed']
|
97
|
+
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
98
|
+
self.no_repeat_puts "Dequeue API call timeout! Contact infra team, your job needs a profile change"
|
99
|
+
return 0
|
100
|
+
rescue => e
|
101
|
+
raise e
|
102
|
+
end
|
103
|
+
|
104
|
+
# ensure a valid response or try again
|
105
|
+
if response.nil? || response.response.code.to_i != 200
|
106
|
+
self.repeat_puts(response.nil? ? 'null' : response.body)
|
107
|
+
self.recollect_garbage
|
108
|
+
return 0
|
109
|
+
end
|
110
|
+
|
111
|
+
# add pages
|
112
|
+
count = 0
|
113
|
+
(JSON.parse(response.body) || []).each do |page|
|
114
|
+
count += 1
|
115
|
+
next if self.pages.has_key? page['gid']
|
116
|
+
self.pages[page['gid']] = page
|
117
|
+
end
|
118
|
+
response = nil
|
119
|
+
|
120
|
+
# recolect garbage to free some memory before parsing
|
121
|
+
if count > 0
|
122
|
+
self.recollect_garbage
|
123
|
+
self.repeat_puts "Found #{count} page(s) to parse"
|
124
|
+
self.second_dequeue_count += 1 unless self.second_dequeue_count > 1
|
125
|
+
else
|
126
|
+
self.no_repeat_puts NOT_FOUND_MSG
|
127
|
+
end
|
128
|
+
|
129
|
+
# return how many pages were loaded
|
130
|
+
count
|
131
|
+
end
|
132
|
+
|
133
|
+
def dequeue_pages
|
134
|
+
# collect garbage
|
135
|
+
self.recollect_garbage
|
136
|
+
|
137
|
+
# return page if there are loeaded pages
|
138
|
+
is_waiting = false
|
139
|
+
while true do
|
140
|
+
key_value = self.pages.shift
|
141
|
+
unless key_value.nil?
|
142
|
+
puts "[Worker #{Parallel.worker_number}]: Finish waiting" if is_waiting
|
143
|
+
return key_value[1]
|
144
|
+
end
|
145
|
+
|
146
|
+
# be more verbose on worker waiting
|
147
|
+
unless is_waiting
|
148
|
+
is_waiting = true
|
149
|
+
puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
|
150
|
+
if self.second_dequeue_count > 1
|
151
|
+
puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
self.class.wait 1
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def exec_parse save = false, keep_outputs = false
|
159
|
+
if self.worker_count < 1
|
160
|
+
self.no_repeat_puts NO_WORKERS_MSG
|
161
|
+
return
|
162
|
+
else
|
163
|
+
self.no_repeat_puts "Spawing #{self.worker_count} workers"
|
164
|
+
end
|
165
|
+
|
166
|
+
# dequeuing on parallel (the ride never ends :D)
|
167
|
+
Thread.new do
|
168
|
+
while true
|
169
|
+
begin
|
170
|
+
self.load_pages
|
171
|
+
self.class.wait self.dequeue_interval
|
172
|
+
rescue => e
|
173
|
+
puts [e.message] + e.backtrace rescue 'error'
|
174
|
+
end
|
175
|
+
end
|
176
|
+
puts "Error: dequeuer died! D:"
|
177
|
+
end
|
178
|
+
|
179
|
+
# process the pages
|
180
|
+
dequeue = lambda{ self.dequeue_pages }
|
181
|
+
Parallel.each(dequeue, in_threads: (worker_count)) do |page|
|
182
|
+
parser_file = self.parsers[page['page_type']]
|
183
|
+
begin
|
184
|
+
puts Datahen::Scraper::Parser.exec_parser_by_page(
|
185
|
+
parser_file,
|
186
|
+
page,
|
187
|
+
job_id,
|
188
|
+
save,
|
189
|
+
nil,
|
190
|
+
keep_outputs
|
191
|
+
)
|
192
|
+
rescue Parallel::Kill => e
|
193
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to kill Parallel!!!"
|
194
|
+
rescue Parallel::Break => e
|
195
|
+
puts "[Worker #{Parallel.worker_number}]: Someone tried to break Parallel!!!"
|
196
|
+
rescue => e
|
197
|
+
puts [e.message] + e.backtrace rescue 'error'
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
@@ -6,7 +6,7 @@ module Datahen
|
|
6
6
|
# Max allowed page size when query outputs (see #find_outputs).
|
7
7
|
MAX_FIND_OUTPUTS_PER_PAGE = 500
|
8
8
|
|
9
|
-
attr_accessor :filename, :gid, :job_id
|
9
|
+
attr_accessor :filename, :page, :gid, :job_id
|
10
10
|
|
11
11
|
include Datahen::Plugin::ContextExposer
|
12
12
|
|
@@ -15,6 +15,9 @@ module Datahen
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def init_page()
|
18
|
+
# skip whenever a page is provided
|
19
|
+
return self.page unless self.page.nil?
|
20
|
+
|
18
21
|
if job_id
|
19
22
|
puts "getting Job Page"
|
20
23
|
init_job_page
|
@@ -374,6 +377,11 @@ module Datahen
|
|
374
377
|
def eval_with_context file_path, context
|
375
378
|
eval(File.read(file_path), context, file_path)
|
376
379
|
end
|
380
|
+
|
381
|
+
# Finish the executor execution
|
382
|
+
def finish
|
383
|
+
raise Error::SafeTerminateError
|
384
|
+
end
|
377
385
|
end
|
378
386
|
end
|
379
387
|
end
|
@@ -18,6 +18,22 @@ module Datahen
|
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
+
def self.exec_parser_by_page(filename, page, job_id=nil, save=false, vars = {}, keep_outputs=false)
|
22
|
+
extname = File.extname(filename)
|
23
|
+
case extname
|
24
|
+
when '.rb'
|
25
|
+
executor = RubyParserExecutor.new(
|
26
|
+
filename: filename,
|
27
|
+
page: page,
|
28
|
+
job_id: job_id,
|
29
|
+
vars: vars,
|
30
|
+
keep_outputs: keep_outputs
|
31
|
+
)
|
32
|
+
executor.exec_parser(save)
|
33
|
+
else
|
34
|
+
puts "Unable to find a parser executor for file type \"#{extname}\""
|
35
|
+
end
|
36
|
+
end
|
21
37
|
|
22
38
|
end
|
23
39
|
end
|
@@ -40,6 +40,8 @@ module Datahen
|
|
40
40
|
job_id: job_id
|
41
41
|
})
|
42
42
|
eval_with_context filename, context
|
43
|
+
rescue Error::SafeTerminateError => e
|
44
|
+
# do nothing, this is fine
|
43
45
|
rescue SyntaxError => e
|
44
46
|
handle_error(e) if save
|
45
47
|
raise e
|
@@ -55,7 +57,7 @@ module Datahen
|
|
55
57
|
handle_error(e) if save
|
56
58
|
raise e
|
57
59
|
end
|
58
|
-
|
60
|
+
|
59
61
|
update_finisher_done_status
|
60
62
|
end
|
61
63
|
proc.call
|
@@ -12,7 +12,8 @@ module Datahen
|
|
12
12
|
|
13
13
|
def initialize(options={})
|
14
14
|
@filename = options.fetch(:filename) { raise "Filename is required"}
|
15
|
-
@
|
15
|
+
@page = options.fetch(:page) { nil }
|
16
|
+
@gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
|
16
17
|
@job_id = options.fetch(:job_id)
|
17
18
|
@page_vars = options.fetch(:vars) { {} }
|
18
19
|
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
|
@@ -46,6 +47,8 @@ module Datahen
|
|
46
47
|
end
|
47
48
|
|
48
49
|
def init_page_vars(page)
|
50
|
+
return self.page unless self.page.nil?
|
51
|
+
|
49
52
|
if !@page_vars.nil? && !@page_vars.empty?
|
50
53
|
page['vars'] = @page_vars
|
51
54
|
end
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-05-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -45,9 +45,6 @@ dependencies:
|
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '1.6'
|
48
|
-
- - "<"
|
49
|
-
- !ruby/object:Gem::Version
|
50
|
-
version: '1.10'
|
51
48
|
type: :runtime
|
52
49
|
prerelease: false
|
53
50
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -55,9 +52,34 @@ dependencies:
|
|
55
52
|
- - "~>"
|
56
53
|
- !ruby/object:Gem::Version
|
57
54
|
version: '1.6'
|
58
|
-
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: concurrent-ruby
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.1'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.1'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: parallel
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.20'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
59
81
|
- !ruby/object:Gem::Version
|
60
|
-
version: '1.
|
82
|
+
version: '1.20'
|
61
83
|
- !ruby/object:Gem::Dependency
|
62
84
|
name: bundler
|
63
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -238,6 +260,7 @@ files:
|
|
238
260
|
- lib/datahen/plugin.rb
|
239
261
|
- lib/datahen/plugin/context_exposer.rb
|
240
262
|
- lib/datahen/scraper.rb
|
263
|
+
- lib/datahen/scraper/batch_parser.rb
|
241
264
|
- lib/datahen/scraper/executor.rb
|
242
265
|
- lib/datahen/scraper/finisher.rb
|
243
266
|
- lib/datahen/scraper/parser.rb
|
@@ -261,7 +284,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
261
284
|
requirements:
|
262
285
|
- - ">="
|
263
286
|
- !ruby/object:Gem::Version
|
264
|
-
version: 2.
|
287
|
+
version: 2.4.4
|
265
288
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
266
289
|
requirements:
|
267
290
|
- - ">="
|