datahen 0.14.19 → 0.14.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/datahen.gemspec +1 -1
- data/lib/datahen/cli/parser.rb +9 -10
- data/lib/datahen/cli/scraper.rb +1 -0
- data/lib/datahen/client/backblaze_content.rb +13 -9
- data/lib/datahen/client/scraper_job.rb +7 -0
- data/lib/datahen/scraper/executor.rb +5 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +3 -1
- data/lib/datahen/scraper/ruby_seeder_executor.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +5 -11
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 566f3019f38a812f11eb5475e787a2ee0f4475dc0f8ee342accb78897974308f
|
|
4
|
+
data.tar.gz: d050afb771fd6f6d95054e587540c0dc5da0769a7369b55f407c3f727633fff2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a5bd6b82b1982ea466a3b12f2d49daddd865d8aa45b30a6e5d5c8db6d921c7a9eef2af67c8fe9c79c7460d1df348c88572e8e8cfb7df9ba627e6cda3f5f37af5
|
|
7
|
+
data.tar.gz: e19ee95ad123ed2c345a1f90fe01421f7be9819f58a67bdf5b498eb93e512793b0b3e8d32c4d5ec9acd1751e7108f9e3f189d9eb7a0beefbb5ce6ba798296f66
|
data/datahen.gemspec
CHANGED
|
@@ -36,7 +36,7 @@ Gem::Specification.new do |spec|
|
|
|
36
36
|
spec.required_ruby_version = '>= 2.2.2'
|
|
37
37
|
spec.add_dependency "thor", "~> 0.20.3"
|
|
38
38
|
spec.add_dependency 'httparty', '~> 0.16.2'
|
|
39
|
-
spec.add_dependency 'nokogiri', '~> 1.6'
|
|
39
|
+
spec.add_dependency 'nokogiri', '~> 1.6'
|
|
40
40
|
spec.add_development_dependency 'bundler', '>= 1.16'
|
|
41
41
|
spec.add_development_dependency 'rake', '>= 10.0'
|
|
42
42
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/lib/datahen/cli/parser.rb
CHANGED
|
@@ -14,20 +14,19 @@ module Datahen
|
|
|
14
14
|
def try_parse(scraper_name, parser_file, gid)
|
|
15
15
|
begin
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
17
|
+
if options[:job]
|
|
18
|
+
job_id = options[:job]
|
|
19
|
+
elsif options[:global]
|
|
20
|
+
job_id = nil
|
|
21
|
+
else
|
|
22
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
|
23
|
+
job_id = job['id']
|
|
24
|
+
end
|
|
26
25
|
|
|
27
26
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
|
28
27
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
|
29
28
|
|
|
30
|
-
|
|
29
|
+
rescue JSON::ParserError
|
|
31
30
|
if options[:vars]
|
|
32
31
|
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
|
33
32
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
|
@@ -93,6 +93,7 @@ module Datahen
|
|
|
93
93
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
|
94
94
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
|
95
95
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
|
96
|
+
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
|
96
97
|
def start(scraper_name)
|
|
97
98
|
client = Client::ScraperJob.new(options)
|
|
98
99
|
puts "Starting a scrape job..."
|
|
@@ -4,8 +4,8 @@ require 'httparty'
|
|
|
4
4
|
module Datahen
|
|
5
5
|
module Client
|
|
6
6
|
class BackblazeContent
|
|
7
|
-
include HTTParty
|
|
8
|
-
|
|
7
|
+
include HTTParty
|
|
8
|
+
|
|
9
9
|
def get_content(url)
|
|
10
10
|
self.class.get(url, format: :plain)
|
|
11
11
|
end
|
|
@@ -19,19 +19,23 @@ module Datahen
|
|
|
19
19
|
sio = StringIO.new(string)
|
|
20
20
|
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
|
21
21
|
_content = ""
|
|
22
|
-
begin
|
|
22
|
+
begin
|
|
23
23
|
_content = gz.read
|
|
24
24
|
rescue => e
|
|
25
25
|
# if unexpected eof error, then readchar until error, and ignore it
|
|
26
26
|
if e.to_s == 'unexpected end of file'
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
# heavily improve content read recovery by using "String#<<",
|
|
28
|
+
# reading all "good" lines and then concat the remaining chars
|
|
29
|
+
begin
|
|
30
|
+
gz.each_line{|line| _content << line}
|
|
31
31
|
rescue => e
|
|
32
|
-
|
|
32
|
+
begin
|
|
33
|
+
_content << gz.readchar while !gz.eof
|
|
34
|
+
rescue => e
|
|
35
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
|
36
|
+
end
|
|
33
37
|
end
|
|
34
|
-
else
|
|
38
|
+
else
|
|
35
39
|
raise e
|
|
36
40
|
end
|
|
37
41
|
end
|
|
@@ -11,6 +11,13 @@ module Datahen
|
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
|
14
|
+
if opts[:vars]
|
|
15
|
+
if opts[:vars].is_a?(Array)
|
|
16
|
+
body[:vars] = opts[:vars]
|
|
17
|
+
elsif opts[:vars].is_a?(String)
|
|
18
|
+
body[:vars] = JSON.parse(opts[:vars])
|
|
19
|
+
end
|
|
20
|
+
end
|
|
14
21
|
params = @options.merge({body: body.to_json})
|
|
15
22
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
|
16
23
|
end
|
|
@@ -40,6 +40,8 @@ module Datahen
|
|
|
40
40
|
job_id: job_id
|
|
41
41
|
})
|
|
42
42
|
eval_with_context filename, context
|
|
43
|
+
rescue Error::SafeTerminateError => e
|
|
44
|
+
# do nothing, this is fine
|
|
43
45
|
rescue SyntaxError => e
|
|
44
46
|
handle_error(e) if save
|
|
45
47
|
raise e
|
|
@@ -55,7 +57,7 @@ module Datahen
|
|
|
55
57
|
handle_error(e) if save
|
|
56
58
|
raise e
|
|
57
59
|
end
|
|
58
|
-
|
|
60
|
+
|
|
59
61
|
update_finisher_done_status
|
|
60
62
|
end
|
|
61
63
|
proc.call
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: datahen
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.14.
|
|
4
|
+
version: 0.14.25
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Parama Danoesubroto
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-03-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: thor
|
|
@@ -45,9 +45,6 @@ dependencies:
|
|
|
45
45
|
- - "~>"
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
47
|
version: '1.6'
|
|
48
|
-
- - "<"
|
|
49
|
-
- !ruby/object:Gem::Version
|
|
50
|
-
version: '1.10'
|
|
51
48
|
type: :runtime
|
|
52
49
|
prerelease: false
|
|
53
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
@@ -55,9 +52,6 @@ dependencies:
|
|
|
55
52
|
- - "~>"
|
|
56
53
|
- !ruby/object:Gem::Version
|
|
57
54
|
version: '1.6'
|
|
58
|
-
- - "<"
|
|
59
|
-
- !ruby/object:Gem::Version
|
|
60
|
-
version: '1.10'
|
|
61
55
|
- !ruby/object:Gem::Dependency
|
|
62
56
|
name: bundler
|
|
63
57
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -253,7 +247,7 @@ metadata:
|
|
|
253
247
|
allowed_push_host: https://rubygems.org
|
|
254
248
|
homepage_uri: https://datahen.com
|
|
255
249
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
|
256
|
-
post_install_message:
|
|
250
|
+
post_install_message:
|
|
257
251
|
rdoc_options: []
|
|
258
252
|
require_paths:
|
|
259
253
|
- lib
|
|
@@ -269,7 +263,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
269
263
|
version: '0'
|
|
270
264
|
requirements: []
|
|
271
265
|
rubygems_version: 3.0.3
|
|
272
|
-
signing_key:
|
|
266
|
+
signing_key:
|
|
273
267
|
specification_version: 4
|
|
274
268
|
summary: DataHen toolbelt for developers
|
|
275
269
|
test_files: []
|