datahen 0.14.19 → 0.14.25
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/datahen.gemspec +1 -1
- data/lib/datahen/cli/parser.rb +9 -10
- data/lib/datahen/cli/scraper.rb +1 -0
- data/lib/datahen/client/backblaze_content.rb +13 -9
- data/lib/datahen/client/scraper_job.rb +7 -0
- data/lib/datahen/scraper/executor.rb +5 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +3 -1
- data/lib/datahen/scraper/ruby_seeder_executor.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +5 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 566f3019f38a812f11eb5475e787a2ee0f4475dc0f8ee342accb78897974308f
|
4
|
+
data.tar.gz: d050afb771fd6f6d95054e587540c0dc5da0769a7369b55f407c3f727633fff2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a5bd6b82b1982ea466a3b12f2d49daddd865d8aa45b30a6e5d5c8db6d921c7a9eef2af67c8fe9c79c7460d1df348c88572e8e8cfb7df9ba627e6cda3f5f37af5
|
7
|
+
data.tar.gz: e19ee95ad123ed2c345a1f90fe01421f7be9819f58a67bdf5b498eb93e512793b0b3e8d32c4d5ec9acd1751e7108f9e3f189d9eb7a0beefbb5ce6ba798296f66
|
data/datahen.gemspec
CHANGED
@@ -36,7 +36,7 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.required_ruby_version = '>= 2.2.2'
|
37
37
|
spec.add_dependency "thor", "~> 0.20.3"
|
38
38
|
spec.add_dependency 'httparty', '~> 0.16.2'
|
39
|
-
spec.add_dependency 'nokogiri', '~> 1.6'
|
39
|
+
spec.add_dependency 'nokogiri', '~> 1.6'
|
40
40
|
spec.add_development_dependency 'bundler', '>= 1.16'
|
41
41
|
spec.add_development_dependency 'rake', '>= 10.0'
|
42
42
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -14,20 +14,19 @@ module Datahen
|
|
14
14
|
def try_parse(scraper_name, parser_file, gid)
|
15
15
|
begin
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
17
|
+
if options[:job]
|
18
|
+
job_id = options[:job]
|
19
|
+
elsif options[:global]
|
20
|
+
job_id = nil
|
21
|
+
else
|
22
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
23
|
+
job_id = job['id']
|
24
|
+
end
|
26
25
|
|
27
26
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
28
27
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
29
28
|
|
30
|
-
|
29
|
+
rescue JSON::ParserError
|
31
30
|
if options[:vars]
|
32
31
|
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
33
32
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -93,6 +93,7 @@ module Datahen
|
|
93
93
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
94
94
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
95
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
|
+
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
96
97
|
def start(scraper_name)
|
97
98
|
client = Client::ScraperJob.new(options)
|
98
99
|
puts "Starting a scrape job..."
|
@@ -4,8 +4,8 @@ require 'httparty'
|
|
4
4
|
module Datahen
|
5
5
|
module Client
|
6
6
|
class BackblazeContent
|
7
|
-
include HTTParty
|
8
|
-
|
7
|
+
include HTTParty
|
8
|
+
|
9
9
|
def get_content(url)
|
10
10
|
self.class.get(url, format: :plain)
|
11
11
|
end
|
@@ -19,19 +19,23 @@ module Datahen
|
|
19
19
|
sio = StringIO.new(string)
|
20
20
|
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
21
|
_content = ""
|
22
|
-
begin
|
22
|
+
begin
|
23
23
|
_content = gz.read
|
24
24
|
rescue => e
|
25
25
|
# if unexpected eof error, then readchar until error, and ignore it
|
26
26
|
if e.to_s == 'unexpected end of file'
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
# heavily improve content read recovery by using "String#<<",
|
28
|
+
# reading all "good" lines and then concat the remaining chars
|
29
|
+
begin
|
30
|
+
gz.each_line{|line| _content << line}
|
31
31
|
rescue => e
|
32
|
-
|
32
|
+
begin
|
33
|
+
_content << gz.readchar while !gz.eof
|
34
|
+
rescue => e
|
35
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
36
|
+
end
|
33
37
|
end
|
34
|
-
else
|
38
|
+
else
|
35
39
|
raise e
|
36
40
|
end
|
37
41
|
end
|
@@ -11,6 +11,13 @@ module Datahen
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
|
+
if opts[:vars]
|
15
|
+
if opts[:vars].is_a?(Array)
|
16
|
+
body[:vars] = opts[:vars]
|
17
|
+
elsif opts[:vars].is_a?(String)
|
18
|
+
body[:vars] = JSON.parse(opts[:vars])
|
19
|
+
end
|
20
|
+
end
|
14
21
|
params = @options.merge({body: body.to_json})
|
15
22
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
23
|
end
|
@@ -40,6 +40,8 @@ module Datahen
|
|
40
40
|
job_id: job_id
|
41
41
|
})
|
42
42
|
eval_with_context filename, context
|
43
|
+
rescue Error::SafeTerminateError => e
|
44
|
+
# do nothing, this is fine
|
43
45
|
rescue SyntaxError => e
|
44
46
|
handle_error(e) if save
|
45
47
|
raise e
|
@@ -55,7 +57,7 @@ module Datahen
|
|
55
57
|
handle_error(e) if save
|
56
58
|
raise e
|
57
59
|
end
|
58
|
-
|
60
|
+
|
59
61
|
update_finisher_done_status
|
60
62
|
end
|
61
63
|
proc.call
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.25
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -45,9 +45,6 @@ dependencies:
|
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '1.6'
|
48
|
-
- - "<"
|
49
|
-
- !ruby/object:Gem::Version
|
50
|
-
version: '1.10'
|
51
48
|
type: :runtime
|
52
49
|
prerelease: false
|
53
50
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -55,9 +52,6 @@ dependencies:
|
|
55
52
|
- - "~>"
|
56
53
|
- !ruby/object:Gem::Version
|
57
54
|
version: '1.6'
|
58
|
-
- - "<"
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version: '1.10'
|
61
55
|
- !ruby/object:Gem::Dependency
|
62
56
|
name: bundler
|
63
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -253,7 +247,7 @@ metadata:
|
|
253
247
|
allowed_push_host: https://rubygems.org
|
254
248
|
homepage_uri: https://datahen.com
|
255
249
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
256
|
-
post_install_message:
|
250
|
+
post_install_message:
|
257
251
|
rdoc_options: []
|
258
252
|
require_paths:
|
259
253
|
- lib
|
@@ -269,7 +263,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
269
263
|
version: '0'
|
270
264
|
requirements: []
|
271
265
|
rubygems_version: 3.0.3
|
272
|
-
signing_key:
|
266
|
+
signing_key:
|
273
267
|
specification_version: 4
|
274
268
|
summary: DataHen toolbelt for developers
|
275
269
|
test_files: []
|