datahen 0.14.19 → 0.14.24
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/parser.rb +9 -10
- data/lib/datahen/cli/scraper.rb +1 -0
- data/lib/datahen/client/backblaze_content.rb +13 -9
- data/lib/datahen/client/scraper_job.rb +7 -0
- data/lib/datahen/scraper/executor.rb +5 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +3 -1
- data/lib/datahen/scraper/ruby_seeder_executor.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54064cf5656d253f683540fa3704cdcae3991bc07a91ab5339bf9083eb0792f2
|
4
|
+
data.tar.gz: 3c82cedd06937454a9af5d91eea9b32f3ad744394763ca9fadbcdc6068eda683
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb17b046f9dbd15cf7a278f68f550e8b3b84d8c16a030d0ef4df100df28c8e4dce29fe74ffc70e02f79af26a2d1e3f66ccb6890e06a342a70fac09d824169431
|
7
|
+
data.tar.gz: 732256fc714635896f444bed50e2d6f49c2a03c4868fd23eb84e52701935bc2bfbef25811055983691c5e0ff39af53328dbeb67faf91f127c1d06af450d7d666
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -14,20 +14,19 @@ module Datahen
|
|
14
14
|
def try_parse(scraper_name, parser_file, gid)
|
15
15
|
begin
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
17
|
+
if options[:job]
|
18
|
+
job_id = options[:job]
|
19
|
+
elsif options[:global]
|
20
|
+
job_id = nil
|
21
|
+
else
|
22
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
23
|
+
job_id = job['id']
|
24
|
+
end
|
26
25
|
|
27
26
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
28
27
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
29
28
|
|
30
|
-
|
29
|
+
rescue JSON::ParserError
|
31
30
|
if options[:vars]
|
32
31
|
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
33
32
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -93,6 +93,7 @@ module Datahen
|
|
93
93
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
94
94
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
95
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
|
+
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
96
97
|
def start(scraper_name)
|
97
98
|
client = Client::ScraperJob.new(options)
|
98
99
|
puts "Starting a scrape job..."
|
@@ -4,8 +4,8 @@ require 'httparty'
|
|
4
4
|
module Datahen
|
5
5
|
module Client
|
6
6
|
class BackblazeContent
|
7
|
-
include HTTParty
|
8
|
-
|
7
|
+
include HTTParty
|
8
|
+
|
9
9
|
def get_content(url)
|
10
10
|
self.class.get(url, format: :plain)
|
11
11
|
end
|
@@ -19,19 +19,23 @@ module Datahen
|
|
19
19
|
sio = StringIO.new(string)
|
20
20
|
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
21
|
_content = ""
|
22
|
-
begin
|
22
|
+
begin
|
23
23
|
_content = gz.read
|
24
24
|
rescue => e
|
25
25
|
# if unexpected eof error, then readchar until error, and ignore it
|
26
26
|
if e.to_s == 'unexpected end of file'
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
# heavily improve content read recovery by using "String#<<",
|
28
|
+
# reading all "good" lines and then concat the remaining chars
|
29
|
+
begin
|
30
|
+
gz.each_line{|line| _content << line}
|
31
31
|
rescue => e
|
32
|
-
|
32
|
+
begin
|
33
|
+
_content << gz.readchar while !gz.eof
|
34
|
+
rescue => e
|
35
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
36
|
+
end
|
33
37
|
end
|
34
|
-
else
|
38
|
+
else
|
35
39
|
raise e
|
36
40
|
end
|
37
41
|
end
|
@@ -11,6 +11,13 @@ module Datahen
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
14
|
+
if opts[:vars]
|
15
|
+
if opts[:vars].is_a?(Array)
|
16
|
+
body[:vars] = opts[:vars]
|
17
|
+
elsif opts[:vars].is_a?(String)
|
18
|
+
body[:vars] = JSON.parse(opts[:vars])
|
19
|
+
end
|
20
|
+
end
|
14
21
|
params = @options.merge({body: body.to_json})
|
15
22
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
23
|
end
|
@@ -40,6 +40,8 @@ module Datahen
|
|
40
40
|
job_id: job_id
|
41
41
|
})
|
42
42
|
eval_with_context filename, context
|
43
|
+
rescue Error::SafeTerminateError => e
|
44
|
+
# do nothing, this is fine
|
43
45
|
rescue SyntaxError => e
|
44
46
|
handle_error(e) if save
|
45
47
|
raise e
|
@@ -55,7 +57,7 @@ module Datahen
|
|
55
57
|
handle_error(e) if save
|
56
58
|
raise e
|
57
59
|
end
|
58
|
-
|
60
|
+
|
59
61
|
update_finisher_done_status
|
60
62
|
end
|
61
63
|
proc.call
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -253,7 +253,7 @@ metadata:
|
|
253
253
|
allowed_push_host: https://rubygems.org
|
254
254
|
homepage_uri: https://datahen.com
|
255
255
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
256
|
-
post_install_message:
|
256
|
+
post_install_message:
|
257
257
|
rdoc_options: []
|
258
258
|
require_paths:
|
259
259
|
- lib
|
@@ -269,7 +269,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
269
269
|
version: '0'
|
270
270
|
requirements: []
|
271
271
|
rubygems_version: 3.0.3
|
272
|
-
signing_key:
|
272
|
+
signing_key:
|
273
273
|
specification_version: 4
|
274
274
|
summary: DataHen toolbelt for developers
|
275
275
|
test_files: []
|