datahen 0.14.19 → 0.14.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/parser.rb +9 -10
- data/lib/datahen/cli/scraper.rb +1 -0
- data/lib/datahen/client/backblaze_content.rb +13 -9
- data/lib/datahen/client/scraper_job.rb +7 -0
- data/lib/datahen/scraper/executor.rb +5 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +3 -1
- data/lib/datahen/scraper/ruby_seeder_executor.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 54064cf5656d253f683540fa3704cdcae3991bc07a91ab5339bf9083eb0792f2
|
|
4
|
+
data.tar.gz: 3c82cedd06937454a9af5d91eea9b32f3ad744394763ca9fadbcdc6068eda683
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fb17b046f9dbd15cf7a278f68f550e8b3b84d8c16a030d0ef4df100df28c8e4dce29fe74ffc70e02f79af26a2d1e3f66ccb6890e06a342a70fac09d824169431
|
|
7
|
+
data.tar.gz: 732256fc714635896f444bed50e2d6f49c2a03c4868fd23eb84e52701935bc2bfbef25811055983691c5e0ff39af53328dbeb67faf91f127c1d06af450d7d666
|
data/lib/datahen/cli/parser.rb
CHANGED
|
@@ -14,20 +14,19 @@ module Datahen
|
|
|
14
14
|
def try_parse(scraper_name, parser_file, gid)
|
|
15
15
|
begin
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
17
|
+
if options[:job]
|
|
18
|
+
job_id = options[:job]
|
|
19
|
+
elsif options[:global]
|
|
20
|
+
job_id = nil
|
|
21
|
+
else
|
|
22
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
|
23
|
+
job_id = job['id']
|
|
24
|
+
end
|
|
26
25
|
|
|
27
26
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
|
28
27
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
|
29
28
|
|
|
30
|
-
|
|
29
|
+
rescue JSON::ParserError
|
|
31
30
|
if options[:vars]
|
|
32
31
|
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
|
33
32
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
|
@@ -93,6 +93,7 @@ module Datahen
|
|
|
93
93
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
|
94
94
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
|
95
95
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
|
96
|
+
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
|
96
97
|
def start(scraper_name)
|
|
97
98
|
client = Client::ScraperJob.new(options)
|
|
98
99
|
puts "Starting a scrape job..."
|
|
@@ -4,8 +4,8 @@ require 'httparty'
|
|
|
4
4
|
module Datahen
|
|
5
5
|
module Client
|
|
6
6
|
class BackblazeContent
|
|
7
|
-
include HTTParty
|
|
8
|
-
|
|
7
|
+
include HTTParty
|
|
8
|
+
|
|
9
9
|
def get_content(url)
|
|
10
10
|
self.class.get(url, format: :plain)
|
|
11
11
|
end
|
|
@@ -19,19 +19,23 @@ module Datahen
|
|
|
19
19
|
sio = StringIO.new(string)
|
|
20
20
|
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
|
21
21
|
_content = ""
|
|
22
|
-
begin
|
|
22
|
+
begin
|
|
23
23
|
_content = gz.read
|
|
24
24
|
rescue => e
|
|
25
25
|
# if unexpected eof error, then readchar until error, and ignore it
|
|
26
26
|
if e.to_s == 'unexpected end of file'
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
# heavily improve content read recovery by using "String#<<",
|
|
28
|
+
# reading all "good" lines and then concat the remaining chars
|
|
29
|
+
begin
|
|
30
|
+
gz.each_line{|line| _content << line}
|
|
31
31
|
rescue => e
|
|
32
|
-
|
|
32
|
+
begin
|
|
33
|
+
_content << gz.readchar while !gz.eof
|
|
34
|
+
rescue => e
|
|
35
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
|
36
|
+
end
|
|
33
37
|
end
|
|
34
|
-
else
|
|
38
|
+
else
|
|
35
39
|
raise e
|
|
36
40
|
end
|
|
37
41
|
end
|
|
@@ -11,6 +11,13 @@ module Datahen
|
|
|
11
11
|
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
|
12
12
|
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
|
13
13
|
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
|
14
|
+
if opts[:vars]
|
|
15
|
+
if opts[:vars].is_a?(Array)
|
|
16
|
+
body[:vars] = opts[:vars]
|
|
17
|
+
elsif opts[:vars].is_a?(String)
|
|
18
|
+
body[:vars] = JSON.parse(opts[:vars])
|
|
19
|
+
end
|
|
20
|
+
end
|
|
14
21
|
params = @options.merge({body: body.to_json})
|
|
15
22
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
|
16
23
|
end
|
|
@@ -40,6 +40,8 @@ module Datahen
|
|
|
40
40
|
job_id: job_id
|
|
41
41
|
})
|
|
42
42
|
eval_with_context filename, context
|
|
43
|
+
rescue Error::SafeTerminateError => e
|
|
44
|
+
# do nothing, this is fine
|
|
43
45
|
rescue SyntaxError => e
|
|
44
46
|
handle_error(e) if save
|
|
45
47
|
raise e
|
|
@@ -55,7 +57,7 @@ module Datahen
|
|
|
55
57
|
handle_error(e) if save
|
|
56
58
|
raise e
|
|
57
59
|
end
|
|
58
|
-
|
|
60
|
+
|
|
59
61
|
update_finisher_done_status
|
|
60
62
|
end
|
|
61
63
|
proc.call
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: datahen
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.14.
|
|
4
|
+
version: 0.14.24
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Parama Danoesubroto
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-03-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: thor
|
|
@@ -253,7 +253,7 @@ metadata:
|
|
|
253
253
|
allowed_push_host: https://rubygems.org
|
|
254
254
|
homepage_uri: https://datahen.com
|
|
255
255
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
|
256
|
-
post_install_message:
|
|
256
|
+
post_install_message:
|
|
257
257
|
rdoc_options: []
|
|
258
258
|
require_paths:
|
|
259
259
|
- lib
|
|
@@ -269,7 +269,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
269
269
|
version: '0'
|
|
270
270
|
requirements: []
|
|
271
271
|
rubygems_version: 3.0.3
|
|
272
|
-
signing_key:
|
|
272
|
+
signing_key:
|
|
273
273
|
specification_version: 4
|
|
274
274
|
summary: DataHen toolbelt for developers
|
|
275
275
|
test_files: []
|