datahen 0.14.20 → 0.14.26
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/datahen.gemspec +1 -1
- data/lib/datahen/cli/parser.rb +9 -10
- data/lib/datahen/cli/scraper.rb +1 -1
- data/lib/datahen/client/backblaze_content.rb +14 -10
- data/lib/datahen/scraper/executor.rb +5 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +3 -1
- data/lib/datahen/scraper/ruby_seeder_executor.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +2 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dac57d98132102aa9ae8244b6528394473b2bdeb9992c7ea15d6979eaf87d4af
|
4
|
+
data.tar.gz: e68858d2f088b2d7b8538411dd59cf2ae2de7866416fc213c6a6fa009d93c556
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 857126b2f7ec4fa058aaa8d5b4a7095108224bdf3f6ece690dbfc930e0527a294853705227f0e63be5af3524982fff21f7d3c9d940c22b31caade5139a3d607b
|
7
|
+
data.tar.gz: 81ecf95378e6f4aa31a87e39a82bc815216fce1b84aa65d8f7f2aa8ee8b19b871f08eb8c86025d9dc8d84617f20864f5f39c21d7b8ac4900a739599c0aa6283c
|
data/datahen.gemspec
CHANGED
@@ -36,7 +36,7 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.required_ruby_version = '>= 2.2.2'
|
37
37
|
spec.add_dependency "thor", "~> 0.20.3"
|
38
38
|
spec.add_dependency 'httparty', '~> 0.16.2'
|
39
|
-
spec.add_dependency 'nokogiri', '~> 1.6'
|
39
|
+
spec.add_dependency 'nokogiri', '~> 1.6'
|
40
40
|
spec.add_development_dependency 'bundler', '>= 1.16'
|
41
41
|
spec.add_development_dependency 'rake', '>= 10.0'
|
42
42
|
spec.add_development_dependency 'minitest', '>= 5.11'
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -14,20 +14,19 @@ module Datahen
|
|
14
14
|
def try_parse(scraper_name, parser_file, gid)
|
15
15
|
begin
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
17
|
+
if options[:job]
|
18
|
+
job_id = options[:job]
|
19
|
+
elsif options[:global]
|
20
|
+
job_id = nil
|
21
|
+
else
|
22
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
23
|
+
job_id = job['id']
|
24
|
+
end
|
26
25
|
|
27
26
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
28
27
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
29
28
|
|
30
|
-
|
29
|
+
rescue JSON::ParserError
|
31
30
|
if options[:vars]
|
32
31
|
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
33
32
|
end
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -93,7 +93,7 @@ module Datahen
|
|
93
93
|
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
|
94
94
|
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
|
95
95
|
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
96
|
-
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: {"name":"foo", "value":"bar", "secret":false} '
|
96
|
+
option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
|
97
97
|
def start(scraper_name)
|
98
98
|
client = Client::ScraperJob.new(options)
|
99
99
|
puts "Starting a scrape job..."
|
@@ -4,10 +4,10 @@ require 'httparty'
|
|
4
4
|
module Datahen
|
5
5
|
module Client
|
6
6
|
class BackblazeContent
|
7
|
-
include HTTParty
|
8
|
-
|
7
|
+
include HTTParty
|
8
|
+
|
9
9
|
def get_content(url)
|
10
|
-
self.class.get(url, format: :plain)
|
10
|
+
self.class.get(url, format: :plain).response.body
|
11
11
|
end
|
12
12
|
|
13
13
|
def get_gunzipped_content(url)
|
@@ -19,19 +19,23 @@ module Datahen
|
|
19
19
|
sio = StringIO.new(string)
|
20
20
|
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
21
|
_content = ""
|
22
|
-
begin
|
22
|
+
begin
|
23
23
|
_content = gz.read
|
24
24
|
rescue => e
|
25
25
|
# if unexpected eof error, then readchar until error, and ignore it
|
26
26
|
if e.to_s == 'unexpected end of file'
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
# heavily improve content read recovery by using "String#<<",
|
28
|
+
# reading all "good" lines and then concat the remaining chars
|
29
|
+
begin
|
30
|
+
gz.each_line{|line| _content << line}
|
31
31
|
rescue => e
|
32
|
-
|
32
|
+
begin
|
33
|
+
_content << gz.readchar while !gz.eof
|
34
|
+
rescue => e
|
35
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
36
|
+
end
|
33
37
|
end
|
34
|
-
else
|
38
|
+
else
|
35
39
|
raise e
|
36
40
|
end
|
37
41
|
end
|
@@ -40,6 +40,8 @@ module Datahen
|
|
40
40
|
job_id: job_id
|
41
41
|
})
|
42
42
|
eval_with_context filename, context
|
43
|
+
rescue Error::SafeTerminateError => e
|
44
|
+
# do nothing, this is fine
|
43
45
|
rescue SyntaxError => e
|
44
46
|
handle_error(e) if save
|
45
47
|
raise e
|
@@ -55,7 +57,7 @@ module Datahen
|
|
55
57
|
handle_error(e) if save
|
56
58
|
raise e
|
57
59
|
end
|
58
|
-
|
60
|
+
|
59
61
|
update_finisher_done_status
|
60
62
|
end
|
61
63
|
proc.call
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.26
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -45,9 +45,6 @@ dependencies:
|
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '1.6'
|
48
|
-
- - "<"
|
49
|
-
- !ruby/object:Gem::Version
|
50
|
-
version: '1.10'
|
51
48
|
type: :runtime
|
52
49
|
prerelease: false
|
53
50
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -55,9 +52,6 @@ dependencies:
|
|
55
52
|
- - "~>"
|
56
53
|
- !ruby/object:Gem::Version
|
57
54
|
version: '1.6'
|
58
|
-
- - "<"
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version: '1.10'
|
61
55
|
- !ruby/object:Gem::Dependency
|
62
56
|
name: bundler
|
63
57
|
requirement: !ruby/object:Gem::Requirement
|