datahen 0.14.22 → 0.14.24
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/parser.rb +9 -10
- data/lib/datahen/client/backblaze_content.rb +13 -9
- data/lib/datahen/scraper/executor.rb +5 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +3 -1
- data/lib/datahen/scraper/ruby_seeder_executor.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54064cf5656d253f683540fa3704cdcae3991bc07a91ab5339bf9083eb0792f2
|
4
|
+
data.tar.gz: 3c82cedd06937454a9af5d91eea9b32f3ad744394763ca9fadbcdc6068eda683
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb17b046f9dbd15cf7a278f68f550e8b3b84d8c16a030d0ef4df100df28c8e4dce29fe74ffc70e02f79af26a2d1e3f66ccb6890e06a342a70fac09d824169431
|
7
|
+
data.tar.gz: 732256fc714635896f444bed50e2d6f49c2a03c4868fd23eb84e52701935bc2bfbef25811055983691c5e0ff39af53328dbeb67faf91f127c1d06af450d7d666
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -14,20 +14,19 @@ module Datahen
|
|
14
14
|
def try_parse(scraper_name, parser_file, gid)
|
15
15
|
begin
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
17
|
+
if options[:job]
|
18
|
+
job_id = options[:job]
|
19
|
+
elsif options[:global]
|
20
|
+
job_id = nil
|
21
|
+
else
|
22
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
23
|
+
job_id = job['id']
|
24
|
+
end
|
26
25
|
|
27
26
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
28
27
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
29
28
|
|
30
|
-
|
29
|
+
rescue JSON::ParserError
|
31
30
|
if options[:vars]
|
32
31
|
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
33
32
|
end
|
@@ -4,8 +4,8 @@ require 'httparty'
|
|
4
4
|
module Datahen
|
5
5
|
module Client
|
6
6
|
class BackblazeContent
|
7
|
-
include HTTParty
|
8
|
-
|
7
|
+
include HTTParty
|
8
|
+
|
9
9
|
def get_content(url)
|
10
10
|
self.class.get(url, format: :plain)
|
11
11
|
end
|
@@ -19,19 +19,23 @@ module Datahen
|
|
19
19
|
sio = StringIO.new(string)
|
20
20
|
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
21
|
_content = ""
|
22
|
-
begin
|
22
|
+
begin
|
23
23
|
_content = gz.read
|
24
24
|
rescue => e
|
25
25
|
# if unexpected eof error, then readchar until error, and ignore it
|
26
26
|
if e.to_s == 'unexpected end of file'
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
# heavily improve content read recovery by using "String#<<",
|
28
|
+
# reading all "good" lines and then concat the remaining chars
|
29
|
+
begin
|
30
|
+
gz.each_line{|line| _content << line}
|
31
31
|
rescue => e
|
32
|
-
|
32
|
+
begin
|
33
|
+
_content << gz.readchar while !gz.eof
|
34
|
+
rescue => e
|
35
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
36
|
+
end
|
33
37
|
end
|
34
|
-
else
|
38
|
+
else
|
35
39
|
raise e
|
36
40
|
end
|
37
41
|
end
|
@@ -40,6 +40,8 @@ module Datahen
|
|
40
40
|
job_id: job_id
|
41
41
|
})
|
42
42
|
eval_with_context filename, context
|
43
|
+
rescue Error::SafeTerminateError => e
|
44
|
+
# do nothing, this is fine
|
43
45
|
rescue SyntaxError => e
|
44
46
|
handle_error(e) if save
|
45
47
|
raise e
|
@@ -55,7 +57,7 @@ module Datahen
|
|
55
57
|
handle_error(e) if save
|
56
58
|
raise e
|
57
59
|
end
|
58
|
-
|
60
|
+
|
59
61
|
update_finisher_done_status
|
60
62
|
end
|
61
63
|
proc.call
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|