datahen 0.14.22 → 0.14.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/parser.rb +9 -10
- data/lib/datahen/client/backblaze_content.rb +13 -9
- data/lib/datahen/scraper/executor.rb +5 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +3 -1
- data/lib/datahen/scraper/ruby_seeder_executor.rb +2 -0
- data/lib/datahen/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54064cf5656d253f683540fa3704cdcae3991bc07a91ab5339bf9083eb0792f2
|
4
|
+
data.tar.gz: 3c82cedd06937454a9af5d91eea9b32f3ad744394763ca9fadbcdc6068eda683
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb17b046f9dbd15cf7a278f68f550e8b3b84d8c16a030d0ef4df100df28c8e4dce29fe74ffc70e02f79af26a2d1e3f66ccb6890e06a342a70fac09d824169431
|
7
|
+
data.tar.gz: 732256fc714635896f444bed50e2d6f49c2a03c4868fd23eb84e52701935bc2bfbef25811055983691c5e0ff39af53328dbeb67faf91f127c1d06af450d7d666
|
data/lib/datahen/cli/parser.rb
CHANGED
@@ -14,20 +14,19 @@ module Datahen
|
|
14
14
|
def try_parse(scraper_name, parser_file, gid)
|
15
15
|
begin
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
17
|
+
if options[:job]
|
18
|
+
job_id = options[:job]
|
19
|
+
elsif options[:global]
|
20
|
+
job_id = nil
|
21
|
+
else
|
22
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
23
|
+
job_id = job['id']
|
24
|
+
end
|
26
25
|
|
27
26
|
vars = JSON.parse(options[:vars]) if options[:vars]
|
28
27
|
puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
|
29
28
|
|
30
|
-
|
29
|
+
rescue JSON::ParserError
|
31
30
|
if options[:vars]
|
32
31
|
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
33
32
|
end
|
@@ -4,8 +4,8 @@ require 'httparty'
|
|
4
4
|
module Datahen
|
5
5
|
module Client
|
6
6
|
class BackblazeContent
|
7
|
-
include HTTParty
|
8
|
-
|
7
|
+
include HTTParty
|
8
|
+
|
9
9
|
def get_content(url)
|
10
10
|
self.class.get(url, format: :plain)
|
11
11
|
end
|
@@ -19,19 +19,23 @@ module Datahen
|
|
19
19
|
sio = StringIO.new(string)
|
20
20
|
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
21
|
_content = ""
|
22
|
-
begin
|
22
|
+
begin
|
23
23
|
_content = gz.read
|
24
24
|
rescue => e
|
25
25
|
# if unexpected eof error, then readchar until error, and ignore it
|
26
26
|
if e.to_s == 'unexpected end of file'
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
# heavily improve content read recovery by using "String#<<",
|
28
|
+
# reading all "good" lines and then concat the remaining chars
|
29
|
+
begin
|
30
|
+
gz.each_line{|line| _content << line}
|
31
31
|
rescue => e
|
32
|
-
|
32
|
+
begin
|
33
|
+
_content << gz.readchar while !gz.eof
|
34
|
+
rescue => e
|
35
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
36
|
+
end
|
33
37
|
end
|
34
|
-
else
|
38
|
+
else
|
35
39
|
raise e
|
36
40
|
end
|
37
41
|
end
|
@@ -40,6 +40,8 @@ module Datahen
|
|
40
40
|
job_id: job_id
|
41
41
|
})
|
42
42
|
eval_with_context filename, context
|
43
|
+
rescue Error::SafeTerminateError => e
|
44
|
+
# do nothing, this is fine
|
43
45
|
rescue SyntaxError => e
|
44
46
|
handle_error(e) if save
|
45
47
|
raise e
|
@@ -55,7 +57,7 @@ module Datahen
|
|
55
57
|
handle_error(e) if save
|
56
58
|
raise e
|
57
59
|
end
|
58
|
-
|
60
|
+
|
59
61
|
update_finisher_done_status
|
60
62
|
end
|
61
63
|
proc.call
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|