datahen 0.14.20 → 0.14.26

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4a2f75a84728b7e8c228c578e4a4af6253dcd19db445f509ba44f9866847936c
4
- data.tar.gz: c37744322cc3e3035a31dd69bca340397ac906d27a3497604e3d98321187d972
3
+ metadata.gz: dac57d98132102aa9ae8244b6528394473b2bdeb9992c7ea15d6979eaf87d4af
4
+ data.tar.gz: e68858d2f088b2d7b8538411dd59cf2ae2de7866416fc213c6a6fa009d93c556
5
5
  SHA512:
6
- metadata.gz: fcf774827b35bf23048b47241da29b1a17e2b1d691a360d2afcb5b34c206c26b7ae316c457e26b946ae3144b4d9075dda5db90a2fd0fb7e56b9f53b0cd612d13
7
- data.tar.gz: 56fe24f850c0b695b87c61629ab6939f856ecf8f0ea6b6557d59646214b6ffd87557a6ade6bf59e03b993626663cc1af3071ca10858802272d75d47cf34de56a
6
+ metadata.gz: 857126b2f7ec4fa058aaa8d5b4a7095108224bdf3f6ece690dbfc930e0527a294853705227f0e63be5af3524982fff21f7d3c9d940c22b31caade5139a3d607b
7
+ data.tar.gz: 81ecf95378e6f4aa31a87e39a82bc815216fce1b84aa65d8f7f2aa8ee8b19b871f08eb8c86025d9dc8d84617f20864f5f39c21d7b8ac4900a739599c0aa6283c
data/datahen.gemspec CHANGED
@@ -36,7 +36,7 @@ Gem::Specification.new do |spec|
36
36
  spec.required_ruby_version = '>= 2.2.2'
37
37
  spec.add_dependency "thor", "~> 0.20.3"
38
38
  spec.add_dependency 'httparty', '~> 0.16.2'
39
- spec.add_dependency 'nokogiri', '~> 1.6', '< 1.10'
39
+ spec.add_dependency 'nokogiri', '~> 1.6'
40
40
  spec.add_development_dependency 'bundler', '>= 1.16'
41
41
  spec.add_development_dependency 'rake', '>= 10.0'
42
42
  spec.add_development_dependency 'minitest', '>= 5.11'
@@ -14,20 +14,19 @@ module Datahen
14
14
  def try_parse(scraper_name, parser_file, gid)
15
15
  begin
16
16
 
17
- if options[:job]
18
- job_id = options[:job]
19
- elsif options[:global]
20
- job_id = nil
21
- else
22
- job = Client::ScraperJob.new(options).find(scraper_name)
23
- job_id = job['id']
24
- end
25
-
17
+ if options[:job]
18
+ job_id = options[:job]
19
+ elsif options[:global]
20
+ job_id = nil
21
+ else
22
+ job = Client::ScraperJob.new(options).find(scraper_name)
23
+ job_id = job['id']
24
+ end
26
25
 
27
26
  vars = JSON.parse(options[:vars]) if options[:vars]
28
27
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
29
28
 
30
- rescue JSON::ParserError
29
+ rescue JSON::ParserError
31
30
  if options[:vars]
32
31
  puts "Error: #{options[:vars]} on vars is not a valid JSON"
33
32
  end
@@ -93,7 +93,7 @@ module Datahen
93
93
  option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
94
94
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
95
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
- option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: {"name":"foo", "value":"bar", "secret":false} '
96
+ option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
97
97
  def start(scraper_name)
98
98
  client = Client::ScraperJob.new(options)
99
99
  puts "Starting a scrape job..."
@@ -4,10 +4,10 @@ require 'httparty'
4
4
  module Datahen
5
5
  module Client
6
6
  class BackblazeContent
7
- include HTTParty
8
-
7
+ include HTTParty
8
+
9
9
  def get_content(url)
10
- self.class.get(url, format: :plain)
10
+ self.class.get(url, format: :plain).response.body
11
11
  end
12
12
 
13
13
  def get_gunzipped_content(url)
@@ -19,19 +19,23 @@ module Datahen
19
19
  sio = StringIO.new(string)
20
20
  gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
21
21
  _content = ""
22
- begin
22
+ begin
23
23
  _content = gz.read
24
24
  rescue => e
25
25
  # if unexpected eof error, then readchar until error, and ignore it
26
26
  if e.to_s == 'unexpected end of file'
27
- begin
28
- while !gz.eof?
29
- _content += gz.readchar
30
- end
27
+ # heavily improve content read recovery by using "String#<<",
28
+ # reading all "good" lines and then concat the remaining chars
29
+ begin
30
+ gz.each_line{|line| _content << line}
31
31
  rescue => e
32
- puts "Ignored Zlib error: #{e.to_s}"
32
+ begin
33
+ _content << gz.readchar while !gz.eof
34
+ rescue => e
35
+ puts "Ignored Zlib error: #{e.to_s}"
36
+ end
33
37
  end
34
- else
38
+ else
35
39
  raise e
36
40
  end
37
41
  end
@@ -374,6 +374,11 @@ module Datahen
374
374
  def eval_with_context file_path, context
375
375
  eval(File.read(file_path), context, file_path)
376
376
  end
377
+
378
+ # Finish the executor execution
379
+ def finish
380
+ raise Error::SafeTerminateError
381
+ end
377
382
  end
378
383
  end
379
384
  end
@@ -40,6 +40,8 @@ module Datahen
40
40
  job_id: job_id
41
41
  })
42
42
  eval_with_context filename, context
43
+ rescue Error::SafeTerminateError => e
44
+ # do nothing, this is fine
43
45
  rescue SyntaxError => e
44
46
  handle_error(e) if save
45
47
  raise e
@@ -55,7 +57,7 @@ module Datahen
55
57
  handle_error(e) if save
56
58
  raise e
57
59
  end
58
-
60
+
59
61
  update_finisher_done_status
60
62
  end
61
63
  proc.call
@@ -44,6 +44,8 @@ module Datahen
44
44
  pages: pages
45
45
  })
46
46
  eval_with_context filename, context
47
+ rescue Error::SafeTerminateError => e
48
+ # do nothing, this is fine
47
49
  rescue SyntaxError => e
48
50
  handle_error(e) if save
49
51
  raise e
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.20"
2
+ VERSION = "0.14.26"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.20
4
+ version: 0.14.26
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-12-14 00:00:00.000000000 Z
11
+ date: 2021-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -45,9 +45,6 @@ dependencies:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.6'
48
- - - "<"
49
- - !ruby/object:Gem::Version
50
- version: '1.10'
51
48
  type: :runtime
52
49
  prerelease: false
53
50
  version_requirements: !ruby/object:Gem::Requirement
@@ -55,9 +52,6 @@ dependencies:
55
52
  - - "~>"
56
53
  - !ruby/object:Gem::Version
57
54
  version: '1.6'
58
- - - "<"
59
- - !ruby/object:Gem::Version
60
- version: '1.10'
61
55
  - !ruby/object:Gem::Dependency
62
56
  name: bundler
63
57
  requirement: !ruby/object:Gem::Requirement