datahen 0.14.19 → 0.14.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b483263384d6a00e51fd345499ad020d69c9d07c1a77e3521e6b55a4528195de
4
- data.tar.gz: 167c0a417c402198f13151daf29bddb0f8429cac3876c8a84d6b7ba396b1ac90
3
+ metadata.gz: 566f3019f38a812f11eb5475e787a2ee0f4475dc0f8ee342accb78897974308f
4
+ data.tar.gz: d050afb771fd6f6d95054e587540c0dc5da0769a7369b55f407c3f727633fff2
5
5
  SHA512:
6
- metadata.gz: 344168b7a8a4cb746347aba1640d7b4e75a901cd5a1e2ab7acd00f6e8eee3d8c297795812259f84a21757b444e653331e44dd9e1efbec372472b6aeb98c56619
7
- data.tar.gz: 1ef00c32694830740ea477fb4291de9234957ee4b7f7ad02642aa9d5cb91d1b3b1cd6a56542db4cbf84d92d42039546f2e9bfb8a40cf22d103256e8d53b0ac28
6
+ metadata.gz: a5bd6b82b1982ea466a3b12f2d49daddd865d8aa45b30a6e5d5c8db6d921c7a9eef2af67c8fe9c79c7460d1df348c88572e8e8cfb7df9ba627e6cda3f5f37af5
7
+ data.tar.gz: e19ee95ad123ed2c345a1f90fe01421f7be9819f58a67bdf5b498eb93e512793b0b3e8d32c4d5ec9acd1751e7108f9e3f189d9eb7a0beefbb5ce6ba798296f66
data/datahen.gemspec CHANGED
@@ -36,7 +36,7 @@ Gem::Specification.new do |spec|
36
36
  spec.required_ruby_version = '>= 2.2.2'
37
37
  spec.add_dependency "thor", "~> 0.20.3"
38
38
  spec.add_dependency 'httparty', '~> 0.16.2'
39
- spec.add_dependency 'nokogiri', '~> 1.6', '< 1.10'
39
+ spec.add_dependency 'nokogiri', '~> 1.6'
40
40
  spec.add_development_dependency 'bundler', '>= 1.16'
41
41
  spec.add_development_dependency 'rake', '>= 10.0'
42
42
  spec.add_development_dependency 'minitest', '>= 5.11'
@@ -14,20 +14,19 @@ module Datahen
14
14
  def try_parse(scraper_name, parser_file, gid)
15
15
  begin
16
16
 
17
- if options[:job]
18
- job_id = options[:job]
19
- elsif options[:global]
20
- job_id = nil
21
- else
22
- job = Client::ScraperJob.new(options).find(scraper_name)
23
- job_id = job['id']
24
- end
25
-
17
+ if options[:job]
18
+ job_id = options[:job]
19
+ elsif options[:global]
20
+ job_id = nil
21
+ else
22
+ job = Client::ScraperJob.new(options).find(scraper_name)
23
+ job_id = job['id']
24
+ end
26
25
 
27
26
  vars = JSON.parse(options[:vars]) if options[:vars]
28
27
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
29
28
 
30
- rescue JSON::ParserError
29
+ rescue JSON::ParserError
31
30
  if options[:vars]
32
31
  puts "Error: #{options[:vars]} on vars is not a valid JSON"
33
32
  end
@@ -93,6 +93,7 @@ module Datahen
93
93
  option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
94
94
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
95
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
+ option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
96
97
  def start(scraper_name)
97
98
  client = Client::ScraperJob.new(options)
98
99
  puts "Starting a scrape job..."
@@ -4,8 +4,8 @@ require 'httparty'
4
4
  module Datahen
5
5
  module Client
6
6
  class BackblazeContent
7
- include HTTParty
8
-
7
+ include HTTParty
8
+
9
9
  def get_content(url)
10
10
  self.class.get(url, format: :plain)
11
11
  end
@@ -19,19 +19,23 @@ module Datahen
19
19
  sio = StringIO.new(string)
20
20
  gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
21
21
  _content = ""
22
- begin
22
+ begin
23
23
  _content = gz.read
24
24
  rescue => e
25
25
  # if unexpected eof error, then readchar until error, and ignore it
26
26
  if e.to_s == 'unexpected end of file'
27
- begin
28
- while !gz.eof?
29
- _content += gz.readchar
30
- end
27
+ # heavily improve content read recovery by using "String#<<",
28
+ # reading all "good" lines and then concat the remaining chars
29
+ begin
30
+ gz.each_line{|line| _content << line}
31
31
  rescue => e
32
- puts "Ignored Zlib error: #{e.to_s}"
32
+ begin
33
+ _content << gz.readchar while !gz.eof
34
+ rescue => e
35
+ puts "Ignored Zlib error: #{e.to_s}"
36
+ end
33
37
  end
34
- else
38
+ else
35
39
  raise e
36
40
  end
37
41
  end
@@ -11,6 +11,13 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ if opts[:vars]
15
+ if opts[:vars].is_a?(Array)
16
+ body[:vars] = opts[:vars]
17
+ elsif opts[:vars].is_a?(String)
18
+ body[:vars] = JSON.parse(opts[:vars])
19
+ end
20
+ end
14
21
  params = @options.merge({body: body.to_json})
15
22
  self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
23
  end
@@ -374,6 +374,11 @@ module Datahen
374
374
  def eval_with_context file_path, context
375
375
  eval(File.read(file_path), context, file_path)
376
376
  end
377
+
378
+ # Finish the executor execution
379
+ def finish
380
+ raise Error::SafeTerminateError
381
+ end
377
382
  end
378
383
  end
379
384
  end
@@ -40,6 +40,8 @@ module Datahen
40
40
  job_id: job_id
41
41
  })
42
42
  eval_with_context filename, context
43
+ rescue Error::SafeTerminateError => e
44
+ # do nothing, this is fine
43
45
  rescue SyntaxError => e
44
46
  handle_error(e) if save
45
47
  raise e
@@ -55,7 +57,7 @@ module Datahen
55
57
  handle_error(e) if save
56
58
  raise e
57
59
  end
58
-
60
+
59
61
  update_finisher_done_status
60
62
  end
61
63
  proc.call
@@ -44,6 +44,8 @@ module Datahen
44
44
  pages: pages
45
45
  })
46
46
  eval_with_context filename, context
47
+ rescue Error::SafeTerminateError => e
48
+ # do nothing, this is fine
47
49
  rescue SyntaxError => e
48
50
  handle_error(e) if save
49
51
  raise e
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.19"
2
+ VERSION = "0.14.25"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.19
4
+ version: 0.14.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-27 00:00:00.000000000 Z
11
+ date: 2021-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -45,9 +45,6 @@ dependencies:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.6'
48
- - - "<"
49
- - !ruby/object:Gem::Version
50
- version: '1.10'
51
48
  type: :runtime
52
49
  prerelease: false
53
50
  version_requirements: !ruby/object:Gem::Requirement
@@ -55,9 +52,6 @@ dependencies:
55
52
  - - "~>"
56
53
  - !ruby/object:Gem::Version
57
54
  version: '1.6'
58
- - - "<"
59
- - !ruby/object:Gem::Version
60
- version: '1.10'
61
55
  - !ruby/object:Gem::Dependency
62
56
  name: bundler
63
57
  requirement: !ruby/object:Gem::Requirement
@@ -253,7 +247,7 @@ metadata:
253
247
  allowed_push_host: https://rubygems.org
254
248
  homepage_uri: https://datahen.com
255
249
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
256
- post_install_message:
250
+ post_install_message:
257
251
  rdoc_options: []
258
252
  require_paths:
259
253
  - lib
@@ -269,7 +263,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
269
263
  version: '0'
270
264
  requirements: []
271
265
  rubygems_version: 3.0.3
272
- signing_key:
266
+ signing_key:
273
267
  specification_version: 4
274
268
  summary: DataHen toolbelt for developers
275
269
  test_files: []