datahen 0.14.19 → 0.14.24

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b483263384d6a00e51fd345499ad020d69c9d07c1a77e3521e6b55a4528195de
4
- data.tar.gz: 167c0a417c402198f13151daf29bddb0f8429cac3876c8a84d6b7ba396b1ac90
3
+ metadata.gz: 54064cf5656d253f683540fa3704cdcae3991bc07a91ab5339bf9083eb0792f2
4
+ data.tar.gz: 3c82cedd06937454a9af5d91eea9b32f3ad744394763ca9fadbcdc6068eda683
5
5
  SHA512:
6
- metadata.gz: 344168b7a8a4cb746347aba1640d7b4e75a901cd5a1e2ab7acd00f6e8eee3d8c297795812259f84a21757b444e653331e44dd9e1efbec372472b6aeb98c56619
7
- data.tar.gz: 1ef00c32694830740ea477fb4291de9234957ee4b7f7ad02642aa9d5cb91d1b3b1cd6a56542db4cbf84d92d42039546f2e9bfb8a40cf22d103256e8d53b0ac28
6
+ metadata.gz: fb17b046f9dbd15cf7a278f68f550e8b3b84d8c16a030d0ef4df100df28c8e4dce29fe74ffc70e02f79af26a2d1e3f66ccb6890e06a342a70fac09d824169431
7
+ data.tar.gz: 732256fc714635896f444bed50e2d6f49c2a03c4868fd23eb84e52701935bc2bfbef25811055983691c5e0ff39af53328dbeb67faf91f127c1d06af450d7d666
@@ -14,20 +14,19 @@ module Datahen
14
14
  def try_parse(scraper_name, parser_file, gid)
15
15
  begin
16
16
 
17
- if options[:job]
18
- job_id = options[:job]
19
- elsif options[:global]
20
- job_id = nil
21
- else
22
- job = Client::ScraperJob.new(options).find(scraper_name)
23
- job_id = job['id']
24
- end
25
-
17
+ if options[:job]
18
+ job_id = options[:job]
19
+ elsif options[:global]
20
+ job_id = nil
21
+ else
22
+ job = Client::ScraperJob.new(options).find(scraper_name)
23
+ job_id = job['id']
24
+ end
26
25
 
27
26
  vars = JSON.parse(options[:vars]) if options[:vars]
28
27
  puts Datahen::Scraper::Parser.exec_parser_page(parser_file, gid, job_id, false, vars, options[:"keep-outputs"])
29
28
 
30
- rescue JSON::ParserError
29
+ rescue JSON::ParserError
31
30
  if options[:vars]
32
31
  puts "Error: #{options[:vars]} on vars is not a valid JSON"
33
32
  end
@@ -93,6 +93,7 @@ module Datahen
93
93
  option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
94
94
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
95
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
+ option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
96
97
  def start(scraper_name)
97
98
  client = Client::ScraperJob.new(options)
98
99
  puts "Starting a scrape job..."
@@ -4,8 +4,8 @@ require 'httparty'
4
4
  module Datahen
5
5
  module Client
6
6
  class BackblazeContent
7
- include HTTParty
8
-
7
+ include HTTParty
8
+
9
9
  def get_content(url)
10
10
  self.class.get(url, format: :plain)
11
11
  end
@@ -19,19 +19,23 @@ module Datahen
19
19
  sio = StringIO.new(string)
20
20
  gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
21
21
  _content = ""
22
- begin
22
+ begin
23
23
  _content = gz.read
24
24
  rescue => e
25
25
  # if unexpected eof error, then readchar until error, and ignore it
26
26
  if e.to_s == 'unexpected end of file'
27
- begin
28
- while !gz.eof?
29
- _content += gz.readchar
30
- end
27
+ # heavily improve content read recovery by using "String#<<",
28
+ # reading all "good" lines and then concat the remaining chars
29
+ begin
30
+ gz.each_line{|line| _content << line}
31
31
  rescue => e
32
- puts "Ignored Zlib error: #{e.to_s}"
32
+ begin
33
+ _content << gz.readchar while !gz.eof
34
+ rescue => e
35
+ puts "Ignored Zlib error: #{e.to_s}"
36
+ end
33
37
  end
34
- else
38
+ else
35
39
  raise e
36
40
  end
37
41
  end
@@ -11,6 +11,13 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ if opts[:vars]
15
+ if opts[:vars].is_a?(Array)
16
+ body[:vars] = opts[:vars]
17
+ elsif opts[:vars].is_a?(String)
18
+ body[:vars] = JSON.parse(opts[:vars])
19
+ end
20
+ end
14
21
  params = @options.merge({body: body.to_json})
15
22
  self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
23
  end
@@ -374,6 +374,11 @@ module Datahen
374
374
  def eval_with_context file_path, context
375
375
  eval(File.read(file_path), context, file_path)
376
376
  end
377
+
378
+ # Finish the executor execution
379
+ def finish
380
+ raise Error::SafeTerminateError
381
+ end
377
382
  end
378
383
  end
379
384
  end
@@ -40,6 +40,8 @@ module Datahen
40
40
  job_id: job_id
41
41
  })
42
42
  eval_with_context filename, context
43
+ rescue Error::SafeTerminateError => e
44
+ # do nothing, this is fine
43
45
  rescue SyntaxError => e
44
46
  handle_error(e) if save
45
47
  raise e
@@ -55,7 +57,7 @@ module Datahen
55
57
  handle_error(e) if save
56
58
  raise e
57
59
  end
58
-
60
+
59
61
  update_finisher_done_status
60
62
  end
61
63
  proc.call
@@ -44,6 +44,8 @@ module Datahen
44
44
  pages: pages
45
45
  })
46
46
  eval_with_context filename, context
47
+ rescue Error::SafeTerminateError => e
48
+ # do nothing, this is fine
47
49
  rescue SyntaxError => e
48
50
  handle_error(e) if save
49
51
  raise e
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.19"
2
+ VERSION = "0.14.24"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.19
4
+ version: 0.14.24
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-27 00:00:00.000000000 Z
11
+ date: 2021-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -253,7 +253,7 @@ metadata:
253
253
  allowed_push_host: https://rubygems.org
254
254
  homepage_uri: https://datahen.com
255
255
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
256
- post_install_message:
256
+ post_install_message:
257
257
  rdoc_options: []
258
258
  require_paths:
259
259
  - lib
@@ -269,7 +269,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
269
269
  version: '0'
270
270
  requirements: []
271
271
  rubygems_version: 3.0.3
272
- signing_key:
272
+ signing_key:
273
273
  specification_version: 4
274
274
  summary: DataHen toolbelt for developers
275
275
  test_files: []