datahen 0.14.17 → 0.14.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a3cf1bf50610ab9ba523ca8f9ae1fdb307831cb56018036076f31353b357edfa
4
- data.tar.gz: '0199ea98a2f171675168699adc523932418b9822ff156c71786458c6362b6cdb'
3
+ metadata.gz: 25eb02598ac32462a53995a4b9e72b3bc466b54c2d74be02516f8d04f178a7b8
4
+ data.tar.gz: 5f6fcedfa7f4a477e18fc1a0ee80126b1a646a3ecefdd8258d1982bf7d7fe06f
5
5
  SHA512:
6
- metadata.gz: 8d66226573dbd9bd3ef795ce021eb9ee202b21cce1d0b79211093bb9abda6f1982a721bf8ae1fbc9bf84aae6e50d1878bf9d389ee67d685e4d1c440fd88cceeb
7
- data.tar.gz: 6ea1a7748cf77ae1cb8f750da71df86120f68e671c84e535e8536ee3324ed71026034a042662d081108ff7b72ccb3f0ef3ab3f049bdb45c92fc0e86ba2e10d46
6
+ metadata.gz: 69a71d740f9078a5a4c2a77211587c0099a4064cabda690cd6fb4803c153975e3e91f1c05f98278f2852a0bacf8cb444bba8f29f56c3cfbd0fba12cece39b9cd
7
+ data.tar.gz: df131c11592d2b6192fa74d26fc0e8d823b99f8073b907c82c8e9f04622c7d28aa5e1145419ac0377a99f6efcd3f46ff8fcef88fc436e802d51afc014fd4383a
@@ -93,6 +93,7 @@ module Datahen
93
93
  option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
94
94
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
95
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
+ option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
96
97
  def start(scraper_name)
97
98
  client = Client::ScraperJob.new(options)
98
99
  puts "Starting a scrape job..."
@@ -99,16 +99,17 @@ module Datahen
99
99
 
100
100
  desc "refetch <scraper_name>", "Refetch Pages on a scraper's current job"
101
101
  long_desc <<-LONGDESC
102
- Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status.\x5
102
+ Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status or --page-type.\x5
103
103
  LONGDESC
104
104
  option :gid, :aliases => :g, type: :string, desc: 'Refetch a specific GID'
105
105
  option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
106
106
  option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
107
107
  option :status, type: :string, desc: 'Refetches only pages with a specific status.'
108
+ option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
108
109
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
109
110
  def refetch(scraper_name)
110
- if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
111
- puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
111
+ if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
112
+ puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
112
113
  return
113
114
  end
114
115
 
@@ -123,15 +124,16 @@ module Datahen
123
124
 
124
125
  desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
125
126
  long_desc <<-LONGDESC
126
- Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status.\x5
127
+ Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status or --page-type.\x5
127
128
  LONGDESC
128
129
  option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
129
130
  option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
130
131
  option :status, type: :string, desc: 'Reparse only pages with a specific status.'
132
+ option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
131
133
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
132
134
  def reparse(scraper_name)
133
- if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status)
134
- puts "Must specify either a --gid, --parse-fail or --status"
135
+ if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
136
+ puts "Must specify either a --gid, --parse-fail, --status or --page-type"
135
137
  return
136
138
  end
137
139
 
@@ -11,6 +11,13 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ if opts[:vars]
15
+ if opts[:vars].is_a?(Array)
16
+ body[:vars] = opts[:vars]
17
+ elsif opts[:vars].is_a?(String)
18
+ body[:vars] = JSON.parse(opts[:vars])
19
+ end
20
+ end
14
21
  params = @options.merge({body: body.to_json})
15
22
  self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
23
  end
@@ -0,0 +1,6 @@
1
+ require 'datahen/error/safe_terminate_error'
2
+
3
+ module Datahen
4
+ module Error
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module Datahen
2
+ module Error
3
+ class SafeTerminateError < Exception
4
+ end
5
+ end
6
+ end
@@ -1,3 +1,4 @@
1
+ require "datahen/error"
1
2
  require "datahen/plugin"
2
3
  require "datahen/scraper/parser"
3
4
  require "datahen/scraper/seeder"
@@ -112,7 +112,7 @@ module Datahen
112
112
  raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
113
113
  if page_gid == gid
114
114
  self.refetch_self = true
115
- return
115
+ raise Error::SafeTerminateError
116
116
  end
117
117
  refetch_page page_gid
118
118
  end
@@ -130,7 +130,7 @@ module Datahen
130
130
  raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
131
131
  if page_gid == gid
132
132
  self.reparse_self = true
133
- return
133
+ raise Error::SafeTerminateError
134
134
  end
135
135
  reparse_page page_gid
136
136
  end
@@ -153,6 +153,8 @@ module Datahen
153
153
  page: page
154
154
  })
155
155
  eval_with_context filename, context
156
+ rescue Error::SafeTerminateError => e
157
+ # do nothing, this is fine
156
158
  rescue SyntaxError => e
157
159
  handle_error(e) if save
158
160
  raise e
@@ -163,7 +165,7 @@ module Datahen
163
165
 
164
166
  puts "=========== Parsing Executed ==========="
165
167
  begin
166
- save_pages_and_outputs(pages, outputs, :parsing)
168
+ save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
167
169
  rescue => e
168
170
  handle_error(e) if save
169
171
  raise e
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.17"
2
+ VERSION = "0.14.22"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.17
4
+ version: 0.14.22
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-20 00:00:00.000000000 Z
11
+ date: 2020-12-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -233,6 +233,8 @@ files:
233
233
  - lib/datahen/client/scraper_job_page.rb
234
234
  - lib/datahen/client/scraper_job_var.rb
235
235
  - lib/datahen/client/scraper_var.rb
236
+ - lib/datahen/error.rb
237
+ - lib/datahen/error/safe_terminate_error.rb
236
238
  - lib/datahen/plugin.rb
237
239
  - lib/datahen/plugin/context_exposer.rb
238
240
  - lib/datahen/scraper.rb