datahen 0.14.12 → 0.14.19

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 19a6cfd035a3cd5ba9bd10c1cf34e6baa0590cb2f07664e2bdd3ee2f8e032ebf
4
- data.tar.gz: b7cd0aa204baa98013c5853df68b93eb8562c1b4c202bf65b8a3c931ca2135d1
3
+ metadata.gz: b483263384d6a00e51fd345499ad020d69c9d07c1a77e3521e6b55a4528195de
4
+ data.tar.gz: 167c0a417c402198f13151daf29bddb0f8429cac3876c8a84d6b7ba396b1ac90
5
5
  SHA512:
6
- metadata.gz: 8ac98d022003ea6db219c1a07518f127bed088db30ebca3791569bd643acdf8ddc2ad01ddc553f42976a0a6983dfe2928fceadc2f91ed4464b4c2539939df28b
7
- data.tar.gz: 33a65a6ffe29cb8ab0e75283436909f683354365a07e11ee672fac4311367ec55c31d2db6e8bcc02de0569768169360fb2120d4e6149aae8e0151cabea8b6752
6
+ metadata.gz: 344168b7a8a4cb746347aba1640d7b4e75a901cd5a1e2ab7acd00f6e8eee3d8c297795812259f84a21757b444e653331e44dd9e1efbec372472b6aeb98c56619
7
+ data.tar.gz: 1ef00c32694830740ea477fb4291de9234957ee4b7f7ad02642aa9d5cb91d1b3b1cd6a56542db4cbf84d92d42039546f2e9bfb8a40cf22d103256e8d53b0ac28
@@ -15,7 +15,10 @@ module Datahen
15
15
 
16
16
  if result['available'] == true
17
17
  puts "Preview content url: \"#{result['preview_url']}\""
18
- `open "#{result['preview_url']}"`
18
+ begin
19
+ `open "#{result['preview_url']}"`
20
+ rescue
21
+ end
19
22
  else
20
23
  puts "Content does not exist"
21
24
  end
@@ -6,7 +6,6 @@ module Datahen
6
6
  "#{basename} #{@package_name} #{command.usage}"
7
7
  end
8
8
 
9
-
10
9
  desc "list", "gets a list of jobs"
11
10
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
12
11
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
@@ -35,8 +34,7 @@ module Datahen
35
34
  client = Client::JobStat.new(options)
36
35
  puts "#{client.job_current_stats(job_id, options)}"
37
36
  end
38
-
39
-
37
+
40
38
  end
41
39
  end
42
40
 
@@ -31,6 +31,7 @@ module Datahen
31
31
  option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
32
32
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
33
33
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
+ option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
34
35
  def create(scraper_name, git_repository)
35
36
  # puts "options #{options}"
36
37
  client = Client::Scraper.new(options)
@@ -55,6 +56,7 @@ module Datahen
55
56
  option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
56
57
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
57
58
  option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
59
+ option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
58
60
  def update(scraper_name)
59
61
  client = Client::Scraper.new(options)
60
62
  puts "#{client.update(scraper_name, options)}"
@@ -36,7 +36,10 @@ module Datahen
36
36
 
37
37
  if result['signed_url']
38
38
  puts "Download url: \"#{result['signed_url']}\""
39
- `open "#{result['signed_url']}"`
39
+ begin
40
+ `open "#{result['signed_url']}"`
41
+ rescue
42
+ end
40
43
  else
41
44
  puts "Exported file does not exist"
42
45
  end
@@ -48,6 +48,21 @@ module Datahen
48
48
  end
49
49
  end
50
50
 
51
+ desc "delete <scraper_name>", "delete a scraper's current job"
52
+ long_desc <<-LONGDESC
53
+ Delete a scraper's current job
54
+ LONGDESC
55
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
56
+ def delete(scraper_name)
57
+ if options[:job]
58
+ client = Client::Job.new(options)
59
+ puts "#{client.delete(options[:job])}"
60
+ else
61
+ client = Client::ScraperJob.new(options)
62
+ puts "#{client.delete(scraper_name)}"
63
+ end
64
+ end
65
+
51
66
  desc "resume <scraper_name>", "resumes a scraper's current job"
52
67
  long_desc <<-LONGDESC
53
68
  Resumes a scraper's current job
@@ -99,18 +99,20 @@ module Datahen
99
99
 
100
100
  desc "refetch <scraper_name>", "Refetch Pages on a scraper's current job"
101
101
  long_desc <<-LONGDESC
102
- Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status.\x5
102
+ Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status or --page-type.\x5
103
103
  LONGDESC
104
104
  option :gid, :aliases => :g, type: :string, desc: 'Refetch a specific GID'
105
105
  option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
106
106
  option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
107
107
  option :status, type: :string, desc: 'Refetches only pages with a specific status.'
108
+ option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
108
109
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
109
110
  def refetch(scraper_name)
110
- if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
111
- puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
111
+ if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
112
+ puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
112
113
  return
113
114
  end
115
+
114
116
  if options[:job]
115
117
  client = Client::JobPage.new(options)
116
118
  puts "#{client.refetch(options[:job])}"
@@ -122,33 +124,47 @@ module Datahen
122
124
 
123
125
  desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
124
126
  long_desc <<-LONGDESC
125
- Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status.\x5
127
+ Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status or --page-type.\x5
126
128
  LONGDESC
127
129
  option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
128
130
  option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
129
131
  option :status, type: :string, desc: 'Reparse only pages with a specific status.'
132
+ option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
130
133
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
131
134
  def reparse(scraper_name)
132
- begin
133
- options[:vars] = JSON.parse(options[:vars]) if options[:vars]
135
+ if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
136
+ puts "Must specify either a --gid, --parse-fail, --status or --page-type"
137
+ return
138
+ end
134
139
 
135
- if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status)
136
- puts "Must specify either a --gid, --parse-fail or --status"
137
- return
138
- end
140
+ if options[:job]
141
+ client = Client::JobPage.new(options)
142
+ puts "#{client.reparse(options[:job])}"
143
+ else
144
+ client = Client::ScraperJobPage.new(options)
145
+ puts "#{client.reparse(scraper_name)}"
146
+ end
147
+ end
139
148
 
140
- if options[:job]
141
- client = Client::JobPage.new(options)
142
- puts "#{client.reparse(options[:job])}"
143
- else
144
- client = Client::ScraperJobPage.new(options)
145
- puts "#{client.reparse(scraper_name)}"
146
- end
149
+ desc "limbo <scraper_name>", "Move pages on a scraper's current job to limbo"
150
+ long_desc <<-LONGDESC
151
+ Move pages in a scraper's current job to limbo. You need to specify either a --gid or --status.\x5
152
+ LONGDESC
153
+ option :gid, :aliases => :g, type: :string, desc: 'Move a specific GID to limbo'
154
+ option :status, type: :string, desc: 'Move pages with a specific status to limbo.'
155
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
156
+ def limbo(scraper_name)
157
+ if !options.key?(:gid) && !options.key?(:status)
158
+ puts "Must specify either a --gid or --status"
159
+ return
160
+ end
147
161
 
148
- rescue JSON::ParserError
149
- if options[:vars]
150
- puts "Error: #{options[:vars]} on vars is not a valid JSON"
151
- end
162
+ if options[:job]
163
+ client = Client::JobPage.new(options)
164
+ puts "#{client.limbo(options[:job])}"
165
+ else
166
+ client = Client::ScraperJobPage.new(options)
167
+ puts "#{client.limbo(scraper_name)}"
152
168
  end
153
169
  end
154
170
 
@@ -224,7 +240,10 @@ module Datahen
224
240
 
225
241
  if result['available'] == true
226
242
  puts "Preview content url: \"#{result['preview_url']}\""
227
- `open "#{result['preview_url']}"`
243
+ begin
244
+ `open "#{result['preview_url']}"`
245
+ rescue
246
+ end
228
247
  else
229
248
  puts "Content does not exist"
230
249
  end
@@ -244,7 +263,10 @@ module Datahen
244
263
 
245
264
  if result['available'] == true
246
265
  puts "Preview failed content url: \"#{result['preview_url']}\""
247
- `open "#{result['preview_url']}"`
266
+ begin
267
+ `open "#{result['preview_url']}"`
268
+ rescue
269
+ end
248
270
  else
249
271
  puts "Failed Content does not exist"
250
272
  end
@@ -16,7 +16,7 @@ module Datahen
16
16
  role: role,
17
17
  description: description}
18
18
 
19
- params = @options.merge({body: body.to_json})
19
+ params = @options.merge({body: body.to_json}).merge(opts)
20
20
  self.class.post("/auth_tokens", params)
21
21
  end
22
22
 
@@ -71,6 +71,11 @@ module Datahen
71
71
  self.class.get("/jobs/#{job_id}/profile", params)
72
72
  end
73
73
 
74
+ def delete(job_id, opts={})
75
+ params = @options.merge(opts)
76
+ self.class.delete("/jobs/#{job_id}", params)
77
+ end
78
+
74
79
  end
75
80
 
76
81
  end
@@ -72,6 +72,11 @@ module Datahen
72
72
  params = @options.merge(opts)
73
73
  self.class.put("/jobs/#{job_id}/pages/refetch", params)
74
74
  end
75
+
76
+ def limbo(job_id, opts={})
77
+ params = @options.merge(opts)
78
+ self.class.put("/jobs/#{job_id}/pages/limbo", params)
79
+ end
75
80
  end
76
81
  end
77
82
  end
@@ -27,6 +27,7 @@ module Datahen
27
27
  body[:timezone] = opts[:timezone] if opts[:timezone]
28
28
  body[:profile] = opts[:profile] if opts[:profile]
29
29
  body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
+ body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
30
31
  params = @options.merge({body: body.to_json})
31
32
  self.class.post("/scrapers", params)
32
33
  end
@@ -47,6 +48,7 @@ module Datahen
47
48
  body[:timezone] = opts[:timezone] if opts[:timezone]
48
49
  body[:profile] = opts[:profile] if opts[:profile]
49
50
  body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
51
+ body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
50
52
  params = @options.merge({body: body.to_json})
51
53
 
52
54
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -55,6 +55,11 @@ module Datahen
55
55
 
56
56
  self.class.get("/scrapers/#{scraper_name}/current_job/profile", params)
57
57
  end
58
+
59
+ def delete(scraper_name, opts={})
60
+ params = @options.merge(opts)
61
+ self.class.delete("/scrapers/#{scraper_name}/current_job", params)
62
+ end
58
63
  end
59
64
  end
60
65
  end
@@ -26,7 +26,7 @@ module Datahen
26
26
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
27
27
  end
28
28
 
29
- # Deprecated, please use Datahen::Client::JobVar#refetch instead.
29
+ # Deprecated, please use Datahen::Client::JobPage#refetch instead.
30
30
  #
31
31
  # @note This method will be removed at some point in the future.
32
32
  def refetch_by_job(job_id, opts={})
@@ -39,6 +39,11 @@ module Datahen
39
39
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
40
40
  end
41
41
 
42
+ def limbo(scraper_name, opts={})
43
+ params = @options.merge(opts)
44
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
45
+ end
46
+
42
47
  def enqueue(scraper_name, method, url, opts={})
43
48
  body = {}
44
49
  body[:method] = method != "" ? method : "GET"
@@ -0,0 +1,6 @@
1
+ require 'datahen/error/safe_terminate_error'
2
+
3
+ module Datahen
4
+ module Error
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module Datahen
2
+ module Error
3
+ class SafeTerminateError < Exception
4
+ end
5
+ end
6
+ end
@@ -1,3 +1,4 @@
1
+ require "datahen/error"
1
2
  require "datahen/plugin"
2
3
  require "datahen/scraper/parser"
3
4
  require "datahen/scraper/seeder"
@@ -112,7 +112,7 @@ module Datahen
112
112
  raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
113
113
  if page_gid == gid
114
114
  self.refetch_self = true
115
- return
115
+ raise Error::SafeTerminateError
116
116
  end
117
117
  refetch_page page_gid
118
118
  end
@@ -130,7 +130,7 @@ module Datahen
130
130
  raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
131
131
  if page_gid == gid
132
132
  self.reparse_self = true
133
- return
133
+ raise Error::SafeTerminateError
134
134
  end
135
135
  reparse_page page_gid
136
136
  end
@@ -153,6 +153,8 @@ module Datahen
153
153
  page: page
154
154
  })
155
155
  eval_with_context filename, context
156
+ rescue Error::SafeTerminateError => e
157
+ # do nothing, this is fine
156
158
  rescue SyntaxError => e
157
159
  handle_error(e) if save
158
160
  raise e
@@ -163,7 +165,7 @@ module Datahen
163
165
 
164
166
  puts "=========== Parsing Executed ==========="
165
167
  begin
166
- save_pages_and_outputs(pages, outputs, :parsing)
168
+ save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
167
169
  rescue => e
168
170
  handle_error(e) if save
169
171
  raise e
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.12"
2
+ VERSION = "0.14.19"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.12
4
+ version: 0.14.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-09-01 00:00:00.000000000 Z
11
+ date: 2020-10-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -233,6 +233,8 @@ files:
233
233
  - lib/datahen/client/scraper_job_page.rb
234
234
  - lib/datahen/client/scraper_job_var.rb
235
235
  - lib/datahen/client/scraper_var.rb
236
+ - lib/datahen/error.rb
237
+ - lib/datahen/error/safe_terminate_error.rb
236
238
  - lib/datahen/plugin.rb
237
239
  - lib/datahen/plugin/context_exposer.rb
238
240
  - lib/datahen/scraper.rb
@@ -251,7 +253,7 @@ metadata:
251
253
  allowed_push_host: https://rubygems.org
252
254
  homepage_uri: https://datahen.com
253
255
  source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
254
- post_install_message:
256
+ post_install_message:
255
257
  rdoc_options: []
256
258
  require_paths:
257
259
  - lib
@@ -267,7 +269,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
267
269
  version: '0'
268
270
  requirements: []
269
271
  rubygems_version: 3.0.3
270
- signing_key:
272
+ signing_key:
271
273
  specification_version: 4
272
274
  summary: DataHen toolbelt for developers
273
275
  test_files: []