datahen 0.14.11 → 0.14.18

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 678f01c798cd52a29da298f48e65b1139f13c80a1214d2c1328f4a4c49abbc63
4
- data.tar.gz: 0f787dc429274bb1cd0521237883615e7f8c83316d36fcb319e542cd0fe22d7c
3
+ metadata.gz: 79ae69bd67fcd5158e782a4944cad2fef5f50a9ca896b67e62312778e4d3026c
4
+ data.tar.gz: b41c7c981d40dc1fb84bd6c3b32ee83d7a2f571f99a93d2c0253570bf4723e2e
5
5
  SHA512:
6
- metadata.gz: dcaf8fbd6dcfb04f74b8b65ffdcbc127c3e8f179bf0b6da3f2e55acbd2f8d0e425f09a79287bc3ff1791b3fd20e1503c31af3a3128af6eaaedf43e149f1bc995
7
- data.tar.gz: 577dda3f28fe7303c08aa290f2fa82150e6f1e679184fac739f839f9de6200be11df7111eddd79f7db7fcfa3aa03f925933db85bed382a1c5c0211534b384e67
6
+ metadata.gz: 176c36236e870b58eeb11ad008aeb7115ecf3a7ddcced3bddaef88238f638a1159ff0fd8f037143e670c8ed1713332bd6117ff4c90f363c4fa46f54277dc48fa
7
+ data.tar.gz: dd2b96e8dcd3a7cb136affb56050b4e61c5339d5e3dae527223b20067fe506089d12535e8b06252fea7bd20086bf056a9afcd3cbb8edfb4fb3fd3bbf9bbc65d2
@@ -15,7 +15,10 @@ module Datahen
15
15
 
16
16
  if result['available'] == true
17
17
  puts "Preview content url: \"#{result['preview_url']}\""
18
- `open "#{result['preview_url']}"`
18
+ begin
19
+ `open "#{result['preview_url']}"`
20
+ rescue
21
+ end
19
22
  else
20
23
  puts "Content does not exist"
21
24
  end
@@ -6,7 +6,6 @@ module Datahen
6
6
  "#{basename} #{@package_name} #{command.usage}"
7
7
  end
8
8
 
9
-
10
9
  desc "list", "gets a list of jobs"
11
10
  option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
12
11
  option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
@@ -35,8 +34,7 @@ module Datahen
35
34
  client = Client::JobStat.new(options)
36
35
  puts "#{client.job_current_stats(job_id, options)}"
37
36
  end
38
-
39
-
37
+
40
38
  end
41
39
  end
42
40
 
@@ -30,6 +30,8 @@ module Datahen
30
30
  option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
31
31
  option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
32
32
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
33
+ option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
34
+ option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
33
35
  def create(scraper_name, git_repository)
34
36
  # puts "options #{options}"
35
37
  client = Client::Scraper.new(options)
@@ -53,6 +55,8 @@ module Datahen
53
55
  option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
54
56
  option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
55
57
  option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
58
+ option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
59
+ option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
56
60
  def update(scraper_name)
57
61
  client = Client::Scraper.new(options)
58
62
  puts "#{client.update(scraper_name, options)}"
@@ -36,7 +36,10 @@ module Datahen
36
36
 
37
37
  if result['signed_url']
38
38
  puts "Download url: \"#{result['signed_url']}\""
39
- `open "#{result['signed_url']}"`
39
+ begin
40
+ `open "#{result['signed_url']}"`
41
+ rescue
42
+ end
40
43
  else
41
44
  puts "Exported file does not exist"
42
45
  end
@@ -48,6 +48,21 @@ module Datahen
48
48
  end
49
49
  end
50
50
 
51
+ desc "delete <scraper_name>", "delete a scraper's current job"
52
+ long_desc <<-LONGDESC
53
+ Delete a scraper's current job
54
+ LONGDESC
55
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
56
+ def delete(scraper_name)
57
+ if options[:job]
58
+ client = Client::Job.new(options)
59
+ puts "#{client.delete(options[:job])}"
60
+ else
61
+ client = Client::ScraperJob.new(options)
62
+ puts "#{client.delete(scraper_name)}"
63
+ end
64
+ end
65
+
51
66
  desc "resume <scraper_name>", "resumes a scraper's current job"
52
67
  long_desc <<-LONGDESC
53
68
  Resumes a scraper's current job
@@ -111,6 +111,7 @@ module Datahen
111
111
  puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
112
112
  return
113
113
  end
114
+
114
115
  if options[:job]
115
116
  client = Client::JobPage.new(options)
116
117
  puts "#{client.refetch(options[:job])}"
@@ -129,26 +130,39 @@ module Datahen
129
130
  option :status, type: :string, desc: 'Reparse only pages with a specific status.'
130
131
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
131
132
  def reparse(scraper_name)
132
- begin
133
- options[:vars] = JSON.parse(options[:vars]) if options[:vars]
133
+ if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status)
134
+ puts "Must specify either a --gid, --parse-fail or --status"
135
+ return
136
+ end
134
137
 
135
- if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status)
136
- puts "Must specify either a --gid, --parse-fail or --status"
137
- return
138
- end
138
+ if options[:job]
139
+ client = Client::JobPage.new(options)
140
+ puts "#{client.reparse(options[:job])}"
141
+ else
142
+ client = Client::ScraperJobPage.new(options)
143
+ puts "#{client.reparse(scraper_name)}"
144
+ end
145
+ end
139
146
 
140
- if options[:job]
141
- client = Client::JobPage.new(options)
142
- puts "#{client.reparse(options[:job])}"
143
- else
144
- client = Client::ScraperJobPage.new(options)
145
- puts "#{client.reparse(scraper_name)}"
146
- end
147
+ desc "limbo <scraper_name>", "Move pages on a scraper's current job to limbo"
148
+ long_desc <<-LONGDESC
149
+ Move pages in a scraper's current job to limbo. You need to specify either a --gid or --status.\x5
150
+ LONGDESC
151
+ option :gid, :aliases => :g, type: :string, desc: 'Move a specific GID to limbo'
152
+ option :status, type: :string, desc: 'Move pages with a specific status to limbo.'
153
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
154
+ def limbo(scraper_name)
155
+ if !options.key?(:gid) && !options.key?(:status)
156
+ puts "Must specify either a --gid or --status"
157
+ return
158
+ end
147
159
 
148
- rescue JSON::ParserError
149
- if options[:vars]
150
- puts "Error: #{options[:vars]} on vars is not a valid JSON"
151
- end
160
+ if options[:job]
161
+ client = Client::JobPage.new(options)
162
+ puts "#{client.limbo(options[:job])}"
163
+ else
164
+ client = Client::ScraperJobPage.new(options)
165
+ puts "#{client.limbo(scraper_name)}"
152
166
  end
153
167
  end
154
168
 
@@ -224,7 +238,10 @@ module Datahen
224
238
 
225
239
  if result['available'] == true
226
240
  puts "Preview content url: \"#{result['preview_url']}\""
227
- `open "#{result['preview_url']}"`
241
+ begin
242
+ `open "#{result['preview_url']}"`
243
+ rescue
244
+ end
228
245
  else
229
246
  puts "Content does not exist"
230
247
  end
@@ -244,7 +261,10 @@ module Datahen
244
261
 
245
262
  if result['available'] == true
246
263
  puts "Preview failed content url: \"#{result['preview_url']}\""
247
- `open "#{result['preview_url']}"`
264
+ begin
265
+ `open "#{result['preview_url']}"`
266
+ rescue
267
+ end
248
268
  else
249
269
  puts "Failed Content does not exist"
250
270
  end
@@ -16,7 +16,7 @@ module Datahen
16
16
  role: role,
17
17
  description: description}
18
18
 
19
- params = @options.merge({body: body.to_json})
19
+ params = @options.merge({body: body.to_json}).merge(opts)
20
20
  self.class.post("/auth_tokens", params)
21
21
  end
22
22
 
@@ -71,6 +71,11 @@ module Datahen
71
71
  self.class.get("/jobs/#{job_id}/profile", params)
72
72
  end
73
73
 
74
+ def delete(job_id, opts={})
75
+ params = @options.merge(opts)
76
+ self.class.delete("/jobs/#{job_id}", params)
77
+ end
78
+
74
79
  end
75
80
 
76
81
  end
@@ -72,6 +72,11 @@ module Datahen
72
72
  params = @options.merge(opts)
73
73
  self.class.put("/jobs/#{job_id}/pages/refetch", params)
74
74
  end
75
+
76
+ def limbo(job_id, opts={})
77
+ params = @options.merge(opts)
78
+ self.class.put("/jobs/#{job_id}/pages/limbo", params)
79
+ end
75
80
  end
76
81
  end
77
82
  end
@@ -26,6 +26,8 @@ module Datahen
26
26
  body[:schedule] = opts[:schedule] if opts[:schedule]
27
27
  body[:timezone] = opts[:timezone] if opts[:timezone]
28
28
  body[:profile] = opts[:profile] if opts[:profile]
29
+ body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
30
+ body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
29
31
  params = @options.merge({body: body.to_json})
30
32
  self.class.post("/scrapers", params)
31
33
  end
@@ -45,6 +47,8 @@ module Datahen
45
47
  body[:schedule] = opts[:schedule] if opts[:schedule]
46
48
  body[:timezone] = opts[:timezone] if opts[:timezone]
47
49
  body[:profile] = opts[:profile] if opts[:profile]
50
+ body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
51
+ body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
48
52
  params = @options.merge({body: body.to_json})
49
53
 
50
54
  self.class.put("/scrapers/#{scraper_name}", params)
@@ -55,6 +55,11 @@ module Datahen
55
55
 
56
56
  self.class.get("/scrapers/#{scraper_name}/current_job/profile", params)
57
57
  end
58
+
59
+ def delete(scraper_name, opts={})
60
+ params = @options.merge(opts)
61
+ self.class.delete("/scrapers/#{scraper_name}/current_job", params)
62
+ end
58
63
  end
59
64
  end
60
65
  end
@@ -26,7 +26,7 @@ module Datahen
26
26
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
27
27
  end
28
28
 
29
- # Deprecated, please use Datahen::Client::JobVar#refetch instead.
29
+ # Deprecated, please use Datahen::Client::JobPage#refetch instead.
30
30
  #
31
31
  # @note This method will be removed at some point in the future.
32
32
  def refetch_by_job(job_id, opts={})
@@ -39,6 +39,11 @@ module Datahen
39
39
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
40
40
  end
41
41
 
42
+ def limbo(scraper_name, opts={})
43
+ params = @options.merge(opts)
44
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
45
+ end
46
+
42
47
  def enqueue(scraper_name, method, url, opts={})
43
48
  body = {}
44
49
  body[:method] = method != "" ? method : "GET"
@@ -0,0 +1,6 @@
1
+ require 'datahen/error/safe_terminate_error'
2
+
3
+ module Datahen
4
+ module Error
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module Datahen
2
+ module Error
3
+ class SafeTerminateError < Exception
4
+ end
5
+ end
6
+ end
@@ -1,3 +1,4 @@
1
+ require "datahen/error"
1
2
  require "datahen/plugin"
2
3
  require "datahen/scraper/parser"
3
4
  require "datahen/scraper/seeder"
@@ -112,7 +112,7 @@ module Datahen
112
112
  raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
113
113
  if page_gid == gid
114
114
  self.refetch_self = true
115
- return
115
+ raise Error::SafeTerminateError
116
116
  end
117
117
  refetch_page page_gid
118
118
  end
@@ -130,7 +130,7 @@ module Datahen
130
130
  raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
131
131
  if page_gid == gid
132
132
  self.reparse_self = true
133
- return
133
+ raise Error::SafeTerminateError
134
134
  end
135
135
  reparse_page page_gid
136
136
  end
@@ -153,6 +153,8 @@ module Datahen
153
153
  page: page
154
154
  })
155
155
  eval_with_context filename, context
156
+ rescue Error::SafeTerminateError => e
157
+ # do nothing, this is fine
156
158
  rescue SyntaxError => e
157
159
  handle_error(e) if save
158
160
  raise e
@@ -163,7 +165,7 @@ module Datahen
163
165
 
164
166
  puts "=========== Parsing Executed ==========="
165
167
  begin
166
- save_pages_and_outputs(pages, outputs, :parsing)
168
+ save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
167
169
  rescue => e
168
170
  handle_error(e) if save
169
171
  raise e
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.11"
2
+ VERSION = "0.14.18"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.11
4
+ version: 0.14.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-24 00:00:00.000000000 Z
11
+ date: 2020-10-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -233,6 +233,8 @@ files:
233
233
  - lib/datahen/client/scraper_job_page.rb
234
234
  - lib/datahen/client/scraper_job_var.rb
235
235
  - lib/datahen/client/scraper_var.rb
236
+ - lib/datahen/error.rb
237
+ - lib/datahen/error/safe_terminate_error.rb
236
238
  - lib/datahen/plugin.rb
237
239
  - lib/datahen/plugin/context_exposer.rb
238
240
  - lib/datahen/scraper.rb