datahen 0.14.15 → 0.14.20

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b3dc2db62e50655c78bb70aceac56a3f1fb3f8ec87d155c65bbaf4e521d10fc5
4
- data.tar.gz: 57de5a7b0e9271dcfbb05d7ac22df37a127cd3c427e03d93bb448359b7d30edc
3
+ metadata.gz: 4a2f75a84728b7e8c228c578e4a4af6253dcd19db445f509ba44f9866847936c
4
+ data.tar.gz: c37744322cc3e3035a31dd69bca340397ac906d27a3497604e3d98321187d972
5
5
  SHA512:
6
- metadata.gz: 73888c3cabdb2c4f20371fff2f73294a71eb105a282ffff9e85ab675865fe1a7ffd8efb768cd8f0e82667158b6c249bbfef0f883088538dfc1c2c1fc4cdab527
7
- data.tar.gz: e15616500c7994d3c537b42c38dd4965035f7ef825181053ad05ece46a42194d8574c322ec3f3e24121ca0c7aac2c4a9351cd93dff64814855500b04a536d61d
6
+ metadata.gz: fcf774827b35bf23048b47241da29b1a17e2b1d691a360d2afcb5b34c206c26b7ae316c457e26b946ae3144b4d9075dda5db90a2fd0fb7e56b9f53b0cd612d13
7
+ data.tar.gz: 56fe24f850c0b695b87c61629ab6939f856ecf8f0ea6b6557d59646214b6ffd87557a6ade6bf59e03b993626663cc1af3071ca10858802272d75d47cf34de56a
@@ -15,7 +15,10 @@ module Datahen
15
15
 
16
16
  if result['available'] == true
17
17
  puts "Preview content url: \"#{result['preview_url']}\""
18
- `open "#{result['preview_url']}"`
18
+ begin
19
+ `open "#{result['preview_url']}"`
20
+ rescue
21
+ end
19
22
  else
20
23
  puts "Content does not exist"
21
24
  end
@@ -93,6 +93,7 @@ module Datahen
93
93
  option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Default: 1'
94
94
  option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Default: 0'
95
95
  option :proxy_type, desc: 'Set the Proxy type. Default: standard'
96
+ option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: {"name":"foo", "value":"bar", "secret":false} '
96
97
  def start(scraper_name)
97
98
  client = Client::ScraperJob.new(options)
98
99
  puts "Starting a scrape job..."
@@ -36,7 +36,10 @@ module Datahen
36
36
 
37
37
  if result['signed_url']
38
38
  puts "Download url: \"#{result['signed_url']}\""
39
- `open "#{result['signed_url']}"`
39
+ begin
40
+ `open "#{result['signed_url']}"`
41
+ rescue
42
+ end
40
43
  else
41
44
  puts "Exported file does not exist"
42
45
  end
@@ -99,18 +99,20 @@ module Datahen
99
99
 
100
100
  desc "refetch <scraper_name>", "Refetch Pages on a scraper's current job"
101
101
  long_desc <<-LONGDESC
102
- Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status.\x5
102
+ Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status or --page-type.\x5
103
103
  LONGDESC
104
104
  option :gid, :aliases => :g, type: :string, desc: 'Refetch a specific GID'
105
105
  option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
106
106
  option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
107
107
  option :status, type: :string, desc: 'Refetches only pages with a specific status.'
108
+ option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
108
109
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
109
110
  def refetch(scraper_name)
110
- if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
111
- puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
111
+ if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
112
+ puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
112
113
  return
113
114
  end
115
+
114
116
  if options[:job]
115
117
  client = Client::JobPage.new(options)
116
118
  puts "#{client.refetch(options[:job])}"
@@ -122,33 +124,47 @@ module Datahen
122
124
 
123
125
  desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
124
126
  long_desc <<-LONGDESC
125
- Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status.\x5
127
+ Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status or --page-type.\x5
126
128
  LONGDESC
127
129
  option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
128
130
  option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
129
131
  option :status, type: :string, desc: 'Reparse only pages with a specific status.'
132
+ option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
130
133
  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
131
134
  def reparse(scraper_name)
132
- begin
133
- options[:vars] = JSON.parse(options[:vars]) if options[:vars]
135
+ if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
136
+ puts "Must specify either a --gid, --parse-fail, --status or --page-type"
137
+ return
138
+ end
134
139
 
135
- if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status)
136
- puts "Must specify either a --gid, --parse-fail or --status"
137
- return
138
- end
140
+ if options[:job]
141
+ client = Client::JobPage.new(options)
142
+ puts "#{client.reparse(options[:job])}"
143
+ else
144
+ client = Client::ScraperJobPage.new(options)
145
+ puts "#{client.reparse(scraper_name)}"
146
+ end
147
+ end
139
148
 
140
- if options[:job]
141
- client = Client::JobPage.new(options)
142
- puts "#{client.reparse(options[:job])}"
143
- else
144
- client = Client::ScraperJobPage.new(options)
145
- puts "#{client.reparse(scraper_name)}"
146
- end
149
+ desc "limbo <scraper_name>", "Move pages on a scraper's current job to limbo"
150
+ long_desc <<-LONGDESC
151
+ Move pages in a scraper's current job to limbo. You need to specify either a --gid or --status.\x5
152
+ LONGDESC
153
+ option :gid, :aliases => :g, type: :string, desc: 'Move a specific GID to limbo'
154
+ option :status, type: :string, desc: 'Move pages with a specific status to limbo.'
155
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
156
+ def limbo(scraper_name)
157
+ if !options.key?(:gid) && !options.key?(:status)
158
+ puts "Must specify either a --gid or --status"
159
+ return
160
+ end
147
161
 
148
- rescue JSON::ParserError
149
- if options[:vars]
150
- puts "Error: #{options[:vars]} on vars is not a valid JSON"
151
- end
162
+ if options[:job]
163
+ client = Client::JobPage.new(options)
164
+ puts "#{client.limbo(options[:job])}"
165
+ else
166
+ client = Client::ScraperJobPage.new(options)
167
+ puts "#{client.limbo(scraper_name)}"
152
168
  end
153
169
  end
154
170
 
@@ -224,7 +240,10 @@ module Datahen
224
240
 
225
241
  if result['available'] == true
226
242
  puts "Preview content url: \"#{result['preview_url']}\""
227
- `open "#{result['preview_url']}"`
243
+ begin
244
+ `open "#{result['preview_url']}"`
245
+ rescue
246
+ end
228
247
  else
229
248
  puts "Content does not exist"
230
249
  end
@@ -244,7 +263,10 @@ module Datahen
244
263
 
245
264
  if result['available'] == true
246
265
  puts "Preview failed content url: \"#{result['preview_url']}\""
247
- `open "#{result['preview_url']}"`
266
+ begin
267
+ `open "#{result['preview_url']}"`
268
+ rescue
269
+ end
248
270
  else
249
271
  puts "Failed Content does not exist"
250
272
  end
@@ -72,6 +72,11 @@ module Datahen
72
72
  params = @options.merge(opts)
73
73
  self.class.put("/jobs/#{job_id}/pages/refetch", params)
74
74
  end
75
+
76
+ def limbo(job_id, opts={})
77
+ params = @options.merge(opts)
78
+ self.class.put("/jobs/#{job_id}/pages/limbo", params)
79
+ end
75
80
  end
76
81
  end
77
82
  end
@@ -11,6 +11,13 @@ module Datahen
11
11
  body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
12
  body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
13
  body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ if opts[:vars]
15
+ if opts[:vars].is_a?(Array)
16
+ body[:vars] = opts[:vars]
17
+ elsif opts[:vars].is_a?(String)
18
+ body[:vars] = JSON.parse(opts[:vars])
19
+ end
20
+ end
14
21
  params = @options.merge({body: body.to_json})
15
22
  self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
23
  end
@@ -26,7 +26,7 @@ module Datahen
26
26
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
27
27
  end
28
28
 
29
- # Deprecated, please use Datahen::Client::JobVar#refetch instead.
29
+ # Deprecated, please use Datahen::Client::JobPage#refetch instead.
30
30
  #
31
31
  # @note This method will be removed at some point in the future.
32
32
  def refetch_by_job(job_id, opts={})
@@ -39,6 +39,11 @@ module Datahen
39
39
  self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
40
40
  end
41
41
 
42
+ def limbo(scraper_name, opts={})
43
+ params = @options.merge(opts)
44
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
45
+ end
46
+
42
47
  def enqueue(scraper_name, method, url, opts={})
43
48
  body = {}
44
49
  body[:method] = method != "" ? method : "GET"
@@ -0,0 +1,6 @@
1
+ require 'datahen/error/safe_terminate_error'
2
+
3
+ module Datahen
4
+ module Error
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module Datahen
2
+ module Error
3
+ class SafeTerminateError < Exception
4
+ end
5
+ end
6
+ end
@@ -1,3 +1,4 @@
1
+ require "datahen/error"
1
2
  require "datahen/plugin"
2
3
  require "datahen/scraper/parser"
3
4
  require "datahen/scraper/seeder"
@@ -112,7 +112,7 @@ module Datahen
112
112
  raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
113
113
  if page_gid == gid
114
114
  self.refetch_self = true
115
- return
115
+ raise Error::SafeTerminateError
116
116
  end
117
117
  refetch_page page_gid
118
118
  end
@@ -130,7 +130,7 @@ module Datahen
130
130
  raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
131
131
  if page_gid == gid
132
132
  self.reparse_self = true
133
- return
133
+ raise Error::SafeTerminateError
134
134
  end
135
135
  reparse_page page_gid
136
136
  end
@@ -153,6 +153,8 @@ module Datahen
153
153
  page: page
154
154
  })
155
155
  eval_with_context filename, context
156
+ rescue Error::SafeTerminateError => e
157
+ # do nothing, this is fine
156
158
  rescue SyntaxError => e
157
159
  handle_error(e) if save
158
160
  raise e
@@ -163,7 +165,7 @@ module Datahen
163
165
 
164
166
  puts "=========== Parsing Executed ==========="
165
167
  begin
166
- save_pages_and_outputs(pages, outputs, :parsing)
168
+ save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
167
169
  rescue => e
168
170
  handle_error(e) if save
169
171
  raise e
@@ -1,3 +1,3 @@
1
1
  module Datahen
2
- VERSION = "0.14.15"
2
+ VERSION = "0.14.20"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datahen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.15
4
+ version: 0.14.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - Parama Danoesubroto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-07 00:00:00.000000000 Z
11
+ date: 2020-12-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -233,6 +233,8 @@ files:
233
233
  - lib/datahen/client/scraper_job_page.rb
234
234
  - lib/datahen/client/scraper_job_var.rb
235
235
  - lib/datahen/client/scraper_var.rb
236
+ - lib/datahen/error.rb
237
+ - lib/datahen/error/safe_terminate_error.rb
236
238
  - lib/datahen/plugin.rb
237
239
  - lib/datahen/plugin/context_exposer.rb
238
240
  - lib/datahen/scraper.rb