datahen 0.14.11 → 0.14.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datahen/cli/global_page.rb +4 -1
- data/lib/datahen/cli/job.rb +1 -3
- data/lib/datahen/cli/scraper.rb +4 -0
- data/lib/datahen/cli/scraper_export.rb +4 -1
- data/lib/datahen/cli/scraper_job.rb +15 -0
- data/lib/datahen/cli/scraper_page.rb +39 -19
- data/lib/datahen/client/auth_token.rb +1 -1
- data/lib/datahen/client/job.rb +5 -0
- data/lib/datahen/client/job_page.rb +5 -0
- data/lib/datahen/client/scraper.rb +4 -0
- data/lib/datahen/client/scraper_job.rb +5 -0
- data/lib/datahen/client/scraper_job_page.rb +6 -1
- data/lib/datahen/error.rb +6 -0
- data/lib/datahen/error/safe_terminate_error.rb +6 -0
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +5 -3
- data/lib/datahen/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 79ae69bd67fcd5158e782a4944cad2fef5f50a9ca896b67e62312778e4d3026c
|
4
|
+
data.tar.gz: b41c7c981d40dc1fb84bd6c3b32ee83d7a2f571f99a93d2c0253570bf4723e2e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 176c36236e870b58eeb11ad008aeb7115ecf3a7ddcced3bddaef88238f638a1159ff0fd8f037143e670c8ed1713332bd6117ff4c90f363c4fa46f54277dc48fa
|
7
|
+
data.tar.gz: dd2b96e8dcd3a7cb136affb56050b4e61c5339d5e3dae527223b20067fe506089d12535e8b06252fea7bd20086bf056a9afcd3cbb8edfb4fb3fd3bbf9bbc65d2
|
@@ -15,7 +15,10 @@ module Datahen
|
|
15
15
|
|
16
16
|
if result['available'] == true
|
17
17
|
puts "Preview content url: \"#{result['preview_url']}\""
|
18
|
-
|
18
|
+
begin
|
19
|
+
`open "#{result['preview_url']}"`
|
20
|
+
rescue
|
21
|
+
end
|
19
22
|
else
|
20
23
|
puts "Content does not exist"
|
21
24
|
end
|
data/lib/datahen/cli/job.rb
CHANGED
@@ -6,7 +6,6 @@ module Datahen
|
|
6
6
|
"#{basename} #{@package_name} #{command.usage}"
|
7
7
|
end
|
8
8
|
|
9
|
-
|
10
9
|
desc "list", "gets a list of jobs"
|
11
10
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
12
11
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
@@ -35,8 +34,7 @@ module Datahen
|
|
35
34
|
client = Client::JobStat.new(options)
|
36
35
|
puts "#{client.job_current_stats(job_id, options)}"
|
37
36
|
end
|
38
|
-
|
39
|
-
|
37
|
+
|
40
38
|
end
|
41
39
|
end
|
42
40
|
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -30,6 +30,8 @@ module Datahen
|
|
30
30
|
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
31
31
|
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
32
32
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
33
|
+
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
|
+
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
33
35
|
def create(scraper_name, git_repository)
|
34
36
|
# puts "options #{options}"
|
35
37
|
client = Client::Scraper.new(options)
|
@@ -53,6 +55,8 @@ module Datahen
|
|
53
55
|
option :schedule, type: :string, desc: 'Set the schedule of the scraper to run. Must be in CRON format.'
|
54
56
|
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
55
57
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
58
|
+
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
59
|
+
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
56
60
|
def update(scraper_name)
|
57
61
|
client = Client::Scraper.new(options)
|
58
62
|
puts "#{client.update(scraper_name, options)}"
|
@@ -48,6 +48,21 @@ module Datahen
|
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
51
|
+
desc "delete <scraper_name>", "delete a scraper's current job"
|
52
|
+
long_desc <<-LONGDESC
|
53
|
+
Delete a scraper's current job
|
54
|
+
LONGDESC
|
55
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
56
|
+
def delete(scraper_name)
|
57
|
+
if options[:job]
|
58
|
+
client = Client::Job.new(options)
|
59
|
+
puts "#{client.delete(options[:job])}"
|
60
|
+
else
|
61
|
+
client = Client::ScraperJob.new(options)
|
62
|
+
puts "#{client.delete(scraper_name)}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
51
66
|
desc "resume <scraper_name>", "resumes a scraper's current job"
|
52
67
|
long_desc <<-LONGDESC
|
53
68
|
Resumes a scraper's current job
|
@@ -111,6 +111,7 @@ module Datahen
|
|
111
111
|
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --status"
|
112
112
|
return
|
113
113
|
end
|
114
|
+
|
114
115
|
if options[:job]
|
115
116
|
client = Client::JobPage.new(options)
|
116
117
|
puts "#{client.refetch(options[:job])}"
|
@@ -129,26 +130,39 @@ module Datahen
|
|
129
130
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
130
131
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
131
132
|
def reparse(scraper_name)
|
132
|
-
|
133
|
-
|
133
|
+
if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status)
|
134
|
+
puts "Must specify either a --gid, --parse-fail or --status"
|
135
|
+
return
|
136
|
+
end
|
134
137
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
138
|
+
if options[:job]
|
139
|
+
client = Client::JobPage.new(options)
|
140
|
+
puts "#{client.reparse(options[:job])}"
|
141
|
+
else
|
142
|
+
client = Client::ScraperJobPage.new(options)
|
143
|
+
puts "#{client.reparse(scraper_name)}"
|
144
|
+
end
|
145
|
+
end
|
139
146
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
+
desc "limbo <scraper_name>", "Move pages on a scraper's current job to limbo"
|
148
|
+
long_desc <<-LONGDESC
|
149
|
+
Move pages in a scraper's current job to limbo. You need to specify either a --gid or --status.\x5
|
150
|
+
LONGDESC
|
151
|
+
option :gid, :aliases => :g, type: :string, desc: 'Move a specific GID to limbo'
|
152
|
+
option :status, type: :string, desc: 'Move pages with a specific status to limbo.'
|
153
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
154
|
+
def limbo(scraper_name)
|
155
|
+
if !options.key?(:gid) && !options.key?(:status)
|
156
|
+
puts "Must specify either a --gid or --status"
|
157
|
+
return
|
158
|
+
end
|
147
159
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
160
|
+
if options[:job]
|
161
|
+
client = Client::JobPage.new(options)
|
162
|
+
puts "#{client.limbo(options[:job])}"
|
163
|
+
else
|
164
|
+
client = Client::ScraperJobPage.new(options)
|
165
|
+
puts "#{client.limbo(scraper_name)}"
|
152
166
|
end
|
153
167
|
end
|
154
168
|
|
@@ -224,7 +238,10 @@ module Datahen
|
|
224
238
|
|
225
239
|
if result['available'] == true
|
226
240
|
puts "Preview content url: \"#{result['preview_url']}\""
|
227
|
-
|
241
|
+
begin
|
242
|
+
`open "#{result['preview_url']}"`
|
243
|
+
rescue
|
244
|
+
end
|
228
245
|
else
|
229
246
|
puts "Content does not exist"
|
230
247
|
end
|
@@ -244,7 +261,10 @@ module Datahen
|
|
244
261
|
|
245
262
|
if result['available'] == true
|
246
263
|
puts "Preview failed content url: \"#{result['preview_url']}\""
|
247
|
-
|
264
|
+
begin
|
265
|
+
`open "#{result['preview_url']}"`
|
266
|
+
rescue
|
267
|
+
end
|
248
268
|
else
|
249
269
|
puts "Failed Content does not exist"
|
250
270
|
end
|
data/lib/datahen/client/job.rb
CHANGED
@@ -72,6 +72,11 @@ module Datahen
|
|
72
72
|
params = @options.merge(opts)
|
73
73
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
74
74
|
end
|
75
|
+
|
76
|
+
def limbo(job_id, opts={})
|
77
|
+
params = @options.merge(opts)
|
78
|
+
self.class.put("/jobs/#{job_id}/pages/limbo", params)
|
79
|
+
end
|
75
80
|
end
|
76
81
|
end
|
77
82
|
end
|
@@ -26,6 +26,8 @@ module Datahen
|
|
26
26
|
body[:schedule] = opts[:schedule] if opts[:schedule]
|
27
27
|
body[:timezone] = opts[:timezone] if opts[:timezone]
|
28
28
|
body[:profile] = opts[:profile] if opts[:profile]
|
29
|
+
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
|
+
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
29
31
|
params = @options.merge({body: body.to_json})
|
30
32
|
self.class.post("/scrapers", params)
|
31
33
|
end
|
@@ -45,6 +47,8 @@ module Datahen
|
|
45
47
|
body[:schedule] = opts[:schedule] if opts[:schedule]
|
46
48
|
body[:timezone] = opts[:timezone] if opts[:timezone]
|
47
49
|
body[:profile] = opts[:profile] if opts[:profile]
|
50
|
+
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
51
|
+
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
48
52
|
params = @options.merge({body: body.to_json})
|
49
53
|
|
50
54
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -55,6 +55,11 @@ module Datahen
|
|
55
55
|
|
56
56
|
self.class.get("/scrapers/#{scraper_name}/current_job/profile", params)
|
57
57
|
end
|
58
|
+
|
59
|
+
def delete(scraper_name, opts={})
|
60
|
+
params = @options.merge(opts)
|
61
|
+
self.class.delete("/scrapers/#{scraper_name}/current_job", params)
|
62
|
+
end
|
58
63
|
end
|
59
64
|
end
|
60
65
|
end
|
@@ -26,7 +26,7 @@ module Datahen
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
27
27
|
end
|
28
28
|
|
29
|
-
# Deprecated, please use Datahen::Client::
|
29
|
+
# Deprecated, please use Datahen::Client::JobPage#refetch instead.
|
30
30
|
#
|
31
31
|
# @note This method will be removed at some point in the future.
|
32
32
|
def refetch_by_job(job_id, opts={})
|
@@ -39,6 +39,11 @@ module Datahen
|
|
39
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
40
40
|
end
|
41
41
|
|
42
|
+
def limbo(scraper_name, opts={})
|
43
|
+
params = @options.merge(opts)
|
44
|
+
self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
|
45
|
+
end
|
46
|
+
|
42
47
|
def enqueue(scraper_name, method, url, opts={})
|
43
48
|
body = {}
|
44
49
|
body[:method] = method != "" ? method : "GET"
|
data/lib/datahen/scraper.rb
CHANGED
@@ -112,7 +112,7 @@ module Datahen
|
|
112
112
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
113
113
|
if page_gid == gid
|
114
114
|
self.refetch_self = true
|
115
|
-
|
115
|
+
raise Error::SafeTerminateError
|
116
116
|
end
|
117
117
|
refetch_page page_gid
|
118
118
|
end
|
@@ -130,7 +130,7 @@ module Datahen
|
|
130
130
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
131
131
|
if page_gid == gid
|
132
132
|
self.reparse_self = true
|
133
|
-
|
133
|
+
raise Error::SafeTerminateError
|
134
134
|
end
|
135
135
|
reparse_page page_gid
|
136
136
|
end
|
@@ -153,6 +153,8 @@ module Datahen
|
|
153
153
|
page: page
|
154
154
|
})
|
155
155
|
eval_with_context filename, context
|
156
|
+
rescue Error::SafeTerminateError => e
|
157
|
+
# do nothing, this is fine
|
156
158
|
rescue SyntaxError => e
|
157
159
|
handle_error(e) if save
|
158
160
|
raise e
|
@@ -163,7 +165,7 @@ module Datahen
|
|
163
165
|
|
164
166
|
puts "=========== Parsing Executed ==========="
|
165
167
|
begin
|
166
|
-
save_pages_and_outputs(pages, outputs, :parsing)
|
168
|
+
save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
|
167
169
|
rescue => e
|
168
170
|
handle_error(e) if save
|
169
171
|
raise e
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -233,6 +233,8 @@ files:
|
|
233
233
|
- lib/datahen/client/scraper_job_page.rb
|
234
234
|
- lib/datahen/client/scraper_job_var.rb
|
235
235
|
- lib/datahen/client/scraper_var.rb
|
236
|
+
- lib/datahen/error.rb
|
237
|
+
- lib/datahen/error/safe_terminate_error.rb
|
236
238
|
- lib/datahen/plugin.rb
|
237
239
|
- lib/datahen/plugin/context_exposer.rb
|
238
240
|
- lib/datahen/scraper.rb
|