datahen 0.14.12 → 0.14.19
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/global_page.rb +4 -1
- data/lib/datahen/cli/job.rb +1 -3
- data/lib/datahen/cli/scraper.rb +2 -0
- data/lib/datahen/cli/scraper_export.rb +4 -1
- data/lib/datahen/cli/scraper_job.rb +15 -0
- data/lib/datahen/cli/scraper_page.rb +45 -23
- data/lib/datahen/client/auth_token.rb +1 -1
- data/lib/datahen/client/job.rb +5 -0
- data/lib/datahen/client/job_page.rb +5 -0
- data/lib/datahen/client/scraper.rb +2 -0
- data/lib/datahen/client/scraper_job.rb +5 -0
- data/lib/datahen/client/scraper_job_page.rb +6 -1
- data/lib/datahen/error.rb +6 -0
- data/lib/datahen/error/safe_terminate_error.rb +6 -0
- data/lib/datahen/scraper.rb +1 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +5 -3
- data/lib/datahen/version.rb +1 -1
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b483263384d6a00e51fd345499ad020d69c9d07c1a77e3521e6b55a4528195de
|
4
|
+
data.tar.gz: 167c0a417c402198f13151daf29bddb0f8429cac3876c8a84d6b7ba396b1ac90
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 344168b7a8a4cb746347aba1640d7b4e75a901cd5a1e2ab7acd00f6e8eee3d8c297795812259f84a21757b444e653331e44dd9e1efbec372472b6aeb98c56619
|
7
|
+
data.tar.gz: 1ef00c32694830740ea477fb4291de9234957ee4b7f7ad02642aa9d5cb91d1b3b1cd6a56542db4cbf84d92d42039546f2e9bfb8a40cf22d103256e8d53b0ac28
|
@@ -15,7 +15,10 @@ module Datahen
|
|
15
15
|
|
16
16
|
if result['available'] == true
|
17
17
|
puts "Preview content url: \"#{result['preview_url']}\""
|
18
|
-
|
18
|
+
begin
|
19
|
+
`open "#{result['preview_url']}"`
|
20
|
+
rescue
|
21
|
+
end
|
19
22
|
else
|
20
23
|
puts "Content does not exist"
|
21
24
|
end
|
data/lib/datahen/cli/job.rb
CHANGED
@@ -6,7 +6,6 @@ module Datahen
|
|
6
6
|
"#{basename} #{@package_name} #{command.usage}"
|
7
7
|
end
|
8
8
|
|
9
|
-
|
10
9
|
desc "list", "gets a list of jobs"
|
11
10
|
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
12
11
|
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
@@ -35,8 +34,7 @@ module Datahen
|
|
35
34
|
client = Client::JobStat.new(options)
|
36
35
|
puts "#{client.job_current_stats(job_id, options)}"
|
37
36
|
end
|
38
|
-
|
39
|
-
|
37
|
+
|
40
38
|
end
|
41
39
|
end
|
42
40
|
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -31,6 +31,7 @@ module Datahen
|
|
31
31
|
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
32
32
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
33
33
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
34
|
+
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
34
35
|
def create(scraper_name, git_repository)
|
35
36
|
# puts "options #{options}"
|
36
37
|
client = Client::Scraper.new(options)
|
@@ -55,6 +56,7 @@ module Datahen
|
|
55
56
|
option :timezone, type: :string, desc: "Set the scheduler's timezone. Must be in IANA Timezone format. Defaults to \"America/Toronto\""
|
56
57
|
option :profile, type: :string, desc: 'Set the profiles (comma separated) to apply to the job. Default: default'
|
57
58
|
option :multiple_jobs, type: :boolean, desc: 'Set true to enable multiple jobs. Default: false'
|
59
|
+
option :max_job_count, type: :numeric, desc: 'Set a value to set max number of jobs available. Set -1 for unlimited. Default: 3'
|
58
60
|
def update(scraper_name)
|
59
61
|
client = Client::Scraper.new(options)
|
60
62
|
puts "#{client.update(scraper_name, options)}"
|
@@ -48,6 +48,21 @@ module Datahen
|
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
51
|
+
desc "delete <scraper_name>", "delete a scraper's current job"
|
52
|
+
long_desc <<-LONGDESC
|
53
|
+
Delete a scraper's current job
|
54
|
+
LONGDESC
|
55
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
56
|
+
def delete(scraper_name)
|
57
|
+
if options[:job]
|
58
|
+
client = Client::Job.new(options)
|
59
|
+
puts "#{client.delete(options[:job])}"
|
60
|
+
else
|
61
|
+
client = Client::ScraperJob.new(options)
|
62
|
+
puts "#{client.delete(scraper_name)}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
51
66
|
desc "resume <scraper_name>", "resumes a scraper's current job"
|
52
67
|
long_desc <<-LONGDESC
|
53
68
|
Resumes a scraper's current job
|
@@ -99,18 +99,20 @@ module Datahen
|
|
99
99
|
|
100
100
|
desc "refetch <scraper_name>", "Refetch Pages on a scraper's current job"
|
101
101
|
long_desc <<-LONGDESC
|
102
|
-
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status.\x5
|
102
|
+
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail or --status or --page-type.\x5
|
103
103
|
LONGDESC
|
104
104
|
option :gid, :aliases => :g, type: :string, desc: 'Refetch a specific GID'
|
105
105
|
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
106
106
|
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
107
107
|
option :status, type: :string, desc: 'Refetches only pages with a specific status.'
|
108
|
+
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
108
109
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
109
110
|
def refetch(scraper_name)
|
110
|
-
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status)
|
111
|
-
puts "Must specify either a --gid, --fetch-fail, --parse-fail or --
|
111
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
112
|
+
puts "Must specify either a --gid, --fetch-fail, --parse-fail, --status or --page-type"
|
112
113
|
return
|
113
114
|
end
|
115
|
+
|
114
116
|
if options[:job]
|
115
117
|
client = Client::JobPage.new(options)
|
116
118
|
puts "#{client.refetch(options[:job])}"
|
@@ -122,33 +124,47 @@ module Datahen
|
|
122
124
|
|
123
125
|
desc "reparse <scraper_name>", "Reparse Pages on a scraper's current job"
|
124
126
|
long_desc <<-LONGDESC
|
125
|
-
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status.\x5
|
127
|
+
Reparse pages in a scraper's current job. You need to specify either a --gid or --parse-fail or --status or --page-type.\x5
|
126
128
|
LONGDESC
|
127
129
|
option :gid, :aliases => :g, type: :string, desc: 'Reparse a specific GID'
|
128
130
|
option :parse_fail, type: :boolean, desc: 'Reparse only pages that fails parsing.'
|
129
131
|
option :status, type: :string, desc: 'Reparse only pages with a specific status.'
|
132
|
+
option :page_type, type: :string, desc: 'Refetches only pages with a specific page type.'
|
130
133
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
131
134
|
def reparse(scraper_name)
|
132
|
-
|
133
|
-
|
135
|
+
if !options.key?(:gid) && !options.key?(:parse_fail) && !options.key?(:status) && !options.key?(:page_type)
|
136
|
+
puts "Must specify either a --gid, --parse-fail, --status or --page-type"
|
137
|
+
return
|
138
|
+
end
|
134
139
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
140
|
+
if options[:job]
|
141
|
+
client = Client::JobPage.new(options)
|
142
|
+
puts "#{client.reparse(options[:job])}"
|
143
|
+
else
|
144
|
+
client = Client::ScraperJobPage.new(options)
|
145
|
+
puts "#{client.reparse(scraper_name)}"
|
146
|
+
end
|
147
|
+
end
|
139
148
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
149
|
+
desc "limbo <scraper_name>", "Move pages on a scraper's current job to limbo"
|
150
|
+
long_desc <<-LONGDESC
|
151
|
+
Move pages in a scraper's current job to limbo. You need to specify either a --gid or --status.\x5
|
152
|
+
LONGDESC
|
153
|
+
option :gid, :aliases => :g, type: :string, desc: 'Move a specific GID to limbo'
|
154
|
+
option :status, type: :string, desc: 'Move pages with a specific status to limbo.'
|
155
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
156
|
+
def limbo(scraper_name)
|
157
|
+
if !options.key?(:gid) && !options.key?(:status)
|
158
|
+
puts "Must specify either a --gid or --status"
|
159
|
+
return
|
160
|
+
end
|
147
161
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
162
|
+
if options[:job]
|
163
|
+
client = Client::JobPage.new(options)
|
164
|
+
puts "#{client.limbo(options[:job])}"
|
165
|
+
else
|
166
|
+
client = Client::ScraperJobPage.new(options)
|
167
|
+
puts "#{client.limbo(scraper_name)}"
|
152
168
|
end
|
153
169
|
end
|
154
170
|
|
@@ -224,7 +240,10 @@ module Datahen
|
|
224
240
|
|
225
241
|
if result['available'] == true
|
226
242
|
puts "Preview content url: \"#{result['preview_url']}\""
|
227
|
-
|
243
|
+
begin
|
244
|
+
`open "#{result['preview_url']}"`
|
245
|
+
rescue
|
246
|
+
end
|
228
247
|
else
|
229
248
|
puts "Content does not exist"
|
230
249
|
end
|
@@ -244,7 +263,10 @@ module Datahen
|
|
244
263
|
|
245
264
|
if result['available'] == true
|
246
265
|
puts "Preview failed content url: \"#{result['preview_url']}\""
|
247
|
-
|
266
|
+
begin
|
267
|
+
`open "#{result['preview_url']}"`
|
268
|
+
rescue
|
269
|
+
end
|
248
270
|
else
|
249
271
|
puts "Failed Content does not exist"
|
250
272
|
end
|
data/lib/datahen/client/job.rb
CHANGED
@@ -72,6 +72,11 @@ module Datahen
|
|
72
72
|
params = @options.merge(opts)
|
73
73
|
self.class.put("/jobs/#{job_id}/pages/refetch", params)
|
74
74
|
end
|
75
|
+
|
76
|
+
def limbo(job_id, opts={})
|
77
|
+
params = @options.merge(opts)
|
78
|
+
self.class.put("/jobs/#{job_id}/pages/limbo", params)
|
79
|
+
end
|
75
80
|
end
|
76
81
|
end
|
77
82
|
end
|
@@ -27,6 +27,7 @@ module Datahen
|
|
27
27
|
body[:timezone] = opts[:timezone] if opts[:timezone]
|
28
28
|
body[:profile] = opts[:profile] if opts[:profile]
|
29
29
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts[:multiple_jobs]
|
30
|
+
body[:max_job_count] = opts[:max_job_count] if opts[:max_job_count]
|
30
31
|
params = @options.merge({body: body.to_json})
|
31
32
|
self.class.post("/scrapers", params)
|
32
33
|
end
|
@@ -47,6 +48,7 @@ module Datahen
|
|
47
48
|
body[:timezone] = opts[:timezone] if opts[:timezone]
|
48
49
|
body[:profile] = opts[:profile] if opts[:profile]
|
49
50
|
body[:multiple_jobs] = opts[:multiple_jobs] if opts.has_key?("multiple_jobs") || opts.has_key?(:multiple_jobs)
|
51
|
+
body[:max_job_count] = opts[:max_job_count] if opts.has_key?("max_job_count") || opts.has_key?(:max_job_count)
|
50
52
|
params = @options.merge({body: body.to_json})
|
51
53
|
|
52
54
|
self.class.put("/scrapers/#{scraper_name}", params)
|
@@ -55,6 +55,11 @@ module Datahen
|
|
55
55
|
|
56
56
|
self.class.get("/scrapers/#{scraper_name}/current_job/profile", params)
|
57
57
|
end
|
58
|
+
|
59
|
+
def delete(scraper_name, opts={})
|
60
|
+
params = @options.merge(opts)
|
61
|
+
self.class.delete("/scrapers/#{scraper_name}/current_job", params)
|
62
|
+
end
|
58
63
|
end
|
59
64
|
end
|
60
65
|
end
|
@@ -26,7 +26,7 @@ module Datahen
|
|
26
26
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
|
27
27
|
end
|
28
28
|
|
29
|
-
# Deprecated, please use Datahen::Client::
|
29
|
+
# Deprecated, please use Datahen::Client::JobPage#refetch instead.
|
30
30
|
#
|
31
31
|
# @note This method will be removed at some point in the future.
|
32
32
|
def refetch_by_job(job_id, opts={})
|
@@ -39,6 +39,11 @@ module Datahen
|
|
39
39
|
self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
|
40
40
|
end
|
41
41
|
|
42
|
+
def limbo(scraper_name, opts={})
|
43
|
+
params = @options.merge(opts)
|
44
|
+
self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
|
45
|
+
end
|
46
|
+
|
42
47
|
def enqueue(scraper_name, method, url, opts={})
|
43
48
|
body = {}
|
44
49
|
body[:method] = method != "" ? method : "GET"
|
data/lib/datahen/scraper.rb
CHANGED
@@ -112,7 +112,7 @@ module Datahen
|
|
112
112
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
113
113
|
if page_gid == gid
|
114
114
|
self.refetch_self = true
|
115
|
-
|
115
|
+
raise Error::SafeTerminateError
|
116
116
|
end
|
117
117
|
refetch_page page_gid
|
118
118
|
end
|
@@ -130,7 +130,7 @@ module Datahen
|
|
130
130
|
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
131
131
|
if page_gid == gid
|
132
132
|
self.reparse_self = true
|
133
|
-
|
133
|
+
raise Error::SafeTerminateError
|
134
134
|
end
|
135
135
|
reparse_page page_gid
|
136
136
|
end
|
@@ -153,6 +153,8 @@ module Datahen
|
|
153
153
|
page: page
|
154
154
|
})
|
155
155
|
eval_with_context filename, context
|
156
|
+
rescue Error::SafeTerminateError => e
|
157
|
+
# do nothing, this is fine
|
156
158
|
rescue SyntaxError => e
|
157
159
|
handle_error(e) if save
|
158
160
|
raise e
|
@@ -163,7 +165,7 @@ module Datahen
|
|
163
165
|
|
164
166
|
puts "=========== Parsing Executed ==========="
|
165
167
|
begin
|
166
|
-
save_pages_and_outputs(pages, outputs, :parsing)
|
168
|
+
save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
|
167
169
|
rescue => e
|
168
170
|
handle_error(e) if save
|
169
171
|
raise e
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -233,6 +233,8 @@ files:
|
|
233
233
|
- lib/datahen/client/scraper_job_page.rb
|
234
234
|
- lib/datahen/client/scraper_job_var.rb
|
235
235
|
- lib/datahen/client/scraper_var.rb
|
236
|
+
- lib/datahen/error.rb
|
237
|
+
- lib/datahen/error/safe_terminate_error.rb
|
236
238
|
- lib/datahen/plugin.rb
|
237
239
|
- lib/datahen/plugin/context_exposer.rb
|
238
240
|
- lib/datahen/scraper.rb
|
@@ -251,7 +253,7 @@ metadata:
|
|
251
253
|
allowed_push_host: https://rubygems.org
|
252
254
|
homepage_uri: https://datahen.com
|
253
255
|
source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
|
254
|
-
post_install_message:
|
256
|
+
post_install_message:
|
255
257
|
rdoc_options: []
|
256
258
|
require_paths:
|
257
259
|
- lib
|
@@ -267,7 +269,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
267
269
|
version: '0'
|
268
270
|
requirements: []
|
269
271
|
rubygems_version: 3.0.3
|
270
|
-
signing_key:
|
272
|
+
signing_key:
|
271
273
|
specification_version: 4
|
272
274
|
summary: DataHen toolbelt for developers
|
273
275
|
test_files: []
|