datahen 0.13.7 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/datahen/cli/job.rb +14 -2
- data/lib/datahen/cli/scraper.rb +7 -5
- data/lib/datahen/cli/scraper_job.rb +10 -3
- data/lib/datahen/cli/scraper_page.rb +1 -1
- data/lib/datahen/client/base.rb +2 -0
- data/lib/datahen/client/job.rb +6 -2
- data/lib/datahen/client/job_stat.rb +24 -8
- data/lib/datahen/client/scraper_job.rb +6 -2
- data/lib/datahen/scraper/executor.rb +9 -3
- data/lib/datahen/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b5f93e47c85c172722c4517fe4af0805d81387a440fb06922fb5d6e941994f66
|
4
|
+
data.tar.gz: 1c895d1e6ec3e8415202d581ed9fe48e3c0443f53355b0ea34c0a3418b4e306d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0bc52173785501b7fe3ad3bd1de448d818033441eefb9a0526f68cb6a17595c4dda91e7f231501c4212892aaccda68e28ea2ac72c90ce5c89a7e78d06db5bf4b
|
7
|
+
data.tar.gz: d9fa5425007496e404258504127bedb1a7b786d609d9db15f6cdc485a2922aea5c7e8f25ca124142e0e41daadf22039d79c666f507029c3fc0c8388ca4a221df
|
data/lib/datahen/cli/job.rb
CHANGED
@@ -18,12 +18,24 @@ module Datahen
|
|
18
18
|
puts "#{client.all()}"
|
19
19
|
end
|
20
20
|
|
21
|
-
desc "show <job_id>", "Show a job"
|
21
|
+
desc "show <job_id>", "Show a job (Defaults to showing data from cached job)"
|
22
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
22
23
|
def show(job_id)
|
23
24
|
client = Client::Job.new(options)
|
24
|
-
puts "#{client.find(job_id)}"
|
25
|
+
puts "#{client.find(job_id, options)}"
|
25
26
|
end
|
26
27
|
|
28
|
+
desc "stats <job_id>", "Get the stat for a job (Defaults to showing data from cached stats)"
|
29
|
+
long_desc <<-LONGDESC
|
30
|
+
Get stats for a scraper's current job\n
|
31
|
+
LONGDESC
|
32
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
33
|
+
def stats(job_id)
|
34
|
+
client = Client::JobStat.new(options)
|
35
|
+
puts "#{client.job_current_stats(job_id, options)}"
|
36
|
+
end
|
37
|
+
|
38
|
+
|
27
39
|
end
|
28
40
|
end
|
29
41
|
|
data/lib/datahen/cli/scraper.rb
CHANGED
@@ -140,17 +140,18 @@ module Datahen
|
|
140
140
|
end
|
141
141
|
end
|
142
142
|
|
143
|
-
desc "stats <scraper_name>", "Get the
|
143
|
+
desc "stats <scraper_name>", "Get the stat for a current job (Defaults to showing data from cached stats)"
|
144
144
|
long_desc <<-LONGDESC
|
145
145
|
Get stats for a scraper's current job\n
|
146
146
|
LONGDESC
|
147
147
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
148
|
+
option :live, type: :boolean, desc: 'Get data from the live stats, not cached stats.'
|
148
149
|
def stats(scraper_name)
|
149
150
|
client = Client::JobStat.new(options)
|
150
151
|
if options[:job]
|
151
|
-
puts "#{client.job_current_stats(options[:job])}"
|
152
|
+
puts "#{client.job_current_stats(options[:job], options)}"
|
152
153
|
else
|
153
|
-
puts "#{client.scraper_job_current_stats(scraper_name)}"
|
154
|
+
puts "#{client.scraper_job_current_stats(scraper_name, options)}"
|
154
155
|
end
|
155
156
|
end
|
156
157
|
|
@@ -163,12 +164,13 @@ module Datahen
|
|
163
164
|
option :"max-timestamp", type: :string, desc: 'Ending timestamp point in time to query historic stats (inclusive)'
|
164
165
|
option :"limit", type: :numeric, desc: 'Limit stats retrieved'
|
165
166
|
option :"order", type: :numeric, desc: 'Order stats by timestamp [DESC]'
|
167
|
+
option :live, type: :boolean, desc: 'Get data from the live stats history, not cached stats history.'
|
166
168
|
def history(scraper_name)
|
167
169
|
client = Client::JobStat.new(options)
|
168
170
|
if options[:job]
|
169
|
-
puts "#{client.job_stats_history(options[:job])}"
|
171
|
+
puts "#{client.job_stats_history(options[:job], options)}"
|
170
172
|
else
|
171
|
-
puts "#{client.scraper_job_stats_history(scraper_name)}"
|
173
|
+
puts "#{client.scraper_job_stats_history(scraper_name, options)}"
|
172
174
|
end
|
173
175
|
end
|
174
176
|
|
@@ -6,10 +6,17 @@ module Datahen
|
|
6
6
|
"#{basename} #{@package_name} #{command.usage}"
|
7
7
|
end
|
8
8
|
|
9
|
-
desc "show <scraper_name>", "Show a scraper's current job"
|
9
|
+
desc "show <scraper_name>", "Show a scraper's current job (Defaults to showing data from cached job)"
|
10
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
11
|
+
option :live, type: :boolean, desc: 'Get data from the live job, not cached job.'
|
10
12
|
def show(scraper_name)
|
11
|
-
|
12
|
-
|
13
|
+
if options[:job]
|
14
|
+
client = Client::Job.new(options)
|
15
|
+
puts "#{client.find(options[:job], options)}"
|
16
|
+
else
|
17
|
+
client = Client::ScraperJob.new(options)
|
18
|
+
puts "#{client.find(scraper_name, options)}"
|
19
|
+
end
|
13
20
|
end
|
14
21
|
|
15
22
|
|
@@ -230,7 +230,7 @@ module Datahen
|
|
230
230
|
end
|
231
231
|
end
|
232
232
|
|
233
|
-
desc "failedcontent <gid>", "Show a page's failed content in scraper's current job"
|
233
|
+
desc "failedcontent <scraper_name> <gid>", "Show a page's failed content in scraper's current job"
|
234
234
|
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
235
235
|
def failedcontent(scraper_name, gid)
|
236
236
|
result = nil
|
data/lib/datahen/client/base.rb
CHANGED
data/lib/datahen/client/job.rb
CHANGED
@@ -6,8 +6,12 @@ module Datahen
|
|
6
6
|
self.class.get("/jobs", params)
|
7
7
|
end
|
8
8
|
|
9
|
-
def find(job_id)
|
10
|
-
|
9
|
+
def find(job_id, opts={})
|
10
|
+
if opts[:live]
|
11
|
+
self.class.get("/jobs/#{job_id}", @options)
|
12
|
+
else
|
13
|
+
self.class.get("/cached/jobs/#{job_id}", @options)
|
14
|
+
end
|
11
15
|
end
|
12
16
|
|
13
17
|
def update(job_id, opts={})
|
@@ -2,20 +2,36 @@ module Datahen
|
|
2
2
|
module Client
|
3
3
|
class JobStat < Datahen::Client::Base
|
4
4
|
|
5
|
-
def job_current_stats(job_id)
|
6
|
-
|
5
|
+
def job_current_stats(job_id, opts={})
|
6
|
+
if opts[:live]
|
7
|
+
self.class.get("/jobs/#{job_id}/stats/current", @options)
|
8
|
+
else
|
9
|
+
self.class.get("/cached/jobs/#{job_id}/stats/current", @options)
|
10
|
+
end
|
7
11
|
end
|
8
12
|
|
9
|
-
def scraper_job_current_stats(scraper_name)
|
10
|
-
|
13
|
+
def scraper_job_current_stats(scraper_name, opts={})
|
14
|
+
if opts[:live]
|
15
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
16
|
+
else
|
17
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
18
|
+
end
|
11
19
|
end
|
12
20
|
|
13
|
-
def job_stats_history(job_id)
|
14
|
-
|
21
|
+
def job_stats_history(job_id, opts={})
|
22
|
+
if opts[:live]
|
23
|
+
self.class.get("/jobs/#{job_id}/stats/history", @options)
|
24
|
+
else
|
25
|
+
self.class.get("/cached/jobs/#{job_id}/stats/history", @options)
|
26
|
+
end
|
15
27
|
end
|
16
28
|
|
17
|
-
def scraper_job_stats_history(scraper_name)
|
18
|
-
|
29
|
+
def scraper_job_stats_history(scraper_name, opts={})
|
30
|
+
if opts[:live]
|
31
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/stats/history", @options)
|
32
|
+
else
|
33
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job/stats/history", @options)
|
34
|
+
end
|
19
35
|
end
|
20
36
|
|
21
37
|
end
|
@@ -15,8 +15,12 @@ module Datahen
|
|
15
15
|
self.class.post("/scrapers/#{scraper_name}/jobs", params)
|
16
16
|
end
|
17
17
|
|
18
|
-
def find(scraper_name)
|
19
|
-
|
18
|
+
def find(scraper_name, opts={})
|
19
|
+
if opts[:live]
|
20
|
+
self.class.get("/scrapers/#{scraper_name}/current_job", @options)
|
21
|
+
else
|
22
|
+
self.class.get("/cached/scrapers/#{scraper_name}/current_job", @options)
|
23
|
+
end
|
20
24
|
end
|
21
25
|
|
22
26
|
def update(scraper_name, opts={})
|
@@ -60,7 +60,12 @@ module Datahen
|
|
60
60
|
|
61
61
|
def init_global_page()
|
62
62
|
client = Client::GlobalPage.new()
|
63
|
-
client.find(gid)
|
63
|
+
global_page = client.find(gid)
|
64
|
+
unless global_page.code == 200
|
65
|
+
raise "GID #{gid} not found. Aborting execution!"
|
66
|
+
else
|
67
|
+
global_page
|
68
|
+
end
|
64
69
|
end
|
65
70
|
|
66
71
|
def get_content(job_id, gid)
|
@@ -287,11 +292,12 @@ module Datahen
|
|
287
292
|
end
|
288
293
|
|
289
294
|
# behave differently if it is a real save
|
295
|
+
save_status = status
|
290
296
|
if save
|
291
297
|
log_msg = "Saving #{log_msgs.join(' and ')}."
|
292
298
|
puts "#{log_msg}"
|
293
299
|
else
|
294
|
-
|
300
|
+
save_status = "#{status}_try"
|
295
301
|
end
|
296
302
|
|
297
303
|
# saving to server
|
@@ -300,7 +306,7 @@ module Datahen
|
|
300
306
|
gid: gid,
|
301
307
|
pages: pages_slice,
|
302
308
|
outputs: outputs_slice,
|
303
|
-
status:
|
309
|
+
status: save_status)
|
304
310
|
|
305
311
|
if response.code == 200
|
306
312
|
if save
|
data/lib/datahen/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datahen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Parama Danoesubroto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -264,7 +264,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
264
264
|
- !ruby/object:Gem::Version
|
265
265
|
version: '0'
|
266
266
|
requirements: []
|
267
|
-
rubygems_version: 3.
|
267
|
+
rubygems_version: 3.1.2
|
268
268
|
signing_key:
|
269
269
|
specification_version: 4
|
270
270
|
summary: DataHen toolbelt for developers
|