rbbt-util 5.29.1 → 5.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc/orchestrate.rb +95 -8
- data/lib/rbbt/hpc/slurm.rb +57 -24
- data/lib/rbbt/persist.rb +4 -0
- data/lib/rbbt/persist/tsv/adapter.rb +44 -13
- data/lib/rbbt/tsv.rb +6 -2
- data/lib/rbbt/util/cmd.rb +6 -1
- data/lib/rbbt/util/misc/options.rb +0 -42
- data/lib/rbbt/util/procpath.rb +49 -0
- data/lib/rbbt/workflow/step/accessor.rb +3 -4
- data/lib/rbbt/workflow/step/run.rb +2 -4
- data/lib/rbbt/workflow/usage.rb +1 -1
- data/share/rbbt_commands/slurm/clean +165 -0
- data/share/rbbt_commands/slurm/list +174 -95
- data/share/rbbt_commands/slurm/orchestrate +3 -2
- data/share/rbbt_commands/slurm/task +1 -0
- data/share/rbbt_commands/tsv/slice +3 -3
- data/share/rbbt_commands/workflow/info +1 -1
- data/share/rbbt_commands/workflow/task +27 -7
- data/share/rbbt_commands/workflow/write_info +52 -0
- data/test/rbbt/util/test_procpath.rb +23 -0
- metadata +7 -2
data/lib/rbbt/tsv.rb
CHANGED
@@ -113,8 +113,8 @@ module TSV
|
|
113
113
|
|
114
114
|
data.entity_options = entity_options
|
115
115
|
|
116
|
-
if Path === source
|
117
|
-
|
116
|
+
if Path === source && data.identifiers
|
117
|
+
Path.setup(data.identifiers, source.pkgdir, source.resource)
|
118
118
|
end
|
119
119
|
|
120
120
|
if data.respond_to? :persistence_path
|
@@ -124,6 +124,10 @@ module TSV
|
|
124
124
|
data.clear
|
125
125
|
data.annotate h
|
126
126
|
end
|
127
|
+
|
128
|
+
data.read if data.respond_to? :persistence_path
|
129
|
+
|
130
|
+
data
|
127
131
|
end
|
128
132
|
|
129
133
|
def self.parse_header(stream, options = {})
|
data/lib/rbbt/util/cmd.rb
CHANGED
@@ -217,7 +217,7 @@ module CMD
|
|
217
217
|
end
|
218
218
|
end
|
219
219
|
|
220
|
-
def self.
|
220
|
+
def self.cmd_pid(*args)
|
221
221
|
all_args = *args
|
222
222
|
|
223
223
|
all_args << {} unless Hash === all_args.last
|
@@ -248,4 +248,9 @@ module CMD
|
|
248
248
|
nil
|
249
249
|
end
|
250
250
|
|
251
|
+
def self.cmd_log(*args)
|
252
|
+
cmd_pid(*args)
|
253
|
+
nil
|
254
|
+
end
|
255
|
+
|
251
256
|
end
|
@@ -242,48 +242,6 @@ module Misc
|
|
242
242
|
|
243
243
|
return options
|
244
244
|
|
245
|
-
options = {}
|
246
|
-
string.split(/#/).each do |str|
|
247
|
-
if str.match(/(.*)=(.*)/)
|
248
|
-
option, value = $1, $2
|
249
|
-
else
|
250
|
-
option, value = str, true
|
251
|
-
end
|
252
|
-
|
253
|
-
option = option.sub(":",'').to_sym if option.chars.first == ':'
|
254
|
-
value = value.sub(":",'').to_sym if String === value and value.chars.first == ':'
|
255
|
-
|
256
|
-
if value == true
|
257
|
-
options[option] = option.to_s.chars.first != '!'
|
258
|
-
else
|
259
|
-
options[option] = Thread.start do
|
260
|
-
$SAFE = 0;
|
261
|
-
case
|
262
|
-
when value =~ /^(?:true|T)$/i
|
263
|
-
true
|
264
|
-
when value =~ /^(?:false|F)$/i
|
265
|
-
false
|
266
|
-
when Symbol === value
|
267
|
-
value
|
268
|
-
when (String === value and value =~ /^\/(.*)\/$/)
|
269
|
-
Regexp.new /#{$1}/
|
270
|
-
else
|
271
|
-
begin
|
272
|
-
Kernel.const_get value
|
273
|
-
rescue
|
274
|
-
begin
|
275
|
-
raise if value =~ /[a-z]/ and defined? value
|
276
|
-
eval(value)
|
277
|
-
rescue Exception
|
278
|
-
value
|
279
|
-
end
|
280
|
-
end
|
281
|
-
end
|
282
|
-
end.value
|
283
|
-
end
|
284
|
-
end
|
285
|
-
|
286
|
-
options
|
287
245
|
end
|
288
246
|
|
289
247
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'rbbt/util/cmd'
|
2
|
+
module ProcPath
|
3
|
+
CMD.tool :procpath do
|
4
|
+
'pip install procpath'
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.record(pid, path, options = {})
|
8
|
+
IndiferentHash.setup(options)
|
9
|
+
options = Misc.add_defaults options, "interval" => 30
|
10
|
+
|
11
|
+
cmd_options = %w(interval recnum reevalnum).inject({}){|acc,k| acc[k] = options[k]; acc}
|
12
|
+
|
13
|
+
Log.debug "ProcPath recording #{pid} in #{path} (#{Misc.fingerprint options})"
|
14
|
+
procpath_thread = Thread.new do
|
15
|
+
begin
|
16
|
+
procpath_pid = CMD.cmd_pid(:procpath, "record --database-file '#{path}' '$..children[?(@.stat.pid == #{pid})]'", cmd_options.merge(:nofail => true, :add_option_dashes => true))
|
17
|
+
rescue Exception
|
18
|
+
Log.exceptions $!
|
19
|
+
Process.kill "INT", procpath_pid
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
procpath_thread.report_on_exception = false
|
24
|
+
|
25
|
+
Process.wait pid.to_i
|
26
|
+
procpath_thread.raise Interrupt
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.plot(path, output, options = {})
|
30
|
+
IndiferentHash.setup(options)
|
31
|
+
options = Misc.add_defaults options, "query-name" => 'rss', 'epsilon' => 0.5, "moving-average-window" => 10
|
32
|
+
|
33
|
+
cmd_options = %w(query-name epsilon monitor-average-window title logarithmic after before custom-query-file custom-value-expr).inject({}){|acc,k| acc[k] = options[k]; acc}
|
34
|
+
CMD.cmd_log(:procpath, "plot --database-file '#{path}' --plot-file '#{output}' ", cmd_options.merge(:nofail => true, :add_option_dashes => true))
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.monitor(pid, path)
|
38
|
+
database, options_str = path.split("#")
|
39
|
+
options = options_str.nil? ? {} : Misc.string2hash(options_str)
|
40
|
+
|
41
|
+
database = File.expand_path database
|
42
|
+
Log.low "ProcPath monitor #{pid} in #{database} (#{Misc.fingerprint options})"
|
43
|
+
|
44
|
+
ProcPath.record(pid, database + '.sqlite3', options)
|
45
|
+
ProcPath.plot(database + '.sqlite3', database + '.cpu.svg', options.merge("query-name" => 'cpu'))
|
46
|
+
ProcPath.plot(database + '.sqlite3', database + '.rss.svg', options.merge("query-name" => 'rss'))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
@@ -505,8 +505,8 @@ class Step
|
|
505
505
|
|
506
506
|
def running?
|
507
507
|
return false if ! (started? || status == :ending)
|
508
|
-
|
509
|
-
|
508
|
+
return nil unless Open.exist?(self.pid_file)
|
509
|
+
pid = Open.read(self.pid_file).to_i
|
510
510
|
|
511
511
|
return false if done? or error? or aborted?
|
512
512
|
|
@@ -530,8 +530,7 @@ class Step
|
|
530
530
|
end
|
531
531
|
|
532
532
|
def nopid?
|
533
|
-
|
534
|
-
! pid && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
|
533
|
+
! Open.exists?(pid_file) && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
|
535
534
|
end
|
536
535
|
|
537
536
|
def aborted?
|
@@ -373,7 +373,6 @@ class Step
|
|
373
373
|
Log.exception $!
|
374
374
|
ensure
|
375
375
|
Step.purge_stream_cache
|
376
|
-
set_info :pid, nil
|
377
376
|
Open.rm pid_file if Open.exist?(pid_file)
|
378
377
|
end
|
379
378
|
end
|
@@ -388,7 +387,6 @@ class Step
|
|
388
387
|
_clean_finished
|
389
388
|
rescue
|
390
389
|
stop_dependencies
|
391
|
-
set_info :pid, nil
|
392
390
|
Open.rm pid_file if Open.exist?(pid_file)
|
393
391
|
end
|
394
392
|
end
|
@@ -449,7 +447,7 @@ class Step
|
|
449
447
|
ensure
|
450
448
|
no_load = false unless IO === result
|
451
449
|
Open.rm pid_file if Open.exist?(pid_file) unless no_load
|
452
|
-
set_info :pid, nil unless no_load
|
450
|
+
#set_info :pid, nil unless no_load
|
453
451
|
end
|
454
452
|
end
|
455
453
|
|
@@ -559,7 +557,7 @@ class Step
|
|
559
557
|
RbbtSemaphore.post_semaphore(semaphore) if semaphore
|
560
558
|
Kernel.exit! -1
|
561
559
|
end
|
562
|
-
set_info :pid, nil
|
560
|
+
#set_info :pid, nil
|
563
561
|
ensure
|
564
562
|
RbbtSemaphore.post_semaphore(semaphore) if semaphore
|
565
563
|
end
|
data/lib/rbbt/workflow/usage.rb
CHANGED
@@ -57,7 +57,7 @@ module Task
|
|
57
57
|
puts Log.color(:magenta, "Input select options")
|
58
58
|
puts
|
59
59
|
selects.collect{|p| p}.uniq.each do |input,options|
|
60
|
-
puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| o.to_s} * ", ") << "\n"
|
60
|
+
puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| Array === o ? o.first.to_s : o.to_s} * ", ") << "\n"
|
61
61
|
puts unless Log.compact
|
62
62
|
end
|
63
63
|
puts
|
@@ -0,0 +1,165 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rbbt-util'
|
4
|
+
require 'rbbt/util/simpleopt'
|
5
|
+
|
6
|
+
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
7
|
+
|
8
|
+
options = SOPT.setup <<EOF
|
9
|
+
|
10
|
+
Clean error or aborted jobs
|
11
|
+
|
12
|
+
$ rbbt mnl [options]
|
13
|
+
|
14
|
+
-h--help Print this help
|
15
|
+
-d--done Done jobs only
|
16
|
+
-e--error Error jobs only
|
17
|
+
-a--aborted SLURM aboted jobs
|
18
|
+
-j--job* Job ids
|
19
|
+
-s--search* Regular expression
|
20
|
+
-t--tail* Show the last lines of the STDERR
|
21
|
+
-SBP--sbatch_parameters show sbatch parameters
|
22
|
+
-dr--dry_run Do not erase anything
|
23
|
+
EOF
|
24
|
+
|
25
|
+
if options[:help]
|
26
|
+
if defined? rbbt_usage
|
27
|
+
rbbt_usage
|
28
|
+
else
|
29
|
+
puts SOPT.doc
|
30
|
+
end
|
31
|
+
exit 0
|
32
|
+
end
|
33
|
+
|
34
|
+
Log.severity = 4
|
35
|
+
done, error, aborted, jobid, search, tail, sbatch_parameters, dry_run = options.values_at :done, :error, :aborted, :job, :search, :tail, :sbatch_parameters, :dry_run
|
36
|
+
|
37
|
+
workdir = File.expand_path('~/rbbt-slurm')
|
38
|
+
Path.setup(workdir)
|
39
|
+
|
40
|
+
running_jobs = begin
|
41
|
+
squeue_txt = CMD.cmd('squeue').read
|
42
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
43
|
+
rescue
|
44
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
45
|
+
squeue_txt = nil
|
46
|
+
$norunningjobs = true
|
47
|
+
[]
|
48
|
+
end
|
49
|
+
|
50
|
+
if squeue_txt
|
51
|
+
job_nodes = {}
|
52
|
+
squeue_txt.split("\n").each do |line|
|
53
|
+
parts = line.strip.split(/\s+/)
|
54
|
+
job_nodes[parts.first] = parts.last.split(",")
|
55
|
+
end
|
56
|
+
else
|
57
|
+
job_nodes = nil
|
58
|
+
end
|
59
|
+
|
60
|
+
count = 0
|
61
|
+
workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
62
|
+
dir = File.dirname(fcmd)
|
63
|
+
|
64
|
+
if m = Open.read(fcmd).match(/#CMD: (.*)/)
|
65
|
+
cmd = m[1]
|
66
|
+
else
|
67
|
+
cmd = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
|
71
|
+
exe = m[1]
|
72
|
+
else
|
73
|
+
exe = nil
|
74
|
+
end
|
75
|
+
|
76
|
+
if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
|
77
|
+
container_home = m[1]
|
78
|
+
else
|
79
|
+
container_home = nil
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
if File.exists?(fid = File.join(dir, 'job.id'))
|
84
|
+
id = Open.read(fid).chomp
|
85
|
+
else
|
86
|
+
id = nil
|
87
|
+
end
|
88
|
+
|
89
|
+
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
90
|
+
exit_status = Open.read(fstatus).to_i
|
91
|
+
else
|
92
|
+
exit_status = nil
|
93
|
+
end
|
94
|
+
|
95
|
+
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
96
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
97
|
+
elsif job_nodes[id]
|
98
|
+
nodes = job_nodes[id]
|
99
|
+
else
|
100
|
+
nodes = []
|
101
|
+
end
|
102
|
+
|
103
|
+
if File.exists?(File.join(dir, 'std.out'))
|
104
|
+
outt = File.mtime File.join(dir, 'std.out')
|
105
|
+
errt = File.mtime File.join(dir, 'std.err')
|
106
|
+
time_diff = Time.now - [outt, errt].max
|
107
|
+
end
|
108
|
+
|
109
|
+
fdep = File.join(dir, 'dependencies.list')
|
110
|
+
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
111
|
+
|
112
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
113
|
+
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
114
|
+
|
115
|
+
aborted = error = true if aborted.nil? && error.nil?
|
116
|
+
if done || error || aborted || running || queued || jobid || search
|
117
|
+
select = false
|
118
|
+
select = true if done && exit_status && exit_status.to_i == 0
|
119
|
+
select = true if error && exit_status && exit_status.to_i != 0
|
120
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
121
|
+
select = select && jobid.split(",").include?(id) if jobid
|
122
|
+
select = select && cmd.match(/#{search}/) if search
|
123
|
+
next unless select
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
puts Log.color(:yellow, "**ERASING**")
|
128
|
+
puts Log.color :blue, dir
|
129
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
|
130
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
131
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
132
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
133
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
134
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
135
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
136
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
137
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
138
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
139
|
+
|
140
|
+
if options[:sbatch_parameters]
|
141
|
+
puts Log.color(:magenta, "SBATCH parameters: ")
|
142
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
143
|
+
end
|
144
|
+
|
145
|
+
if tail && File.exists?(File.join(dir, 'std.err'))
|
146
|
+
if exit_status && exit_status != 0
|
147
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
148
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
149
|
+
elsif exit_status
|
150
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
151
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
152
|
+
else
|
153
|
+
puts Log.color(:magenta, "Log tail: ")
|
154
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
count += 1
|
159
|
+
|
160
|
+
Open.rm_rf dir unless dry_run
|
161
|
+
end
|
162
|
+
|
163
|
+
puts
|
164
|
+
puts "Found #{count} jobs"
|
165
|
+
|
@@ -20,15 +20,18 @@ $ rbbt mnl [options]
|
|
20
20
|
-j--job* Job ids
|
21
21
|
-s--search* Regular expression
|
22
22
|
-t--tail* Show the last lines of the STDERR
|
23
|
+
-SBP--sbatch_parameters show sbatch parameters
|
24
|
+
-PERF--procpath_performance show Procpath performance summary
|
25
|
+
-sacct--sacct_peformance show sacct performance summary
|
23
26
|
EOF
|
24
27
|
|
25
28
|
if options[:help]
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
if defined? rbbt_usage
|
30
|
+
rbbt_usage
|
31
|
+
else
|
32
|
+
puts SOPT.doc
|
33
|
+
end
|
34
|
+
exit 0
|
32
35
|
end
|
33
36
|
|
34
37
|
Log.severity = 4
|
@@ -38,101 +41,177 @@ workdir = File.expand_path('~/rbbt-slurm')
|
|
38
41
|
Path.setup(workdir)
|
39
42
|
|
40
43
|
running_jobs = begin
|
41
|
-
|
44
|
+
squeue_txt = CMD.cmd('squeue').read
|
45
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
42
46
|
rescue
|
43
|
-
|
44
|
-
|
45
|
-
|
47
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
48
|
+
squeue_txt = nil
|
49
|
+
$norunningjobs = true
|
50
|
+
[]
|
46
51
|
end
|
47
52
|
|
53
|
+
if squeue_txt
|
54
|
+
job_nodes = {}
|
55
|
+
squeue_txt.split("\n").each do |line|
|
56
|
+
parts = line.strip.split(/\s+/)
|
57
|
+
job_nodes[parts.first] = parts.last.split(",")
|
58
|
+
end
|
59
|
+
else
|
60
|
+
job_nodes = nil
|
61
|
+
end
|
62
|
+
|
48
63
|
count = 0
|
49
64
|
workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
65
|
+
dir = File.dirname(fcmd)
|
66
|
+
|
67
|
+
if m = Open.read(fcmd).match(/#CMD: (.*)/)
|
68
|
+
cmd = m[1]
|
69
|
+
else
|
70
|
+
cmd = nil
|
71
|
+
end
|
72
|
+
|
73
|
+
if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
|
74
|
+
exe = m[1].sub('step_path=$(','')
|
75
|
+
else
|
76
|
+
exe = nil
|
77
|
+
end
|
78
|
+
|
79
|
+
if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
|
80
|
+
container_home = m[1]
|
81
|
+
else
|
82
|
+
container_home = nil
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
if File.exists?(fid = File.join(dir, 'job.id'))
|
87
|
+
id = Open.read(fid).chomp
|
88
|
+
else
|
89
|
+
id = nil
|
90
|
+
end
|
91
|
+
|
92
|
+
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
93
|
+
exit_status = Open.read(fstatus).to_i
|
94
|
+
else
|
95
|
+
exit_status = nil
|
96
|
+
end
|
97
|
+
|
98
|
+
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
99
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
100
|
+
elsif job_nodes[id]
|
101
|
+
nodes = job_nodes[id]
|
102
|
+
else
|
103
|
+
nodes = []
|
104
|
+
end
|
105
|
+
|
106
|
+
if File.exists?(File.join(dir, 'std.out'))
|
107
|
+
outt = File.mtime File.join(dir, 'std.out')
|
108
|
+
errt = File.mtime File.join(dir, 'std.err')
|
109
|
+
time_diff = Time.now - [outt, errt].max
|
110
|
+
end
|
111
|
+
|
112
|
+
fdep = File.join(dir, 'dependencies.list')
|
113
|
+
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
114
|
+
|
115
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
116
|
+
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
117
|
+
|
118
|
+
if done || error || aborted || running || queued || jobid || search
|
119
|
+
select = false
|
120
|
+
select = true if done && exit_status == 0
|
121
|
+
select = true if error && exit_status && exit_status != 0
|
122
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
123
|
+
select = true if queued && deps && (running_jobs & deps).any?
|
124
|
+
select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
125
|
+
select = true if jobid && jobid.split(",").include?(id)
|
126
|
+
select = true if search && cmd.match(/#{search}/)
|
127
|
+
next unless select
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
puts Log.color :blue, dir
|
132
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
|
133
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
134
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
135
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
136
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
137
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
138
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
139
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
140
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
141
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
142
|
+
|
143
|
+
if options[:sbatch_parameters]
|
144
|
+
puts Log.color(:magenta, "SBATCH parameters: ")
|
145
|
+
text = CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
146
|
+
lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
|
147
|
+
puts Log.color :yellow, lines * "\n"
|
148
|
+
end
|
149
|
+
|
150
|
+
fprocpath = File.join(dir, 'procpath.sqlite3')
|
151
|
+
if options[:procpath_performance] && Open.exists?(fprocpath)
|
152
|
+
puts Log.color(:magenta, "Procpath summary: ")
|
153
|
+
require 'rbbt/tsv/csv'
|
154
|
+
meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
|
155
|
+
perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
|
156
|
+
|
157
|
+
page_size = meta["page_size"].first.to_f
|
158
|
+
clock_ticks = meta["clock_ticks"].first.to_f
|
159
|
+
|
160
|
+
cpu_average = {}
|
161
|
+
rss_average = {}
|
162
|
+
perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
|
163
|
+
time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
|
164
|
+
time = time.to_f
|
165
|
+
|
166
|
+
cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
|
167
|
+
cpu_average[stat_pid] ||= {}
|
168
|
+
cpu_average[stat_pid][time] ||= []
|
169
|
+
cpu_average[stat_pid][time] << cpu.to_f
|
170
|
+
rss_average[time] ||= []
|
171
|
+
rss_average[time] << rss.to_f * page_size
|
172
|
+
end
|
173
|
+
|
174
|
+
ticks = 0
|
175
|
+
cpu_average.each do |stat_pid, cpu_average_pid|
|
176
|
+
start = cpu_average_pid.keys.sort.first
|
177
|
+
eend = cpu_average_pid.keys.sort.last
|
178
|
+
ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
|
179
|
+
end
|
180
|
+
start = rss_average.keys.sort.first
|
181
|
+
eend = rss_average.keys.sort.last
|
182
|
+
time_elapsed = eend - start
|
183
|
+
puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
|
184
|
+
puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
|
185
|
+
|
186
|
+
end
|
187
|
+
|
188
|
+
if options[:sacct_peformance]
|
189
|
+
begin
|
190
|
+
tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
|
191
|
+
values = tsv[tsv.keys.first]
|
192
|
+
if values.compact.any?
|
193
|
+
puts Log.color(:magenta, "SACCT performance: ")
|
194
|
+
puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
|
56
195
|
end
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
78
|
-
exit_status = Open.read(fstatus).to_i
|
79
|
-
else
|
80
|
-
exit_status = nil
|
81
|
-
end
|
82
|
-
|
83
|
-
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
84
|
-
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
85
|
-
else
|
86
|
-
nodes = []
|
87
|
-
end
|
88
|
-
|
89
|
-
if File.exists?(File.join(dir, 'std.out'))
|
90
|
-
outt = File.mtime File.join(dir, 'std.out')
|
91
|
-
errt = File.mtime File.join(dir, 'std.err')
|
92
|
-
time_diff = Time.now - [outt, errt].max
|
93
|
-
end
|
94
|
-
|
95
|
-
fdep = File.join(dir, 'dependencies.list')
|
96
|
-
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
97
|
-
|
98
|
-
if done || error || aborted || running || queued || jobid || search
|
99
|
-
select = false
|
100
|
-
select = true if done && exit_status == 0
|
101
|
-
select = true if error && exit_status && exit_status != 0
|
102
|
-
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
103
|
-
select = true if queued && deps && (running_jobs & deps).any?
|
104
|
-
select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
105
|
-
select = true if jobid && jobid.split(",").include?(id)
|
106
|
-
select = true if search && cmd.match(/#{search}/)
|
107
|
-
next unless select
|
108
|
-
end
|
109
|
-
|
110
|
-
|
111
|
-
puts Log.color :blue, dir
|
112
|
-
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
|
113
|
-
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
114
|
-
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
115
|
-
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
116
|
-
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
117
|
-
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
118
|
-
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
119
|
-
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
120
|
-
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
121
|
-
|
122
|
-
if tail && File.exists?(File.join(dir, 'std.err'))
|
123
|
-
if exit_status && exit_status != 0
|
124
|
-
puts Log.color(:magenta, "First error or exception found: ")
|
125
|
-
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
126
|
-
elsif exit_status
|
127
|
-
puts Log.color(:magenta, "Completed jobs: ")
|
128
|
-
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
129
|
-
else
|
130
|
-
puts Log.color(:magenta, "Log tail: ")
|
131
|
-
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
count += 1
|
196
|
+
rescue
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
|
201
|
+
if tail && File.exists?(File.join(dir, 'std.err'))
|
202
|
+
if exit_status && exit_status != 0
|
203
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
204
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
205
|
+
elsif exit_status
|
206
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
207
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
208
|
+
else
|
209
|
+
puts Log.color(:magenta, "Log tail: ")
|
210
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
count += 1
|
136
215
|
|
137
216
|
end
|
138
217
|
|