rbbt-util 5.32.7 → 5.32.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/share/rbbt_commands/lsf/clean +212 -0
- data/share/rbbt_commands/lsf/list +311 -0
- data/share/rbbt_commands/lsf/orchestrate +56 -0
- data/share/rbbt_commands/lsf/tail +55 -0
- data/share/rbbt_commands/lsf/task +55 -0
- data/share/rbbt_commands/slurm/clean +212 -0
- data/share/rbbt_commands/slurm/list +311 -0
- data/share/rbbt_commands/slurm/orchestrate +56 -0
- data/share/rbbt_commands/slurm/tail +55 -0
- data/share/rbbt_commands/slurm/task +55 -0
- metadata +13 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3b9d236294c4bdcc32e517eadcffa4d103e5a99a220096497a50f8baa26f746d
|
|
4
|
+
data.tar.gz: 6370930ac76bd8b86666fa59556d2d8648789c3e3dda56bb71c71c2ff65b88d3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7dcd6fadf6424add27b6773d16ef0b794183c4b2f52616498d8c2ec21a77c5b5ad8f3fed65ca69ce5111c71e0942f84be79c107b40156c331919533f3fc5bea7
|
|
7
|
+
data.tar.gz: ea294d8aa5f8ff04a9e902fdd2d7d2287a34934f7288a11234996d06fb72dba4f333e655d591d9b78c5bc00104036356625088f9aa4c7e808ae2d42d0b253781
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'rbbt-util'
|
|
4
|
+
require 'rbbt/util/simpleopt'
|
|
5
|
+
require 'rbbt/hpc'
|
|
6
|
+
|
|
7
|
+
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
|
8
|
+
|
|
9
|
+
options = SOPT.setup <<EOF
|
|
10
|
+
|
|
11
|
+
Clean error or aborted jobs
|
|
12
|
+
|
|
13
|
+
$ rbbt slurm clean [options]
|
|
14
|
+
|
|
15
|
+
-h--help Print this help
|
|
16
|
+
-d--done Done jobs only
|
|
17
|
+
-e--error Error jobs only
|
|
18
|
+
-a--aborted SLURM aboted jobs
|
|
19
|
+
-q--queued Queued jobs only
|
|
20
|
+
-j--job* Job ids
|
|
21
|
+
-s--search* Regular expression
|
|
22
|
+
-t--tail* Show the last lines of the STDERR
|
|
23
|
+
-BP--batch_parameters show batch parameters
|
|
24
|
+
-dr--dry_run Do not erase anything
|
|
25
|
+
EOF
|
|
26
|
+
|
|
27
|
+
if options[:help]
|
|
28
|
+
if defined? rbbt_usage
|
|
29
|
+
rbbt_usage
|
|
30
|
+
else
|
|
31
|
+
puts SOPT.doc
|
|
32
|
+
end
|
|
33
|
+
exit 0
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
batch_system = options.delete :batch_system
|
|
37
|
+
batch_system ||= 'auto'
|
|
38
|
+
|
|
39
|
+
HPC::BATCH_MODULE = HPC.batch_system batch_system
|
|
40
|
+
|
|
41
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
|
42
|
+
|
|
43
|
+
Log.severity = 4
|
|
44
|
+
done, error, aborted, queued, jobid, search, tail, batch_parameters, dry_run = options.values_at :done, :error, :aborted, :queued, :job, :search, :tail, :batch_parameters, :dry_run
|
|
45
|
+
|
|
46
|
+
workdir = File.expand_path('~/rbbt-batch')
|
|
47
|
+
Path.setup(workdir)
|
|
48
|
+
|
|
49
|
+
running_jobs = begin
|
|
50
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
|
51
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
|
52
|
+
rescue
|
|
53
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
|
54
|
+
squeue_txt = nil
|
|
55
|
+
$norunningjobs = true
|
|
56
|
+
[]
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
if squeue_txt
|
|
60
|
+
job_nodes = {}
|
|
61
|
+
squeue_txt.split("\n").each do |line|
|
|
62
|
+
parts = line.strip.split(/\s+/)
|
|
63
|
+
job_nodes[parts.first] = parts.last.split(",")
|
|
64
|
+
end
|
|
65
|
+
else
|
|
66
|
+
job_nodes = nil
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
count = 0
|
|
70
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
71
|
+
dir = File.dirname(fcmd)
|
|
72
|
+
command_txt = Open.read(fcmd)
|
|
73
|
+
|
|
74
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
|
75
|
+
cmd = m[1]
|
|
76
|
+
else
|
|
77
|
+
cmd = nil
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
if m = command_txt.match(/# Run command\n(.*?)\n/im)
|
|
81
|
+
exe = m[1]
|
|
82
|
+
else
|
|
83
|
+
exe = nil
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
|
87
|
+
container_home = m[1]
|
|
88
|
+
else
|
|
89
|
+
container_home = nil
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
|
|
93
|
+
job_batch_system = m[1].downcase
|
|
94
|
+
else
|
|
95
|
+
job_batch_system = nil
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
different_system = job_batch_system != batch_system
|
|
99
|
+
|
|
100
|
+
if File.exists?(fid = File.join(dir, 'job.id'))
|
|
101
|
+
id = Open.read(fid).chomp
|
|
102
|
+
else
|
|
103
|
+
id = nil
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
|
107
|
+
exit_status = Open.read(fstatus).to_i
|
|
108
|
+
else
|
|
109
|
+
exit_status = nil
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
|
113
|
+
fstatus_txt = Open.read(fstatus)
|
|
114
|
+
begin
|
|
115
|
+
if job_batch_system == "lsf"
|
|
116
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
|
117
|
+
else
|
|
118
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
|
119
|
+
end
|
|
120
|
+
rescue
|
|
121
|
+
nodes = []
|
|
122
|
+
end
|
|
123
|
+
elsif job_nodes[id]
|
|
124
|
+
nodes = job_nodes[id]
|
|
125
|
+
else
|
|
126
|
+
nodes = []
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
if File.exists?(File.join(dir, 'std.out'))
|
|
130
|
+
outt = File.mtime File.join(dir, 'std.out')
|
|
131
|
+
errt = File.mtime File.join(dir, 'std.err')
|
|
132
|
+
time_diff = Time.now - [outt, errt].max
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
fdep = File.join(dir, 'dependencies.list')
|
|
136
|
+
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
|
137
|
+
|
|
138
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
|
139
|
+
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
|
140
|
+
|
|
141
|
+
aborted = error = true if aborted.nil? && error.nil?
|
|
142
|
+
#if done || error || aborted || running || queued || jobid || search
|
|
143
|
+
# select = false
|
|
144
|
+
# select = true if done && exit_status && exit_status.to_i == 0
|
|
145
|
+
# select = true if error && exit_status && exit_status.to_i != 0
|
|
146
|
+
# select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
|
147
|
+
# select = select && jobid.split(",").include?(id) if jobid
|
|
148
|
+
# select = select && cmd.match(/#{search}/) if search
|
|
149
|
+
# next unless select
|
|
150
|
+
#end
|
|
151
|
+
|
|
152
|
+
if done || error || aborted || queued || jobid
|
|
153
|
+
select = false
|
|
154
|
+
select = true if done && exit_status == 0
|
|
155
|
+
select = true if error && exit_status && exit_status != 0
|
|
156
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
|
157
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
|
|
158
|
+
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
|
159
|
+
select = true if jobid && jobid.split(",").include?(id)
|
|
160
|
+
select = select && cmd.match(/#{search}/) if search
|
|
161
|
+
next unless select
|
|
162
|
+
elsif search
|
|
163
|
+
select = false
|
|
164
|
+
select = true if search && cmd.match(/#{search}/)
|
|
165
|
+
next unless select
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
puts Log.color(:yellow, "**ERASING**")
|
|
170
|
+
puts Log.color :blue, dir
|
|
171
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
|
|
172
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
|
173
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
|
174
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
|
175
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
|
176
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
|
177
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
|
178
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
|
179
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
|
180
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
|
181
|
+
|
|
182
|
+
if options[:batch_parameters]
|
|
183
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
|
184
|
+
case job_batch_system
|
|
185
|
+
when 'slurm'
|
|
186
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
|
187
|
+
when 'lsf'
|
|
188
|
+
puts Log.color :blue, CMD.cmd('grep "^#BSUB" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
if tail && File.exists?(File.join(dir, 'std.err'))
|
|
193
|
+
if exit_status && exit_status != 0
|
|
194
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
|
195
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
|
196
|
+
elsif exit_status
|
|
197
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
|
198
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
|
199
|
+
else
|
|
200
|
+
puts Log.color(:magenta, "Log tail: ")
|
|
201
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
count += 1
|
|
206
|
+
|
|
207
|
+
Open.rm_rf dir unless dry_run
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
puts
|
|
211
|
+
puts "Found #{count} jobs"
|
|
212
|
+
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'rbbt-util'
|
|
4
|
+
require 'rbbt/util/simpleopt'
|
|
5
|
+
require 'rbbt/hpc'
|
|
6
|
+
|
|
7
|
+
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
|
8
|
+
|
|
9
|
+
options = SOPT.setup <<EOF
|
|
10
|
+
|
|
11
|
+
Queue a job in Marenostrum
|
|
12
|
+
|
|
13
|
+
$ rbbt slurm list [options]
|
|
14
|
+
|
|
15
|
+
-h--help Print this help
|
|
16
|
+
-d--done Done jobs only
|
|
17
|
+
-e--error Error jobs only
|
|
18
|
+
-a--aborted SLURM aboted jobs
|
|
19
|
+
-r--running Running jobs only
|
|
20
|
+
-q--queued Queued jobs only
|
|
21
|
+
-j--job* Job ids
|
|
22
|
+
-s--search* Regular expression
|
|
23
|
+
-t--tail* Show the last lines of the STDERR
|
|
24
|
+
-p--progress Report progress of job and the dependencies
|
|
25
|
+
-BP--batch_parameters show batch parameters
|
|
26
|
+
-BPP--batch_procpath show Procpath performance summary
|
|
27
|
+
-sacct--sacct_peformance show sacct performance summary
|
|
28
|
+
-bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
|
|
29
|
+
EOF
|
|
30
|
+
|
|
31
|
+
if options[:help]
|
|
32
|
+
if defined? rbbt_usage
|
|
33
|
+
rbbt_usage
|
|
34
|
+
else
|
|
35
|
+
puts SOPT.doc
|
|
36
|
+
end
|
|
37
|
+
exit 0
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
batch_system = options.delete :batch_system
|
|
41
|
+
batch_system ||= 'auto'
|
|
42
|
+
|
|
43
|
+
HPC::BATCH_MODULE = HPC.batch_system batch_system
|
|
44
|
+
|
|
45
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
|
46
|
+
|
|
47
|
+
batch_system = HPC::BATCH_MODULE.to_s.split("::").last.downcase
|
|
48
|
+
|
|
49
|
+
done, error, running, queued, aborted, jobid, search, tail, progress = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail, :progress
|
|
50
|
+
|
|
51
|
+
workdir = File.expand_path('~/rbbt-batch')
|
|
52
|
+
Path.setup(workdir)
|
|
53
|
+
|
|
54
|
+
running_jobs = begin
|
|
55
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
|
56
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
|
57
|
+
rescue
|
|
58
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
|
59
|
+
squeue_txt = nil
|
|
60
|
+
$norunningjobs = true
|
|
61
|
+
[]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
if squeue_txt
|
|
65
|
+
job_nodes = {}
|
|
66
|
+
squeue_txt.split("\n").each do |line|
|
|
67
|
+
parts = line.strip.split(/\s+/)
|
|
68
|
+
job_nodes[parts.first] = parts.last.split(",")
|
|
69
|
+
end
|
|
70
|
+
else
|
|
71
|
+
job_nodes = nil
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
count = 0
|
|
75
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
76
|
+
dir = File.dirname(fcmd)
|
|
77
|
+
command_txt = Open.read(fcmd)
|
|
78
|
+
|
|
79
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
|
80
|
+
cmd = m[1]
|
|
81
|
+
else
|
|
82
|
+
cmd = nil
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
if m = command_txt.match(/^export BATCH_SYSTEM=(.*)/)
|
|
86
|
+
job_batch_system = m[1].downcase
|
|
87
|
+
else
|
|
88
|
+
job_batch_system = nil
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
different_system = job_batch_system != batch_system
|
|
92
|
+
|
|
93
|
+
if m = command_txt.match(/#MANIFEST: (.*)/)
|
|
94
|
+
manifest = m[1]
|
|
95
|
+
else
|
|
96
|
+
manifest = nil
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
if m = command_txt.match(/#STEP_PATH: (.*)/)
|
|
100
|
+
step_path = m[1]
|
|
101
|
+
else
|
|
102
|
+
step_path = nil
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
if m = command_txt.match(/#EXEC_CMD: (.*)/)
|
|
106
|
+
exe = m[1]
|
|
107
|
+
else
|
|
108
|
+
exe = nil
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
|
112
|
+
container_home = m[1]
|
|
113
|
+
else
|
|
114
|
+
container_home = nil
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
if File.exists?(fid = File.join(dir, 'job.id'))
|
|
118
|
+
id = Open.read(fid).chomp
|
|
119
|
+
else
|
|
120
|
+
id = nil
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
|
124
|
+
exit_status = Open.read(fstatus).to_i
|
|
125
|
+
else
|
|
126
|
+
exit_status = nil
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
|
130
|
+
fstatus_txt = Open.read(fstatus)
|
|
131
|
+
begin
|
|
132
|
+
if job_batch_system == "lsf"
|
|
133
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
|
134
|
+
else
|
|
135
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
|
136
|
+
end
|
|
137
|
+
rescue
|
|
138
|
+
nodes = []
|
|
139
|
+
end
|
|
140
|
+
elsif job_nodes[id]
|
|
141
|
+
nodes = job_nodes[id].reject{|n| n.include? "("}
|
|
142
|
+
else
|
|
143
|
+
nodes = []
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
if File.exists?(File.join(dir, 'exit.status'))
|
|
147
|
+
now = File.ctime(File.join(dir, 'exit.status'))
|
|
148
|
+
else
|
|
149
|
+
now = Time.now
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
if File.exists?(File.join(dir, 'std.out'))
|
|
153
|
+
cerrt = File.ctime File.join(dir, 'std.err')
|
|
154
|
+
coutt = File.ctime File.join(dir, 'std.out')
|
|
155
|
+
outt = File.mtime File.join(dir, 'std.out')
|
|
156
|
+
errt = File.mtime File.join(dir, 'std.err')
|
|
157
|
+
time_diff = now - [outt, errt].max
|
|
158
|
+
time_elapsed = now - [cerrt, coutt].min
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
fdep = File.join(dir, 'dependencies.list')
|
|
162
|
+
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
|
163
|
+
|
|
164
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
|
165
|
+
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
|
166
|
+
|
|
167
|
+
if done || error || aborted || running || queued || jobid
|
|
168
|
+
select = false
|
|
169
|
+
select = true if done && exit_status == 0
|
|
170
|
+
select = true if error && exit_status && exit_status != 0
|
|
171
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
|
172
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
|
|
173
|
+
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
|
174
|
+
select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
|
175
|
+
select = true if jobid && jobid.split(",").include?(id)
|
|
176
|
+
select = select && cmd.match(/#{search}/) if search
|
|
177
|
+
next unless select
|
|
178
|
+
elsif search
|
|
179
|
+
select = false
|
|
180
|
+
select = true if search && cmd.match(/#{search}/)
|
|
181
|
+
next unless select
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
puts Log.color :blue, dir
|
|
186
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
|
|
187
|
+
puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err'))
|
|
188
|
+
puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest)
|
|
189
|
+
puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
|
|
190
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
|
191
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
|
192
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
|
193
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
|
194
|
+
if different_system
|
|
195
|
+
puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
|
|
196
|
+
else
|
|
197
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
|
198
|
+
end
|
|
199
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
|
200
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
|
201
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
|
202
|
+
puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
|
|
203
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exists?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
|
204
|
+
|
|
205
|
+
if options[:batch_parameters]
|
|
206
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
|
207
|
+
case job_batch_system
|
|
208
|
+
when 'slurm'
|
|
209
|
+
text = CMD.cmd('grep "^#SBATCH" |tail -n +5', :in => Open.read(fcmd)).read.strip
|
|
210
|
+
when 'lsf'
|
|
211
|
+
text = CMD.cmd('grep "^#BSUB" |tail -n +5', :in => Open.read(fcmd)).read.strip
|
|
212
|
+
else
|
|
213
|
+
text = ""
|
|
214
|
+
end
|
|
215
|
+
lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
|
|
216
|
+
puts Log.color :yellow, lines * "\n"
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
fprocpath = File.join(dir, 'procpath.sqlite3')
|
|
220
|
+
if options[:batch_procpath] && Open.exists?(fprocpath)
|
|
221
|
+
puts Log.color(:magenta, "Procpath summary: ")
|
|
222
|
+
require 'rbbt/tsv/csv'
|
|
223
|
+
meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
|
|
224
|
+
perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
|
|
225
|
+
|
|
226
|
+
page_size = meta["page_size"].first.to_f
|
|
227
|
+
clock_ticks = meta["clock_ticks"].first.to_f
|
|
228
|
+
|
|
229
|
+
cpu_average = {}
|
|
230
|
+
rss_average = {}
|
|
231
|
+
perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
|
|
232
|
+
time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
|
|
233
|
+
time = time.to_f
|
|
234
|
+
|
|
235
|
+
cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
|
|
236
|
+
cpu_average[stat_pid] ||= {}
|
|
237
|
+
cpu_average[stat_pid][time] ||= []
|
|
238
|
+
cpu_average[stat_pid][time] << cpu.to_f
|
|
239
|
+
rss_average[time] ||= []
|
|
240
|
+
rss_average[time] << rss.to_f * page_size
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
ticks = 0
|
|
244
|
+
cpu_average.each do |stat_pid, cpu_average_pid|
|
|
245
|
+
start = cpu_average_pid.keys.sort.first
|
|
246
|
+
eend = cpu_average_pid.keys.sort.last
|
|
247
|
+
ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
|
|
248
|
+
end
|
|
249
|
+
start = rss_average.keys.sort.first
|
|
250
|
+
eend = rss_average.keys.sort.last
|
|
251
|
+
time_elapsed = eend - start
|
|
252
|
+
ticks = 1 if ticks == 0
|
|
253
|
+
time_elapsed = 1 if time_elapsed == 0
|
|
254
|
+
puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
|
|
255
|
+
puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
|
|
256
|
+
puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
|
|
257
|
+
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
if options[:sacct_peformance]
|
|
261
|
+
begin
|
|
262
|
+
raise "sacct not supported for LSF" unless batch_system == 'slurm'
|
|
263
|
+
tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
|
|
264
|
+
values = tsv[tsv.keys.first]
|
|
265
|
+
if values.compact.any?
|
|
266
|
+
puts Log.color(:magenta, "SACCT performance: ")
|
|
267
|
+
puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
|
|
268
|
+
end
|
|
269
|
+
rescue
|
|
270
|
+
Log.warn $!.message
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
if tail && File.exists?(File.join(dir, 'std.err'))
|
|
276
|
+
if exit_status && exit_status != 0
|
|
277
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
|
278
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
|
279
|
+
elsif exit_status
|
|
280
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
|
281
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
|
282
|
+
else
|
|
283
|
+
puts Log.color(:magenta, "Log tail: ")
|
|
284
|
+
puts CMD.cmd(" cat #{File.join(dir, 'std.err')} | grep -v '^[^\\s:]*\\[3.m' | tail -n #{tail.to_i} ").read
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
if options[:progress]
|
|
289
|
+
step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
|
|
290
|
+
if step_line
|
|
291
|
+
require 'rbbt/workflow'
|
|
292
|
+
step_path = step_line.split(": ").last.strip
|
|
293
|
+
step = Step.new step_path
|
|
294
|
+
step.load_dependencies_from_info
|
|
295
|
+
(step.rec_dependencies + [step]).reverse.each do |j|
|
|
296
|
+
next if j.done?
|
|
297
|
+
next unless j.file(:progress).exists?
|
|
298
|
+
bar = Log::ProgressBar.new
|
|
299
|
+
bar.load(j.file(:progress).yaml)
|
|
300
|
+
puts Log.color(:magenta, "Progress: ") + bar.report_msg + " " + Log.color(:yellow, j.task_signature)
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
count += 1
|
|
306
|
+
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
puts
|
|
310
|
+
puts "Found #{count} jobs"
|
|
311
|
+
|