rbbt-util 5.37.16 → 5.38.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -31,12 +31,12 @@ $ rbbt slurm list [options]
31
31
  EOF
32
32
 
33
33
  if options[:help]
34
- if defined? rbbt_usage
35
- rbbt_usage
36
- else
37
- puts SOPT.doc
38
- end
39
- exit 0
34
+ if defined? rbbt_usage
35
+ rbbt_usage
36
+ else
37
+ puts SOPT.doc
38
+ end
39
+ exit 0
40
40
  end
41
41
 
42
42
  batch_system = options.delete :batch_system
@@ -56,302 +56,302 @@ workdir = File.expand_path('~/rbbt-batch')
56
56
  Path.setup(workdir)
57
57
 
58
58
  running_jobs = begin
59
- squeue_txt = HPC::BATCH_MODULE.job_status
60
- squeue_txt.split("\n").collect{|l| l.to_i.to_s}
59
+ squeue_txt = HPC::BATCH_MODULE.job_status
60
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
61
61
  rescue
62
- Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
63
- squeue_txt = nil
64
- $norunningjobs = true
65
- []
62
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
63
+ squeue_txt = nil
64
+ $norunningjobs = true
65
+ []
66
66
  end
67
67
 
68
68
  if squeue_txt
69
- job_nodes = {}
70
- squeue_txt.split("\n").each do |line|
71
- parts = line.strip.split(/\s+/)
72
- job_nodes[parts.first] = parts.last.split(",")
73
- end
69
+ job_nodes = {}
70
+ squeue_txt.split("\n").each do |line|
71
+ parts = line.strip.split(/\s+/)
72
+ job_nodes[parts.first] = parts.last.split(",")
73
+ end
74
74
  else
75
- job_nodes = nil
75
+ job_nodes = nil
76
76
  end
77
77
 
78
78
  count = 0
79
79
  workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
80
- dir = File.dirname(fcmd)
81
- command_txt = Open.read(fcmd)
82
-
83
- if m = command_txt.match(/#CMD: (.*)/)
84
- cmd = m[1]
85
- else
86
- cmd = nil
87
- end
88
-
89
- if m = command_txt.match(/^export BATCH_SYSTEM=(.*)/)
90
- job_batch_system = m[1].downcase
91
- else
92
- job_batch_system = nil
93
- end
94
-
95
- different_system = job_batch_system != batch_system
96
-
97
- if m = command_txt.match(/#MANIFEST: (.*)/)
98
- manifest = m[1]
99
- else
100
- manifest = nil
101
- end
102
-
103
- if m = command_txt.match(/#STEP_PATH: (.*)/)
104
- step_path = m[1]
105
- else
106
- step_path = nil
107
- end
108
-
109
- if m = command_txt.match(/#EXEC_CMD: (.*)/)
110
- exe = m[1]
111
- else
112
- exe = nil
113
- end
114
-
115
- if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
116
- container_home = m[1]
117
- else
118
- container_home = nil
119
- end
120
-
121
- if File.exist?(fid = File.join(dir, 'job.id'))
122
- id = Open.read(fid).chomp
123
- else
124
- id = nil
125
- end
126
-
127
- if File.exist?(fstatus = File.join(dir, 'exit.status'))
128
- exit_status = Open.read(fstatus).to_i
129
- else
130
- exit_status = nil
131
- end
132
-
133
- if File.exist?(fstatus = File.join(dir, 'job.status'))
134
- fstatus_txt = Open.read(fstatus)
135
- begin
136
- if job_batch_system == "lsf"
137
- nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
138
- else
139
- nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
140
- end
141
- rescue
142
- nodes = []
143
- end
144
- elsif job_nodes && job_nodes[id]
145
- nodes = job_nodes[id].reject{|n| n.include? "("}
146
- else
147
- nodes = []
148
- end
149
-
150
- if File.exist?(File.join(dir, 'exit.status'))
151
- now = File.ctime(File.join(dir, 'exit.status'))
152
- else
153
- now = Time.now
154
- end
155
-
156
- if File.exist?(File.join(dir, 'std.out'))
157
- cerrt = File.ctime File.join(dir, 'std.err')
158
- coutt = File.ctime File.join(dir, 'std.out')
159
- outt = File.mtime File.join(dir, 'std.out')
160
- errt = File.mtime File.join(dir, 'std.err')
161
- time_diff = now - [outt, errt].max
162
- time_elapsed = now - [cerrt, coutt].min
163
- end
164
-
165
- fdep = File.join(dir, 'dependencies.list')
166
- deps = Open.read(fdep).split("\n") if File.exist?(fdep)
167
-
168
- fcadep = File.join(dir, 'canfail_dependencies.list')
169
- cadeps = Open.read(fcadep).split("\n") if File.exist?(fcadep)
170
-
171
- if done || error || aborted || running || queued || jobid
172
- select = false
173
- select = true if done && exit_status == 0
174
- select = true if error && exit_status && exit_status != 0
175
- select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
176
- is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
177
- select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
178
- select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
179
- select = true if jobid && jobid.split(",").include?(id)
180
- select = select && step_path.match(/#{search}/) if search
181
- next unless select
182
- elsif search
183
- select = false
184
- select = true if search && cmd.match(/#{search}/)
185
- next unless select
186
- end
187
-
188
-
189
- count += 1
190
-
191
- if options[:compressed]
192
- status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : Log.color(:green, id)
193
- if different_system
194
- status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id)
195
- else
196
- #status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) )
197
- status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" :
198
- (running_jobs.include?(id) || $norunningjobs ?
199
- (is_running ? Log.color(:green, id) : Log.color(:yellow, id) ) :
200
- Log.color(:red, id) )
201
- end
202
- prog_rep = []
203
- if options[:progress]
204
- step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
205
- if step_line
206
- require 'rbbt/workflow'
207
- step_path = step_line.split(": ").last.strip
208
- step = Step.new step_path
209
- step.load_dependencies_from_info
210
- has_bar = false
211
- [step].reverse.each do |j|
212
- next if j.done?
213
- if j.file(:progress).exists?
214
- bar = Log::ProgressBar.new
215
- bar.load(j.file(:progress).yaml)
216
- rep = bar.report_msg.split("·")[1]
217
- rep = rep.sub(/.*?(\d+%)/, Log.color(:blue,'\1')).sub(/\-.*/,'')
218
- prog_rep << [rep]
219
- end
220
- end
221
- end
222
- end
223
- workflow, task, name = step_path.split("/")[-3..-1]
224
- job_str = [Log.color(:yellow, workflow), Log.color(:magenta, task), name] * "/"
225
- puts [job_str, status, prog_rep ].flatten * " "
226
- next
227
- end
228
-
229
- puts Log.color :blue, dir
230
- puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s if long
231
- puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err')) && long
232
- puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest) if long
233
- puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
234
- puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
235
- puts Log.color(:magenta, "Exec: ") << (exe || "Missing") if long
236
- puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing") if long
237
- puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home && long
238
- if different_system
239
- puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
240
- else
241
- puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
242
- end
243
- puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
244
- puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
245
- puts Log.color(:magenta, "Nodes: ") << nodes * ", " if long
246
- puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
247
- puts Log.color(:magenta, "Output: ") << File.exist?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exist?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)") if long
248
-
249
- if options[:batch_parameters]
250
- puts Log.color(:magenta, "BATCH parameters: ")
251
- case job_batch_system
252
- when 'slurm'
253
- text = CMD.cmd('grep "^#SBATCH" ', :in => Open.read(fcmd)).read.strip
254
- when 'lsf'
255
- text = CMD.cmd('grep "^#BSUB" ', :in => Open.read(fcmd)).read.strip
256
- else
257
- text = ""
258
- end
259
- lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
260
- puts Log.color :yellow, lines * "\n"
261
- end
262
-
263
- fprocpath = File.join(dir, 'procpath.sqlite3')
264
- if options[:batch_procpath] && Open.exists?(fprocpath)
265
- puts Log.color(:magenta, "Procpath summary: ")
266
- require 'rbbt/tsv/csv'
267
- meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
268
- perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
269
-
270
- page_size = meta["page_size"].first.to_f
271
- clock_ticks = meta["clock_ticks"].first.to_f
272
-
273
- cpu_average = {}
274
- rss_average = {}
275
- perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
276
- time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
277
- time = time.to_f
278
-
279
- cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
280
- cpu_average[stat_pid] ||= {}
281
- cpu_average[stat_pid][time] ||= []
282
- cpu_average[stat_pid][time] << cpu.to_f
283
- rss_average[time] ||= []
284
- rss_average[time] << rss.to_f * page_size
285
- end
286
-
287
- ticks = 0
288
- cpu_average.each do |stat_pid, cpu_average_pid|
289
- start = cpu_average_pid.keys.sort.first
290
- eend = cpu_average_pid.keys.sort.last
291
- ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
292
- end
293
- start = rss_average.keys.sort.first
294
- eend = rss_average.keys.sort.last
295
- time_elapsed = eend - start
296
- ticks = 1 if ticks == 0
297
- time_elapsed = 1 if time_elapsed == 0
298
- puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
299
- puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
300
- puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
301
-
302
- end
303
-
304
- if options[:sacct_peformance]
305
- begin
306
- raise "sacct not supported for LSF" unless batch_system == 'slurm'
307
- tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
308
- values = tsv[tsv.keys.first]
309
- if values.compact.any?
310
- puts Log.color(:magenta, "SACCT performance: ")
311
- puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
312
- end
313
- rescue
314
- Log.warn $!.message
315
- end
316
- end
317
-
318
-
319
- if tail && File.exist?(File.join(dir, 'std.err'))
320
- if exit_status && exit_status != 0
321
- puts Log.color(:magenta, "First error or exception found: ")
322
- puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
323
- elsif exit_status
324
- puts Log.color(:magenta, "Completed jobs: ")
325
- puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
326
- else
327
- puts Log.color(:magenta, "Log tail: ")
328
- puts CMD.cmd(" cat #{File.join(dir, 'std.err')} | grep -v '^[^\\s:]*\\[3.m' | grep -v -e '^[[:space:]]*$' | grep -v \"\\(STDOUT\\|STDERR\\):[[:space:]]*$\" | tail -n #{tail.to_i} ").read
329
- end
330
- end
331
-
332
- if options[:progress]
333
- step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
334
- if step_line
335
- require 'rbbt/workflow'
336
- step_path = step_line.split(": ").last.strip
337
- step = Step.new step_path
338
- step.load_dependencies_from_info
339
- has_bar = false
340
- (step.rec_dependencies + [step]).reverse.each do |j|
341
- next if j.done?
342
- if j.file(:progress).exists?
343
- bar = Log::ProgressBar.new
344
- bar.load(j.file(:progress).yaml)
345
- puts Log.color(:magenta, "Progress: ") + bar.report_msg + " " + Log.color(:yellow, j.task_signature)
346
- has_bar = true
347
- end
348
- end
349
- step_status = step.status
350
- step_status = Log.color :red, step_status if step_status.to_s == 'cleaned'
351
- step_status = Log.color :green, step_status if step_status.to_s == 'done'
352
- puts Log.color(:magenta, "Progress: ") + Log.color(:yellow, step.task_signature) + " #{step_status}" unless has_bar
353
- end
354
- end
80
+ dir = File.dirname(fcmd)
81
+ command_txt = Open.read(fcmd)
82
+
83
+ if m = command_txt.match(/#CMD: (.*)/)
84
+ cmd = m[1]
85
+ else
86
+ cmd = nil
87
+ end
88
+
89
+ if m = command_txt.match(/^export BATCH_SYSTEM=(.*)/)
90
+ job_batch_system = m[1].downcase
91
+ else
92
+ job_batch_system = nil
93
+ end
94
+
95
+ different_system = job_batch_system != batch_system
96
+
97
+ if m = command_txt.match(/#MANIFEST: (.*)/)
98
+ manifest = m[1]
99
+ else
100
+ manifest = nil
101
+ end
102
+
103
+ if m = command_txt.match(/#STEP_PATH: (.*)/)
104
+ step_path = m[1]
105
+ else
106
+ step_path = nil
107
+ end
108
+
109
+ if m = command_txt.match(/#EXEC_CMD: (.*)/)
110
+ exe = m[1]
111
+ else
112
+ exe = nil
113
+ end
114
+
115
+ if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
116
+ container_home = m[1]
117
+ else
118
+ container_home = nil
119
+ end
120
+
121
+ if File.exist?(fid = File.join(dir, 'job.id'))
122
+ id = Open.read(fid).chomp
123
+ else
124
+ id = nil
125
+ end
126
+
127
+ if File.exist?(fstatus = File.join(dir, 'exit.status'))
128
+ exit_status = Open.read(fstatus).to_i
129
+ else
130
+ exit_status = nil
131
+ end
132
+
133
+ if File.exist?(fstatus = File.join(dir, 'job.status'))
134
+ fstatus_txt = Open.read(fstatus)
135
+ begin
136
+ if job_batch_system == "lsf"
137
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
138
+ else
139
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
140
+ end
141
+ rescue
142
+ nodes = []
143
+ end
144
+ elsif job_nodes && job_nodes[id]
145
+ nodes = job_nodes[id].reject{|n| n.include? "("}
146
+ else
147
+ nodes = []
148
+ end
149
+
150
+ if File.exist?(File.join(dir, 'exit.status'))
151
+ now = File.ctime(File.join(dir, 'exit.status'))
152
+ else
153
+ now = Time.now
154
+ end
155
+
156
+ if File.exist?(File.join(dir, 'std.out'))
157
+ cerrt = File.ctime File.join(dir, 'std.err')
158
+ coutt = File.ctime File.join(dir, 'std.out')
159
+ outt = File.mtime File.join(dir, 'std.out')
160
+ errt = File.mtime File.join(dir, 'std.err')
161
+ time_diff = now - [outt, errt].max
162
+ time_elapsed = now - [cerrt, coutt].min
163
+ end
164
+
165
+ fdep = File.join(dir, 'dependencies.list')
166
+ deps = Open.read(fdep).split("\n") if File.exist?(fdep)
167
+
168
+ fcadep = File.join(dir, 'canfail_dependencies.list')
169
+ cadeps = Open.read(fcadep).split("\n") if File.exist?(fcadep)
170
+
171
+ is_running = exit_status.nil? && ( (running_jobs.include?(id) && (deps.nil? || (running_jobs & deps).empty?)) || different_system )
172
+ if done || error || aborted || running || queued || jobid
173
+ select = false
174
+ select = true if done && exit_status == 0
175
+ select = true if error && exit_status && exit_status != 0
176
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
177
+ select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
178
+ select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
179
+ select = true if jobid && jobid.split(",").include?(id)
180
+ select = select && step_path.match(/#{search}/) if search
181
+ next unless select
182
+ elsif search
183
+ select = false
184
+ select = true if search && cmd.match(/#{search}/)
185
+ next unless select
186
+ end
187
+
188
+
189
+ count += 1
190
+
191
+ if options[:compressed]
192
+ status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : Log.color(:green, id)
193
+ if different_system
194
+ status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id)
195
+ else
196
+ #status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) )
197
+ status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" :
198
+ (running_jobs.include?(id) || $norunningjobs ?
199
+ (is_running ? Log.color(:cyan, id) : Log.color(:yellow, id) ) :
200
+ Log.color(:red, id) )
201
+ end
202
+ prog_rep = []
203
+ if options[:progress]
204
+ step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
205
+ if step_line
206
+ require 'rbbt/workflow'
207
+ step_path = step_line.split(": ").last.strip
208
+ step = Step.new step_path
209
+ step.load_dependencies_from_info
210
+ has_bar = false
211
+ [step].reverse.each do |j|
212
+ next if j.done?
213
+ if j.file(:progress).exists?
214
+ bar = Log::ProgressBar.new
215
+ bar.load(j.file(:progress).yaml)
216
+ rep = bar.report_msg.split("·")[1]
217
+ rep = rep.sub(/.*?(\d+%)/, Log.color(:blue,'\1')).sub(/\-.*/,'')
218
+ prog_rep << [rep]
219
+ end
220
+ end
221
+ end
222
+ end
223
+ workflow, task, name = step_path.split("/")[-3..-1]
224
+ job_str = [Log.color(:yellow, workflow), Log.color(:magenta, task), name] * "/"
225
+ puts [job_str, status, prog_rep ].flatten * " "
226
+ next
227
+ end
228
+
229
+ puts Log.color :blue, dir
230
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s if long
231
+ puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err')) && long
232
+ puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest) if long
233
+ puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
234
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
235
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing") if long
236
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing") if long
237
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home && long
238
+ if different_system
239
+ puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
240
+ else
241
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
242
+ end
243
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
244
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
245
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", " if long
246
+ puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
247
+ puts Log.color(:magenta, "Output: ") << File.exist?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exist?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)") if long
248
+
249
+ if options[:batch_parameters]
250
+ puts Log.color(:magenta, "BATCH parameters: ")
251
+ case job_batch_system
252
+ when 'slurm'
253
+ text = CMD.cmd('grep "^#SBATCH" ', :in => Open.read(fcmd)).read.strip
254
+ when 'lsf'
255
+ text = CMD.cmd('grep "^#BSUB" ', :in => Open.read(fcmd)).read.strip
256
+ else
257
+ text = ""
258
+ end
259
+ lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
260
+ puts Log.color :yellow, lines * "\n"
261
+ end
262
+
263
+ fprocpath = File.join(dir, 'procpath.sqlite3')
264
+ if options[:batch_procpath] && Open.exists?(fprocpath)
265
+ puts Log.color(:magenta, "Procpath summary: ")
266
+ require 'rbbt/tsv/csv'
267
+ meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
268
+ perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
269
+
270
+ page_size = meta["page_size"].first.to_f
271
+ clock_ticks = meta["clock_ticks"].first.to_f
272
+
273
+ cpu_average = {}
274
+ rss_average = {}
275
+ perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
276
+ time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
277
+ time = time.to_f
278
+
279
+ cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
280
+ cpu_average[stat_pid] ||= {}
281
+ cpu_average[stat_pid][time] ||= []
282
+ cpu_average[stat_pid][time] << cpu.to_f
283
+ rss_average[time] ||= []
284
+ rss_average[time] << rss.to_f * page_size
285
+ end
286
+
287
+ ticks = 0
288
+ cpu_average.each do |stat_pid, cpu_average_pid|
289
+ start = cpu_average_pid.keys.sort.first
290
+ eend = cpu_average_pid.keys.sort.last
291
+ ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
292
+ end
293
+ start = rss_average.keys.sort.first
294
+ eend = rss_average.keys.sort.last
295
+ time_elapsed = eend - start
296
+ ticks = 1 if ticks == 0
297
+ time_elapsed = 1 if time_elapsed == 0
298
+ puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
299
+ puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
300
+ puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
301
+
302
+ end
303
+
304
+ if options[:sacct_peformance]
305
+ begin
306
+ raise "sacct not supported for LSF" unless batch_system == 'slurm'
307
+ tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
308
+ values = tsv[tsv.keys.first]
309
+ if values.compact.any?
310
+ puts Log.color(:magenta, "SACCT performance: ")
311
+ puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
312
+ end
313
+ rescue
314
+ Log.warn $!.message
315
+ end
316
+ end
317
+
318
+
319
+ if tail && File.exist?(File.join(dir, 'std.err'))
320
+ if exit_status && exit_status != 0
321
+ puts Log.color(:magenta, "First error or exception found: ")
322
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
323
+ elsif exit_status
324
+ puts Log.color(:magenta, "Completed jobs: ")
325
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
326
+ else
327
+ puts Log.color(:magenta, "Log tail: ")
328
+ puts CMD.cmd(" cat #{File.join(dir, 'std.err')} | grep -v '^[^\\s:]*\\[3.m' | grep -v -e '^[[:space:]]*$' | grep -v \"\\(STDOUT\\|STDERR\\):[[:space:]]*$\" | tail -n #{tail.to_i} ").read
329
+ end
330
+ end
331
+
332
+ if options[:progress]
333
+ step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
334
+ if step_line
335
+ require 'rbbt/workflow'
336
+ step_path = step_line.split(": ").last.strip
337
+ step = Step.new step_path
338
+ step.load_dependencies_from_info
339
+ has_bar = false
340
+ (step.rec_dependencies + [step]).reverse.each do |j|
341
+ next if j.done?
342
+ if j.file(:progress).exists?
343
+ bar = Log::ProgressBar.new
344
+ bar.load(j.file(:progress).yaml)
345
+ puts Log.color(:magenta, "Progress: ") + bar.report_msg + " " + Log.color(:yellow, j.task_signature)
346
+ has_bar = true
347
+ end
348
+ end
349
+ step_status = step.status
350
+ step_status = Log.color :red, step_status if step_status.to_s == 'cleaned'
351
+ step_status = Log.color :green, step_status if step_status.to_s == 'done'
352
+ puts Log.color(:magenta, "Progress: ") + Log.color(:yellow, step.task_signature) + " #{step_status}" unless has_bar
353
+ end
354
+ end
355
355
 
356
356
  end
357
357