rbbt-util 5.37.16 → 5.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,12 +31,12 @@ $ rbbt slurm list [options]
31
31
  EOF
32
32
 
33
33
  if options[:help]
34
- if defined? rbbt_usage
35
- rbbt_usage
36
- else
37
- puts SOPT.doc
38
- end
39
- exit 0
34
+ if defined? rbbt_usage
35
+ rbbt_usage
36
+ else
37
+ puts SOPT.doc
38
+ end
39
+ exit 0
40
40
  end
41
41
 
42
42
  batch_system = options.delete :batch_system
@@ -56,302 +56,302 @@ workdir = File.expand_path('~/rbbt-batch')
56
56
  Path.setup(workdir)
57
57
 
58
58
  running_jobs = begin
59
- squeue_txt = HPC::BATCH_MODULE.job_status
60
- squeue_txt.split("\n").collect{|l| l.to_i.to_s}
59
+ squeue_txt = HPC::BATCH_MODULE.job_status
60
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
61
61
  rescue
62
- Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
63
- squeue_txt = nil
64
- $norunningjobs = true
65
- []
62
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
63
+ squeue_txt = nil
64
+ $norunningjobs = true
65
+ []
66
66
  end
67
67
 
68
68
  if squeue_txt
69
- job_nodes = {}
70
- squeue_txt.split("\n").each do |line|
71
- parts = line.strip.split(/\s+/)
72
- job_nodes[parts.first] = parts.last.split(",")
73
- end
69
+ job_nodes = {}
70
+ squeue_txt.split("\n").each do |line|
71
+ parts = line.strip.split(/\s+/)
72
+ job_nodes[parts.first] = parts.last.split(",")
73
+ end
74
74
  else
75
- job_nodes = nil
75
+ job_nodes = nil
76
76
  end
77
77
 
78
78
  count = 0
79
79
  workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
80
- dir = File.dirname(fcmd)
81
- command_txt = Open.read(fcmd)
82
-
83
- if m = command_txt.match(/#CMD: (.*)/)
84
- cmd = m[1]
85
- else
86
- cmd = nil
87
- end
88
-
89
- if m = command_txt.match(/^export BATCH_SYSTEM=(.*)/)
90
- job_batch_system = m[1].downcase
91
- else
92
- job_batch_system = nil
93
- end
94
-
95
- different_system = job_batch_system != batch_system
96
-
97
- if m = command_txt.match(/#MANIFEST: (.*)/)
98
- manifest = m[1]
99
- else
100
- manifest = nil
101
- end
102
-
103
- if m = command_txt.match(/#STEP_PATH: (.*)/)
104
- step_path = m[1]
105
- else
106
- step_path = nil
107
- end
108
-
109
- if m = command_txt.match(/#EXEC_CMD: (.*)/)
110
- exe = m[1]
111
- else
112
- exe = nil
113
- end
114
-
115
- if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
116
- container_home = m[1]
117
- else
118
- container_home = nil
119
- end
120
-
121
- if File.exist?(fid = File.join(dir, 'job.id'))
122
- id = Open.read(fid).chomp
123
- else
124
- id = nil
125
- end
126
-
127
- if File.exist?(fstatus = File.join(dir, 'exit.status'))
128
- exit_status = Open.read(fstatus).to_i
129
- else
130
- exit_status = nil
131
- end
132
-
133
- if File.exist?(fstatus = File.join(dir, 'job.status'))
134
- fstatus_txt = Open.read(fstatus)
135
- begin
136
- if job_batch_system == "lsf"
137
- nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
138
- else
139
- nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
140
- end
141
- rescue
142
- nodes = []
143
- end
144
- elsif job_nodes && job_nodes[id]
145
- nodes = job_nodes[id].reject{|n| n.include? "("}
146
- else
147
- nodes = []
148
- end
149
-
150
- if File.exist?(File.join(dir, 'exit.status'))
151
- now = File.ctime(File.join(dir, 'exit.status'))
152
- else
153
- now = Time.now
154
- end
155
-
156
- if File.exist?(File.join(dir, 'std.out'))
157
- cerrt = File.ctime File.join(dir, 'std.err')
158
- coutt = File.ctime File.join(dir, 'std.out')
159
- outt = File.mtime File.join(dir, 'std.out')
160
- errt = File.mtime File.join(dir, 'std.err')
161
- time_diff = now - [outt, errt].max
162
- time_elapsed = now - [cerrt, coutt].min
163
- end
164
-
165
- fdep = File.join(dir, 'dependencies.list')
166
- deps = Open.read(fdep).split("\n") if File.exist?(fdep)
167
-
168
- fcadep = File.join(dir, 'canfail_dependencies.list')
169
- cadeps = Open.read(fcadep).split("\n") if File.exist?(fcadep)
170
-
171
- if done || error || aborted || running || queued || jobid
172
- select = false
173
- select = true if done && exit_status == 0
174
- select = true if error && exit_status && exit_status != 0
175
- select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
176
- is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
177
- select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
178
- select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
179
- select = true if jobid && jobid.split(",").include?(id)
180
- select = select && step_path.match(/#{search}/) if search
181
- next unless select
182
- elsif search
183
- select = false
184
- select = true if search && cmd.match(/#{search}/)
185
- next unless select
186
- end
187
-
188
-
189
- count += 1
190
-
191
- if options[:compressed]
192
- status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : Log.color(:green, id)
193
- if different_system
194
- status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id)
195
- else
196
- #status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) )
197
- status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" :
198
- (running_jobs.include?(id) || $norunningjobs ?
199
- (is_running ? Log.color(:green, id) : Log.color(:yellow, id) ) :
200
- Log.color(:red, id) )
201
- end
202
- prog_rep = []
203
- if options[:progress]
204
- step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
205
- if step_line
206
- require 'rbbt/workflow'
207
- step_path = step_line.split(": ").last.strip
208
- step = Step.new step_path
209
- step.load_dependencies_from_info
210
- has_bar = false
211
- [step].reverse.each do |j|
212
- next if j.done?
213
- if j.file(:progress).exists?
214
- bar = Log::ProgressBar.new
215
- bar.load(j.file(:progress).yaml)
216
- rep = bar.report_msg.split("·")[1]
217
- rep = rep.sub(/.*?(\d+%)/, Log.color(:blue,'\1')).sub(/\-.*/,'')
218
- prog_rep << [rep]
219
- end
220
- end
221
- end
222
- end
223
- workflow, task, name = step_path.split("/")[-3..-1]
224
- job_str = [Log.color(:yellow, workflow), Log.color(:magenta, task), name] * "/"
225
- puts [job_str, status, prog_rep ].flatten * " "
226
- next
227
- end
228
-
229
- puts Log.color :blue, dir
230
- puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s if long
231
- puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err')) && long
232
- puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest) if long
233
- puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
234
- puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
235
- puts Log.color(:magenta, "Exec: ") << (exe || "Missing") if long
236
- puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing") if long
237
- puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home && long
238
- if different_system
239
- puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
240
- else
241
- puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
242
- end
243
- puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
244
- puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
245
- puts Log.color(:magenta, "Nodes: ") << nodes * ", " if long
246
- puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
247
- puts Log.color(:magenta, "Output: ") << File.exist?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exist?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)") if long
248
-
249
- if options[:batch_parameters]
250
- puts Log.color(:magenta, "BATCH parameters: ")
251
- case job_batch_system
252
- when 'slurm'
253
- text = CMD.cmd('grep "^#SBATCH" ', :in => Open.read(fcmd)).read.strip
254
- when 'lsf'
255
- text = CMD.cmd('grep "^#BSUB" ', :in => Open.read(fcmd)).read.strip
256
- else
257
- text = ""
258
- end
259
- lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
260
- puts Log.color :yellow, lines * "\n"
261
- end
262
-
263
- fprocpath = File.join(dir, 'procpath.sqlite3')
264
- if options[:batch_procpath] && Open.exists?(fprocpath)
265
- puts Log.color(:magenta, "Procpath summary: ")
266
- require 'rbbt/tsv/csv'
267
- meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
268
- perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
269
-
270
- page_size = meta["page_size"].first.to_f
271
- clock_ticks = meta["clock_ticks"].first.to_f
272
-
273
- cpu_average = {}
274
- rss_average = {}
275
- perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
276
- time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
277
- time = time.to_f
278
-
279
- cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
280
- cpu_average[stat_pid] ||= {}
281
- cpu_average[stat_pid][time] ||= []
282
- cpu_average[stat_pid][time] << cpu.to_f
283
- rss_average[time] ||= []
284
- rss_average[time] << rss.to_f * page_size
285
- end
286
-
287
- ticks = 0
288
- cpu_average.each do |stat_pid, cpu_average_pid|
289
- start = cpu_average_pid.keys.sort.first
290
- eend = cpu_average_pid.keys.sort.last
291
- ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
292
- end
293
- start = rss_average.keys.sort.first
294
- eend = rss_average.keys.sort.last
295
- time_elapsed = eend - start
296
- ticks = 1 if ticks == 0
297
- time_elapsed = 1 if time_elapsed == 0
298
- puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
299
- puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
300
- puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
301
-
302
- end
303
-
304
- if options[:sacct_peformance]
305
- begin
306
- raise "sacct not supported for LSF" unless batch_system == 'slurm'
307
- tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
308
- values = tsv[tsv.keys.first]
309
- if values.compact.any?
310
- puts Log.color(:magenta, "SACCT performance: ")
311
- puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
312
- end
313
- rescue
314
- Log.warn $!.message
315
- end
316
- end
317
-
318
-
319
- if tail && File.exist?(File.join(dir, 'std.err'))
320
- if exit_status && exit_status != 0
321
- puts Log.color(:magenta, "First error or exception found: ")
322
- puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
323
- elsif exit_status
324
- puts Log.color(:magenta, "Completed jobs: ")
325
- puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
326
- else
327
- puts Log.color(:magenta, "Log tail: ")
328
- puts CMD.cmd(" cat #{File.join(dir, 'std.err')} | grep -v '^[^\\s:]*\\[3.m' | grep -v -e '^[[:space:]]*$' | grep -v \"\\(STDOUT\\|STDERR\\):[[:space:]]*$\" | tail -n #{tail.to_i} ").read
329
- end
330
- end
331
-
332
- if options[:progress]
333
- step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
334
- if step_line
335
- require 'rbbt/workflow'
336
- step_path = step_line.split(": ").last.strip
337
- step = Step.new step_path
338
- step.load_dependencies_from_info
339
- has_bar = false
340
- (step.rec_dependencies + [step]).reverse.each do |j|
341
- next if j.done?
342
- if j.file(:progress).exists?
343
- bar = Log::ProgressBar.new
344
- bar.load(j.file(:progress).yaml)
345
- puts Log.color(:magenta, "Progress: ") + bar.report_msg + " " + Log.color(:yellow, j.task_signature)
346
- has_bar = true
347
- end
348
- end
349
- step_status = step.status
350
- step_status = Log.color :red, step_status if step_status.to_s == 'cleaned'
351
- step_status = Log.color :green, step_status if step_status.to_s == 'done'
352
- puts Log.color(:magenta, "Progress: ") + Log.color(:yellow, step.task_signature) + " #{step_status}" unless has_bar
353
- end
354
- end
80
+ dir = File.dirname(fcmd)
81
+ command_txt = Open.read(fcmd)
82
+
83
+ if m = command_txt.match(/#CMD: (.*)/)
84
+ cmd = m[1]
85
+ else
86
+ cmd = nil
87
+ end
88
+
89
+ if m = command_txt.match(/^export BATCH_SYSTEM=(.*)/)
90
+ job_batch_system = m[1].downcase
91
+ else
92
+ job_batch_system = nil
93
+ end
94
+
95
+ different_system = job_batch_system != batch_system
96
+
97
+ if m = command_txt.match(/#MANIFEST: (.*)/)
98
+ manifest = m[1]
99
+ else
100
+ manifest = nil
101
+ end
102
+
103
+ if m = command_txt.match(/#STEP_PATH: (.*)/)
104
+ step_path = m[1]
105
+ else
106
+ step_path = nil
107
+ end
108
+
109
+ if m = command_txt.match(/#EXEC_CMD: (.*)/)
110
+ exe = m[1]
111
+ else
112
+ exe = nil
113
+ end
114
+
115
+ if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
116
+ container_home = m[1]
117
+ else
118
+ container_home = nil
119
+ end
120
+
121
+ if File.exist?(fid = File.join(dir, 'job.id'))
122
+ id = Open.read(fid).chomp
123
+ else
124
+ id = nil
125
+ end
126
+
127
+ if File.exist?(fstatus = File.join(dir, 'exit.status'))
128
+ exit_status = Open.read(fstatus).to_i
129
+ else
130
+ exit_status = nil
131
+ end
132
+
133
+ if File.exist?(fstatus = File.join(dir, 'job.status'))
134
+ fstatus_txt = Open.read(fstatus)
135
+ begin
136
+ if job_batch_system == "lsf"
137
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
138
+ else
139
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
140
+ end
141
+ rescue
142
+ nodes = []
143
+ end
144
+ elsif job_nodes && job_nodes[id]
145
+ nodes = job_nodes[id].reject{|n| n.include? "("}
146
+ else
147
+ nodes = []
148
+ end
149
+
150
+ if File.exist?(File.join(dir, 'exit.status'))
151
+ now = File.ctime(File.join(dir, 'exit.status'))
152
+ else
153
+ now = Time.now
154
+ end
155
+
156
+ if File.exist?(File.join(dir, 'std.out'))
157
+ cerrt = File.ctime File.join(dir, 'std.err')
158
+ coutt = File.ctime File.join(dir, 'std.out')
159
+ outt = File.mtime File.join(dir, 'std.out')
160
+ errt = File.mtime File.join(dir, 'std.err')
161
+ time_diff = now - [outt, errt].max
162
+ time_elapsed = now - [cerrt, coutt].min
163
+ end
164
+
165
+ fdep = File.join(dir, 'dependencies.list')
166
+ deps = Open.read(fdep).split("\n") if File.exist?(fdep)
167
+
168
+ fcadep = File.join(dir, 'canfail_dependencies.list')
169
+ cadeps = Open.read(fcadep).split("\n") if File.exist?(fcadep)
170
+
171
+ is_running = exit_status.nil? && ( (running_jobs.include?(id) && (deps.nil? || (running_jobs & deps).empty?)) || different_system )
172
+ if done || error || aborted || running || queued || jobid
173
+ select = false
174
+ select = true if done && exit_status == 0
175
+ select = true if error && exit_status && exit_status != 0
176
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
177
+ select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
178
+ select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
179
+ select = true if jobid && jobid.split(",").include?(id)
180
+ select = select && step_path.match(/#{search}/) if search
181
+ next unless select
182
+ elsif search
183
+ select = false
184
+ select = true if search && cmd.match(/#{search}/)
185
+ next unless select
186
+ end
187
+
188
+
189
+ count += 1
190
+
191
+ if options[:compressed]
192
+ status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : Log.color(:green, id)
193
+ if different_system
194
+ status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id)
195
+ else
196
+ #status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) )
197
+ status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" :
198
+ (running_jobs.include?(id) || $norunningjobs ?
199
+ (is_running ? Log.color(:cyan, id) : Log.color(:yellow, id) ) :
200
+ Log.color(:red, id) )
201
+ end
202
+ prog_rep = []
203
+ if options[:progress]
204
+ step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
205
+ if step_line
206
+ require 'rbbt/workflow'
207
+ step_path = step_line.split(": ").last.strip
208
+ step = Step.new step_path
209
+ step.load_dependencies_from_info
210
+ has_bar = false
211
+ [step].reverse.each do |j|
212
+ next if j.done?
213
+ if j.file(:progress).exists?
214
+ bar = Log::ProgressBar.new
215
+ bar.load(j.file(:progress).yaml)
216
+ rep = bar.report_msg.split("·")[1]
217
+ rep = rep.sub(/.*?(\d+%)/, Log.color(:blue,'\1')).sub(/\-.*/,'')
218
+ prog_rep << [rep]
219
+ end
220
+ end
221
+ end
222
+ end
223
+ workflow, task, name = step_path.split("/")[-3..-1]
224
+ job_str = [Log.color(:yellow, workflow), Log.color(:magenta, task), name] * "/"
225
+ puts [job_str, status, prog_rep ].flatten * " "
226
+ next
227
+ end
228
+
229
+ puts Log.color :blue, dir
230
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s if long
231
+ puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err')) && long
232
+ puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest) if long
233
+ puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
234
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
235
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing") if long
236
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing") if long
237
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home && long
238
+ if different_system
239
+ puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
240
+ else
241
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
242
+ end
243
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
244
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
245
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", " if long
246
+ puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
247
+ puts Log.color(:magenta, "Output: ") << File.exist?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exist?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)") if long
248
+
249
+ if options[:batch_parameters]
250
+ puts Log.color(:magenta, "BATCH parameters: ")
251
+ case job_batch_system
252
+ when 'slurm'
253
+ text = CMD.cmd('grep "^#SBATCH" ', :in => Open.read(fcmd)).read.strip
254
+ when 'lsf'
255
+ text = CMD.cmd('grep "^#BSUB" ', :in => Open.read(fcmd)).read.strip
256
+ else
257
+ text = ""
258
+ end
259
+ lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
260
+ puts Log.color :yellow, lines * "\n"
261
+ end
262
+
263
+ fprocpath = File.join(dir, 'procpath.sqlite3')
264
+ if options[:batch_procpath] && Open.exists?(fprocpath)
265
+ puts Log.color(:magenta, "Procpath summary: ")
266
+ require 'rbbt/tsv/csv'
267
+ meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
268
+ perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
269
+
270
+ page_size = meta["page_size"].first.to_f
271
+ clock_ticks = meta["clock_ticks"].first.to_f
272
+
273
+ cpu_average = {}
274
+ rss_average = {}
275
+ perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
276
+ time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
277
+ time = time.to_f
278
+
279
+ cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
280
+ cpu_average[stat_pid] ||= {}
281
+ cpu_average[stat_pid][time] ||= []
282
+ cpu_average[stat_pid][time] << cpu.to_f
283
+ rss_average[time] ||= []
284
+ rss_average[time] << rss.to_f * page_size
285
+ end
286
+
287
+ ticks = 0
288
+ cpu_average.each do |stat_pid, cpu_average_pid|
289
+ start = cpu_average_pid.keys.sort.first
290
+ eend = cpu_average_pid.keys.sort.last
291
+ ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
292
+ end
293
+ start = rss_average.keys.sort.first
294
+ eend = rss_average.keys.sort.last
295
+ time_elapsed = eend - start
296
+ ticks = 1 if ticks == 0
297
+ time_elapsed = 1 if time_elapsed == 0
298
+ puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
299
+ puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
300
+ puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
301
+
302
+ end
303
+
304
+ if options[:sacct_peformance]
305
+ begin
306
+ raise "sacct not supported for LSF" unless batch_system == 'slurm'
307
+ tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
308
+ values = tsv[tsv.keys.first]
309
+ if values.compact.any?
310
+ puts Log.color(:magenta, "SACCT performance: ")
311
+ puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
312
+ end
313
+ rescue
314
+ Log.warn $!.message
315
+ end
316
+ end
317
+
318
+
319
+ if tail && File.exist?(File.join(dir, 'std.err'))
320
+ if exit_status && exit_status != 0
321
+ puts Log.color(:magenta, "First error or exception found: ")
322
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
323
+ elsif exit_status
324
+ puts Log.color(:magenta, "Completed jobs: ")
325
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
326
+ else
327
+ puts Log.color(:magenta, "Log tail: ")
328
+ puts CMD.cmd(" cat #{File.join(dir, 'std.err')} | grep -v '^[^\\s:]*\\[3.m' | grep -v -e '^[[:space:]]*$' | grep -v \"\\(STDOUT\\|STDERR\\):[[:space:]]*$\" | tail -n #{tail.to_i} ").read
329
+ end
330
+ end
331
+
332
+ if options[:progress]
333
+ step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
334
+ if step_line
335
+ require 'rbbt/workflow'
336
+ step_path = step_line.split(": ").last.strip
337
+ step = Step.new step_path
338
+ step.load_dependencies_from_info
339
+ has_bar = false
340
+ (step.rec_dependencies + [step]).reverse.each do |j|
341
+ next if j.done?
342
+ if j.file(:progress).exists?
343
+ bar = Log::ProgressBar.new
344
+ bar.load(j.file(:progress).yaml)
345
+ puts Log.color(:magenta, "Progress: ") + bar.report_msg + " " + Log.color(:yellow, j.task_signature)
346
+ has_bar = true
347
+ end
348
+ end
349
+ step_status = step.status
350
+ step_status = Log.color :red, step_status if step_status.to_s == 'cleaned'
351
+ step_status = Log.color :green, step_status if step_status.to_s == 'done'
352
+ puts Log.color(:magenta, "Progress: ") + Log.color(:yellow, step.task_signature) + " #{step_status}" unless has_bar
353
+ end
354
+ end
355
355
 
356
356
  end
357
357