rbbt-util 5.32.7 → 5.32.13

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/bin/rbbt +1 -0
  3. data/lib/rbbt/hpc/batch.rb +23 -7
  4. data/lib/rbbt/hpc/slurm.rb +29 -10
  5. data/lib/rbbt/persist/tsv/adapter.rb +1 -5
  6. data/lib/rbbt/resource.rb +22 -9
  7. data/lib/rbbt/tsv/csv.rb +2 -2
  8. data/lib/rbbt/tsv/manipulate.rb +2 -0
  9. data/lib/rbbt/util/R.rb +2 -2
  10. data/lib/rbbt/util/cmd.rb +39 -18
  11. data/lib/rbbt/util/log/progress/report.rb +20 -17
  12. data/lib/rbbt/util/python.rb +24 -3
  13. data/lib/rbbt/util/simpleDSL.rb +4 -4
  14. data/lib/rbbt/workflow.rb +20 -2
  15. data/lib/rbbt/workflow/step.rb +37 -6
  16. data/lib/rbbt/workflow/step/accessor.rb +2 -2
  17. data/lib/rbbt/workflow/util/data.rb +31 -0
  18. data/lib/rbbt/workflow/util/trace.rb +2 -1
  19. data/python/rbbt.py +3 -0
  20. data/share/install/software/lib/install_helpers +1 -1
  21. data/share/rbbt_commands/hpc/list +11 -7
  22. data/share/rbbt_commands/hpc/orchestrate +6 -1
  23. data/share/rbbt_commands/hpc/task +6 -1
  24. data/share/rbbt_commands/lsf/clean +212 -0
  25. data/share/rbbt_commands/lsf/list +315 -0
  26. data/share/rbbt_commands/lsf/orchestrate +61 -0
  27. data/share/rbbt_commands/lsf/tail +55 -0
  28. data/share/rbbt_commands/lsf/task +60 -0
  29. data/share/rbbt_commands/slurm/clean +212 -0
  30. data/share/rbbt_commands/slurm/list +315 -0
  31. data/share/rbbt_commands/slurm/orchestrate +61 -0
  32. data/share/rbbt_commands/slurm/tail +55 -0
  33. data/share/rbbt_commands/slurm/task +60 -0
  34. data/share/rbbt_commands/workflow/forget_deps +5 -4
  35. data/test/rbbt/util/test_python.rb +3 -2
  36. data/test/rbbt/util/test_simpleDSL.rb +3 -3
  37. data/test/rbbt/workflow/util/test_data.rb +35 -0
  38. metadata +97 -84
@@ -0,0 +1,212 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+ require 'rbbt/hpc'
6
+
7
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
8
+
9
+ options = SOPT.setup <<EOF
10
+
11
+ Clean error or aborted jobs
12
+
13
+ $ rbbt slurm clean [options]
14
+
15
+ -h--help Print this help
16
+ -d--done Done jobs only
17
+ -e--error Error jobs only
18
+ -a--aborted SLURM aboted jobs
19
+ -q--queued Queued jobs only
20
+ -j--job* Job ids
21
+ -s--search* Regular expression
22
+ -t--tail* Show the last lines of the STDERR
23
+ -BP--batch_parameters show batch parameters
24
+ -dr--dry_run Do not erase anything
25
+ EOF
26
+
27
+ if options[:help]
28
+ if defined? rbbt_usage
29
+ rbbt_usage
30
+ else
31
+ puts SOPT.doc
32
+ end
33
+ exit 0
34
+ end
35
+
36
+ batch_system = options.delete :batch_system
37
+ batch_system ||= 'auto'
38
+
39
+ HPC::BATCH_MODULE = HPC.batch_system batch_system
40
+
41
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
42
+
43
+ Log.severity = 4
44
+ done, error, aborted, queued, jobid, search, tail, batch_parameters, dry_run = options.values_at :done, :error, :aborted, :queued, :job, :search, :tail, :batch_parameters, :dry_run
45
+
46
+ workdir = File.expand_path('~/rbbt-batch')
47
+ Path.setup(workdir)
48
+
49
+ running_jobs = begin
50
+ squeue_txt = HPC::BATCH_MODULE.job_status
51
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
52
+ rescue
53
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
54
+ squeue_txt = nil
55
+ $norunningjobs = true
56
+ []
57
+ end
58
+
59
+ if squeue_txt
60
+ job_nodes = {}
61
+ squeue_txt.split("\n").each do |line|
62
+ parts = line.strip.split(/\s+/)
63
+ job_nodes[parts.first] = parts.last.split(",")
64
+ end
65
+ else
66
+ job_nodes = nil
67
+ end
68
+
69
+ count = 0
70
+ workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
71
+ dir = File.dirname(fcmd)
72
+ command_txt = Open.read(fcmd)
73
+
74
+ if m = command_txt.match(/#CMD: (.*)/)
75
+ cmd = m[1]
76
+ else
77
+ cmd = nil
78
+ end
79
+
80
+ if m = command_txt.match(/# Run command\n(.*?)\n/im)
81
+ exe = m[1]
82
+ else
83
+ exe = nil
84
+ end
85
+
86
+ if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
87
+ container_home = m[1]
88
+ else
89
+ container_home = nil
90
+ end
91
+
92
+ if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
93
+ job_batch_system = m[1].downcase
94
+ else
95
+ job_batch_system = nil
96
+ end
97
+
98
+ different_system = job_batch_system != batch_system
99
+
100
+ if File.exists?(fid = File.join(dir, 'job.id'))
101
+ id = Open.read(fid).chomp
102
+ else
103
+ id = nil
104
+ end
105
+
106
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
107
+ exit_status = Open.read(fstatus).to_i
108
+ else
109
+ exit_status = nil
110
+ end
111
+
112
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
113
+ fstatus_txt = Open.read(fstatus)
114
+ begin
115
+ if job_batch_system == "lsf"
116
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
117
+ else
118
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
119
+ end
120
+ rescue
121
+ nodes = []
122
+ end
123
+ elsif job_nodes[id]
124
+ nodes = job_nodes[id]
125
+ else
126
+ nodes = []
127
+ end
128
+
129
+ if File.exists?(File.join(dir, 'std.out'))
130
+ outt = File.mtime File.join(dir, 'std.out')
131
+ errt = File.mtime File.join(dir, 'std.err')
132
+ time_diff = Time.now - [outt, errt].max
133
+ end
134
+
135
+ fdep = File.join(dir, 'dependencies.list')
136
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
137
+
138
+ fcadep = File.join(dir, 'canfail_dependencies.list')
139
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
140
+
141
+ aborted = error = true if aborted.nil? && error.nil?
142
+ #if done || error || aborted || running || queued || jobid || search
143
+ # select = false
144
+ # select = true if done && exit_status && exit_status.to_i == 0
145
+ # select = true if error && exit_status && exit_status.to_i != 0
146
+ # select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
147
+ # select = select && jobid.split(",").include?(id) if jobid
148
+ # select = select && cmd.match(/#{search}/) if search
149
+ # next unless select
150
+ #end
151
+
152
+ if done || error || aborted || queued || jobid
153
+ select = false
154
+ select = true if done && exit_status == 0
155
+ select = true if error && exit_status && exit_status != 0
156
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
157
+ is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
158
+ select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
159
+ select = true if jobid && jobid.split(",").include?(id)
160
+ select = select && cmd.match(/#{search}/) if search
161
+ next unless select
162
+ elsif search
163
+ select = false
164
+ select = true if search && cmd.match(/#{search}/)
165
+ next unless select
166
+ end
167
+
168
+
169
+ puts Log.color(:yellow, "**ERASING**")
170
+ puts Log.color :blue, dir
171
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
172
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
173
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
174
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
175
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
176
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
177
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
178
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
179
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
180
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
181
+
182
+ if options[:batch_parameters]
183
+ puts Log.color(:magenta, "BATCH parameters: ")
184
+ case job_batch_system
185
+ when 'slurm'
186
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
187
+ when 'lsf'
188
+ puts Log.color :blue, CMD.cmd('grep "^#BSUB" |tail -n +6', :in => Open.read(fcmd)).read.strip
189
+ end
190
+ end
191
+
192
+ if tail && File.exists?(File.join(dir, 'std.err'))
193
+ if exit_status && exit_status != 0
194
+ puts Log.color(:magenta, "First error or exception found: ")
195
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
196
+ elsif exit_status
197
+ puts Log.color(:magenta, "Completed jobs: ")
198
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
199
+ else
200
+ puts Log.color(:magenta, "Log tail: ")
201
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
202
+ end
203
+ end
204
+
205
+ count += 1
206
+
207
+ Open.rm_rf dir unless dry_run
208
+ end
209
+
210
+ puts
211
+ puts "Found #{count} jobs"
212
+
@@ -0,0 +1,315 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+ require 'rbbt/hpc'
6
+
7
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
8
+
9
+ options = SOPT.setup <<EOF
10
+
11
+ Queue a job in Marenostrum
12
+
13
+ $ rbbt slurm list [options]
14
+
15
+ -h--help Print this help
16
+ -d--done Done jobs only
17
+ -e--error Error jobs only
18
+ -a--aborted SLURM aboted jobs
19
+ -r--running Running jobs only
20
+ -q--queued Queued jobs only
21
+ -j--job* Job ids
22
+ -s--search* Regular expression
23
+ -t--tail* Show the last lines of the STDERR
24
+ -p--progress Report progress of job and the dependencies
25
+ -BP--batch_parameters show batch parameters
26
+ -BPP--batch_procpath show Procpath performance summary
27
+ -sacct--sacct_peformance show sacct performance summary
28
+ -bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
29
+ EOF
30
+
31
+ if options[:help]
32
+ if defined? rbbt_usage
33
+ rbbt_usage
34
+ else
35
+ puts SOPT.doc
36
+ end
37
+ exit 0
38
+ end
39
+
40
+ batch_system = options.delete :batch_system
41
+ batch_system ||= 'auto'
42
+
43
+ HPC::BATCH_MODULE = HPC.batch_system batch_system
44
+
45
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
46
+
47
+ batch_system = HPC::BATCH_MODULE.to_s.split("::").last.downcase
48
+
49
+ done, error, running, queued, aborted, jobid, search, tail, progress = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail, :progress
50
+
51
+ workdir = File.expand_path('~/rbbt-batch')
52
+ Path.setup(workdir)
53
+
54
+ running_jobs = begin
55
+ squeue_txt = HPC::BATCH_MODULE.job_status
56
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
57
+ rescue
58
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
59
+ squeue_txt = nil
60
+ $norunningjobs = true
61
+ []
62
+ end
63
+
64
+ if squeue_txt
65
+ job_nodes = {}
66
+ squeue_txt.split("\n").each do |line|
67
+ parts = line.strip.split(/\s+/)
68
+ job_nodes[parts.first] = parts.last.split(",")
69
+ end
70
+ else
71
+ job_nodes = nil
72
+ end
73
+
74
+ count = 0
75
+ workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
76
+ dir = File.dirname(fcmd)
77
+ command_txt = Open.read(fcmd)
78
+
79
+ if m = command_txt.match(/#CMD: (.*)/)
80
+ cmd = m[1]
81
+ else
82
+ cmd = nil
83
+ end
84
+
85
+ if m = command_txt.match(/^export BATCH_SYSTEM=(.*)/)
86
+ job_batch_system = m[1].downcase
87
+ else
88
+ job_batch_system = nil
89
+ end
90
+
91
+ different_system = job_batch_system != batch_system
92
+
93
+ if m = command_txt.match(/#MANIFEST: (.*)/)
94
+ manifest = m[1]
95
+ else
96
+ manifest = nil
97
+ end
98
+
99
+ if m = command_txt.match(/#STEP_PATH: (.*)/)
100
+ step_path = m[1]
101
+ else
102
+ step_path = nil
103
+ end
104
+
105
+ if m = command_txt.match(/#EXEC_CMD: (.*)/)
106
+ exe = m[1]
107
+ else
108
+ exe = nil
109
+ end
110
+
111
+ if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
112
+ container_home = m[1]
113
+ else
114
+ container_home = nil
115
+ end
116
+
117
+ if File.exists?(fid = File.join(dir, 'job.id'))
118
+ id = Open.read(fid).chomp
119
+ else
120
+ id = nil
121
+ end
122
+
123
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
124
+ exit_status = Open.read(fstatus).to_i
125
+ else
126
+ exit_status = nil
127
+ end
128
+
129
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
130
+ fstatus_txt = Open.read(fstatus)
131
+ begin
132
+ if job_batch_system == "lsf"
133
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
134
+ else
135
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
136
+ end
137
+ rescue
138
+ nodes = []
139
+ end
140
+ elsif job_nodes[id]
141
+ nodes = job_nodes[id].reject{|n| n.include? "("}
142
+ else
143
+ nodes = []
144
+ end
145
+
146
+ if File.exists?(File.join(dir, 'exit.status'))
147
+ now = File.ctime(File.join(dir, 'exit.status'))
148
+ else
149
+ now = Time.now
150
+ end
151
+
152
+ if File.exists?(File.join(dir, 'std.out'))
153
+ cerrt = File.ctime File.join(dir, 'std.err')
154
+ coutt = File.ctime File.join(dir, 'std.out')
155
+ outt = File.mtime File.join(dir, 'std.out')
156
+ errt = File.mtime File.join(dir, 'std.err')
157
+ time_diff = now - [outt, errt].max
158
+ time_elapsed = now - [cerrt, coutt].min
159
+ end
160
+
161
+ fdep = File.join(dir, 'dependencies.list')
162
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
163
+
164
+ fcadep = File.join(dir, 'canfail_dependencies.list')
165
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
166
+
167
+ if done || error || aborted || running || queued || jobid
168
+ select = false
169
+ select = true if done && exit_status == 0
170
+ select = true if error && exit_status && exit_status != 0
171
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
172
+ is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
173
+ select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
174
+ select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
175
+ select = true if jobid && jobid.split(",").include?(id)
176
+ select = select && step_path.match(/#{search}/) if search
177
+ next unless select
178
+ elsif search
179
+ select = false
180
+ select = true if search && cmd.match(/#{search}/)
181
+ next unless select
182
+ end
183
+
184
+
185
+ puts Log.color :blue, dir
186
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
187
+ puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err'))
188
+ puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest)
189
+ puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
190
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
191
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
192
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
193
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
194
+ if different_system
195
+ puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
196
+ else
197
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
198
+ end
199
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
200
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
201
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
202
+ puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
203
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exists?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
204
+
205
+ if options[:batch_parameters]
206
+ puts Log.color(:magenta, "BATCH parameters: ")
207
+ case job_batch_system
208
+ when 'slurm'
209
+ text = CMD.cmd('grep "^#SBATCH" ', :in => Open.read(fcmd)).read.strip
210
+ when 'lsf'
211
+ text = CMD.cmd('grep "^#BSUB" ', :in => Open.read(fcmd)).read.strip
212
+ else
213
+ text = ""
214
+ end
215
+ lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
216
+ puts Log.color :yellow, lines * "\n"
217
+ end
218
+
219
+ fprocpath = File.join(dir, 'procpath.sqlite3')
220
+ if options[:batch_procpath] && Open.exists?(fprocpath)
221
+ puts Log.color(:magenta, "Procpath summary: ")
222
+ require 'rbbt/tsv/csv'
223
+ meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
224
+ perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
225
+
226
+ page_size = meta["page_size"].first.to_f
227
+ clock_ticks = meta["clock_ticks"].first.to_f
228
+
229
+ cpu_average = {}
230
+ rss_average = {}
231
+ perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
232
+ time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
233
+ time = time.to_f
234
+
235
+ cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
236
+ cpu_average[stat_pid] ||= {}
237
+ cpu_average[stat_pid][time] ||= []
238
+ cpu_average[stat_pid][time] << cpu.to_f
239
+ rss_average[time] ||= []
240
+ rss_average[time] << rss.to_f * page_size
241
+ end
242
+
243
+ ticks = 0
244
+ cpu_average.each do |stat_pid, cpu_average_pid|
245
+ start = cpu_average_pid.keys.sort.first
246
+ eend = cpu_average_pid.keys.sort.last
247
+ ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
248
+ end
249
+ start = rss_average.keys.sort.first
250
+ eend = rss_average.keys.sort.last
251
+ time_elapsed = eend - start
252
+ ticks = 1 if ticks == 0
253
+ time_elapsed = 1 if time_elapsed == 0
254
+ puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
255
+ puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
256
+ puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
257
+
258
+ end
259
+
260
+ if options[:sacct_peformance]
261
+ begin
262
+ raise "sacct not supported for LSF" unless batch_system == 'slurm'
263
+ tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
264
+ values = tsv[tsv.keys.first]
265
+ if values.compact.any?
266
+ puts Log.color(:magenta, "SACCT performance: ")
267
+ puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
268
+ end
269
+ rescue
270
+ Log.warn $!.message
271
+ end
272
+ end
273
+
274
+
275
+ if tail && File.exists?(File.join(dir, 'std.err'))
276
+ if exit_status && exit_status != 0
277
+ puts Log.color(:magenta, "First error or exception found: ")
278
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
279
+ elsif exit_status
280
+ puts Log.color(:magenta, "Completed jobs: ")
281
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
282
+ else
283
+ puts Log.color(:magenta, "Log tail: ")
284
+ puts CMD.cmd(" cat #{File.join(dir, 'std.err')} | grep -v '^[^\\s:]*\\[3.m' | tail -n #{tail.to_i} ").read
285
+ end
286
+ end
287
+
288
+ if options[:progress]
289
+ step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
290
+ if step_line
291
+ require 'rbbt/workflow'
292
+ step_path = step_line.split(": ").last.strip
293
+ step = Step.new step_path
294
+ step.load_dependencies_from_info
295
+ has_bar = false
296
+ (step.rec_dependencies + [step]).reverse.each do |j|
297
+ next if j.done?
298
+ if j.file(:progress).exists?
299
+ bar = Log::ProgressBar.new
300
+ bar.load(j.file(:progress).yaml)
301
+ puts Log.color(:magenta, "Progress: ") + bar.report_msg + " " + Log.color(:yellow, j.task_signature)
302
+ has_bar = true
303
+ end
304
+ end
305
+ puts Log.color(:magenta, "Progress: ") + Log.color(:yellow, step.task_signature) + " #{step.status}" unless has_bar
306
+ end
307
+ end
308
+
309
+ count += 1
310
+
311
+ end
312
+
313
+ puts
314
+ puts "Found #{count} jobs"
315
+