rbbt-util 5.32.7 → 5.32.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b989d145b1a34baab93f351ea9944e73dde157e9973a9443717b84a43e7b41d5
4
- data.tar.gz: d6e8f5a7e8cb2d2f0e9a99f36a7594e4fc3655dd1d4f9b571f73f0f947b19fd3
3
+ metadata.gz: 3b9d236294c4bdcc32e517eadcffa4d103e5a99a220096497a50f8baa26f746d
4
+ data.tar.gz: 6370930ac76bd8b86666fa59556d2d8648789c3e3dda56bb71c71c2ff65b88d3
5
5
  SHA512:
6
- metadata.gz: da60531d5e35efd54b0c4de7e9ad6b034ef05e8d58d6a6b0a1b15422b324dd6eb5a4cb01b21ba60c091616d512106c278ef794088f87f844bd96e86d8f61d7da
7
- data.tar.gz: 2327575017d9dbb301b8c0ba36ee691bc8be00175712e2121da019251c3735d0348cae6d18e8112b00c952e233eb806a19f2275563310360ed745a9b868a6021
6
+ metadata.gz: 7dcd6fadf6424add27b6773d16ef0b794183c4b2f52616498d8c2ec21a77c5b5ad8f3fed65ca69ce5111c71e0942f84be79c107b40156c331919533f3fc5bea7
7
+ data.tar.gz: ea294d8aa5f8ff04a9e902fdd2d7d2287a34934f7288a11234996d06fb72dba4f333e655d591d9b78c5bc00104036356625088f9aa4c7e808ae2d42d0b253781
@@ -0,0 +1,212 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+ require 'rbbt/hpc'
6
+
7
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
8
+
9
+ options = SOPT.setup <<EOF
10
+
11
+ Clean error or aborted jobs
12
+
13
+ $ rbbt slurm clean [options]
14
+
15
+ -h--help Print this help
16
+ -d--done Done jobs only
17
+ -e--error Error jobs only
18
+ -a--aborted SLURM aboted jobs
19
+ -q--queued Queued jobs only
20
+ -j--job* Job ids
21
+ -s--search* Regular expression
22
+ -t--tail* Show the last lines of the STDERR
23
+ -BP--batch_parameters show batch parameters
24
+ -dr--dry_run Do not erase anything
25
+ EOF
26
+
27
+ if options[:help]
28
+ if defined? rbbt_usage
29
+ rbbt_usage
30
+ else
31
+ puts SOPT.doc
32
+ end
33
+ exit 0
34
+ end
35
+
36
+ batch_system = options.delete :batch_system
37
+ batch_system ||= 'auto'
38
+
39
+ HPC::BATCH_MODULE = HPC.batch_system batch_system
40
+
41
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
42
+
43
+ Log.severity = 4
44
+ done, error, aborted, queued, jobid, search, tail, batch_parameters, dry_run = options.values_at :done, :error, :aborted, :queued, :job, :search, :tail, :batch_parameters, :dry_run
45
+
46
+ workdir = File.expand_path('~/rbbt-batch')
47
+ Path.setup(workdir)
48
+
49
+ running_jobs = begin
50
+ squeue_txt = HPC::BATCH_MODULE.job_status
51
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
52
+ rescue
53
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
54
+ squeue_txt = nil
55
+ $norunningjobs = true
56
+ []
57
+ end
58
+
59
+ if squeue_txt
60
+ job_nodes = {}
61
+ squeue_txt.split("\n").each do |line|
62
+ parts = line.strip.split(/\s+/)
63
+ job_nodes[parts.first] = parts.last.split(",")
64
+ end
65
+ else
66
+ job_nodes = nil
67
+ end
68
+
69
+ count = 0
70
+ workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
71
+ dir = File.dirname(fcmd)
72
+ command_txt = Open.read(fcmd)
73
+
74
+ if m = command_txt.match(/#CMD: (.*)/)
75
+ cmd = m[1]
76
+ else
77
+ cmd = nil
78
+ end
79
+
80
+ if m = command_txt.match(/# Run command\n(.*?)\n/im)
81
+ exe = m[1]
82
+ else
83
+ exe = nil
84
+ end
85
+
86
+ if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
87
+ container_home = m[1]
88
+ else
89
+ container_home = nil
90
+ end
91
+
92
+ if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
93
+ job_batch_system = m[1].downcase
94
+ else
95
+ job_batch_system = nil
96
+ end
97
+
98
+ different_system = job_batch_system != batch_system
99
+
100
+ if File.exists?(fid = File.join(dir, 'job.id'))
101
+ id = Open.read(fid).chomp
102
+ else
103
+ id = nil
104
+ end
105
+
106
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
107
+ exit_status = Open.read(fstatus).to_i
108
+ else
109
+ exit_status = nil
110
+ end
111
+
112
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
113
+ fstatus_txt = Open.read(fstatus)
114
+ begin
115
+ if job_batch_system == "lsf"
116
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
117
+ else
118
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
119
+ end
120
+ rescue
121
+ nodes = []
122
+ end
123
+ elsif job_nodes[id]
124
+ nodes = job_nodes[id]
125
+ else
126
+ nodes = []
127
+ end
128
+
129
+ if File.exists?(File.join(dir, 'std.out'))
130
+ outt = File.mtime File.join(dir, 'std.out')
131
+ errt = File.mtime File.join(dir, 'std.err')
132
+ time_diff = Time.now - [outt, errt].max
133
+ end
134
+
135
+ fdep = File.join(dir, 'dependencies.list')
136
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
137
+
138
+ fcadep = File.join(dir, 'canfail_dependencies.list')
139
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
140
+
141
+ aborted = error = true if aborted.nil? && error.nil?
142
+ #if done || error || aborted || running || queued || jobid || search
143
+ # select = false
144
+ # select = true if done && exit_status && exit_status.to_i == 0
145
+ # select = true if error && exit_status && exit_status.to_i != 0
146
+ # select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
147
+ # select = select && jobid.split(",").include?(id) if jobid
148
+ # select = select && cmd.match(/#{search}/) if search
149
+ # next unless select
150
+ #end
151
+
152
+ if done || error || aborted || queued || jobid
153
+ select = false
154
+ select = true if done && exit_status == 0
155
+ select = true if error && exit_status && exit_status != 0
156
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
157
+ is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
158
+ select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
159
+ select = true if jobid && jobid.split(",").include?(id)
160
+ select = select && cmd.match(/#{search}/) if search
161
+ next unless select
162
+ elsif search
163
+ select = false
164
+ select = true if search && cmd.match(/#{search}/)
165
+ next unless select
166
+ end
167
+
168
+
169
+ puts Log.color(:yellow, "**ERASING**")
170
+ puts Log.color :blue, dir
171
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
172
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
173
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
174
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
175
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
176
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
177
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
178
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
179
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
180
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
181
+
182
+ if options[:batch_parameters]
183
+ puts Log.color(:magenta, "BATCH parameters: ")
184
+ case job_batch_system
185
+ when 'slurm'
186
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
187
+ when 'lsf'
188
+ puts Log.color :blue, CMD.cmd('grep "^#BSUB" |tail -n +6', :in => Open.read(fcmd)).read.strip
189
+ end
190
+ end
191
+
192
+ if tail && File.exists?(File.join(dir, 'std.err'))
193
+ if exit_status && exit_status != 0
194
+ puts Log.color(:magenta, "First error or exception found: ")
195
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
196
+ elsif exit_status
197
+ puts Log.color(:magenta, "Completed jobs: ")
198
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
199
+ else
200
+ puts Log.color(:magenta, "Log tail: ")
201
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
202
+ end
203
+ end
204
+
205
+ count += 1
206
+
207
+ Open.rm_rf dir unless dry_run
208
+ end
209
+
210
+ puts
211
+ puts "Found #{count} jobs"
212
+
@@ -0,0 +1,311 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+ require 'rbbt/hpc'
6
+
7
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
8
+
9
+ options = SOPT.setup <<EOF
10
+
11
+ Queue a job in Marenostrum
12
+
13
+ $ rbbt slurm list [options]
14
+
15
+ -h--help Print this help
16
+ -d--done Done jobs only
17
+ -e--error Error jobs only
18
+ -a--aborted SLURM aboted jobs
19
+ -r--running Running jobs only
20
+ -q--queued Queued jobs only
21
+ -j--job* Job ids
22
+ -s--search* Regular expression
23
+ -t--tail* Show the last lines of the STDERR
24
+ -p--progress Report progress of job and the dependencies
25
+ -BP--batch_parameters show batch parameters
26
+ -BPP--batch_procpath show Procpath performance summary
27
+ -sacct--sacct_peformance show sacct performance summary
28
+ -bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
29
+ EOF
30
+
31
+ if options[:help]
32
+ if defined? rbbt_usage
33
+ rbbt_usage
34
+ else
35
+ puts SOPT.doc
36
+ end
37
+ exit 0
38
+ end
39
+
40
+ batch_system = options.delete :batch_system
41
+ batch_system ||= 'auto'
42
+
43
+ HPC::BATCH_MODULE = HPC.batch_system batch_system
44
+
45
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
46
+
47
+ batch_system = HPC::BATCH_MODULE.to_s.split("::").last.downcase
48
+
49
+ done, error, running, queued, aborted, jobid, search, tail, progress = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail, :progress
50
+
51
+ workdir = File.expand_path('~/rbbt-batch')
52
+ Path.setup(workdir)
53
+
54
+ running_jobs = begin
55
+ squeue_txt = HPC::BATCH_MODULE.job_status
56
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
57
+ rescue
58
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
59
+ squeue_txt = nil
60
+ $norunningjobs = true
61
+ []
62
+ end
63
+
64
+ if squeue_txt
65
+ job_nodes = {}
66
+ squeue_txt.split("\n").each do |line|
67
+ parts = line.strip.split(/\s+/)
68
+ job_nodes[parts.first] = parts.last.split(",")
69
+ end
70
+ else
71
+ job_nodes = nil
72
+ end
73
+
74
+ count = 0
75
+ workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
76
+ dir = File.dirname(fcmd)
77
+ command_txt = Open.read(fcmd)
78
+
79
+ if m = command_txt.match(/#CMD: (.*)/)
80
+ cmd = m[1]
81
+ else
82
+ cmd = nil
83
+ end
84
+
85
+ if m = command_txt.match(/^export BATCH_SYSTEM=(.*)/)
86
+ job_batch_system = m[1].downcase
87
+ else
88
+ job_batch_system = nil
89
+ end
90
+
91
+ different_system = job_batch_system != batch_system
92
+
93
+ if m = command_txt.match(/#MANIFEST: (.*)/)
94
+ manifest = m[1]
95
+ else
96
+ manifest = nil
97
+ end
98
+
99
+ if m = command_txt.match(/#STEP_PATH: (.*)/)
100
+ step_path = m[1]
101
+ else
102
+ step_path = nil
103
+ end
104
+
105
+ if m = command_txt.match(/#EXEC_CMD: (.*)/)
106
+ exe = m[1]
107
+ else
108
+ exe = nil
109
+ end
110
+
111
+ if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
112
+ container_home = m[1]
113
+ else
114
+ container_home = nil
115
+ end
116
+
117
+ if File.exists?(fid = File.join(dir, 'job.id'))
118
+ id = Open.read(fid).chomp
119
+ else
120
+ id = nil
121
+ end
122
+
123
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
124
+ exit_status = Open.read(fstatus).to_i
125
+ else
126
+ exit_status = nil
127
+ end
128
+
129
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
130
+ fstatus_txt = Open.read(fstatus)
131
+ begin
132
+ if job_batch_system == "lsf"
133
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
134
+ else
135
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
136
+ end
137
+ rescue
138
+ nodes = []
139
+ end
140
+ elsif job_nodes[id]
141
+ nodes = job_nodes[id].reject{|n| n.include? "("}
142
+ else
143
+ nodes = []
144
+ end
145
+
146
+ if File.exists?(File.join(dir, 'exit.status'))
147
+ now = File.ctime(File.join(dir, 'exit.status'))
148
+ else
149
+ now = Time.now
150
+ end
151
+
152
+ if File.exists?(File.join(dir, 'std.out'))
153
+ cerrt = File.ctime File.join(dir, 'std.err')
154
+ coutt = File.ctime File.join(dir, 'std.out')
155
+ outt = File.mtime File.join(dir, 'std.out')
156
+ errt = File.mtime File.join(dir, 'std.err')
157
+ time_diff = now - [outt, errt].max
158
+ time_elapsed = now - [cerrt, coutt].min
159
+ end
160
+
161
+ fdep = File.join(dir, 'dependencies.list')
162
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
163
+
164
+ fcadep = File.join(dir, 'canfail_dependencies.list')
165
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
166
+
167
+ if done || error || aborted || running || queued || jobid
168
+ select = false
169
+ select = true if done && exit_status == 0
170
+ select = true if error && exit_status && exit_status != 0
171
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
172
+ is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
173
+ select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
174
+ select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
175
+ select = true if jobid && jobid.split(",").include?(id)
176
+ select = select && cmd.match(/#{search}/) if search
177
+ next unless select
178
+ elsif search
179
+ select = false
180
+ select = true if search && cmd.match(/#{search}/)
181
+ next unless select
182
+ end
183
+
184
+
185
+ puts Log.color :blue, dir
186
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
187
+ puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err'))
188
+ puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest)
189
+ puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
190
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
191
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
192
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
193
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
194
+ if different_system
195
+ puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
196
+ else
197
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
198
+ end
199
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
200
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
201
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
202
+ puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
203
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exists?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
204
+
205
+ if options[:batch_parameters]
206
+ puts Log.color(:magenta, "BATCH parameters: ")
207
+ case job_batch_system
208
+ when 'slurm'
209
+ text = CMD.cmd('grep "^#SBATCH" |tail -n +5', :in => Open.read(fcmd)).read.strip
210
+ when 'lsf'
211
+ text = CMD.cmd('grep "^#BSUB" |tail -n +5', :in => Open.read(fcmd)).read.strip
212
+ else
213
+ text = ""
214
+ end
215
+ lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
216
+ puts Log.color :yellow, lines * "\n"
217
+ end
218
+
219
+ fprocpath = File.join(dir, 'procpath.sqlite3')
220
+ if options[:batch_procpath] && Open.exists?(fprocpath)
221
+ puts Log.color(:magenta, "Procpath summary: ")
222
+ require 'rbbt/tsv/csv'
223
+ meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
224
+ perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
225
+
226
+ page_size = meta["page_size"].first.to_f
227
+ clock_ticks = meta["clock_ticks"].first.to_f
228
+
229
+ cpu_average = {}
230
+ rss_average = {}
231
+ perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
232
+ time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
233
+ time = time.to_f
234
+
235
+ cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
236
+ cpu_average[stat_pid] ||= {}
237
+ cpu_average[stat_pid][time] ||= []
238
+ cpu_average[stat_pid][time] << cpu.to_f
239
+ rss_average[time] ||= []
240
+ rss_average[time] << rss.to_f * page_size
241
+ end
242
+
243
+ ticks = 0
244
+ cpu_average.each do |stat_pid, cpu_average_pid|
245
+ start = cpu_average_pid.keys.sort.first
246
+ eend = cpu_average_pid.keys.sort.last
247
+ ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
248
+ end
249
+ start = rss_average.keys.sort.first
250
+ eend = rss_average.keys.sort.last
251
+ time_elapsed = eend - start
252
+ ticks = 1 if ticks == 0
253
+ time_elapsed = 1 if time_elapsed == 0
254
+ puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
255
+ puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
256
+ puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
257
+
258
+ end
259
+
260
+ if options[:sacct_peformance]
261
+ begin
262
+ raise "sacct not supported for LSF" unless batch_system == 'slurm'
263
+ tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
264
+ values = tsv[tsv.keys.first]
265
+ if values.compact.any?
266
+ puts Log.color(:magenta, "SACCT performance: ")
267
+ puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
268
+ end
269
+ rescue
270
+ Log.warn $!.message
271
+ end
272
+ end
273
+
274
+
275
+ if tail && File.exists?(File.join(dir, 'std.err'))
276
+ if exit_status && exit_status != 0
277
+ puts Log.color(:magenta, "First error or exception found: ")
278
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
279
+ elsif exit_status
280
+ puts Log.color(:magenta, "Completed jobs: ")
281
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
282
+ else
283
+ puts Log.color(:magenta, "Log tail: ")
284
+ puts CMD.cmd(" cat #{File.join(dir, 'std.err')} | grep -v '^[^\\s:]*\\[3.m' | tail -n #{tail.to_i} ").read
285
+ end
286
+ end
287
+
288
+ if options[:progress]
289
+ step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
290
+ if step_line
291
+ require 'rbbt/workflow'
292
+ step_path = step_line.split(": ").last.strip
293
+ step = Step.new step_path
294
+ step.load_dependencies_from_info
295
+ (step.rec_dependencies + [step]).reverse.each do |j|
296
+ next if j.done?
297
+ next unless j.file(:progress).exists?
298
+ bar = Log::ProgressBar.new
299
+ bar.load(j.file(:progress).yaml)
300
+ puts Log.color(:magenta, "Progress: ") + bar.report_msg + " " + Log.color(:yellow, j.task_signature)
301
+ end
302
+ end
303
+ end
304
+
305
+ count += 1
306
+
307
+ end
308
+
309
+ puts
310
+ puts "Found #{count} jobs"
311
+