scout-gear 10.1.0 → 10.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.vimproject +49 -3
- data/VERSION +1 -1
- data/lib/scout/tsv/transformer.rb +17 -13
- data/lib/scout/tsv/util/melt.rb +13 -0
- data/lib/scout/tsv/util.rb +1 -0
- data/lib/scout/tsv.rb +7 -4
- data/lib/scout/workflow/definition.rb +10 -4
- data/lib/scout/workflow/step/dependencies.rb +1 -1
- data/lib/scout/workflow/step/file.rb +2 -1
- data/lib/scout/workflow/step/provenance.rb +2 -2
- data/lib/scout/workflow/step/status.rb +4 -4
- data/lib/scout/workflow/step.rb +11 -2
- data/lib/scout/workflow/task/inputs.rb +6 -2
- data/lib/scout/workflow/usage.rb +4 -2
- data/lib/scout/workflow/util.rb +5 -0
- data/scout-gear.gemspec +12 -4
- data/scout_commands/batch/clean +211 -0
- data/scout_commands/batch/list +357 -0
- data/scout_commands/log +45 -0
- data/scout_commands/rbbt +2 -1
- data/scout_commands/workflow/prov +133 -0
- data/scout_commands/workflow/task +6 -5
- data/scout_commands/workflow/trace +49 -0
- data/scout_commands/workflow/write_info +69 -0
- data/test/scout/offsite/test_ssh.rb +1 -1
- data/test/scout/test_tsv.rb +21 -0
- data/test/scout/tsv/util/test_melt.rb +28 -0
- metadata +11 -3
@@ -0,0 +1,211 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scout'
|
4
|
+
|
5
|
+
require 'rbbt/hpc'
|
6
|
+
|
7
|
+
$0 = "scout #{$previous_commands.any? ? $previous_commands*" " + " " : "" }#{ File.basename(__FILE__) }" if $previous_commands
|
8
|
+
|
9
|
+
options = SOPT.setup <<EOF
|
10
|
+
|
11
|
+
Description of the tool
|
12
|
+
|
13
|
+
$ #{$0} [<options>] <filename> [<other|->]*
|
14
|
+
|
15
|
+
-h--help Print this help
|
16
|
+
-d--done Done jobs only
|
17
|
+
-e--error Error jobs only
|
18
|
+
-a--aborted SLURM aboted jobs
|
19
|
+
-q--queued Queued jobs only
|
20
|
+
-j--job* Job ids
|
21
|
+
-s--search* Regular expression
|
22
|
+
-t--tail* Show the last lines of the STDERR
|
23
|
+
-BP--batch_parameters show batch parameters
|
24
|
+
-dr--dry_run Do not erase anything
|
25
|
+
EOF
|
26
|
+
if options[:help]
|
27
|
+
if defined? scout_usage
|
28
|
+
scout_usage
|
29
|
+
else
|
30
|
+
puts SOPT.doc
|
31
|
+
end
|
32
|
+
exit 0
|
33
|
+
end
|
34
|
+
|
35
|
+
batch_system = options.delete :batch_system
|
36
|
+
batch_system ||= 'auto'
|
37
|
+
|
38
|
+
HPC::BATCH_MODULE = HPC.batch_system batch_system
|
39
|
+
|
40
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
41
|
+
|
42
|
+
Log.severity = 4
|
43
|
+
done, error, aborted, queued, jobid, search, tail, batch_parameters, dry_run = options.values_at :done, :error, :aborted, :queued, :job, :search, :tail, :batch_parameters, :dry_run
|
44
|
+
|
45
|
+
workdir = File.expand_path('~/scout-batch')
|
46
|
+
Path.setup(workdir)
|
47
|
+
|
48
|
+
running_jobs = begin
|
49
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
50
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
51
|
+
rescue
|
52
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
53
|
+
squeue_txt = nil
|
54
|
+
$norunningjobs = true
|
55
|
+
[]
|
56
|
+
end
|
57
|
+
|
58
|
+
if squeue_txt
|
59
|
+
job_nodes = {}
|
60
|
+
squeue_txt.split("\n").each do |line|
|
61
|
+
parts = line.strip.split(/\s+/)
|
62
|
+
job_nodes[parts.first] = parts.last.split(",")
|
63
|
+
end
|
64
|
+
else
|
65
|
+
job_nodes = nil
|
66
|
+
end
|
67
|
+
|
68
|
+
count = 0
|
69
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
70
|
+
dir = File.dirname(fcmd)
|
71
|
+
command_txt = Open.read(fcmd)
|
72
|
+
|
73
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
74
|
+
cmd = m[1]
|
75
|
+
else
|
76
|
+
cmd = nil
|
77
|
+
end
|
78
|
+
|
79
|
+
if m = command_txt.match(/# Run command\n(.*?)\n/im)
|
80
|
+
exe = m[1]
|
81
|
+
else
|
82
|
+
exe = nil
|
83
|
+
end
|
84
|
+
|
85
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
86
|
+
container_home = m[1]
|
87
|
+
else
|
88
|
+
container_home = nil
|
89
|
+
end
|
90
|
+
|
91
|
+
if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
|
92
|
+
job_batch_system = m[1].downcase
|
93
|
+
else
|
94
|
+
job_batch_system = nil
|
95
|
+
end
|
96
|
+
|
97
|
+
different_system = job_batch_system != batch_system
|
98
|
+
|
99
|
+
if File.exist?(fid = File.join(dir, 'job.id'))
|
100
|
+
id = Open.read(fid).chomp
|
101
|
+
else
|
102
|
+
id = nil
|
103
|
+
end
|
104
|
+
|
105
|
+
if File.exist?(fstatus = File.join(dir, 'exit.status'))
|
106
|
+
exit_status = Open.read(fstatus).to_i
|
107
|
+
else
|
108
|
+
exit_status = nil
|
109
|
+
end
|
110
|
+
|
111
|
+
if File.exist?(fstatus = File.join(dir, 'job.status'))
|
112
|
+
fstatus_txt = Open.read(fstatus)
|
113
|
+
begin
|
114
|
+
if job_batch_system == "lsf"
|
115
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
116
|
+
else
|
117
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
118
|
+
end
|
119
|
+
rescue
|
120
|
+
nodes = []
|
121
|
+
end
|
122
|
+
elsif job_nodes[id]
|
123
|
+
nodes = job_nodes[id]
|
124
|
+
else
|
125
|
+
nodes = []
|
126
|
+
end
|
127
|
+
|
128
|
+
if File.exist?(File.join(dir, 'std.out'))
|
129
|
+
outt = File.mtime File.join(dir, 'std.out')
|
130
|
+
errt = File.mtime File.join(dir, 'std.err')
|
131
|
+
time_diff = Time.now - [outt, errt].max
|
132
|
+
end
|
133
|
+
|
134
|
+
fdep = File.join(dir, 'dependencies.list')
|
135
|
+
deps = Open.read(fdep).split("\n") if File.exist?(fdep)
|
136
|
+
|
137
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
138
|
+
cadeps = Open.read(fcadep).split("\n") if File.exist?(fcadep)
|
139
|
+
|
140
|
+
aborted = error = true if ! done && aborted.nil? && error.nil?
|
141
|
+
#if done || error || aborted || running || queued || jobid || search
|
142
|
+
# select = false
|
143
|
+
# select = true if done && exit_status && exit_status.to_i == 0
|
144
|
+
# select = true if error && exit_status && exit_status.to_i != 0
|
145
|
+
# select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
146
|
+
# select = select && jobid.split(",").include?(id) if jobid
|
147
|
+
# select = select && cmd.match(/#{search}/) if search
|
148
|
+
# next unless select
|
149
|
+
#end
|
150
|
+
|
151
|
+
if done || error || aborted || queued || jobid
|
152
|
+
select = false
|
153
|
+
select = true if done && exit_status == 0
|
154
|
+
select = true if error && exit_status && exit_status != 0
|
155
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
156
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
|
157
|
+
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
158
|
+
select = true if jobid && jobid.split(",").include?(id)
|
159
|
+
select = select && cmd.match(/#{search}/) if search
|
160
|
+
next unless select
|
161
|
+
elsif search
|
162
|
+
select = false
|
163
|
+
select = true if search && cmd.match(/#{search}/)
|
164
|
+
next unless select
|
165
|
+
end
|
166
|
+
|
167
|
+
|
168
|
+
puts Log.color(:yellow, "**ERASING**")
|
169
|
+
puts Log.color :blue, dir
|
170
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
|
171
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
172
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
173
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
174
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
175
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
176
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
177
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
178
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
179
|
+
puts Log.color(:magenta, "Output: ") << File.exist?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
180
|
+
|
181
|
+
if options[:batch_parameters]
|
182
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
183
|
+
case job_batch_system
|
184
|
+
when 'slurm'
|
185
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
186
|
+
when 'lsf'
|
187
|
+
puts Log.color :blue, CMD.cmd('grep "^#BSUB" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
if tail && File.exist?(File.join(dir, 'std.err'))
|
192
|
+
if exit_status && exit_status != 0
|
193
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
194
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
195
|
+
elsif exit_status
|
196
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
197
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
198
|
+
else
|
199
|
+
puts Log.color(:magenta, "Log tail: ")
|
200
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
count += 1
|
205
|
+
|
206
|
+
Open.rm_rf dir unless dry_run
|
207
|
+
end
|
208
|
+
|
209
|
+
puts
|
210
|
+
puts "Found #{count} jobs"
|
211
|
+
|
@@ -0,0 +1,357 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scout'
|
4
|
+
|
5
|
+
require 'rbbt/hpc'
|
6
|
+
|
7
|
+
$0 = "scout #{$previous_commands.any? ? $previous_commands*" " + " " : "" }#{ File.basename(__FILE__) }" if $previous_commands
|
8
|
+
|
9
|
+
options = SOPT.setup <<EOF
|
10
|
+
|
11
|
+
List all batch jobs
|
12
|
+
|
13
|
+
$ #{$0} [<options>] <filename> [<other|->]*
|
14
|
+
|
15
|
+
$ rbbt slurm list [options]
|
16
|
+
|
17
|
+
-h--help Print this help
|
18
|
+
-d--done Done jobs only
|
19
|
+
-e--error Error jobs only
|
20
|
+
-a--aborted SLURM aboted jobs
|
21
|
+
-r--running Running jobs only
|
22
|
+
-q--queued Queued jobs only
|
23
|
+
-j--job* Job ids
|
24
|
+
-s--search* Regular expression
|
25
|
+
-t--tail* Show the last lines of the STDERR
|
26
|
+
-l--long Show more entries
|
27
|
+
-c--compressed Show compressed information about entries
|
28
|
+
-p--progress Report progress of job and the dependencies
|
29
|
+
-BP--batch_parameters show batch parameters
|
30
|
+
-BPP--batch_procpath show Procpath performance summary
|
31
|
+
-sacct--sacct_peformance show sacct performance summary
|
32
|
+
-bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
|
33
|
+
EOF
|
34
|
+
if options[:help]
|
35
|
+
if defined? scout_usage
|
36
|
+
scout_usage
|
37
|
+
else
|
38
|
+
puts SOPT.doc
|
39
|
+
end
|
40
|
+
exit 0
|
41
|
+
end
|
42
|
+
|
43
|
+
batch_system = options.delete :batch_system
|
44
|
+
batch_system ||= 'auto'
|
45
|
+
|
46
|
+
long = options.delete :long
|
47
|
+
|
48
|
+
HPC::BATCH_MODULE = HPC.batch_system batch_system
|
49
|
+
|
50
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
51
|
+
|
52
|
+
batch_system = HPC::BATCH_MODULE.to_s.split("::").last.downcase
|
53
|
+
|
54
|
+
done, error, running, queued, aborted, jobid, search, tail, progress = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail, :progress
|
55
|
+
|
56
|
+
workdir = File.expand_path('~/scout-batch')
|
57
|
+
Path.setup(workdir)
|
58
|
+
|
59
|
+
running_jobs = begin
|
60
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
61
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
62
|
+
rescue
|
63
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
64
|
+
squeue_txt = nil
|
65
|
+
$norunningjobs = true
|
66
|
+
[]
|
67
|
+
end
|
68
|
+
|
69
|
+
if squeue_txt
|
70
|
+
job_nodes = {}
|
71
|
+
squeue_txt.split("\n").each do |line|
|
72
|
+
parts = line.strip.split(/\s+/)
|
73
|
+
job_nodes[parts.first] = parts.last.split(",")
|
74
|
+
end
|
75
|
+
else
|
76
|
+
job_nodes = nil
|
77
|
+
end
|
78
|
+
|
79
|
+
count = 0
|
80
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
81
|
+
dir = File.dirname(fcmd)
|
82
|
+
command_txt = Open.read(fcmd)
|
83
|
+
|
84
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
85
|
+
cmd = m[1]
|
86
|
+
else
|
87
|
+
cmd = nil
|
88
|
+
end
|
89
|
+
|
90
|
+
if m = command_txt.match(/^export BATCH_SYSTEM=(.*)/)
|
91
|
+
job_batch_system = m[1].downcase
|
92
|
+
else
|
93
|
+
job_batch_system = nil
|
94
|
+
end
|
95
|
+
|
96
|
+
different_system = job_batch_system != batch_system
|
97
|
+
|
98
|
+
if m = command_txt.match(/#MANIFEST: (.*)/)
|
99
|
+
manifest = m[1]
|
100
|
+
else
|
101
|
+
manifest = nil
|
102
|
+
end
|
103
|
+
|
104
|
+
if m = command_txt.match(/#STEP_PATH: (.*)/)
|
105
|
+
step_path = m[1]
|
106
|
+
else
|
107
|
+
step_path = nil
|
108
|
+
end
|
109
|
+
|
110
|
+
if m = command_txt.match(/#EXEC_CMD: (.*)/)
|
111
|
+
exe = m[1]
|
112
|
+
else
|
113
|
+
exe = nil
|
114
|
+
end
|
115
|
+
|
116
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
117
|
+
container_home = m[1]
|
118
|
+
else
|
119
|
+
container_home = nil
|
120
|
+
end
|
121
|
+
|
122
|
+
if File.exist?(fid = File.join(dir, 'job.id'))
|
123
|
+
id = Open.read(fid).chomp
|
124
|
+
else
|
125
|
+
id = nil
|
126
|
+
end
|
127
|
+
|
128
|
+
if File.exist?(fstatus = File.join(dir, 'exit.status'))
|
129
|
+
exit_status = Open.read(fstatus).to_i
|
130
|
+
else
|
131
|
+
exit_status = nil
|
132
|
+
end
|
133
|
+
|
134
|
+
if File.exist?(fstatus = File.join(dir, 'job.status'))
|
135
|
+
fstatus_txt = Open.read(fstatus)
|
136
|
+
begin
|
137
|
+
if job_batch_system == "lsf"
|
138
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
139
|
+
else
|
140
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
141
|
+
end
|
142
|
+
rescue
|
143
|
+
nodes = []
|
144
|
+
end
|
145
|
+
elsif job_nodes && job_nodes[id]
|
146
|
+
nodes = job_nodes[id].reject{|n| n.include? "("}
|
147
|
+
else
|
148
|
+
nodes = []
|
149
|
+
end
|
150
|
+
|
151
|
+
if File.exist?(File.join(dir, 'exit.status'))
|
152
|
+
now = File.ctime(File.join(dir, 'exit.status'))
|
153
|
+
else
|
154
|
+
now = Time.now
|
155
|
+
end
|
156
|
+
|
157
|
+
if File.exist?(File.join(dir, 'std.out'))
|
158
|
+
cerrt = File.ctime File.join(dir, 'std.err')
|
159
|
+
coutt = File.ctime File.join(dir, 'std.out')
|
160
|
+
outt = File.mtime File.join(dir, 'std.out')
|
161
|
+
errt = File.mtime File.join(dir, 'std.err')
|
162
|
+
time_diff = now - [outt, errt].max
|
163
|
+
time_elapsed = now - [cerrt, coutt].min
|
164
|
+
end
|
165
|
+
|
166
|
+
fdep = File.join(dir, 'dependencies.list')
|
167
|
+
deps = Open.read(fdep).split("\n") if File.exist?(fdep)
|
168
|
+
|
169
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
170
|
+
cadeps = Open.read(fcadep).split("\n") if File.exist?(fcadep)
|
171
|
+
|
172
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (deps.nil? || (running_jobs & deps).empty?)) || different_system )
|
173
|
+
if done || error || aborted || running || queued || jobid
|
174
|
+
select = false
|
175
|
+
select = true if done && exit_status == 0
|
176
|
+
select = true if error && exit_status && exit_status != 0
|
177
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
178
|
+
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
179
|
+
select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
180
|
+
select = true if jobid && jobid.split(",").include?(id)
|
181
|
+
select = select && step_path.match(/#{search}/) if search
|
182
|
+
next unless select
|
183
|
+
elsif search
|
184
|
+
select = false
|
185
|
+
select = true if search && cmd.match(/#{search}/)
|
186
|
+
next unless select
|
187
|
+
end
|
188
|
+
|
189
|
+
|
190
|
+
count += 1
|
191
|
+
|
192
|
+
if options[:compressed]
|
193
|
+
status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : Log.color(:green, id)
|
194
|
+
if different_system
|
195
|
+
status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id)
|
196
|
+
else
|
197
|
+
#status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) )
|
198
|
+
status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" :
|
199
|
+
(running_jobs.include?(id) || $norunningjobs ?
|
200
|
+
(is_running ? Log.color(:cyan, id) : Log.color(:yellow, id) ) :
|
201
|
+
Log.color(:red, id) )
|
202
|
+
end
|
203
|
+
prog_rep = []
|
204
|
+
if options[:progress]
|
205
|
+
step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
|
206
|
+
if step_line
|
207
|
+
step_path = step_line.split(": ").last.strip
|
208
|
+
step = Step.new step_path
|
209
|
+
has_bar = false
|
210
|
+
[step].reverse.each do |j|
|
211
|
+
next if j.done?
|
212
|
+
if j.file(:progress).exists?
|
213
|
+
bar = Log::ProgressBar.new
|
214
|
+
bar.load(j.file(:progress).yaml)
|
215
|
+
rep = bar.report_msg.split("·")[1]
|
216
|
+
rep = rep.sub(/.*?(\d+%)/, Log.color(:blue,'\1')).sub(/\-.*/,'')
|
217
|
+
prog_rep << [rep]
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
workflow, task, name = step_path.split("/")[-3..-1]
|
223
|
+
job_str = [Log.color(:yellow, workflow), Log.color(:magenta, task), name] * "/"
|
224
|
+
puts [job_str, status, prog_rep ].flatten * " "
|
225
|
+
next
|
226
|
+
end
|
227
|
+
|
228
|
+
puts Log.color :blue, dir
|
229
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s if long
|
230
|
+
puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err')) && long
|
231
|
+
puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest) if long
|
232
|
+
puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
|
233
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
234
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing") if long
|
235
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing") if long
|
236
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home && long
|
237
|
+
if different_system
|
238
|
+
puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
|
239
|
+
else
|
240
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
241
|
+
end
|
242
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
243
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
244
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", " if long
|
245
|
+
puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
|
246
|
+
puts Log.color(:magenta, "Output: ") << File.exist?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exist?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)") if long
|
247
|
+
|
248
|
+
if options[:batch_parameters]
|
249
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
250
|
+
case job_batch_system
|
251
|
+
when 'slurm'
|
252
|
+
text = CMD.cmd('grep "^#SBATCH" ', :in => Open.read(fcmd)).read.strip
|
253
|
+
when 'lsf'
|
254
|
+
text = CMD.cmd('grep "^#BSUB" ', :in => Open.read(fcmd)).read.strip
|
255
|
+
else
|
256
|
+
text = ""
|
257
|
+
end
|
258
|
+
lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
|
259
|
+
puts Log.color :yellow, lines * "\n"
|
260
|
+
end
|
261
|
+
|
262
|
+
fprocpath = File.join(dir, 'procpath.sqlite3')
|
263
|
+
if options[:batch_procpath] && Open.exists?(fprocpath)
|
264
|
+
puts Log.color(:magenta, "Procpath summary: ")
|
265
|
+
require 'rbbt/tsv/csv'
|
266
|
+
meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
|
267
|
+
perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
|
268
|
+
|
269
|
+
page_size = meta["page_size"].first.to_f
|
270
|
+
clock_ticks = meta["clock_ticks"].first.to_f
|
271
|
+
|
272
|
+
cpu_average = {}
|
273
|
+
rss_average = {}
|
274
|
+
perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
|
275
|
+
time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
|
276
|
+
time = time.to_f
|
277
|
+
|
278
|
+
cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
|
279
|
+
cpu_average[stat_pid] ||= {}
|
280
|
+
cpu_average[stat_pid][time] ||= []
|
281
|
+
cpu_average[stat_pid][time] << cpu.to_f
|
282
|
+
rss_average[time] ||= []
|
283
|
+
rss_average[time] << rss.to_f * page_size
|
284
|
+
end
|
285
|
+
|
286
|
+
ticks = 0
|
287
|
+
cpu_average.each do |stat_pid, cpu_average_pid|
|
288
|
+
start = cpu_average_pid.keys.sort.first
|
289
|
+
eend = cpu_average_pid.keys.sort.last
|
290
|
+
ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
|
291
|
+
end
|
292
|
+
start = rss_average.keys.sort.first
|
293
|
+
eend = rss_average.keys.sort.last
|
294
|
+
time_elapsed = eend - start
|
295
|
+
ticks = 1 if ticks == 0
|
296
|
+
time_elapsed = 1 if time_elapsed == 0
|
297
|
+
puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
|
298
|
+
puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
|
299
|
+
puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
if options[:sacct_peformance]
|
304
|
+
begin
|
305
|
+
raise "sacct not supported for LSF" unless batch_system == 'slurm'
|
306
|
+
tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
|
307
|
+
values = tsv[tsv.keys.first]
|
308
|
+
if values.compact.any?
|
309
|
+
puts Log.color(:magenta, "SACCT performance: ")
|
310
|
+
puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
|
311
|
+
end
|
312
|
+
rescue
|
313
|
+
Log.warn $!.message
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
317
|
+
|
318
|
+
if tail && File.exist?(File.join(dir, 'std.err'))
|
319
|
+
if exit_status && exit_status != 0
|
320
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
321
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
322
|
+
elsif exit_status
|
323
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
324
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
325
|
+
else
|
326
|
+
puts Log.color(:magenta, "Log tail: ")
|
327
|
+
puts CMD.cmd(" cat #{File.join(dir, 'std.err')} | grep -v '^[^\\s:]*\\[3.m' | grep -v -e '^[[:space:]]*$' | grep -v \"\\(STDOUT\\|STDERR\\):[[:space:]]*$\" | tail -n #{tail.to_i} ").read
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
if options[:progress]
|
332
|
+
step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
|
333
|
+
if step_line
|
334
|
+
step_path = step_line.split(": ").last.strip
|
335
|
+
step = Step.new step_path
|
336
|
+
has_bar = false
|
337
|
+
(step.rec_dependencies + [step]).reverse.each do |j|
|
338
|
+
next if j.done?
|
339
|
+
if j.file(:progress).exists?
|
340
|
+
bar = Log::ProgressBar.new
|
341
|
+
bar.load(j.file(:progress).yaml)
|
342
|
+
puts Log.color(:magenta, "Progress: ") + bar.report_msg + " " + Log.color(:yellow, j.task_signature)
|
343
|
+
has_bar = true
|
344
|
+
end
|
345
|
+
end
|
346
|
+
step_status = step.status
|
347
|
+
step_status = Log.color :red, step_status if step_status.to_s == 'cleaned'
|
348
|
+
step_status = Log.color :green, step_status if step_status.to_s == 'done'
|
349
|
+
puts Log.color(:magenta, "Progress: ") + Log.color(:yellow, step.task_signature) + " #{step_status}" unless has_bar
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
354
|
+
|
355
|
+
puts
|
356
|
+
puts Log.color :clear, "Found #{count} jobs"
|
357
|
+
|
data/scout_commands/log
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scout'
|
4
|
+
|
5
|
+
$0 = "scout #{$previous_commands.any? ? $previous_commands*" " + " " : "" }#{ File.basename(__FILE__) }" if $previous_commands
|
6
|
+
|
7
|
+
options = SOPT.setup <<EOF
|
8
|
+
|
9
|
+
Change log level
|
10
|
+
|
11
|
+
$ #{$0} <level>
|
12
|
+
|
13
|
+
DEBUG
|
14
|
+
LOW
|
15
|
+
MEDIUM
|
16
|
+
HIGH
|
17
|
+
INFO
|
18
|
+
WARN
|
19
|
+
ERROR
|
20
|
+
NONE
|
21
|
+
|
22
|
+
-h--help Print this help
|
23
|
+
EOF
|
24
|
+
if options[:help]
|
25
|
+
if defined? scout_usage
|
26
|
+
scout_usage
|
27
|
+
else
|
28
|
+
puts SOPT.doc
|
29
|
+
end
|
30
|
+
exit 0
|
31
|
+
end
|
32
|
+
|
33
|
+
if ARGV.empty?
|
34
|
+
if Scout.etc.log_severity.exists?
|
35
|
+
puts Scout.etc.log_severity.read
|
36
|
+
else
|
37
|
+
puts Scout.etc.log_severity.find + ' does not exist'
|
38
|
+
end
|
39
|
+
else
|
40
|
+
level = ARGV[0]
|
41
|
+
level = %w(DEBUG LOW MEDIUM HIGH INFO WARN ERROR NONE)[level.to_i] if level =~ /^\d+$/
|
42
|
+
Open.write(Scout.etc.log_severity, level.upcase)
|
43
|
+
end
|
44
|
+
|
45
|
+
|
data/scout_commands/rbbt
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
|
2
3
|
case
|
3
4
|
when File.exist?(file = File.join(__dir__, '../modules/rbbt-util/bin/rbbt'))
|
4
5
|
$LOAD_PATH.unshift(lib_dir = File.join(file, '../..', 'lib'))
|
@@ -19,5 +20,5 @@ when File.exist?(file = File.join(ENV["HOME"], 'git/rbbt-util/bin/rbbt'))
|
|
19
20
|
Path.default_pkgdir = Rbbt
|
20
21
|
load file
|
21
22
|
else
|
22
|
-
raise "
|
23
|
+
raise "Can't find rbbt"
|
23
24
|
end
|