scout-gear 10.1.0 → 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vimproject +49 -3
- data/VERSION +1 -1
- data/lib/scout/tsv/transformer.rb +17 -13
- data/lib/scout/tsv/util/melt.rb +13 -0
- data/lib/scout/tsv/util.rb +1 -0
- data/lib/scout/tsv.rb +7 -4
- data/lib/scout/workflow/definition.rb +10 -4
- data/lib/scout/workflow/step/dependencies.rb +1 -1
- data/lib/scout/workflow/step/file.rb +2 -1
- data/lib/scout/workflow/step/provenance.rb +2 -2
- data/lib/scout/workflow/step/status.rb +4 -4
- data/lib/scout/workflow/step.rb +11 -2
- data/lib/scout/workflow/task/inputs.rb +6 -2
- data/lib/scout/workflow/usage.rb +4 -2
- data/lib/scout/workflow/util.rb +5 -0
- data/scout-gear.gemspec +12 -4
- data/scout_commands/batch/clean +211 -0
- data/scout_commands/batch/list +357 -0
- data/scout_commands/log +45 -0
- data/scout_commands/rbbt +2 -1
- data/scout_commands/workflow/prov +133 -0
- data/scout_commands/workflow/task +6 -5
- data/scout_commands/workflow/trace +49 -0
- data/scout_commands/workflow/write_info +69 -0
- data/test/scout/offsite/test_ssh.rb +1 -1
- data/test/scout/test_tsv.rb +21 -0
- data/test/scout/tsv/util/test_melt.rb +28 -0
- metadata +11 -3
@@ -0,0 +1,211 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scout'
|
4
|
+
|
5
|
+
require 'rbbt/hpc'
|
6
|
+
|
7
|
+
$0 = "scout #{$previous_commands.any? ? $previous_commands*" " + " " : "" }#{ File.basename(__FILE__) }" if $previous_commands
|
8
|
+
|
9
|
+
options = SOPT.setup <<EOF
|
10
|
+
|
11
|
+
Description of the tool
|
12
|
+
|
13
|
+
$ #{$0} [<options>] <filename> [<other|->]*
|
14
|
+
|
15
|
+
-h--help Print this help
|
16
|
+
-d--done Done jobs only
|
17
|
+
-e--error Error jobs only
|
18
|
+
-a--aborted SLURM aboted jobs
|
19
|
+
-q--queued Queued jobs only
|
20
|
+
-j--job* Job ids
|
21
|
+
-s--search* Regular expression
|
22
|
+
-t--tail* Show the last lines of the STDERR
|
23
|
+
-BP--batch_parameters show batch parameters
|
24
|
+
-dr--dry_run Do not erase anything
|
25
|
+
EOF
|
26
|
+
if options[:help]
|
27
|
+
if defined? scout_usage
|
28
|
+
scout_usage
|
29
|
+
else
|
30
|
+
puts SOPT.doc
|
31
|
+
end
|
32
|
+
exit 0
|
33
|
+
end
|
34
|
+
|
35
|
+
batch_system = options.delete :batch_system
|
36
|
+
batch_system ||= 'auto'
|
37
|
+
|
38
|
+
HPC::BATCH_MODULE = HPC.batch_system batch_system
|
39
|
+
|
40
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
41
|
+
|
42
|
+
Log.severity = 4
|
43
|
+
done, error, aborted, queued, jobid, search, tail, batch_parameters, dry_run = options.values_at :done, :error, :aborted, :queued, :job, :search, :tail, :batch_parameters, :dry_run
|
44
|
+
|
45
|
+
workdir = File.expand_path('~/scout-batch')
|
46
|
+
Path.setup(workdir)
|
47
|
+
|
48
|
+
running_jobs = begin
|
49
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
50
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
51
|
+
rescue
|
52
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
53
|
+
squeue_txt = nil
|
54
|
+
$norunningjobs = true
|
55
|
+
[]
|
56
|
+
end
|
57
|
+
|
58
|
+
if squeue_txt
|
59
|
+
job_nodes = {}
|
60
|
+
squeue_txt.split("\n").each do |line|
|
61
|
+
parts = line.strip.split(/\s+/)
|
62
|
+
job_nodes[parts.first] = parts.last.split(",")
|
63
|
+
end
|
64
|
+
else
|
65
|
+
job_nodes = nil
|
66
|
+
end
|
67
|
+
|
68
|
+
count = 0
|
69
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
70
|
+
dir = File.dirname(fcmd)
|
71
|
+
command_txt = Open.read(fcmd)
|
72
|
+
|
73
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
74
|
+
cmd = m[1]
|
75
|
+
else
|
76
|
+
cmd = nil
|
77
|
+
end
|
78
|
+
|
79
|
+
if m = command_txt.match(/# Run command\n(.*?)\n/im)
|
80
|
+
exe = m[1]
|
81
|
+
else
|
82
|
+
exe = nil
|
83
|
+
end
|
84
|
+
|
85
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
86
|
+
container_home = m[1]
|
87
|
+
else
|
88
|
+
container_home = nil
|
89
|
+
end
|
90
|
+
|
91
|
+
if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
|
92
|
+
job_batch_system = m[1].downcase
|
93
|
+
else
|
94
|
+
job_batch_system = nil
|
95
|
+
end
|
96
|
+
|
97
|
+
different_system = job_batch_system != batch_system
|
98
|
+
|
99
|
+
if File.exist?(fid = File.join(dir, 'job.id'))
|
100
|
+
id = Open.read(fid).chomp
|
101
|
+
else
|
102
|
+
id = nil
|
103
|
+
end
|
104
|
+
|
105
|
+
if File.exist?(fstatus = File.join(dir, 'exit.status'))
|
106
|
+
exit_status = Open.read(fstatus).to_i
|
107
|
+
else
|
108
|
+
exit_status = nil
|
109
|
+
end
|
110
|
+
|
111
|
+
if File.exist?(fstatus = File.join(dir, 'job.status'))
|
112
|
+
fstatus_txt = Open.read(fstatus)
|
113
|
+
begin
|
114
|
+
if job_batch_system == "lsf"
|
115
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
116
|
+
else
|
117
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
118
|
+
end
|
119
|
+
rescue
|
120
|
+
nodes = []
|
121
|
+
end
|
122
|
+
elsif job_nodes[id]
|
123
|
+
nodes = job_nodes[id]
|
124
|
+
else
|
125
|
+
nodes = []
|
126
|
+
end
|
127
|
+
|
128
|
+
if File.exist?(File.join(dir, 'std.out'))
|
129
|
+
outt = File.mtime File.join(dir, 'std.out')
|
130
|
+
errt = File.mtime File.join(dir, 'std.err')
|
131
|
+
time_diff = Time.now - [outt, errt].max
|
132
|
+
end
|
133
|
+
|
134
|
+
fdep = File.join(dir, 'dependencies.list')
|
135
|
+
deps = Open.read(fdep).split("\n") if File.exist?(fdep)
|
136
|
+
|
137
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
138
|
+
cadeps = Open.read(fcadep).split("\n") if File.exist?(fcadep)
|
139
|
+
|
140
|
+
aborted = error = true if ! done && aborted.nil? && error.nil?
|
141
|
+
#if done || error || aborted || running || queued || jobid || search
|
142
|
+
# select = false
|
143
|
+
# select = true if done && exit_status && exit_status.to_i == 0
|
144
|
+
# select = true if error && exit_status && exit_status.to_i != 0
|
145
|
+
# select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
146
|
+
# select = select && jobid.split(",").include?(id) if jobid
|
147
|
+
# select = select && cmd.match(/#{search}/) if search
|
148
|
+
# next unless select
|
149
|
+
#end
|
150
|
+
|
151
|
+
if done || error || aborted || queued || jobid
|
152
|
+
select = false
|
153
|
+
select = true if done && exit_status == 0
|
154
|
+
select = true if error && exit_status && exit_status != 0
|
155
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
156
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
|
157
|
+
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
158
|
+
select = true if jobid && jobid.split(",").include?(id)
|
159
|
+
select = select && cmd.match(/#{search}/) if search
|
160
|
+
next unless select
|
161
|
+
elsif search
|
162
|
+
select = false
|
163
|
+
select = true if search && cmd.match(/#{search}/)
|
164
|
+
next unless select
|
165
|
+
end
|
166
|
+
|
167
|
+
|
168
|
+
puts Log.color(:yellow, "**ERASING**")
|
169
|
+
puts Log.color :blue, dir
|
170
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
|
171
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
172
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
173
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
174
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
175
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
176
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
177
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
178
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
179
|
+
puts Log.color(:magenta, "Output: ") << File.exist?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
180
|
+
|
181
|
+
if options[:batch_parameters]
|
182
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
183
|
+
case job_batch_system
|
184
|
+
when 'slurm'
|
185
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
186
|
+
when 'lsf'
|
187
|
+
puts Log.color :blue, CMD.cmd('grep "^#BSUB" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
if tail && File.exist?(File.join(dir, 'std.err'))
|
192
|
+
if exit_status && exit_status != 0
|
193
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
194
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
195
|
+
elsif exit_status
|
196
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
197
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
198
|
+
else
|
199
|
+
puts Log.color(:magenta, "Log tail: ")
|
200
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
count += 1
|
205
|
+
|
206
|
+
Open.rm_rf dir unless dry_run
|
207
|
+
end
|
208
|
+
|
209
|
+
puts
|
210
|
+
puts "Found #{count} jobs"
|
211
|
+
|
@@ -0,0 +1,357 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scout'
|
4
|
+
|
5
|
+
require 'rbbt/hpc'
|
6
|
+
|
7
|
+
$0 = "scout #{$previous_commands.any? ? $previous_commands*" " + " " : "" }#{ File.basename(__FILE__) }" if $previous_commands
|
8
|
+
|
9
|
+
options = SOPT.setup <<EOF
|
10
|
+
|
11
|
+
List all batch jobs
|
12
|
+
|
13
|
+
$ #{$0} [<options>] <filename> [<other|->]*
|
14
|
+
|
15
|
+
$ rbbt slurm list [options]
|
16
|
+
|
17
|
+
-h--help Print this help
|
18
|
+
-d--done Done jobs only
|
19
|
+
-e--error Error jobs only
|
20
|
+
-a--aborted SLURM aboted jobs
|
21
|
+
-r--running Running jobs only
|
22
|
+
-q--queued Queued jobs only
|
23
|
+
-j--job* Job ids
|
24
|
+
-s--search* Regular expression
|
25
|
+
-t--tail* Show the last lines of the STDERR
|
26
|
+
-l--long Show more entries
|
27
|
+
-c--compressed Show compressed information about entries
|
28
|
+
-p--progress Report progress of job and the dependencies
|
29
|
+
-BP--batch_parameters show batch parameters
|
30
|
+
-BPP--batch_procpath show Procpath performance summary
|
31
|
+
-sacct--sacct_peformance show sacct performance summary
|
32
|
+
-bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
|
33
|
+
EOF
|
34
|
+
if options[:help]
|
35
|
+
if defined? scout_usage
|
36
|
+
scout_usage
|
37
|
+
else
|
38
|
+
puts SOPT.doc
|
39
|
+
end
|
40
|
+
exit 0
|
41
|
+
end
|
42
|
+
|
43
|
+
batch_system = options.delete :batch_system
|
44
|
+
batch_system ||= 'auto'
|
45
|
+
|
46
|
+
long = options.delete :long
|
47
|
+
|
48
|
+
HPC::BATCH_MODULE = HPC.batch_system batch_system
|
49
|
+
|
50
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
51
|
+
|
52
|
+
batch_system = HPC::BATCH_MODULE.to_s.split("::").last.downcase
|
53
|
+
|
54
|
+
done, error, running, queued, aborted, jobid, search, tail, progress = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail, :progress
|
55
|
+
|
56
|
+
workdir = File.expand_path('~/scout-batch')
|
57
|
+
Path.setup(workdir)
|
58
|
+
|
59
|
+
running_jobs = begin
|
60
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
61
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
62
|
+
rescue
|
63
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
64
|
+
squeue_txt = nil
|
65
|
+
$norunningjobs = true
|
66
|
+
[]
|
67
|
+
end
|
68
|
+
|
69
|
+
if squeue_txt
|
70
|
+
job_nodes = {}
|
71
|
+
squeue_txt.split("\n").each do |line|
|
72
|
+
parts = line.strip.split(/\s+/)
|
73
|
+
job_nodes[parts.first] = parts.last.split(",")
|
74
|
+
end
|
75
|
+
else
|
76
|
+
job_nodes = nil
|
77
|
+
end
|
78
|
+
|
79
|
+
count = 0
|
80
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
81
|
+
dir = File.dirname(fcmd)
|
82
|
+
command_txt = Open.read(fcmd)
|
83
|
+
|
84
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
85
|
+
cmd = m[1]
|
86
|
+
else
|
87
|
+
cmd = nil
|
88
|
+
end
|
89
|
+
|
90
|
+
if m = command_txt.match(/^export BATCH_SYSTEM=(.*)/)
|
91
|
+
job_batch_system = m[1].downcase
|
92
|
+
else
|
93
|
+
job_batch_system = nil
|
94
|
+
end
|
95
|
+
|
96
|
+
different_system = job_batch_system != batch_system
|
97
|
+
|
98
|
+
if m = command_txt.match(/#MANIFEST: (.*)/)
|
99
|
+
manifest = m[1]
|
100
|
+
else
|
101
|
+
manifest = nil
|
102
|
+
end
|
103
|
+
|
104
|
+
if m = command_txt.match(/#STEP_PATH: (.*)/)
|
105
|
+
step_path = m[1]
|
106
|
+
else
|
107
|
+
step_path = nil
|
108
|
+
end
|
109
|
+
|
110
|
+
if m = command_txt.match(/#EXEC_CMD: (.*)/)
|
111
|
+
exe = m[1]
|
112
|
+
else
|
113
|
+
exe = nil
|
114
|
+
end
|
115
|
+
|
116
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
117
|
+
container_home = m[1]
|
118
|
+
else
|
119
|
+
container_home = nil
|
120
|
+
end
|
121
|
+
|
122
|
+
if File.exist?(fid = File.join(dir, 'job.id'))
|
123
|
+
id = Open.read(fid).chomp
|
124
|
+
else
|
125
|
+
id = nil
|
126
|
+
end
|
127
|
+
|
128
|
+
if File.exist?(fstatus = File.join(dir, 'exit.status'))
|
129
|
+
exit_status = Open.read(fstatus).to_i
|
130
|
+
else
|
131
|
+
exit_status = nil
|
132
|
+
end
|
133
|
+
|
134
|
+
if File.exist?(fstatus = File.join(dir, 'job.status'))
|
135
|
+
fstatus_txt = Open.read(fstatus)
|
136
|
+
begin
|
137
|
+
if job_batch_system == "lsf"
|
138
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
139
|
+
else
|
140
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
141
|
+
end
|
142
|
+
rescue
|
143
|
+
nodes = []
|
144
|
+
end
|
145
|
+
elsif job_nodes && job_nodes[id]
|
146
|
+
nodes = job_nodes[id].reject{|n| n.include? "("}
|
147
|
+
else
|
148
|
+
nodes = []
|
149
|
+
end
|
150
|
+
|
151
|
+
if File.exist?(File.join(dir, 'exit.status'))
|
152
|
+
now = File.ctime(File.join(dir, 'exit.status'))
|
153
|
+
else
|
154
|
+
now = Time.now
|
155
|
+
end
|
156
|
+
|
157
|
+
if File.exist?(File.join(dir, 'std.out'))
|
158
|
+
cerrt = File.ctime File.join(dir, 'std.err')
|
159
|
+
coutt = File.ctime File.join(dir, 'std.out')
|
160
|
+
outt = File.mtime File.join(dir, 'std.out')
|
161
|
+
errt = File.mtime File.join(dir, 'std.err')
|
162
|
+
time_diff = now - [outt, errt].max
|
163
|
+
time_elapsed = now - [cerrt, coutt].min
|
164
|
+
end
|
165
|
+
|
166
|
+
fdep = File.join(dir, 'dependencies.list')
|
167
|
+
deps = Open.read(fdep).split("\n") if File.exist?(fdep)
|
168
|
+
|
169
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
170
|
+
cadeps = Open.read(fcadep).split("\n") if File.exist?(fcadep)
|
171
|
+
|
172
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (deps.nil? || (running_jobs & deps).empty?)) || different_system )
|
173
|
+
if done || error || aborted || running || queued || jobid
|
174
|
+
select = false
|
175
|
+
select = true if done && exit_status == 0
|
176
|
+
select = true if error && exit_status && exit_status != 0
|
177
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
178
|
+
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
179
|
+
select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
180
|
+
select = true if jobid && jobid.split(",").include?(id)
|
181
|
+
select = select && step_path.match(/#{search}/) if search
|
182
|
+
next unless select
|
183
|
+
elsif search
|
184
|
+
select = false
|
185
|
+
select = true if search && cmd.match(/#{search}/)
|
186
|
+
next unless select
|
187
|
+
end
|
188
|
+
|
189
|
+
|
190
|
+
count += 1
|
191
|
+
|
192
|
+
if options[:compressed]
|
193
|
+
status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : Log.color(:green, id)
|
194
|
+
if different_system
|
195
|
+
status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id)
|
196
|
+
else
|
197
|
+
#status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) )
|
198
|
+
status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" :
|
199
|
+
(running_jobs.include?(id) || $norunningjobs ?
|
200
|
+
(is_running ? Log.color(:cyan, id) : Log.color(:yellow, id) ) :
|
201
|
+
Log.color(:red, id) )
|
202
|
+
end
|
203
|
+
prog_rep = []
|
204
|
+
if options[:progress]
|
205
|
+
step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
|
206
|
+
if step_line
|
207
|
+
step_path = step_line.split(": ").last.strip
|
208
|
+
step = Step.new step_path
|
209
|
+
has_bar = false
|
210
|
+
[step].reverse.each do |j|
|
211
|
+
next if j.done?
|
212
|
+
if j.file(:progress).exists?
|
213
|
+
bar = Log::ProgressBar.new
|
214
|
+
bar.load(j.file(:progress).yaml)
|
215
|
+
rep = bar.report_msg.split("·")[1]
|
216
|
+
rep = rep.sub(/.*?(\d+%)/, Log.color(:blue,'\1')).sub(/\-.*/,'')
|
217
|
+
prog_rep << [rep]
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
workflow, task, name = step_path.split("/")[-3..-1]
|
223
|
+
job_str = [Log.color(:yellow, workflow), Log.color(:magenta, task), name] * "/"
|
224
|
+
puts [job_str, status, prog_rep ].flatten * " "
|
225
|
+
next
|
226
|
+
end
|
227
|
+
|
228
|
+
puts Log.color :blue, dir
|
229
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s if long
|
230
|
+
puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err')) && long
|
231
|
+
puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest) if long
|
232
|
+
puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
|
233
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
234
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing") if long
|
235
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing") if long
|
236
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home && long
|
237
|
+
if different_system
|
238
|
+
puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
|
239
|
+
else
|
240
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
241
|
+
end
|
242
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
243
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
244
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", " if long
|
245
|
+
puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
|
246
|
+
puts Log.color(:magenta, "Output: ") << File.exist?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exist?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)") if long
|
247
|
+
|
248
|
+
if options[:batch_parameters]
|
249
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
250
|
+
case job_batch_system
|
251
|
+
when 'slurm'
|
252
|
+
text = CMD.cmd('grep "^#SBATCH" ', :in => Open.read(fcmd)).read.strip
|
253
|
+
when 'lsf'
|
254
|
+
text = CMD.cmd('grep "^#BSUB" ', :in => Open.read(fcmd)).read.strip
|
255
|
+
else
|
256
|
+
text = ""
|
257
|
+
end
|
258
|
+
lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
|
259
|
+
puts Log.color :yellow, lines * "\n"
|
260
|
+
end
|
261
|
+
|
262
|
+
fprocpath = File.join(dir, 'procpath.sqlite3')
|
263
|
+
if options[:batch_procpath] && Open.exists?(fprocpath)
|
264
|
+
puts Log.color(:magenta, "Procpath summary: ")
|
265
|
+
require 'rbbt/tsv/csv'
|
266
|
+
meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
|
267
|
+
perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
|
268
|
+
|
269
|
+
page_size = meta["page_size"].first.to_f
|
270
|
+
clock_ticks = meta["clock_ticks"].first.to_f
|
271
|
+
|
272
|
+
cpu_average = {}
|
273
|
+
rss_average = {}
|
274
|
+
perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
|
275
|
+
time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
|
276
|
+
time = time.to_f
|
277
|
+
|
278
|
+
cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
|
279
|
+
cpu_average[stat_pid] ||= {}
|
280
|
+
cpu_average[stat_pid][time] ||= []
|
281
|
+
cpu_average[stat_pid][time] << cpu.to_f
|
282
|
+
rss_average[time] ||= []
|
283
|
+
rss_average[time] << rss.to_f * page_size
|
284
|
+
end
|
285
|
+
|
286
|
+
ticks = 0
|
287
|
+
cpu_average.each do |stat_pid, cpu_average_pid|
|
288
|
+
start = cpu_average_pid.keys.sort.first
|
289
|
+
eend = cpu_average_pid.keys.sort.last
|
290
|
+
ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
|
291
|
+
end
|
292
|
+
start = rss_average.keys.sort.first
|
293
|
+
eend = rss_average.keys.sort.last
|
294
|
+
time_elapsed = eend - start
|
295
|
+
ticks = 1 if ticks == 0
|
296
|
+
time_elapsed = 1 if time_elapsed == 0
|
297
|
+
puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
|
298
|
+
puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
|
299
|
+
puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
if options[:sacct_peformance]
|
304
|
+
begin
|
305
|
+
raise "sacct not supported for LSF" unless batch_system == 'slurm'
|
306
|
+
tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
|
307
|
+
values = tsv[tsv.keys.first]
|
308
|
+
if values.compact.any?
|
309
|
+
puts Log.color(:magenta, "SACCT performance: ")
|
310
|
+
puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
|
311
|
+
end
|
312
|
+
rescue
|
313
|
+
Log.warn $!.message
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
317
|
+
|
318
|
+
if tail && File.exist?(File.join(dir, 'std.err'))
|
319
|
+
if exit_status && exit_status != 0
|
320
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
321
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
322
|
+
elsif exit_status
|
323
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
324
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
325
|
+
else
|
326
|
+
puts Log.color(:magenta, "Log tail: ")
|
327
|
+
puts CMD.cmd(" cat #{File.join(dir, 'std.err')} | grep -v '^[^\\s:]*\\[3.m' | grep -v -e '^[[:space:]]*$' | grep -v \"\\(STDOUT\\|STDERR\\):[[:space:]]*$\" | tail -n #{tail.to_i} ").read
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
if options[:progress]
|
332
|
+
step_line = Open.read(fcmd).split("\n").select{|line| line =~ /^#STEP_PATH:/}.first
|
333
|
+
if step_line
|
334
|
+
step_path = step_line.split(": ").last.strip
|
335
|
+
step = Step.new step_path
|
336
|
+
has_bar = false
|
337
|
+
(step.rec_dependencies + [step]).reverse.each do |j|
|
338
|
+
next if j.done?
|
339
|
+
if j.file(:progress).exists?
|
340
|
+
bar = Log::ProgressBar.new
|
341
|
+
bar.load(j.file(:progress).yaml)
|
342
|
+
puts Log.color(:magenta, "Progress: ") + bar.report_msg + " " + Log.color(:yellow, j.task_signature)
|
343
|
+
has_bar = true
|
344
|
+
end
|
345
|
+
end
|
346
|
+
step_status = step.status
|
347
|
+
step_status = Log.color :red, step_status if step_status.to_s == 'cleaned'
|
348
|
+
step_status = Log.color :green, step_status if step_status.to_s == 'done'
|
349
|
+
puts Log.color(:magenta, "Progress: ") + Log.color(:yellow, step.task_signature) + " #{step_status}" unless has_bar
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
354
|
+
|
355
|
+
puts
|
356
|
+
puts Log.color :clear, "Found #{count} jobs"
|
357
|
+
|
data/scout_commands/log
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scout'
|
4
|
+
|
5
|
+
$0 = "scout #{$previous_commands.any? ? $previous_commands*" " + " " : "" }#{ File.basename(__FILE__) }" if $previous_commands
|
6
|
+
|
7
|
+
options = SOPT.setup <<EOF
|
8
|
+
|
9
|
+
Change log level
|
10
|
+
|
11
|
+
$ #{$0} <level>
|
12
|
+
|
13
|
+
DEBUG
|
14
|
+
LOW
|
15
|
+
MEDIUM
|
16
|
+
HIGH
|
17
|
+
INFO
|
18
|
+
WARN
|
19
|
+
ERROR
|
20
|
+
NONE
|
21
|
+
|
22
|
+
-h--help Print this help
|
23
|
+
EOF
|
24
|
+
if options[:help]
|
25
|
+
if defined? scout_usage
|
26
|
+
scout_usage
|
27
|
+
else
|
28
|
+
puts SOPT.doc
|
29
|
+
end
|
30
|
+
exit 0
|
31
|
+
end
|
32
|
+
|
33
|
+
if ARGV.empty?
|
34
|
+
if Scout.etc.log_severity.exists?
|
35
|
+
puts Scout.etc.log_severity.read
|
36
|
+
else
|
37
|
+
puts Scout.etc.log_severity.find + ' does not exist'
|
38
|
+
end
|
39
|
+
else
|
40
|
+
level = ARGV[0]
|
41
|
+
level = %w(DEBUG LOW MEDIUM HIGH INFO WARN ERROR NONE)[level.to_i] if level =~ /^\d+$/
|
42
|
+
Open.write(Scout.etc.log_severity, level.upcase)
|
43
|
+
end
|
44
|
+
|
45
|
+
|
data/scout_commands/rbbt
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
|
2
3
|
case
|
3
4
|
when File.exist?(file = File.join(__dir__, '../modules/rbbt-util/bin/rbbt'))
|
4
5
|
$LOAD_PATH.unshift(lib_dir = File.join(file, '../..', 'lib'))
|
@@ -19,5 +20,5 @@ when File.exist?(file = File.join(ENV["HOME"], 'git/rbbt-util/bin/rbbt'))
|
|
19
20
|
Path.default_pkgdir = Rbbt
|
20
21
|
load file
|
21
22
|
else
|
22
|
-
raise "
|
23
|
+
raise "Can't find rbbt"
|
23
24
|
end
|