rbbt-util 5.29.2 → 5.29.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc/orchestrate.rb +12 -1
- data/lib/rbbt/hpc/slurm.rb +36 -15
- data/lib/rbbt/persist.rb +4 -0
- data/share/rbbt_commands/slurm/clean +165 -0
- data/share/rbbt_commands/slurm/list +120 -96
- data/share/rbbt_commands/slurm/orchestrate +2 -2
- data/share/rbbt_commands/workflow/task +9 -6
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ec6302ccfe3f38a074f7f0d10511090c8f4db4186228ad93adb2888e0edbf5e
|
4
|
+
data.tar.gz: fcaa50b654461f128b9539fc47ed00b008d3f713e89b8bbe963c2b898c3c168b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a2537aef150df77142a593742399d28bb96334decc0e33b69e9cbac2853085487100aa90cd711b342bac7eda536d15c080be22891d5b9f38bfa4601282ae5de
|
7
|
+
data.tar.gz: d79cc4afa294d63cebd79f73b759bc46c3c4e0285db7be7406a594aa49e5b572b0c4f91b98c73b5f64c31c908d58d8dee43853ec18b1ead9e814b4370de0d60d
|
data/lib/rbbt/hpc/orchestrate.rb
CHANGED
@@ -60,9 +60,14 @@ module HPC
|
|
60
60
|
return if job.done?
|
61
61
|
return unless job.path.split("/")[-4] == "jobs"
|
62
62
|
seen[:orchestration_target_job] ||= job
|
63
|
+
|
63
64
|
options.delete "recursive_clean"
|
65
|
+
options.delete "clean_task"
|
66
|
+
options.delete "clean"
|
64
67
|
options.delete "tail"
|
65
68
|
options.delete "printfile"
|
69
|
+
options.delete "detach"
|
70
|
+
|
66
71
|
rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
|
67
72
|
rules ||= {}
|
68
73
|
IndiferentHash.setup(rules)
|
@@ -75,7 +80,13 @@ module HPC
|
|
75
80
|
skip_dep = job_rules["chain_tasks"] &&
|
76
81
|
job_rules["chain_tasks"][job.workflow.to_s] && job_rules["chain_tasks"][job.workflow.to_s].include?(job.task_name.to_s) &&
|
77
82
|
job_rules["chain_tasks"][dep.workflow.to_s] && job_rules["chain_tasks"][dep.workflow.to_s].include?(dep.task_name.to_s)
|
78
|
-
|
83
|
+
|
84
|
+
deps = seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
|
85
|
+
if job.canfail_paths.include? dep.path
|
86
|
+
[deps].flatten.collect{|id| ['canfail', id] * ":"}
|
87
|
+
else
|
88
|
+
deps
|
89
|
+
end
|
79
90
|
end.flatten.compact.uniq
|
80
91
|
|
81
92
|
skip = true if job_rules[:skip]
|
data/lib/rbbt/hpc/slurm.rb
CHANGED
@@ -58,11 +58,11 @@ module HPC
|
|
58
58
|
when FalseClass
|
59
59
|
'--' << o << "=false"
|
60
60
|
else
|
61
|
-
['--' << o, "'#{v}'"] * " "
|
61
|
+
['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
|
62
62
|
end
|
63
63
|
end * " "
|
64
64
|
|
65
|
-
rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
|
65
|
+
rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
|
66
66
|
|
67
67
|
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
68
68
|
|
@@ -76,6 +76,7 @@ module HPC
|
|
76
76
|
fjob = File.join(slurm_basedir, 'job.id')
|
77
77
|
fexit = File.join(slurm_basedir, 'exit.status')
|
78
78
|
fsync = File.join(slurm_basedir, 'sync.log')
|
79
|
+
fsyncexit = File.join(slurm_basedir, 'sync.status')
|
79
80
|
fcmd = File.join(slurm_basedir, 'command.slurm')
|
80
81
|
|
81
82
|
#{{{ GENERATE TEMPLATE
|
@@ -107,10 +108,6 @@ module HPC
|
|
107
108
|
EOF
|
108
109
|
end
|
109
110
|
|
110
|
-
header +=<<-EOF
|
111
|
-
#CMD: #{rbbt_cmd}
|
112
|
-
EOF
|
113
|
-
|
114
111
|
# ENV
|
115
112
|
env = ""
|
116
113
|
env +=<<-EOF
|
@@ -246,7 +243,7 @@ EOF
|
|
246
243
|
end
|
247
244
|
|
248
245
|
if contain
|
249
|
-
rbbt_cmd << " " << %(--workdir_all='#{contain}')
|
246
|
+
rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/.rbbt/var/jobs')
|
250
247
|
end
|
251
248
|
end
|
252
249
|
|
@@ -256,6 +253,10 @@ EOF
|
|
256
253
|
#{rbbt_cmd}
|
257
254
|
EOF
|
258
255
|
|
256
|
+
header +=<<-EOF
|
257
|
+
#CMD: #{rbbt_cmd}
|
258
|
+
EOF
|
259
|
+
|
259
260
|
run +=<<-EOF
|
260
261
|
|
261
262
|
# Run command
|
@@ -273,10 +274,10 @@ EOF
|
|
273
274
|
coda +=<<-EOF
|
274
275
|
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
|
275
276
|
EOF
|
276
|
-
else
|
277
|
-
coda +=<<-EOF
|
278
|
-
rbbt system clean all -q &>> #{fsync}
|
279
|
-
EOF
|
277
|
+
# else
|
278
|
+
# coda +=<<-EOF
|
279
|
+
#rbbt system clean all -q &>> #{fsync}
|
280
|
+
#EOF
|
280
281
|
end
|
281
282
|
|
282
283
|
if sync.include?("=>")
|
@@ -295,6 +296,7 @@ EOF
|
|
295
296
|
mkdir -p "$(dirname '#{target}')"
|
296
297
|
rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
|
297
298
|
sync_es="$?"
|
299
|
+
echo $sync_es > #{fsyncexit}
|
298
300
|
find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
|
299
301
|
EOF
|
300
302
|
|
@@ -320,23 +322,24 @@ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem
|
|
320
322
|
EOF
|
321
323
|
else
|
322
324
|
coda +=<<-EOF
|
323
|
-
|
325
|
+
##{exec_cmd} system clean
|
324
326
|
if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
325
327
|
rm -Rfv #{contain} &>> #{fsync}
|
326
328
|
else
|
327
329
|
echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
328
330
|
fi
|
329
|
-
unset sync_es
|
330
331
|
EOF
|
331
332
|
|
332
333
|
end
|
333
334
|
end
|
334
335
|
end
|
336
|
+
|
335
337
|
coda +=<<-EOF
|
336
338
|
|
337
339
|
# Write exit status to file
|
338
340
|
echo $exit_status > #{fexit}
|
339
341
|
EOF
|
342
|
+
|
340
343
|
if sync
|
341
344
|
coda +=<<-EOF
|
342
345
|
if [ "$sync_es" == '0' ]; then
|
@@ -362,6 +365,10 @@ EOF
|
|
362
365
|
slurm_basedir = options[:slurm_basedir]
|
363
366
|
dependencies = options.delete :slurm_dependencies
|
364
367
|
dependencies = [] if dependencies.nil?
|
368
|
+
|
369
|
+
canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
|
370
|
+
dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
|
371
|
+
|
365
372
|
Open.mkdir slurm_basedir
|
366
373
|
|
367
374
|
dry_run = options.delete :dry_run
|
@@ -370,6 +377,7 @@ EOF
|
|
370
377
|
ferr = File.join(slurm_basedir, 'std.err')
|
371
378
|
fjob = File.join(slurm_basedir, 'job.id')
|
372
379
|
fdep = File.join(slurm_basedir, 'dependencies.list')
|
380
|
+
fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
|
373
381
|
fexit = File.join(slurm_basedir, 'exit.status')
|
374
382
|
fsync = File.join(slurm_basedir, 'sync.log')
|
375
383
|
fcmd = File.join(slurm_basedir, 'command.slurm')
|
@@ -401,8 +409,21 @@ EOF
|
|
401
409
|
Open.rm fexit
|
402
410
|
Open.rm fout
|
403
411
|
Open.rm ferr
|
412
|
+
|
404
413
|
Open.write(fdep, dependencies * "\n") if dependencies.any?
|
405
|
-
|
414
|
+
Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
|
415
|
+
|
416
|
+
|
417
|
+
dep_str = '--dependency='
|
418
|
+
normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
|
419
|
+
canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
|
420
|
+
|
421
|
+
if normal_dep_str.nil? && canfail_dep_str.nil?
|
422
|
+
dep_str = ""
|
423
|
+
else
|
424
|
+
dep_str += [normal_dep_str, canfail_dep_str].compact * ","
|
425
|
+
end
|
426
|
+
|
406
427
|
job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
|
407
428
|
Log.debug "SBATCH job id: #{job}"
|
408
429
|
Open.write(fjob, job.to_s)
|
@@ -527,7 +548,7 @@ EOF
|
|
527
548
|
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
|
528
549
|
end
|
529
550
|
|
530
|
-
cmd << "--override_deps='#{override_deps}'" if override_deps and not override_deps.empty?
|
551
|
+
cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
|
531
552
|
|
532
553
|
template = self.template(cmd, options)
|
533
554
|
jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
|
data/lib/rbbt/persist.rb
CHANGED
@@ -110,6 +110,8 @@ module Persist
|
|
110
110
|
def self.load_file(path, type)
|
111
111
|
begin
|
112
112
|
case (type || :marshal).to_sym
|
113
|
+
when :path
|
114
|
+
path
|
113
115
|
when :nil
|
114
116
|
nil
|
115
117
|
when :boolean
|
@@ -167,6 +169,8 @@ module Persist
|
|
167
169
|
end
|
168
170
|
|
169
171
|
case (type || :marshal).to_sym
|
172
|
+
when :path
|
173
|
+
nil
|
170
174
|
when :nil
|
171
175
|
nil
|
172
176
|
when :boolean
|
@@ -0,0 +1,165 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rbbt-util'
|
4
|
+
require 'rbbt/util/simpleopt'
|
5
|
+
|
6
|
+
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
7
|
+
|
8
|
+
options = SOPT.setup <<EOF
|
9
|
+
|
10
|
+
Clean error or aborted jobs
|
11
|
+
|
12
|
+
$ rbbt mnl [options]
|
13
|
+
|
14
|
+
-h--help Print this help
|
15
|
+
-d--done Done jobs only
|
16
|
+
-e--error Error jobs only
|
17
|
+
-a--aborted SLURM aboted jobs
|
18
|
+
-j--job* Job ids
|
19
|
+
-s--search* Regular expression
|
20
|
+
-t--tail* Show the last lines of the STDERR
|
21
|
+
-SBP--sbatch_parameters show sbatch parameters
|
22
|
+
-dr--dry_run Do not erase anything
|
23
|
+
EOF
|
24
|
+
|
25
|
+
if options[:help]
|
26
|
+
if defined? rbbt_usage
|
27
|
+
rbbt_usage
|
28
|
+
else
|
29
|
+
puts SOPT.doc
|
30
|
+
end
|
31
|
+
exit 0
|
32
|
+
end
|
33
|
+
|
34
|
+
Log.severity = 4
|
35
|
+
done, error, aborted, jobid, search, tail, sbatch_parameters, dry_run = options.values_at :done, :error, :aborted, :job, :search, :tail, :sbatch_parameters, :dry_run
|
36
|
+
|
37
|
+
workdir = File.expand_path('~/rbbt-slurm')
|
38
|
+
Path.setup(workdir)
|
39
|
+
|
40
|
+
running_jobs = begin
|
41
|
+
squeue_txt = CMD.cmd('squeue').read
|
42
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
43
|
+
rescue
|
44
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
45
|
+
squeue_txt = nil
|
46
|
+
$norunningjobs = true
|
47
|
+
[]
|
48
|
+
end
|
49
|
+
|
50
|
+
if squeue_txt
|
51
|
+
job_nodes = {}
|
52
|
+
squeue_txt.split("\n").each do |line|
|
53
|
+
parts = line.strip.split(/\s+/)
|
54
|
+
job_nodes[parts.first] = parts.last.split(",")
|
55
|
+
end
|
56
|
+
else
|
57
|
+
job_nodes = nil
|
58
|
+
end
|
59
|
+
|
60
|
+
count = 0
|
61
|
+
workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
62
|
+
dir = File.dirname(fcmd)
|
63
|
+
|
64
|
+
if m = Open.read(fcmd).match(/#CMD: (.*)/)
|
65
|
+
cmd = m[1]
|
66
|
+
else
|
67
|
+
cmd = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
|
71
|
+
exe = m[1]
|
72
|
+
else
|
73
|
+
exe = nil
|
74
|
+
end
|
75
|
+
|
76
|
+
if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
|
77
|
+
container_home = m[1]
|
78
|
+
else
|
79
|
+
container_home = nil
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
if File.exists?(fid = File.join(dir, 'job.id'))
|
84
|
+
id = Open.read(fid).chomp
|
85
|
+
else
|
86
|
+
id = nil
|
87
|
+
end
|
88
|
+
|
89
|
+
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
90
|
+
exit_status = Open.read(fstatus).to_i
|
91
|
+
else
|
92
|
+
exit_status = nil
|
93
|
+
end
|
94
|
+
|
95
|
+
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
96
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
97
|
+
elsif job_nodes[id]
|
98
|
+
nodes = job_nodes[id]
|
99
|
+
else
|
100
|
+
nodes = []
|
101
|
+
end
|
102
|
+
|
103
|
+
if File.exists?(File.join(dir, 'std.out'))
|
104
|
+
outt = File.mtime File.join(dir, 'std.out')
|
105
|
+
errt = File.mtime File.join(dir, 'std.err')
|
106
|
+
time_diff = Time.now - [outt, errt].max
|
107
|
+
end
|
108
|
+
|
109
|
+
fdep = File.join(dir, 'dependencies.list')
|
110
|
+
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
111
|
+
|
112
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
113
|
+
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
114
|
+
|
115
|
+
aborted = error = true if aborted.nil? && error.nil?
|
116
|
+
if done || error || aborted || running || queued || jobid || search
|
117
|
+
select = false
|
118
|
+
select = true if done && exit_status && exit_status.to_i == 0
|
119
|
+
select = true if error && exit_status && exit_status.to_i != 0
|
120
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
121
|
+
select = select && jobid.split(",").include?(id) if jobid
|
122
|
+
select = select && cmd.match(/#{search}/) if search
|
123
|
+
next unless select
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
puts Log.color(:yellow, "**ERASING**")
|
128
|
+
puts Log.color :blue, dir
|
129
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
|
130
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
131
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
132
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
133
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
134
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
135
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
136
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
137
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
138
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
139
|
+
|
140
|
+
if options[:sbatch_parameters]
|
141
|
+
puts Log.color(:magenta, "SBATCH parameters: ")
|
142
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
143
|
+
end
|
144
|
+
|
145
|
+
if tail && File.exists?(File.join(dir, 'std.err'))
|
146
|
+
if exit_status && exit_status != 0
|
147
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
148
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
149
|
+
elsif exit_status
|
150
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
151
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
152
|
+
else
|
153
|
+
puts Log.color(:magenta, "Log tail: ")
|
154
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
count += 1
|
159
|
+
|
160
|
+
Open.rm_rf dir unless dry_run
|
161
|
+
end
|
162
|
+
|
163
|
+
puts
|
164
|
+
puts "Found #{count} jobs"
|
165
|
+
|
@@ -20,15 +20,16 @@ $ rbbt mnl [options]
|
|
20
20
|
-j--job* Job ids
|
21
21
|
-s--search* Regular expression
|
22
22
|
-t--tail* Show the last lines of the STDERR
|
23
|
+
-SBP--sbatch_parameters show sbatch parameters
|
23
24
|
EOF
|
24
25
|
|
25
26
|
if options[:help]
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
27
|
+
if defined? rbbt_usage
|
28
|
+
rbbt_usage
|
29
|
+
else
|
30
|
+
puts SOPT.doc
|
31
|
+
end
|
32
|
+
exit 0
|
32
33
|
end
|
33
34
|
|
34
35
|
Log.severity = 4
|
@@ -38,101 +39,124 @@ workdir = File.expand_path('~/rbbt-slurm')
|
|
38
39
|
Path.setup(workdir)
|
39
40
|
|
40
41
|
running_jobs = begin
|
41
|
-
|
42
|
+
squeue_txt = CMD.cmd('squeue').read
|
43
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
42
44
|
rescue
|
43
|
-
|
44
|
-
|
45
|
-
|
45
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
46
|
+
squeue_txt = nil
|
47
|
+
$norunningjobs = true
|
48
|
+
[]
|
46
49
|
end
|
47
50
|
|
51
|
+
if squeue_txt
|
52
|
+
job_nodes = {}
|
53
|
+
squeue_txt.split("\n").each do |line|
|
54
|
+
parts = line.strip.split(/\s+/)
|
55
|
+
job_nodes[parts.first] = parts.last.split(",")
|
56
|
+
end
|
57
|
+
else
|
58
|
+
job_nodes = nil
|
59
|
+
end
|
60
|
+
|
48
61
|
count = 0
|
49
62
|
workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
63
|
+
dir = File.dirname(fcmd)
|
64
|
+
|
65
|
+
if m = Open.read(fcmd).match(/#CMD: (.*)/)
|
66
|
+
cmd = m[1]
|
67
|
+
else
|
68
|
+
cmd = nil
|
69
|
+
end
|
70
|
+
|
71
|
+
if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
|
72
|
+
exe = m[1]
|
73
|
+
else
|
74
|
+
exe = nil
|
75
|
+
end
|
76
|
+
|
77
|
+
if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
|
78
|
+
container_home = m[1]
|
79
|
+
else
|
80
|
+
container_home = nil
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
if File.exists?(fid = File.join(dir, 'job.id'))
|
85
|
+
id = Open.read(fid).chomp
|
86
|
+
else
|
87
|
+
id = nil
|
88
|
+
end
|
89
|
+
|
90
|
+
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
91
|
+
exit_status = Open.read(fstatus).to_i
|
92
|
+
else
|
93
|
+
exit_status = nil
|
94
|
+
end
|
95
|
+
|
96
|
+
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
97
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
98
|
+
elsif job_nodes[id]
|
99
|
+
nodes = job_nodes[id]
|
100
|
+
else
|
101
|
+
nodes = []
|
102
|
+
end
|
103
|
+
|
104
|
+
if File.exists?(File.join(dir, 'std.out'))
|
105
|
+
outt = File.mtime File.join(dir, 'std.out')
|
106
|
+
errt = File.mtime File.join(dir, 'std.err')
|
107
|
+
time_diff = Time.now - [outt, errt].max
|
108
|
+
end
|
109
|
+
|
110
|
+
fdep = File.join(dir, 'dependencies.list')
|
111
|
+
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
112
|
+
|
113
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
114
|
+
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
115
|
+
|
116
|
+
if done || error || aborted || running || queued || jobid || search
|
117
|
+
select = false
|
118
|
+
select = true if done && exit_status == 0
|
119
|
+
select = true if error && exit_status && exit_status != 0
|
120
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
121
|
+
select = true if queued && deps && (running_jobs & deps).any?
|
122
|
+
select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
123
|
+
select = true if jobid && jobid.split(",").include?(id)
|
124
|
+
select = true if search && cmd.match(/#{search}/)
|
125
|
+
next unless select
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
puts Log.color :blue, dir
|
130
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
|
131
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
132
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
133
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
134
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
135
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
136
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
137
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
138
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
139
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
140
|
+
|
141
|
+
if options[:sbatch_parameters]
|
142
|
+
puts Log.color(:magenta, "SBATCH parameters: ")
|
143
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
144
|
+
end
|
145
|
+
|
146
|
+
if tail && File.exists?(File.join(dir, 'std.err'))
|
147
|
+
if exit_status && exit_status != 0
|
148
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
149
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
150
|
+
elsif exit_status
|
151
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
152
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
153
|
+
else
|
154
|
+
puts Log.color(:magenta, "Log tail: ")
|
155
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
count += 1
|
136
160
|
|
137
161
|
end
|
138
162
|
|
@@ -25,7 +25,7 @@ $slurm_options = SOPT.get <<EOF
|
|
25
25
|
-t--task_cpus* Tasks
|
26
26
|
-W--workflows* Additional workflows
|
27
27
|
-tm--time* Time
|
28
|
-
-
|
28
|
+
-OR--orchestration_rules* Orchestration rules
|
29
29
|
-rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
|
30
30
|
EOF
|
31
31
|
|
@@ -43,5 +43,5 @@ class Step
|
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
46
|
-
ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
|
46
|
+
ARGV.concat ["-W", $slurm_options[:workflows], '--detach'] if $slurm_options[:workflows]
|
47
47
|
load Rbbt.share.rbbt_commands.workflow.task.find
|
@@ -20,7 +20,7 @@ def usage(workflow = nil, task = nil, exception=nil, abridge = false)
|
|
20
20
|
puts
|
21
21
|
if workflow.nil?
|
22
22
|
puts "No workflow specified. Use `rbbt workflow list` to list available workflows."
|
23
|
-
exit -1
|
23
|
+
exit! -1
|
24
24
|
end
|
25
25
|
|
26
26
|
if task.nil?
|
@@ -206,7 +206,7 @@ The `recursive_clean` cleans all the job dependency steps recursively.
|
|
206
206
|
EOF
|
207
207
|
|
208
208
|
workflow = ARGV.shift
|
209
|
-
usage and exit -1 if workflow.nil?
|
209
|
+
usage and exit! -1 if workflow.nil?
|
210
210
|
|
211
211
|
task = ARGV.shift
|
212
212
|
|
@@ -232,7 +232,8 @@ else
|
|
232
232
|
remote_workflows = {}
|
233
233
|
end
|
234
234
|
|
235
|
-
Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
|
235
|
+
#Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
|
236
|
+
Workflow.workdir.search_paths.merge!({:workdir => File.expand_path(options.delete(:workdir_all)), :default => :workdir }) if options[:workdir_all]
|
236
237
|
|
237
238
|
workflow = Workflow.require_workflow workflow
|
238
239
|
|
@@ -486,7 +487,7 @@ rescue ParameterException
|
|
486
487
|
puts
|
487
488
|
report_options saved_job_options
|
488
489
|
puts
|
489
|
-
exit -1
|
490
|
+
exit! -1
|
490
491
|
end
|
491
492
|
|
492
493
|
if options.delete(:list_job_files)
|
@@ -538,7 +539,7 @@ when Step
|
|
538
539
|
io.abort if io.respond_to? :abort
|
539
540
|
io.join if io.respond_to? :join
|
540
541
|
ensure
|
541
|
-
exit -1
|
542
|
+
exit! -1
|
542
543
|
end
|
543
544
|
rescue Exception
|
544
545
|
Log.exception $!
|
@@ -547,9 +548,11 @@ when Step
|
|
547
548
|
io.abort if io.respond_to? :abort
|
548
549
|
io.join if io.respond_to? :join
|
549
550
|
ensure
|
550
|
-
exit -1
|
551
|
+
exit! -1
|
551
552
|
end
|
552
553
|
end
|
554
|
+
elsif detach
|
555
|
+
exit! 0
|
553
556
|
else
|
554
557
|
res.join
|
555
558
|
out.puts Open.read(res.path) if Open.exist?(res.path) || Open.remote?(res.path) || Open.ssh?(res.path)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-util
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.29.
|
4
|
+
version: 5.29.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-01-
|
11
|
+
date: 2021-01-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -378,6 +378,7 @@ files:
|
|
378
378
|
- share/rbbt_commands/resource/produce
|
379
379
|
- share/rbbt_commands/resource/read
|
380
380
|
- share/rbbt_commands/rsync
|
381
|
+
- share/rbbt_commands/slurm/clean
|
381
382
|
- share/rbbt_commands/slurm/list
|
382
383
|
- share/rbbt_commands/slurm/orchestrate
|
383
384
|
- share/rbbt_commands/slurm/task
|