rbbt-util 5.29.2 → 5.29.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc/orchestrate.rb +12 -1
- data/lib/rbbt/hpc/slurm.rb +36 -15
- data/lib/rbbt/persist.rb +4 -0
- data/share/rbbt_commands/slurm/clean +165 -0
- data/share/rbbt_commands/slurm/list +120 -96
- data/share/rbbt_commands/slurm/orchestrate +2 -2
- data/share/rbbt_commands/workflow/task +9 -6
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ec6302ccfe3f38a074f7f0d10511090c8f4db4186228ad93adb2888e0edbf5e
|
4
|
+
data.tar.gz: fcaa50b654461f128b9539fc47ed00b008d3f713e89b8bbe963c2b898c3c168b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a2537aef150df77142a593742399d28bb96334decc0e33b69e9cbac2853085487100aa90cd711b342bac7eda536d15c080be22891d5b9f38bfa4601282ae5de
|
7
|
+
data.tar.gz: d79cc4afa294d63cebd79f73b759bc46c3c4e0285db7be7406a594aa49e5b572b0c4f91b98c73b5f64c31c908d58d8dee43853ec18b1ead9e814b4370de0d60d
|
data/lib/rbbt/hpc/orchestrate.rb
CHANGED
@@ -60,9 +60,14 @@ module HPC
|
|
60
60
|
return if job.done?
|
61
61
|
return unless job.path.split("/")[-4] == "jobs"
|
62
62
|
seen[:orchestration_target_job] ||= job
|
63
|
+
|
63
64
|
options.delete "recursive_clean"
|
65
|
+
options.delete "clean_task"
|
66
|
+
options.delete "clean"
|
64
67
|
options.delete "tail"
|
65
68
|
options.delete "printfile"
|
69
|
+
options.delete "detach"
|
70
|
+
|
66
71
|
rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
|
67
72
|
rules ||= {}
|
68
73
|
IndiferentHash.setup(rules)
|
@@ -75,7 +80,13 @@ module HPC
|
|
75
80
|
skip_dep = job_rules["chain_tasks"] &&
|
76
81
|
job_rules["chain_tasks"][job.workflow.to_s] && job_rules["chain_tasks"][job.workflow.to_s].include?(job.task_name.to_s) &&
|
77
82
|
job_rules["chain_tasks"][dep.workflow.to_s] && job_rules["chain_tasks"][dep.workflow.to_s].include?(dep.task_name.to_s)
|
78
|
-
|
83
|
+
|
84
|
+
deps = seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
|
85
|
+
if job.canfail_paths.include? dep.path
|
86
|
+
[deps].flatten.collect{|id| ['canfail', id] * ":"}
|
87
|
+
else
|
88
|
+
deps
|
89
|
+
end
|
79
90
|
end.flatten.compact.uniq
|
80
91
|
|
81
92
|
skip = true if job_rules[:skip]
|
data/lib/rbbt/hpc/slurm.rb
CHANGED
@@ -58,11 +58,11 @@ module HPC
|
|
58
58
|
when FalseClass
|
59
59
|
'--' << o << "=false"
|
60
60
|
else
|
61
|
-
['--' << o, "'#{v}'"] * " "
|
61
|
+
['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
|
62
62
|
end
|
63
63
|
end * " "
|
64
64
|
|
65
|
-
rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
|
65
|
+
rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
|
66
66
|
|
67
67
|
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
68
68
|
|
@@ -76,6 +76,7 @@ module HPC
|
|
76
76
|
fjob = File.join(slurm_basedir, 'job.id')
|
77
77
|
fexit = File.join(slurm_basedir, 'exit.status')
|
78
78
|
fsync = File.join(slurm_basedir, 'sync.log')
|
79
|
+
fsyncexit = File.join(slurm_basedir, 'sync.status')
|
79
80
|
fcmd = File.join(slurm_basedir, 'command.slurm')
|
80
81
|
|
81
82
|
#{{{ GENERATE TEMPLATE
|
@@ -107,10 +108,6 @@ module HPC
|
|
107
108
|
EOF
|
108
109
|
end
|
109
110
|
|
110
|
-
header +=<<-EOF
|
111
|
-
#CMD: #{rbbt_cmd}
|
112
|
-
EOF
|
113
|
-
|
114
111
|
# ENV
|
115
112
|
env = ""
|
116
113
|
env +=<<-EOF
|
@@ -246,7 +243,7 @@ EOF
|
|
246
243
|
end
|
247
244
|
|
248
245
|
if contain
|
249
|
-
rbbt_cmd << " " << %(--workdir_all='#{contain}')
|
246
|
+
rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/.rbbt/var/jobs')
|
250
247
|
end
|
251
248
|
end
|
252
249
|
|
@@ -256,6 +253,10 @@ EOF
|
|
256
253
|
#{rbbt_cmd}
|
257
254
|
EOF
|
258
255
|
|
256
|
+
header +=<<-EOF
|
257
|
+
#CMD: #{rbbt_cmd}
|
258
|
+
EOF
|
259
|
+
|
259
260
|
run +=<<-EOF
|
260
261
|
|
261
262
|
# Run command
|
@@ -273,10 +274,10 @@ EOF
|
|
273
274
|
coda +=<<-EOF
|
274
275
|
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
|
275
276
|
EOF
|
276
|
-
else
|
277
|
-
coda +=<<-EOF
|
278
|
-
rbbt system clean all -q &>> #{fsync}
|
279
|
-
EOF
|
277
|
+
# else
|
278
|
+
# coda +=<<-EOF
|
279
|
+
#rbbt system clean all -q &>> #{fsync}
|
280
|
+
#EOF
|
280
281
|
end
|
281
282
|
|
282
283
|
if sync.include?("=>")
|
@@ -295,6 +296,7 @@ EOF
|
|
295
296
|
mkdir -p "$(dirname '#{target}')"
|
296
297
|
rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
|
297
298
|
sync_es="$?"
|
299
|
+
echo $sync_es > #{fsyncexit}
|
298
300
|
find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
|
299
301
|
EOF
|
300
302
|
|
@@ -320,23 +322,24 @@ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem
|
|
320
322
|
EOF
|
321
323
|
else
|
322
324
|
coda +=<<-EOF
|
323
|
-
|
325
|
+
##{exec_cmd} system clean
|
324
326
|
if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
325
327
|
rm -Rfv #{contain} &>> #{fsync}
|
326
328
|
else
|
327
329
|
echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
328
330
|
fi
|
329
|
-
unset sync_es
|
330
331
|
EOF
|
331
332
|
|
332
333
|
end
|
333
334
|
end
|
334
335
|
end
|
336
|
+
|
335
337
|
coda +=<<-EOF
|
336
338
|
|
337
339
|
# Write exit status to file
|
338
340
|
echo $exit_status > #{fexit}
|
339
341
|
EOF
|
342
|
+
|
340
343
|
if sync
|
341
344
|
coda +=<<-EOF
|
342
345
|
if [ "$sync_es" == '0' ]; then
|
@@ -362,6 +365,10 @@ EOF
|
|
362
365
|
slurm_basedir = options[:slurm_basedir]
|
363
366
|
dependencies = options.delete :slurm_dependencies
|
364
367
|
dependencies = [] if dependencies.nil?
|
368
|
+
|
369
|
+
canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
|
370
|
+
dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
|
371
|
+
|
365
372
|
Open.mkdir slurm_basedir
|
366
373
|
|
367
374
|
dry_run = options.delete :dry_run
|
@@ -370,6 +377,7 @@ EOF
|
|
370
377
|
ferr = File.join(slurm_basedir, 'std.err')
|
371
378
|
fjob = File.join(slurm_basedir, 'job.id')
|
372
379
|
fdep = File.join(slurm_basedir, 'dependencies.list')
|
380
|
+
fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
|
373
381
|
fexit = File.join(slurm_basedir, 'exit.status')
|
374
382
|
fsync = File.join(slurm_basedir, 'sync.log')
|
375
383
|
fcmd = File.join(slurm_basedir, 'command.slurm')
|
@@ -401,8 +409,21 @@ EOF
|
|
401
409
|
Open.rm fexit
|
402
410
|
Open.rm fout
|
403
411
|
Open.rm ferr
|
412
|
+
|
404
413
|
Open.write(fdep, dependencies * "\n") if dependencies.any?
|
405
|
-
|
414
|
+
Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
|
415
|
+
|
416
|
+
|
417
|
+
dep_str = '--dependency='
|
418
|
+
normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
|
419
|
+
canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
|
420
|
+
|
421
|
+
if normal_dep_str.nil? && canfail_dep_str.nil?
|
422
|
+
dep_str = ""
|
423
|
+
else
|
424
|
+
dep_str += [normal_dep_str, canfail_dep_str].compact * ","
|
425
|
+
end
|
426
|
+
|
406
427
|
job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
|
407
428
|
Log.debug "SBATCH job id: #{job}"
|
408
429
|
Open.write(fjob, job.to_s)
|
@@ -527,7 +548,7 @@ EOF
|
|
527
548
|
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
|
528
549
|
end
|
529
550
|
|
530
|
-
cmd << "--override_deps='#{override_deps}'" if override_deps and not override_deps.empty?
|
551
|
+
cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
|
531
552
|
|
532
553
|
template = self.template(cmd, options)
|
533
554
|
jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
|
data/lib/rbbt/persist.rb
CHANGED
@@ -110,6 +110,8 @@ module Persist
|
|
110
110
|
def self.load_file(path, type)
|
111
111
|
begin
|
112
112
|
case (type || :marshal).to_sym
|
113
|
+
when :path
|
114
|
+
path
|
113
115
|
when :nil
|
114
116
|
nil
|
115
117
|
when :boolean
|
@@ -167,6 +169,8 @@ module Persist
|
|
167
169
|
end
|
168
170
|
|
169
171
|
case (type || :marshal).to_sym
|
172
|
+
when :path
|
173
|
+
nil
|
170
174
|
when :nil
|
171
175
|
nil
|
172
176
|
when :boolean
|
@@ -0,0 +1,165 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rbbt-util'
|
4
|
+
require 'rbbt/util/simpleopt'
|
5
|
+
|
6
|
+
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
7
|
+
|
8
|
+
options = SOPT.setup <<EOF
|
9
|
+
|
10
|
+
Clean error or aborted jobs
|
11
|
+
|
12
|
+
$ rbbt mnl [options]
|
13
|
+
|
14
|
+
-h--help Print this help
|
15
|
+
-d--done Done jobs only
|
16
|
+
-e--error Error jobs only
|
17
|
+
-a--aborted SLURM aboted jobs
|
18
|
+
-j--job* Job ids
|
19
|
+
-s--search* Regular expression
|
20
|
+
-t--tail* Show the last lines of the STDERR
|
21
|
+
-SBP--sbatch_parameters show sbatch parameters
|
22
|
+
-dr--dry_run Do not erase anything
|
23
|
+
EOF
|
24
|
+
|
25
|
+
if options[:help]
|
26
|
+
if defined? rbbt_usage
|
27
|
+
rbbt_usage
|
28
|
+
else
|
29
|
+
puts SOPT.doc
|
30
|
+
end
|
31
|
+
exit 0
|
32
|
+
end
|
33
|
+
|
34
|
+
Log.severity = 4
|
35
|
+
done, error, aborted, jobid, search, tail, sbatch_parameters, dry_run = options.values_at :done, :error, :aborted, :job, :search, :tail, :sbatch_parameters, :dry_run
|
36
|
+
|
37
|
+
workdir = File.expand_path('~/rbbt-slurm')
|
38
|
+
Path.setup(workdir)
|
39
|
+
|
40
|
+
running_jobs = begin
|
41
|
+
squeue_txt = CMD.cmd('squeue').read
|
42
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
43
|
+
rescue
|
44
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
45
|
+
squeue_txt = nil
|
46
|
+
$norunningjobs = true
|
47
|
+
[]
|
48
|
+
end
|
49
|
+
|
50
|
+
if squeue_txt
|
51
|
+
job_nodes = {}
|
52
|
+
squeue_txt.split("\n").each do |line|
|
53
|
+
parts = line.strip.split(/\s+/)
|
54
|
+
job_nodes[parts.first] = parts.last.split(",")
|
55
|
+
end
|
56
|
+
else
|
57
|
+
job_nodes = nil
|
58
|
+
end
|
59
|
+
|
60
|
+
count = 0
|
61
|
+
workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
62
|
+
dir = File.dirname(fcmd)
|
63
|
+
|
64
|
+
if m = Open.read(fcmd).match(/#CMD: (.*)/)
|
65
|
+
cmd = m[1]
|
66
|
+
else
|
67
|
+
cmd = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
|
71
|
+
exe = m[1]
|
72
|
+
else
|
73
|
+
exe = nil
|
74
|
+
end
|
75
|
+
|
76
|
+
if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
|
77
|
+
container_home = m[1]
|
78
|
+
else
|
79
|
+
container_home = nil
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
if File.exists?(fid = File.join(dir, 'job.id'))
|
84
|
+
id = Open.read(fid).chomp
|
85
|
+
else
|
86
|
+
id = nil
|
87
|
+
end
|
88
|
+
|
89
|
+
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
90
|
+
exit_status = Open.read(fstatus).to_i
|
91
|
+
else
|
92
|
+
exit_status = nil
|
93
|
+
end
|
94
|
+
|
95
|
+
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
96
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
97
|
+
elsif job_nodes[id]
|
98
|
+
nodes = job_nodes[id]
|
99
|
+
else
|
100
|
+
nodes = []
|
101
|
+
end
|
102
|
+
|
103
|
+
if File.exists?(File.join(dir, 'std.out'))
|
104
|
+
outt = File.mtime File.join(dir, 'std.out')
|
105
|
+
errt = File.mtime File.join(dir, 'std.err')
|
106
|
+
time_diff = Time.now - [outt, errt].max
|
107
|
+
end
|
108
|
+
|
109
|
+
fdep = File.join(dir, 'dependencies.list')
|
110
|
+
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
111
|
+
|
112
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
113
|
+
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
114
|
+
|
115
|
+
aborted = error = true if aborted.nil? && error.nil?
|
116
|
+
if done || error || aborted || running || queued || jobid || search
|
117
|
+
select = false
|
118
|
+
select = true if done && exit_status && exit_status.to_i == 0
|
119
|
+
select = true if error && exit_status && exit_status.to_i != 0
|
120
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
121
|
+
select = select && jobid.split(",").include?(id) if jobid
|
122
|
+
select = select && cmd.match(/#{search}/) if search
|
123
|
+
next unless select
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
puts Log.color(:yellow, "**ERASING**")
|
128
|
+
puts Log.color :blue, dir
|
129
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
|
130
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
131
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
132
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
133
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
134
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
135
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
136
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
137
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
138
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
139
|
+
|
140
|
+
if options[:sbatch_parameters]
|
141
|
+
puts Log.color(:magenta, "SBATCH parameters: ")
|
142
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
143
|
+
end
|
144
|
+
|
145
|
+
if tail && File.exists?(File.join(dir, 'std.err'))
|
146
|
+
if exit_status && exit_status != 0
|
147
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
148
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
149
|
+
elsif exit_status
|
150
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
151
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
152
|
+
else
|
153
|
+
puts Log.color(:magenta, "Log tail: ")
|
154
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
count += 1
|
159
|
+
|
160
|
+
Open.rm_rf dir unless dry_run
|
161
|
+
end
|
162
|
+
|
163
|
+
puts
|
164
|
+
puts "Found #{count} jobs"
|
165
|
+
|
@@ -20,15 +20,16 @@ $ rbbt mnl [options]
|
|
20
20
|
-j--job* Job ids
|
21
21
|
-s--search* Regular expression
|
22
22
|
-t--tail* Show the last lines of the STDERR
|
23
|
+
-SBP--sbatch_parameters show sbatch parameters
|
23
24
|
EOF
|
24
25
|
|
25
26
|
if options[:help]
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
27
|
+
if defined? rbbt_usage
|
28
|
+
rbbt_usage
|
29
|
+
else
|
30
|
+
puts SOPT.doc
|
31
|
+
end
|
32
|
+
exit 0
|
32
33
|
end
|
33
34
|
|
34
35
|
Log.severity = 4
|
@@ -38,101 +39,124 @@ workdir = File.expand_path('~/rbbt-slurm')
|
|
38
39
|
Path.setup(workdir)
|
39
40
|
|
40
41
|
running_jobs = begin
|
41
|
-
|
42
|
+
squeue_txt = CMD.cmd('squeue').read
|
43
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
42
44
|
rescue
|
43
|
-
|
44
|
-
|
45
|
-
|
45
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
46
|
+
squeue_txt = nil
|
47
|
+
$norunningjobs = true
|
48
|
+
[]
|
46
49
|
end
|
47
50
|
|
51
|
+
if squeue_txt
|
52
|
+
job_nodes = {}
|
53
|
+
squeue_txt.split("\n").each do |line|
|
54
|
+
parts = line.strip.split(/\s+/)
|
55
|
+
job_nodes[parts.first] = parts.last.split(",")
|
56
|
+
end
|
57
|
+
else
|
58
|
+
job_nodes = nil
|
59
|
+
end
|
60
|
+
|
48
61
|
count = 0
|
49
62
|
workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
63
|
+
dir = File.dirname(fcmd)
|
64
|
+
|
65
|
+
if m = Open.read(fcmd).match(/#CMD: (.*)/)
|
66
|
+
cmd = m[1]
|
67
|
+
else
|
68
|
+
cmd = nil
|
69
|
+
end
|
70
|
+
|
71
|
+
if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
|
72
|
+
exe = m[1]
|
73
|
+
else
|
74
|
+
exe = nil
|
75
|
+
end
|
76
|
+
|
77
|
+
if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
|
78
|
+
container_home = m[1]
|
79
|
+
else
|
80
|
+
container_home = nil
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
if File.exists?(fid = File.join(dir, 'job.id'))
|
85
|
+
id = Open.read(fid).chomp
|
86
|
+
else
|
87
|
+
id = nil
|
88
|
+
end
|
89
|
+
|
90
|
+
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
91
|
+
exit_status = Open.read(fstatus).to_i
|
92
|
+
else
|
93
|
+
exit_status = nil
|
94
|
+
end
|
95
|
+
|
96
|
+
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
97
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
98
|
+
elsif job_nodes[id]
|
99
|
+
nodes = job_nodes[id]
|
100
|
+
else
|
101
|
+
nodes = []
|
102
|
+
end
|
103
|
+
|
104
|
+
if File.exists?(File.join(dir, 'std.out'))
|
105
|
+
outt = File.mtime File.join(dir, 'std.out')
|
106
|
+
errt = File.mtime File.join(dir, 'std.err')
|
107
|
+
time_diff = Time.now - [outt, errt].max
|
108
|
+
end
|
109
|
+
|
110
|
+
fdep = File.join(dir, 'dependencies.list')
|
111
|
+
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
112
|
+
|
113
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
114
|
+
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
115
|
+
|
116
|
+
if done || error || aborted || running || queued || jobid || search
|
117
|
+
select = false
|
118
|
+
select = true if done && exit_status == 0
|
119
|
+
select = true if error && exit_status && exit_status != 0
|
120
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
121
|
+
select = true if queued && deps && (running_jobs & deps).any?
|
122
|
+
select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
123
|
+
select = true if jobid && jobid.split(",").include?(id)
|
124
|
+
select = true if search && cmd.match(/#{search}/)
|
125
|
+
next unless select
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
puts Log.color :blue, dir
|
130
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
|
131
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
132
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
133
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
134
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
135
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
136
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
137
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
138
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
139
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
140
|
+
|
141
|
+
if options[:sbatch_parameters]
|
142
|
+
puts Log.color(:magenta, "SBATCH parameters: ")
|
143
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
144
|
+
end
|
145
|
+
|
146
|
+
if tail && File.exists?(File.join(dir, 'std.err'))
|
147
|
+
if exit_status && exit_status != 0
|
148
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
149
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
150
|
+
elsif exit_status
|
151
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
152
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
153
|
+
else
|
154
|
+
puts Log.color(:magenta, "Log tail: ")
|
155
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
count += 1
|
136
160
|
|
137
161
|
end
|
138
162
|
|
@@ -25,7 +25,7 @@ $slurm_options = SOPT.get <<EOF
|
|
25
25
|
-t--task_cpus* Tasks
|
26
26
|
-W--workflows* Additional workflows
|
27
27
|
-tm--time* Time
|
28
|
-
-
|
28
|
+
-OR--orchestration_rules* Orchestration rules
|
29
29
|
-rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
|
30
30
|
EOF
|
31
31
|
|
@@ -43,5 +43,5 @@ class Step
|
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
46
|
-
ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
|
46
|
+
ARGV.concat ["-W", $slurm_options[:workflows], '--detach'] if $slurm_options[:workflows]
|
47
47
|
load Rbbt.share.rbbt_commands.workflow.task.find
|
@@ -20,7 +20,7 @@ def usage(workflow = nil, task = nil, exception=nil, abridge = false)
|
|
20
20
|
puts
|
21
21
|
if workflow.nil?
|
22
22
|
puts "No workflow specified. Use `rbbt workflow list` to list available workflows."
|
23
|
-
exit -1
|
23
|
+
exit! -1
|
24
24
|
end
|
25
25
|
|
26
26
|
if task.nil?
|
@@ -206,7 +206,7 @@ The `recursive_clean` cleans all the job dependency steps recursively.
|
|
206
206
|
EOF
|
207
207
|
|
208
208
|
workflow = ARGV.shift
|
209
|
-
usage and exit -1 if workflow.nil?
|
209
|
+
usage and exit! -1 if workflow.nil?
|
210
210
|
|
211
211
|
task = ARGV.shift
|
212
212
|
|
@@ -232,7 +232,8 @@ else
|
|
232
232
|
remote_workflows = {}
|
233
233
|
end
|
234
234
|
|
235
|
-
Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
|
235
|
+
#Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
|
236
|
+
Workflow.workdir.search_paths.merge!({:workdir => File.expand_path(options.delete(:workdir_all)), :default => :workdir }) if options[:workdir_all]
|
236
237
|
|
237
238
|
workflow = Workflow.require_workflow workflow
|
238
239
|
|
@@ -486,7 +487,7 @@ rescue ParameterException
|
|
486
487
|
puts
|
487
488
|
report_options saved_job_options
|
488
489
|
puts
|
489
|
-
exit -1
|
490
|
+
exit! -1
|
490
491
|
end
|
491
492
|
|
492
493
|
if options.delete(:list_job_files)
|
@@ -538,7 +539,7 @@ when Step
|
|
538
539
|
io.abort if io.respond_to? :abort
|
539
540
|
io.join if io.respond_to? :join
|
540
541
|
ensure
|
541
|
-
exit -1
|
542
|
+
exit! -1
|
542
543
|
end
|
543
544
|
rescue Exception
|
544
545
|
Log.exception $!
|
@@ -547,9 +548,11 @@ when Step
|
|
547
548
|
io.abort if io.respond_to? :abort
|
548
549
|
io.join if io.respond_to? :join
|
549
550
|
ensure
|
550
|
-
exit -1
|
551
|
+
exit! -1
|
551
552
|
end
|
552
553
|
end
|
554
|
+
elsif detach
|
555
|
+
exit! 0
|
553
556
|
else
|
554
557
|
res.join
|
555
558
|
out.puts Open.read(res.path) if Open.exist?(res.path) || Open.remote?(res.path) || Open.ssh?(res.path)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-util
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.29.
|
4
|
+
version: 5.29.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-01-
|
11
|
+
date: 2021-01-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -378,6 +378,7 @@ files:
|
|
378
378
|
- share/rbbt_commands/resource/produce
|
379
379
|
- share/rbbt_commands/resource/read
|
380
380
|
- share/rbbt_commands/rsync
|
381
|
+
- share/rbbt_commands/slurm/clean
|
381
382
|
- share/rbbt_commands/slurm/list
|
382
383
|
- share/rbbt_commands/slurm/orchestrate
|
383
384
|
- share/rbbt_commands/slurm/task
|