rbbt-util 5.28.12 → 5.29.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc.rb +1 -549
- data/lib/rbbt/hpc/orchestrate.rb +111 -0
- data/lib/rbbt/hpc/slurm.rb +592 -0
- data/lib/rbbt/persist.rb +4 -0
- data/lib/rbbt/tsv/attach.rb +7 -4
- data/lib/rbbt/tsv/parallel.rb +0 -3
- data/lib/rbbt/util/misc/inspect.rb +13 -3
- data/lib/rbbt/workflow.rb +2 -1
- data/lib/rbbt/workflow/accessor.rb +7 -1
- data/lib/rbbt/workflow/definition.rb +1 -0
- data/lib/rbbt/workflow/examples.rb +2 -2
- data/lib/rbbt/workflow/step.rb +8 -5
- data/lib/rbbt/workflow/step/accessor.rb +25 -15
- data/lib/rbbt/workflow/step/dependencies.rb +1 -2
- data/lib/rbbt/workflow/step/run.rb +0 -1
- data/lib/rbbt/workflow/util/orchestrator.rb +14 -9
- data/lib/rbbt/workflow/util/provenance.rb +5 -2
- data/share/rbbt_commands/slurm/clean +165 -0
- data/share/rbbt_commands/slurm/list +165 -0
- data/share/rbbt_commands/slurm/orchestrate +47 -0
- data/share/rbbt_commands/{workflow/slurm → slurm/task} +10 -3
- data/share/rbbt_commands/workflow/task +9 -6
- data/test/rbbt/test_workflow.rb +7 -7
- data/test/rbbt/tsv/test_attach.rb +86 -6
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ec6302ccfe3f38a074f7f0d10511090c8f4db4186228ad93adb2888e0edbf5e
|
4
|
+
data.tar.gz: fcaa50b654461f128b9539fc47ed00b008d3f713e89b8bbe963c2b898c3c168b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a2537aef150df77142a593742399d28bb96334decc0e33b69e9cbac2853085487100aa90cd711b342bac7eda536d15c080be22891d5b9f38bfa4601282ae5de
|
7
|
+
data.tar.gz: d79cc4afa294d63cebd79f73b759bc46c3c4e0285db7be7406a594aa49e5b572b0c4f91b98c73b5f64c31c908d58d8dee43853ec18b1ead9e814b4370de0d60d
|
data/lib/rbbt/hpc.rb
CHANGED
@@ -1,551 +1,3 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/util/cmd'
|
3
|
-
|
4
|
-
module Marenostrum
|
5
|
-
SERVER='mn1'
|
6
|
-
class SBATCH < Exception;
|
7
|
-
attr_accessor :directory
|
8
|
-
def initialize(directory)
|
9
|
-
@directory = directory
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
module SLURM
|
14
|
-
|
15
|
-
def self.template(args, options = {})
|
16
|
-
|
17
|
-
development = options.delete :drbbt
|
18
|
-
singularity = options.delete :singularity
|
19
|
-
contain = options.delete :contain
|
20
|
-
sync = options.delete :sync
|
21
|
-
user_group = options.delete :user_group
|
22
|
-
contain_and_sync = options.delete :contain_and_sync
|
23
|
-
wipe_container = options.delete :wipe_container
|
24
|
-
copy_image = options.delete :copy_image
|
25
|
-
exclusive = options.delete :exclusive
|
26
|
-
highmem = options.delete :highmem
|
27
|
-
|
28
|
-
queue = options.delete(:queue) || 'bsc_ls'
|
29
|
-
task_cpus = options.delete(:task_cpus) || 1
|
30
|
-
nodes = options.delete(:nodes) || 1
|
31
|
-
time = options.delete(:time) || "0:00:10"
|
32
|
-
|
33
|
-
inputs_dir = options.delete :inputs_dir
|
34
|
-
config_keys = options.delete :config_keys
|
35
|
-
|
36
|
-
user = ENV['USER'] || `whoami`.strip
|
37
|
-
group = File.basename(File.dirname(ENV['HOME']))
|
38
|
-
|
39
|
-
if contain_and_sync
|
40
|
-
contain = "/scratch/tmp/rbbt-#{user}" if contain.nil?
|
41
|
-
sync = "~/.rbbt/var/jobs" if sync.nil?
|
42
|
-
wipe_container = "post" if wipe_container.nil?
|
43
|
-
end
|
44
|
-
|
45
|
-
contain = nil if contain == "" || contain == "none"
|
46
|
-
sync = nil if sync == "" || sync == "none"
|
47
|
-
|
48
|
-
contain = File.expand_path(contain) if contain
|
49
|
-
|
50
|
-
name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
|
51
|
-
options.delete(:name)
|
52
|
-
slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
|
53
|
-
options.delete(:slurm_basedir)
|
54
|
-
|
55
|
-
rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
|
56
|
-
|
57
|
-
rbbt_cmd += " " << options.collect do |o,v|
|
58
|
-
o = o.to_s
|
59
|
-
case v
|
60
|
-
when TrueClass
|
61
|
-
'--' << o
|
62
|
-
when FalseClass
|
63
|
-
'--' << o << "=false"
|
64
|
-
else
|
65
|
-
['--' << o, "'#{v}'"] * " "
|
66
|
-
end
|
67
|
-
end * " "
|
68
|
-
|
69
|
-
rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
|
70
|
-
|
71
|
-
|
72
|
-
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
73
|
-
|
74
|
-
|
75
|
-
#{{{ PREPARE LOCAL LOGFILES
|
76
|
-
|
77
|
-
Open.mkdir slurm_basedir
|
78
|
-
|
79
|
-
fout = File.join(slurm_basedir, 'std.out')
|
80
|
-
ferr = File.join(slurm_basedir, 'std.err')
|
81
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
82
|
-
fexit = File.join(slurm_basedir, 'exit.status')
|
83
|
-
fsync = File.join(slurm_basedir, 'sync.log')
|
84
|
-
fcmd = File.join(slurm_basedir, 'command.slurm')
|
85
|
-
|
86
|
-
#{{{ GENERATE TEMPLATE
|
87
|
-
|
88
|
-
# HEADER
|
89
|
-
header =<<-EOF
|
90
|
-
#!/bin/bash
|
91
|
-
#SBATCH --qos="#{queue}"
|
92
|
-
#SBATCH --job-name="#{name}"
|
93
|
-
#SBATCH --workdir="#{Dir.pwd}"
|
94
|
-
#SBATCH --output="#{fout}"
|
95
|
-
#SBATCH --error="#{ferr}"
|
96
|
-
#SBATCH --cpus-per-task="#{task_cpus}"
|
97
|
-
#SBATCH --time="#{time}"
|
98
|
-
#SBATCH --nodes="#{nodes}"
|
99
|
-
EOF
|
100
|
-
|
101
|
-
if highmem
|
102
|
-
header +=<<-EOF
|
103
|
-
#SBATCH --constraint=highmem
|
104
|
-
EOF
|
105
|
-
end
|
106
|
-
|
107
|
-
if exclusive
|
108
|
-
header +=<<-EOF
|
109
|
-
#SBATCH --exclusive
|
110
|
-
EOF
|
111
|
-
end
|
112
|
-
|
113
|
-
header +=<<-EOF
|
114
|
-
#CMD: #{rbbt_cmd}
|
115
|
-
EOF
|
116
|
-
|
117
|
-
# ENV
|
118
|
-
env = ""
|
119
|
-
env +=<<-EOF
|
120
|
-
# Prepare env
|
121
|
-
[[ -f ~/config/load.sh ]] && source ~/config/load.sh
|
122
|
-
module load java
|
123
|
-
|
124
|
-
# Calculate max available memory
|
125
|
-
let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK"
|
126
|
-
EOF
|
127
|
-
|
128
|
-
|
129
|
-
# RUN
|
130
|
-
run = ""
|
131
|
-
exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
|
132
|
-
|
133
|
-
|
134
|
-
if singularity
|
135
|
-
#{{{ SINGULARITY
|
136
|
-
|
137
|
-
singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
|
138
|
-
|
139
|
-
env +=<<-EOF
|
140
|
-
module load intel/2018.1
|
141
|
-
module load singularity
|
142
|
-
PROJECTS_ROOT="/gpfs/projects/bsc26/"
|
143
|
-
SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
|
144
|
-
SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
|
145
|
-
SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
|
146
|
-
mkdir -p "$SINGULARITY_RUBY_INLINE"
|
147
|
-
EOF
|
148
|
-
|
149
|
-
prep = ""
|
150
|
-
|
151
|
-
if contain
|
152
|
-
scratch_group_dir = File.join('/gpfs/scratch/', group)
|
153
|
-
projects_group_dir = File.join('/gpfs/projects/', group)
|
154
|
-
|
155
|
-
prep +=<<-EOF
|
156
|
-
|
157
|
-
# Prepare container dir
|
158
|
-
CONTAINER_DIR="#{contain}"
|
159
|
-
mkdir -p $CONTAINER_DIR/.rbbt/etc/
|
160
|
-
|
161
|
-
for dir in .ruby_inline git home; do
|
162
|
-
mkdir -p $CONTAINER_DIR/$dir
|
163
|
-
done
|
164
|
-
|
165
|
-
for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
|
166
|
-
mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
|
167
|
-
done
|
168
|
-
|
169
|
-
# Copy environment
|
170
|
-
cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
|
171
|
-
|
172
|
-
# Set search_paths
|
173
|
-
echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
|
174
|
-
echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
175
|
-
echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
176
|
-
echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
177
|
-
echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
178
|
-
echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
179
|
-
echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
180
|
-
EOF
|
181
|
-
|
182
|
-
if user_group && group != user_group
|
183
|
-
prep +=<<-EOF
|
184
|
-
|
185
|
-
# Add user_group search_path
|
186
|
-
echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
187
|
-
EOF
|
188
|
-
end
|
189
|
-
|
190
|
-
if inputs_dir
|
191
|
-
prep +=<<-EOF
|
192
|
-
|
193
|
-
# Copy inputs
|
194
|
-
[[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
|
195
|
-
EOF
|
196
|
-
rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
|
197
|
-
end
|
198
|
-
|
199
|
-
if copy_image
|
200
|
-
prep +=<<EOF
|
201
|
-
|
202
|
-
# Copy image
|
203
|
-
rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
|
204
|
-
SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
|
205
|
-
EOF
|
206
|
-
end
|
207
|
-
|
208
|
-
if wipe_container == "pre" || wipe_container == "both"
|
209
|
-
if singularity
|
210
|
-
prep +=<<-EOF
|
211
|
-
|
212
|
-
# Clean container pre
|
213
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
|
214
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
|
215
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
|
216
|
-
EOF
|
217
|
-
end
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
if contain
|
222
|
-
singularity_exec << %( -C -H "$CONTAINER_DIR" \
|
223
|
-
-B /scratch/tmp \
|
224
|
-
#{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
|
225
|
-
-B #{scratch_group_dir} \
|
226
|
-
-B #{projects_group_dir} \
|
227
|
-
-B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
|
228
|
-
-B ~/git:"$CONTAINER_DIR/git":ro \
|
229
|
-
#{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
|
230
|
-
-B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
|
231
|
-
"$SINGULARITY_IMG")
|
232
|
-
exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
|
233
|
-
else
|
234
|
-
singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
|
235
|
-
end
|
236
|
-
|
237
|
-
if development
|
238
|
-
exec_cmd += " rbbt --dev='#{development}'"
|
239
|
-
else
|
240
|
-
exec_cmd += ' rbbt'
|
241
|
-
end
|
242
|
-
|
243
|
-
exec_cmd = singularity_exec + " " + exec_cmd
|
244
|
-
else
|
245
|
-
if development
|
246
|
-
exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
|
247
|
-
else
|
248
|
-
exec_cmd << " " << 'rbbt'
|
249
|
-
end
|
250
|
-
|
251
|
-
if contain
|
252
|
-
rbbt_cmd << " " << %(--workdir_all='#{contain}')
|
253
|
-
end
|
254
|
-
end
|
255
|
-
|
256
|
-
|
257
|
-
cmd =<<-EOF
|
258
|
-
#{exec_cmd} \\
|
259
|
-
#{rbbt_cmd}
|
260
|
-
EOF
|
261
|
-
|
262
|
-
run +=<<-EOF
|
263
|
-
|
264
|
-
# Run command
|
265
|
-
#{cmd}
|
266
|
-
|
267
|
-
# Save exit status
|
268
|
-
exit_status=$?
|
269
|
-
|
270
|
-
EOF
|
271
|
-
|
272
|
-
# CODA
|
273
|
-
coda = ""
|
274
|
-
if sync
|
275
|
-
if singularity
|
276
|
-
coda +=<<-EOF
|
277
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
|
278
|
-
EOF
|
279
|
-
else
|
280
|
-
coda +=<<-EOF
|
281
|
-
rbbt system clean all -q &>> #{fsync}
|
282
|
-
EOF
|
283
|
-
end
|
284
|
-
|
285
|
-
if sync.include?("=>")
|
286
|
-
source, _sep, sync = sync.partition("=>")
|
287
|
-
source = source.strip
|
288
|
-
sync = sync.strip
|
289
|
-
source = File.join(File.expand_path(contain), source)
|
290
|
-
else
|
291
|
-
source = File.join(File.expand_path(contain), '.rbbt/var/jobs')
|
292
|
-
end
|
293
|
-
|
294
|
-
target = File.expand_path(sync)
|
295
|
-
coda +=<<-EOF
|
296
|
-
|
297
|
-
# Sync data to target location
|
298
|
-
mkdir -p "$(dirname '#{target}')"
|
299
|
-
rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
|
300
|
-
sync_es="$?"
|
301
|
-
find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
|
302
|
-
EOF
|
303
|
-
|
304
|
-
if contain && (wipe_container == "post" || wipe_container == "both")
|
305
|
-
prep =<<-EOF + prep
|
306
|
-
if ls -A '#{contain}' &> /dev/null ; then
|
307
|
-
echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
|
308
|
-
fi
|
309
|
-
EOF
|
310
|
-
if singularity
|
311
|
-
coda +=<<-EOF
|
312
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
|
313
|
-
|
314
|
-
|
315
|
-
# Clean container directory
|
316
|
-
#if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
317
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
|
318
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
|
319
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
|
320
|
-
#else
|
321
|
-
# echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
322
|
-
#fi
|
323
|
-
EOF
|
324
|
-
else
|
325
|
-
coda +=<<-EOF
|
326
|
-
#{exec_cmd} system clean
|
327
|
-
if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
328
|
-
rm -Rfv #{contain} &>> #{fsync}
|
329
|
-
else
|
330
|
-
echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
331
|
-
fi
|
332
|
-
unset sync_es
|
333
|
-
EOF
|
334
|
-
|
335
|
-
end
|
336
|
-
end
|
337
|
-
end
|
338
|
-
coda +=<<-EOF
|
339
|
-
|
340
|
-
# Write exit status to file
|
341
|
-
echo $exit_status > #{fexit}
|
342
|
-
EOF
|
343
|
-
if sync
|
344
|
-
coda +=<<-EOF
|
345
|
-
if [ "$sync_es" == '0' ]; then
|
346
|
-
unset sync_es
|
347
|
-
exit $exit_status
|
348
|
-
else
|
349
|
-
exit $sync_es
|
350
|
-
fi
|
351
|
-
EOF
|
352
|
-
else
|
353
|
-
coda +=<<-EOF
|
354
|
-
exit $exit_status
|
355
|
-
EOF
|
356
|
-
end
|
357
|
-
|
358
|
-
template = [header, env, prep, run, coda] * "\n"
|
359
|
-
|
360
|
-
template
|
361
|
-
end
|
362
|
-
|
363
|
-
def self.issue_template(template, options = {})
|
364
|
-
|
365
|
-
slurm_basedir = options[:slurm_basedir]
|
366
|
-
Open.mkdir slurm_basedir
|
367
|
-
|
368
|
-
dry_run = options.delete :dry_run
|
369
|
-
|
370
|
-
fout = File.join(slurm_basedir, 'std.out')
|
371
|
-
ferr = File.join(slurm_basedir, 'std.err')
|
372
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
373
|
-
fexit = File.join(slurm_basedir, 'exit.status')
|
374
|
-
fsync = File.join(slurm_basedir, 'sync.log')
|
375
|
-
fcmd = File.join(slurm_basedir, 'command.slurm')
|
376
|
-
|
377
|
-
job = nil
|
378
|
-
if options[:clean_job]
|
379
|
-
[fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
|
380
|
-
Open.rm file if Open.exists? file
|
381
|
-
end
|
382
|
-
end
|
383
|
-
|
384
|
-
return if Open.exists?(fexit)
|
385
|
-
|
386
|
-
STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
|
387
|
-
STDERR.puts template
|
388
|
-
|
389
|
-
Open.write(fcmd, template) unless File.exists? fcmd
|
390
|
-
if File.exists?(fjob)
|
391
|
-
job = Open.read(fjob).to_i
|
392
|
-
else
|
393
|
-
if File.exists?(fout)
|
394
|
-
return
|
395
|
-
elsif dry_run
|
396
|
-
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
|
397
|
-
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
|
398
|
-
raise Marenostrum::SBATCH, slurm_basedir
|
399
|
-
else
|
400
|
-
Open.rm fsync
|
401
|
-
Open.rm fexit
|
402
|
-
Open.rm fout
|
403
|
-
Open.rm ferr
|
404
|
-
job = CMD.cmd("sbatch '#{fcmd}'").read.scan(/\d+/).first.to_i
|
405
|
-
Open.write(fjob, job.to_s)
|
406
|
-
end
|
407
|
-
end
|
408
|
-
end
|
409
|
-
|
410
|
-
def self.follow_job(slurm_basedir, tail = true)
|
411
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
412
|
-
fout = File.join(slurm_basedir, 'std.out')
|
413
|
-
ferr = File.join(slurm_basedir, 'std.err')
|
414
|
-
fstatus = File.join(slurm_basedir, 'job.status')
|
415
|
-
|
416
|
-
job = Open.read(fjob).strip if Open.exists?(fjob)
|
417
|
-
|
418
|
-
if job
|
419
|
-
status_txt = CMD.cmd("squeue --job #{job}").read
|
420
|
-
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
421
|
-
STDERR.puts status_txt
|
422
|
-
lines = status_txt.split("\n").length
|
423
|
-
end
|
424
|
-
|
425
|
-
if tail
|
426
|
-
Log.severity = 10
|
427
|
-
while ! File.exists? fout
|
428
|
-
if job
|
429
|
-
STDERR.puts
|
430
|
-
Log.clear_line(STDERR)
|
431
|
-
STDERR.write Log.color(:magenta, "Waiting for Output")
|
432
|
-
3.times do
|
433
|
-
STDERR.write Log.color(:magenta, ".")
|
434
|
-
sleep 1
|
435
|
-
end
|
436
|
-
status_txt = CMD.cmd("squeue --job #{job}").read
|
437
|
-
lines.times do
|
438
|
-
Log.clear_line(STDERR)
|
439
|
-
end
|
440
|
-
Log.clear_line(STDERR)
|
441
|
-
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
442
|
-
STDERR.puts status_txt
|
443
|
-
lines = status_txt.split("\n").length
|
444
|
-
end
|
445
|
-
end
|
446
|
-
STDERR.puts
|
447
|
-
Log.clear_line(STDERR)
|
448
|
-
STDERR.puts Log.color(:magenta, "Output:")
|
449
|
-
begin
|
450
|
-
CMD.cmd("squeue --job #{job} > #{fstatus}")
|
451
|
-
out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
|
452
|
-
err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
|
453
|
-
|
454
|
-
terr = Misc.consume_stream(err, true, STDERR) if err
|
455
|
-
tout = Misc.consume_stream(out, true, STDOUT) if out
|
456
|
-
|
457
|
-
sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
|
458
|
-
rescue Aborted
|
459
|
-
ensure
|
460
|
-
begin
|
461
|
-
terr.exit if terr
|
462
|
-
tout.exit if tout
|
463
|
-
err.close if err
|
464
|
-
err.join if err
|
465
|
-
rescue Exception
|
466
|
-
end
|
467
|
-
|
468
|
-
begin
|
469
|
-
out.close if out
|
470
|
-
out.join if out
|
471
|
-
rescue Exception
|
472
|
-
end
|
473
|
-
end
|
474
|
-
end
|
475
|
-
end
|
476
|
-
|
477
|
-
def self.wait_for_job(slurm_basedir, time = 1)
|
478
|
-
fexit = File.join(slurm_basedir, 'exit.status')
|
479
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
480
|
-
job = Open.read(fjob) if Open.exists?(fjob)
|
481
|
-
|
482
|
-
|
483
|
-
while ! Open.exists?(fexit)
|
484
|
-
sleep time
|
485
|
-
end
|
486
|
-
end
|
487
|
-
|
488
|
-
def self.run_job(job, options = {})
|
489
|
-
options = IndiferentHash.setup(options.dup)
|
490
|
-
|
491
|
-
dry_run = options.delete :dry_run
|
492
|
-
tail = options.delete :tail
|
493
|
-
|
494
|
-
workflow = job.workflow
|
495
|
-
task = job.task_name
|
496
|
-
|
497
|
-
keep_slurm_basedir = options.delete :keep_SLURM_slurm_basedir
|
498
|
-
slurm_basedir = options.delete :SLURM_basedir
|
499
|
-
slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
|
500
|
-
TmpFile.with_file(nil, !keep_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
|
501
|
-
options[:slurm_basedir] ||= tmp_directory
|
502
|
-
slurm_basedir = options[:slurm_basedir]
|
503
|
-
inputs_dir = File.join(tmp_directory, 'inputs_dir')
|
504
|
-
saved = Step.save_job_inputs(job, inputs_dir, options)
|
505
|
-
if saved
|
506
|
-
options[:inputs_dir] = inputs_dir
|
507
|
-
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
|
508
|
-
else
|
509
|
-
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
|
510
|
-
end
|
511
|
-
|
512
|
-
|
513
|
-
template = self.template(cmd, options)
|
514
|
-
self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run))
|
515
|
-
|
516
|
-
return unless tail
|
517
|
-
|
518
|
-
t_monitor = Thread.new do
|
519
|
-
self.follow_job(slurm_basedir, :STDERR)
|
520
|
-
end
|
521
|
-
self.wait_for_job(slurm_basedir)
|
522
|
-
t_monitor.raise Aborted
|
523
|
-
return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
|
524
|
-
path = Open.read(File.join(slurm_basedir, 'std.out')).strip
|
525
|
-
if Open.exists?(path) && job.path != path
|
526
|
-
Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
|
527
|
-
Open.ln path, job.path
|
528
|
-
Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
|
529
|
-
Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
|
530
|
-
end
|
531
|
-
end
|
532
|
-
end
|
533
|
-
end
|
534
|
-
|
535
|
-
def self.relay(job, options={})
|
536
|
-
options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
|
537
|
-
done_deps = job.dependencies.select do |dep|
|
538
|
-
dep.done?
|
539
|
-
end
|
540
|
-
|
541
|
-
error_deps = job.dependencies.select do |dep|
|
542
|
-
dep.error? && ! dep.recoverable_error?
|
543
|
-
end
|
544
|
-
|
545
|
-
(done_deps + error_deps).each do |dep|
|
546
|
-
Step.migrate(dep.path, options[:search_path], options)
|
547
|
-
end
|
548
|
-
|
549
|
-
end
|
550
|
-
end
|
551
|
-
|
3
|
+
require 'rbbt/hpc/slurm'
|