rbbt-util 5.28.10 → 5.29.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc.rb +1 -549
- data/lib/rbbt/hpc/orchestrate.rb +24 -0
- data/lib/rbbt/hpc/slurm.rb +570 -0
- data/lib/rbbt/persist.rb +8 -3
- data/lib/rbbt/resource.rb +12 -6
- data/lib/rbbt/resource/path.rb +1 -1
- data/lib/rbbt/tsv/attach.rb +7 -4
- data/lib/rbbt/tsv/parallel.rb +0 -3
- data/lib/rbbt/util/R.rb +2 -2
- data/lib/rbbt/util/cmd.rb +9 -0
- data/lib/rbbt/util/misc/indiferent_hash.rb +8 -0
- data/lib/rbbt/util/misc/inspect.rb +23 -9
- data/lib/rbbt/workflow.rb +2 -1
- data/lib/rbbt/workflow/accessor.rb +8 -2
- data/lib/rbbt/workflow/definition.rb +1 -0
- data/lib/rbbt/workflow/examples.rb +2 -2
- data/lib/rbbt/workflow/step.rb +12 -6
- data/lib/rbbt/workflow/step/accessor.rb +47 -27
- data/lib/rbbt/workflow/step/dependencies.rb +9 -4
- data/lib/rbbt/workflow/step/run.rb +22 -20
- data/lib/rbbt/workflow/util/orchestrator.rb +14 -9
- data/lib/rbbt/workflow/util/provenance.rb +12 -5
- data/share/rbbt_commands/slurm/list +141 -0
- data/share/rbbt_commands/slurm/orchestrate +47 -0
- data/share/rbbt_commands/{workflow/slurm → slurm/task} +10 -3
- data/share/rbbt_commands/system/status +22 -22
- data/share/rbbt_commands/workflow/info +12 -9
- data/share/rbbt_commands/workflow/prov +2 -1
- data/test/rbbt/test_workflow.rb +36 -4
- data/test/rbbt/tsv/test_attach.rb +86 -6
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9b88fc549c1c1dc5cd56f06d933776d56540e3d0f4bacf77f04a449abcda974f
|
4
|
+
data.tar.gz: caf80ab624418c6c0a744038d98569c3a32d18dc8d9de162b42363e1c281e8c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8944f1d996afa5610f70046f4e61cf326461050f06f347529cd7e403440e0d9b47d0c33b862bc83e89d4479ce889e773aef0bbabfaf0dae797c3a2164f0e8a8c
|
7
|
+
data.tar.gz: fa0051835f35e23873bde6a075d81805d2878643e35e96e20c5d7423f0c7c48eaf25b2c5afad48215481b0ddd17b9d251555eae5b5c4a7fa420d9796dba9130b
|
data/lib/rbbt/hpc.rb
CHANGED
@@ -1,551 +1,3 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/util/cmd'
|
3
|
-
|
4
|
-
module Marenostrum
|
5
|
-
SERVER='mn1'
|
6
|
-
class SBATCH < Exception;
|
7
|
-
attr_accessor :directory
|
8
|
-
def initialize(directory)
|
9
|
-
@directory = directory
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
module SLURM
|
14
|
-
|
15
|
-
def self.template(args, options = {})
|
16
|
-
|
17
|
-
development = options.delete :drbbt
|
18
|
-
singularity = options.delete :singularity
|
19
|
-
contain = options.delete :contain
|
20
|
-
sync = options.delete :sync
|
21
|
-
user_group = options.delete :user_group
|
22
|
-
contain_and_sync = options.delete :contain_and_sync
|
23
|
-
wipe_container = options.delete :wipe_container
|
24
|
-
copy_image = options.delete :copy_image
|
25
|
-
exclusive = options.delete :exclusive
|
26
|
-
highmem = options.delete :highmem
|
27
|
-
|
28
|
-
queue = options.delete(:queue) || 'bsc_ls'
|
29
|
-
task_cpus = options.delete(:task_cpus) || 1
|
30
|
-
nodes = options.delete(:nodes) || 1
|
31
|
-
time = options.delete(:time) || "0:00:10"
|
32
|
-
|
33
|
-
inputs_dir = options.delete :inputs_dir
|
34
|
-
config_keys = options.delete :config_keys
|
35
|
-
|
36
|
-
user = ENV['USER'] || `whoami`.strip
|
37
|
-
group = File.basename(File.dirname(ENV['HOME']))
|
38
|
-
|
39
|
-
if contain_and_sync
|
40
|
-
contain = "/scratch/tmp/rbbt-#{user}" if contain.nil?
|
41
|
-
sync = "~/.rbbt/var/jobs" if sync.nil?
|
42
|
-
wipe_container = "post" if wipe_container.nil?
|
43
|
-
end
|
44
|
-
|
45
|
-
contain = nil if contain == "" || contain == "none"
|
46
|
-
sync = nil if sync == "" || sync == "none"
|
47
|
-
|
48
|
-
contain = File.expand_path(contain) if contain
|
49
|
-
|
50
|
-
name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
|
51
|
-
options.delete(:name)
|
52
|
-
slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
|
53
|
-
options.delete(:slurm_basedir)
|
54
|
-
|
55
|
-
rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
|
56
|
-
|
57
|
-
rbbt_cmd += " " << options.collect do |o,v|
|
58
|
-
o = o.to_s
|
59
|
-
case v
|
60
|
-
when TrueClass
|
61
|
-
'--' << o
|
62
|
-
when FalseClass
|
63
|
-
'--' << o << "=false"
|
64
|
-
else
|
65
|
-
['--' << o, "'#{v}'"] * " "
|
66
|
-
end
|
67
|
-
end * " "
|
68
|
-
|
69
|
-
rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
|
70
|
-
|
71
|
-
|
72
|
-
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
73
|
-
|
74
|
-
|
75
|
-
#{{{ PREPARE LOCAL LOGFILES
|
76
|
-
|
77
|
-
Open.mkdir slurm_basedir
|
78
|
-
|
79
|
-
fout = File.join(slurm_basedir, 'std.out')
|
80
|
-
ferr = File.join(slurm_basedir, 'std.err')
|
81
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
82
|
-
fexit = File.join(slurm_basedir, 'exit.status')
|
83
|
-
fsync = File.join(slurm_basedir, 'sync.log')
|
84
|
-
fcmd = File.join(slurm_basedir, 'command.slurm')
|
85
|
-
|
86
|
-
#{{{ GENERATE TEMPLATE
|
87
|
-
|
88
|
-
# HEADER
|
89
|
-
header =<<-EOF
|
90
|
-
#!/bin/bash
|
91
|
-
#SBATCH --qos="#{queue}"
|
92
|
-
#SBATCH --job-name="#{name}"
|
93
|
-
#SBATCH --workdir="#{Dir.pwd}"
|
94
|
-
#SBATCH --output="#{fout}"
|
95
|
-
#SBATCH --error="#{ferr}"
|
96
|
-
#SBATCH --cpus-per-task="#{task_cpus}"
|
97
|
-
#SBATCH --time="#{time}"
|
98
|
-
#SBATCH --nodes="#{nodes}"
|
99
|
-
EOF
|
100
|
-
|
101
|
-
if highmem
|
102
|
-
header +=<<-EOF
|
103
|
-
#SBATCH --constraint=highmem
|
104
|
-
EOF
|
105
|
-
end
|
106
|
-
|
107
|
-
if exclusive
|
108
|
-
header +=<<-EOF
|
109
|
-
#SBATCH --exclusive
|
110
|
-
EOF
|
111
|
-
end
|
112
|
-
|
113
|
-
header +=<<-EOF
|
114
|
-
#CMD: #{rbbt_cmd}
|
115
|
-
EOF
|
116
|
-
|
117
|
-
# ENV
|
118
|
-
env = ""
|
119
|
-
env +=<<-EOF
|
120
|
-
# Prepare env
|
121
|
-
[[ -f ~/config/load.sh ]] && source ~/config/load.sh
|
122
|
-
module load java
|
123
|
-
|
124
|
-
# Calculate max available memory
|
125
|
-
let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_ON_NODE"
|
126
|
-
EOF
|
127
|
-
|
128
|
-
|
129
|
-
# RUN
|
130
|
-
run = ""
|
131
|
-
exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
|
132
|
-
|
133
|
-
|
134
|
-
if singularity
|
135
|
-
#{{{ SINGULARITY
|
136
|
-
|
137
|
-
singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
|
138
|
-
|
139
|
-
env +=<<-EOF
|
140
|
-
module load intel/2018.1
|
141
|
-
module load singularity
|
142
|
-
PROJECTS_ROOT="/gpfs/projects/bsc26/"
|
143
|
-
SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
|
144
|
-
SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
|
145
|
-
SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
|
146
|
-
mkdir -p "$SINGULARITY_RUBY_INLINE"
|
147
|
-
EOF
|
148
|
-
|
149
|
-
prep = ""
|
150
|
-
|
151
|
-
if contain
|
152
|
-
scratch_group_dir = File.join('/gpfs/scratch/', group)
|
153
|
-
projects_group_dir = File.join('/gpfs/projects/', group)
|
154
|
-
|
155
|
-
prep +=<<-EOF
|
156
|
-
|
157
|
-
# Prepare container dir
|
158
|
-
CONTAINER_DIR="#{contain}"
|
159
|
-
mkdir -p $CONTAINER_DIR/.rbbt/etc/
|
160
|
-
|
161
|
-
for dir in .ruby_inline git home; do
|
162
|
-
mkdir -p $CONTAINER_DIR/$dir
|
163
|
-
done
|
164
|
-
|
165
|
-
for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
|
166
|
-
mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
|
167
|
-
done
|
168
|
-
|
169
|
-
# Copy environment
|
170
|
-
cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
|
171
|
-
|
172
|
-
# Set search_paths
|
173
|
-
echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
|
174
|
-
echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
175
|
-
echo "home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
176
|
-
echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
177
|
-
echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
178
|
-
echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
179
|
-
echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
180
|
-
EOF
|
181
|
-
|
182
|
-
if user_group && group != user_group
|
183
|
-
prep +=<<-EOF
|
184
|
-
|
185
|
-
# Add user_group search_path
|
186
|
-
echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
187
|
-
EOF
|
188
|
-
end
|
189
|
-
|
190
|
-
if inputs_dir
|
191
|
-
prep +=<<-EOF
|
192
|
-
|
193
|
-
# Copy inputs
|
194
|
-
[[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
|
195
|
-
EOF
|
196
|
-
rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
|
197
|
-
end
|
198
|
-
|
199
|
-
if copy_image
|
200
|
-
prep +=<<EOF
|
201
|
-
|
202
|
-
# Copy image
|
203
|
-
rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
|
204
|
-
SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
|
205
|
-
EOF
|
206
|
-
end
|
207
|
-
|
208
|
-
if wipe_container == "pre" || wipe_container == "both"
|
209
|
-
if singularity
|
210
|
-
prep +=<<-EOF
|
211
|
-
|
212
|
-
# Clean container pre
|
213
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
|
214
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
|
215
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
|
216
|
-
EOF
|
217
|
-
end
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
if contain
|
222
|
-
singularity_exec << %( -C -H "$CONTAINER_DIR" \
|
223
|
-
-B /scratch/tmp \
|
224
|
-
#{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
|
225
|
-
-B #{scratch_group_dir} \
|
226
|
-
-B #{projects_group_dir} \
|
227
|
-
-B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
|
228
|
-
-B ~/git:"$CONTAINER_DIR/git":ro \
|
229
|
-
#{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
|
230
|
-
-B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
|
231
|
-
"$SINGULARITY_IMG")
|
232
|
-
exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
|
233
|
-
else
|
234
|
-
singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
|
235
|
-
end
|
236
|
-
|
237
|
-
if development
|
238
|
-
exec_cmd += " rbbt --dev='#{development}'"
|
239
|
-
else
|
240
|
-
exec_cmd += ' rbbt'
|
241
|
-
end
|
242
|
-
|
243
|
-
exec_cmd = singularity_exec + " " + exec_cmd
|
244
|
-
else
|
245
|
-
if development
|
246
|
-
exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
|
247
|
-
else
|
248
|
-
exec_cmd << " " << 'rbbt'
|
249
|
-
end
|
250
|
-
|
251
|
-
if contain
|
252
|
-
rbbt_cmd << " " << %(--workdir_all='#{contain}')
|
253
|
-
end
|
254
|
-
end
|
255
|
-
|
256
|
-
|
257
|
-
cmd =<<-EOF
|
258
|
-
#{exec_cmd} \\
|
259
|
-
#{rbbt_cmd}
|
260
|
-
EOF
|
261
|
-
|
262
|
-
run +=<<-EOF
|
263
|
-
|
264
|
-
# Run command
|
265
|
-
#{cmd}
|
266
|
-
|
267
|
-
# Save exit status
|
268
|
-
exit_status=$?
|
269
|
-
|
270
|
-
EOF
|
271
|
-
|
272
|
-
# CODA
|
273
|
-
coda = ""
|
274
|
-
if sync
|
275
|
-
if singularity
|
276
|
-
coda +=<<-EOF
|
277
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
|
278
|
-
EOF
|
279
|
-
else
|
280
|
-
coda +=<<-EOF
|
281
|
-
rbbt system clean all -q &>> #{fsync}
|
282
|
-
EOF
|
283
|
-
end
|
284
|
-
|
285
|
-
if sync.include?("=>")
|
286
|
-
source, _sep, sync = sync.partition("=>")
|
287
|
-
source = source.strip
|
288
|
-
sync = sync.strip
|
289
|
-
source = File.join(File.expand_path(contain), source)
|
290
|
-
else
|
291
|
-
source = File.join(File.expand_path(contain), '.rbbt/var/jobs')
|
292
|
-
end
|
293
|
-
|
294
|
-
target = File.expand_path(sync)
|
295
|
-
coda +=<<-EOF
|
296
|
-
|
297
|
-
# Sync data to target location
|
298
|
-
mkdir -p "$(dirname '#{target}')"
|
299
|
-
rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
|
300
|
-
sync_es="$?"
|
301
|
-
find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
|
302
|
-
EOF
|
303
|
-
|
304
|
-
if contain && (wipe_container == "post" || wipe_container == "both")
|
305
|
-
prep =<<-EOF + prep
|
306
|
-
if ls -A '#{contain}' &> /dev/null ; then
|
307
|
-
echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
|
308
|
-
fi
|
309
|
-
EOF
|
310
|
-
if singularity
|
311
|
-
coda +=<<-EOF
|
312
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
|
313
|
-
|
314
|
-
|
315
|
-
# Clean container directory
|
316
|
-
#if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
317
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
|
318
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
|
319
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
|
320
|
-
#else
|
321
|
-
# echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
322
|
-
#fi
|
323
|
-
EOF
|
324
|
-
else
|
325
|
-
coda +=<<-EOF
|
326
|
-
#{exec_cmd} system clean
|
327
|
-
if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
328
|
-
rm -Rfv #{contain} &>> #{fsync}
|
329
|
-
else
|
330
|
-
echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
331
|
-
fi
|
332
|
-
unset sync_es
|
333
|
-
EOF
|
334
|
-
|
335
|
-
end
|
336
|
-
end
|
337
|
-
end
|
338
|
-
coda +=<<-EOF
|
339
|
-
|
340
|
-
# Write exit status to file
|
341
|
-
echo $exit_status > #{fexit}
|
342
|
-
EOF
|
343
|
-
if sync
|
344
|
-
coda +=<<-EOF
|
345
|
-
if [ "$sync_es" == '0' ]; then
|
346
|
-
unset sync_es
|
347
|
-
exit $exit_status
|
348
|
-
else
|
349
|
-
exit $sync_es
|
350
|
-
fi
|
351
|
-
EOF
|
352
|
-
else
|
353
|
-
coda +=<<-EOF
|
354
|
-
exit $exit_status
|
355
|
-
EOF
|
356
|
-
end
|
357
|
-
|
358
|
-
template = [header, env, prep, run, coda] * "\n"
|
359
|
-
|
360
|
-
template
|
361
|
-
end
|
362
|
-
|
363
|
-
def self.issue_template(template, options = {})
|
364
|
-
|
365
|
-
slurm_basedir = options[:slurm_basedir]
|
366
|
-
Open.mkdir slurm_basedir
|
367
|
-
|
368
|
-
dry_run = options.delete :dry_run
|
369
|
-
|
370
|
-
fout = File.join(slurm_basedir, 'std.out')
|
371
|
-
ferr = File.join(slurm_basedir, 'std.err')
|
372
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
373
|
-
fexit = File.join(slurm_basedir, 'exit.status')
|
374
|
-
fsync = File.join(slurm_basedir, 'sync.log')
|
375
|
-
fcmd = File.join(slurm_basedir, 'command.slurm')
|
376
|
-
|
377
|
-
job = nil
|
378
|
-
if options[:clean_job]
|
379
|
-
[fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
|
380
|
-
Open.rm file if Open.exists? file
|
381
|
-
end
|
382
|
-
end
|
383
|
-
|
384
|
-
return if Open.exists?(fexit)
|
385
|
-
|
386
|
-
STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
|
387
|
-
STDERR.puts template
|
388
|
-
|
389
|
-
Open.write(fcmd, template) unless File.exists? fcmd
|
390
|
-
if File.exists?(fjob)
|
391
|
-
job = Open.read(fjob).to_i
|
392
|
-
else
|
393
|
-
if File.exists?(fout)
|
394
|
-
return
|
395
|
-
elsif dry_run
|
396
|
-
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
|
397
|
-
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
|
398
|
-
raise Marenostrum::SBATCH, slurm_basedir
|
399
|
-
else
|
400
|
-
Open.rm fsync
|
401
|
-
Open.rm fexit
|
402
|
-
Open.rm fout
|
403
|
-
Open.rm ferr
|
404
|
-
job = CMD.cmd("sbatch '#{fcmd}'").read.scan(/\d+/).first.to_i
|
405
|
-
Open.write(fjob, job.to_s)
|
406
|
-
end
|
407
|
-
end
|
408
|
-
end
|
409
|
-
|
410
|
-
def self.follow_job(slurm_basedir, tail = true)
|
411
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
412
|
-
fout = File.join(slurm_basedir, 'std.out')
|
413
|
-
ferr = File.join(slurm_basedir, 'std.err')
|
414
|
-
fstatus = File.join(slurm_basedir, 'job.status')
|
415
|
-
|
416
|
-
job = Open.read(fjob).strip if Open.exists?(fjob)
|
417
|
-
|
418
|
-
if job
|
419
|
-
status_txt = CMD.cmd("squeue --job #{job}").read
|
420
|
-
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
421
|
-
STDERR.puts status_txt
|
422
|
-
lines = status_txt.split("\n").length
|
423
|
-
end
|
424
|
-
|
425
|
-
if tail
|
426
|
-
Log.severity = 10
|
427
|
-
while ! File.exists? fout
|
428
|
-
if job
|
429
|
-
STDERR.puts
|
430
|
-
Log.clear_line(STDERR)
|
431
|
-
STDERR.write Log.color(:magenta, "Waiting for Output")
|
432
|
-
3.times do
|
433
|
-
STDERR.write Log.color(:magenta, ".")
|
434
|
-
sleep 1
|
435
|
-
end
|
436
|
-
status_txt = CMD.cmd("squeue --job #{job}").read
|
437
|
-
lines.times do
|
438
|
-
Log.clear_line(STDERR)
|
439
|
-
end
|
440
|
-
Log.clear_line(STDERR)
|
441
|
-
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
442
|
-
STDERR.puts status_txt
|
443
|
-
lines = status_txt.split("\n").length
|
444
|
-
end
|
445
|
-
end
|
446
|
-
STDERR.puts
|
447
|
-
Log.clear_line(STDERR)
|
448
|
-
STDERR.puts Log.color(:magenta, "Output:")
|
449
|
-
begin
|
450
|
-
CMD.cmd("squeue --job #{job} > #{fstatus}")
|
451
|
-
out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
|
452
|
-
err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
|
453
|
-
|
454
|
-
terr = Misc.consume_stream(err, true, STDERR) if err
|
455
|
-
tout = Misc.consume_stream(out, true, STDOUT) if out
|
456
|
-
|
457
|
-
sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
|
458
|
-
rescue Aborted
|
459
|
-
ensure
|
460
|
-
begin
|
461
|
-
terr.exit if terr
|
462
|
-
tout.exit if tout
|
463
|
-
err.close if err
|
464
|
-
err.join if err
|
465
|
-
rescue Exception
|
466
|
-
end
|
467
|
-
|
468
|
-
begin
|
469
|
-
out.close if out
|
470
|
-
out.join if out
|
471
|
-
rescue Exception
|
472
|
-
end
|
473
|
-
end
|
474
|
-
end
|
475
|
-
end
|
476
|
-
|
477
|
-
def self.wait_for_job(slurm_basedir, time = 1)
|
478
|
-
fexit = File.join(slurm_basedir, 'exit.status')
|
479
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
480
|
-
job = Open.read(fjob) if Open.exists?(fjob)
|
481
|
-
|
482
|
-
|
483
|
-
while ! Open.exists?(fexit)
|
484
|
-
sleep time
|
485
|
-
end
|
486
|
-
end
|
487
|
-
|
488
|
-
def self.run_job(job, options = {})
|
489
|
-
options = IndiferentHash.setup(options.dup)
|
490
|
-
|
491
|
-
dry_run = options.delete :dry_run
|
492
|
-
tail = options.delete :tail
|
493
|
-
|
494
|
-
workflow = job.workflow
|
495
|
-
task = job.task_name
|
496
|
-
|
497
|
-
keep_slurm_basedir = options.delete :keep_SLURM_slurm_basedir
|
498
|
-
slurm_basedir = options.delete :SLURM_basedir
|
499
|
-
slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
|
500
|
-
TmpFile.with_file(nil, !keep_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
|
501
|
-
options[:slurm_basedir] ||= tmp_directory
|
502
|
-
slurm_basedir = options[:slurm_basedir]
|
503
|
-
inputs_dir = File.join(tmp_directory, 'inputs_dir')
|
504
|
-
saved = Step.save_job_inputs(job, inputs_dir, options)
|
505
|
-
if saved
|
506
|
-
options[:inputs_dir] = inputs_dir
|
507
|
-
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
|
508
|
-
else
|
509
|
-
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
|
510
|
-
end
|
511
|
-
|
512
|
-
|
513
|
-
template = self.template(cmd, options)
|
514
|
-
self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run))
|
515
|
-
|
516
|
-
return unless tail
|
517
|
-
|
518
|
-
t_monitor = Thread.new do
|
519
|
-
self.follow_job(slurm_basedir, :STDERR)
|
520
|
-
end
|
521
|
-
self.wait_for_job(slurm_basedir)
|
522
|
-
t_monitor.raise Aborted
|
523
|
-
return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
|
524
|
-
path = Open.read(File.join(slurm_basedir, 'std.out')).strip
|
525
|
-
if Open.exists?(path) && job.path != path
|
526
|
-
Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
|
527
|
-
Open.ln path, job.path
|
528
|
-
Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
|
529
|
-
Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
|
530
|
-
end
|
531
|
-
end
|
532
|
-
end
|
533
|
-
end
|
534
|
-
|
535
|
-
def self.relay(job, options={})
|
536
|
-
options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
|
537
|
-
done_deps = job.dependencies.select do |dep|
|
538
|
-
dep.done?
|
539
|
-
end
|
540
|
-
|
541
|
-
error_deps = job.dependencies.select do |dep|
|
542
|
-
dep.error? && ! dep.recoverable_error?
|
543
|
-
end
|
544
|
-
|
545
|
-
(done_deps + error_deps).each do |dep|
|
546
|
-
Step.migrate(dep.path, options[:search_path], options)
|
547
|
-
end
|
548
|
-
|
549
|
-
end
|
550
|
-
end
|
551
|
-
|
3
|
+
require 'rbbt/hpc/slurm'
|