rbbt-util 5.28.10 → 5.29.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc.rb +1 -549
- data/lib/rbbt/hpc/orchestrate.rb +24 -0
- data/lib/rbbt/hpc/slurm.rb +570 -0
- data/lib/rbbt/persist.rb +8 -3
- data/lib/rbbt/resource.rb +12 -6
- data/lib/rbbt/resource/path.rb +1 -1
- data/lib/rbbt/tsv/attach.rb +7 -4
- data/lib/rbbt/tsv/parallel.rb +0 -3
- data/lib/rbbt/util/R.rb +2 -2
- data/lib/rbbt/util/cmd.rb +9 -0
- data/lib/rbbt/util/misc/indiferent_hash.rb +8 -0
- data/lib/rbbt/util/misc/inspect.rb +23 -9
- data/lib/rbbt/workflow.rb +2 -1
- data/lib/rbbt/workflow/accessor.rb +8 -2
- data/lib/rbbt/workflow/definition.rb +1 -0
- data/lib/rbbt/workflow/examples.rb +2 -2
- data/lib/rbbt/workflow/step.rb +12 -6
- data/lib/rbbt/workflow/step/accessor.rb +47 -27
- data/lib/rbbt/workflow/step/dependencies.rb +9 -4
- data/lib/rbbt/workflow/step/run.rb +22 -20
- data/lib/rbbt/workflow/util/orchestrator.rb +14 -9
- data/lib/rbbt/workflow/util/provenance.rb +12 -5
- data/share/rbbt_commands/slurm/list +141 -0
- data/share/rbbt_commands/slurm/orchestrate +47 -0
- data/share/rbbt_commands/{workflow/slurm → slurm/task} +10 -3
- data/share/rbbt_commands/system/status +22 -22
- data/share/rbbt_commands/workflow/info +12 -9
- data/share/rbbt_commands/workflow/prov +2 -1
- data/test/rbbt/test_workflow.rb +36 -4
- data/test/rbbt/tsv/test_attach.rb +86 -6
- metadata +7 -3
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'rbbt/workflow/util/orchestrator'
|
2
|
+
module HPC
|
3
|
+
module SLURM
|
4
|
+
def self.orchestrate_job(job, options, seen = {})
|
5
|
+
return if job.done?
|
6
|
+
return unless job.path.split("/")[-4] == "jobs"
|
7
|
+
options.delete "recursive_clean"
|
8
|
+
options.delete "tail"
|
9
|
+
rules = YAML.load(Open.read(options[:rules])) if options[:rules]
|
10
|
+
rules ||= {}
|
11
|
+
|
12
|
+
deps = job.dependencies || []
|
13
|
+
deps += job.input_dependencies || []
|
14
|
+
|
15
|
+
dep_ids = deps.collect do |dep|
|
16
|
+
seen[dep.path] ||= self.orchestrate_job(dep, options.dup, seen)
|
17
|
+
end.compact
|
18
|
+
|
19
|
+
job_rules = Workflow::Orchestrator.job_rules(rules, job)
|
20
|
+
job_options = options.merge(job_rules).merge(:slurm_dependencies => dep_ids)
|
21
|
+
run_job(job, job_options)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,570 @@
|
|
1
|
+
module HPC
|
2
|
+
class SBATCH < Exception;
|
3
|
+
attr_accessor :directory
|
4
|
+
def initialize(directory)
|
5
|
+
@directory = directory
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
module SLURM
|
10
|
+
|
11
|
+
def self.template(args, options = {})
|
12
|
+
|
13
|
+
development = options.delete :drbbt
|
14
|
+
singularity = options.delete :singularity
|
15
|
+
contain = options.delete :contain
|
16
|
+
sync = options.delete :sync
|
17
|
+
user_group = options.delete :user_group
|
18
|
+
contain_and_sync = options.delete :contain_and_sync
|
19
|
+
wipe_container = options.delete :wipe_container
|
20
|
+
copy_image = options.delete :copy_image
|
21
|
+
exclusive = options.delete :exclusive
|
22
|
+
highmem = options.delete :highmem
|
23
|
+
|
24
|
+
queue = options.delete(:queue) || 'bsc_ls'
|
25
|
+
task_cpus = options.delete(:task_cpus) || 1
|
26
|
+
nodes = options.delete(:nodes) || 1
|
27
|
+
time = options.delete(:time) || "0:00:10"
|
28
|
+
|
29
|
+
inputs_dir = options.delete :inputs_dir
|
30
|
+
config_keys = options.delete :config_keys
|
31
|
+
|
32
|
+
user = ENV['USER'] || `whoami`.strip
|
33
|
+
group = File.basename(File.dirname(ENV['HOME']))
|
34
|
+
|
35
|
+
if contain_and_sync
|
36
|
+
contain = "/scratch/tmp/rbbt-#{user}" if contain.nil?
|
37
|
+
sync = "~/.rbbt/var/jobs" if sync.nil?
|
38
|
+
wipe_container = "post" if wipe_container.nil?
|
39
|
+
end
|
40
|
+
|
41
|
+
contain = nil if contain == "" || contain == "none"
|
42
|
+
sync = nil if sync == "" || sync == "none"
|
43
|
+
|
44
|
+
contain = File.expand_path(contain) if contain
|
45
|
+
|
46
|
+
name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
|
47
|
+
options.delete(:name)
|
48
|
+
slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
|
49
|
+
options.delete(:slurm_basedir)
|
50
|
+
|
51
|
+
rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
|
52
|
+
|
53
|
+
rbbt_cmd += " " << options.collect do |o,v|
|
54
|
+
o = o.to_s
|
55
|
+
case v
|
56
|
+
when TrueClass
|
57
|
+
'--' << o
|
58
|
+
when FalseClass
|
59
|
+
'--' << o << "=false"
|
60
|
+
else
|
61
|
+
['--' << o, "'#{v}'"] * " "
|
62
|
+
end
|
63
|
+
end * " "
|
64
|
+
|
65
|
+
rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
|
66
|
+
|
67
|
+
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
68
|
+
|
69
|
+
|
70
|
+
#{{{ PREPARE LOCAL LOGFILES
|
71
|
+
|
72
|
+
Open.mkdir slurm_basedir
|
73
|
+
|
74
|
+
fout = File.join(slurm_basedir, 'std.out')
|
75
|
+
ferr = File.join(slurm_basedir, 'std.err')
|
76
|
+
fjob = File.join(slurm_basedir, 'job.id')
|
77
|
+
fexit = File.join(slurm_basedir, 'exit.status')
|
78
|
+
fsync = File.join(slurm_basedir, 'sync.log')
|
79
|
+
fcmd = File.join(slurm_basedir, 'command.slurm')
|
80
|
+
|
81
|
+
#{{{ GENERATE TEMPLATE
|
82
|
+
|
83
|
+
# HEADER
|
84
|
+
header =<<-EOF
|
85
|
+
#!/bin/bash
|
86
|
+
#SBATCH --qos="#{queue}"
|
87
|
+
#SBATCH --job-name="#{name}"
|
88
|
+
#SBATCH --workdir="#{Dir.pwd}"
|
89
|
+
#SBATCH --output="#{fout}"
|
90
|
+
#SBATCH --error="#{ferr}"
|
91
|
+
#SBATCH --cpus-per-task="#{task_cpus}"
|
92
|
+
#SBATCH --time="#{time}"
|
93
|
+
#SBATCH --nodes="#{nodes}"
|
94
|
+
EOF
|
95
|
+
|
96
|
+
prep = ""
|
97
|
+
|
98
|
+
if highmem
|
99
|
+
header +=<<-EOF
|
100
|
+
#SBATCH --constraint=highmem
|
101
|
+
EOF
|
102
|
+
end
|
103
|
+
|
104
|
+
if exclusive
|
105
|
+
header +=<<-EOF
|
106
|
+
#SBATCH --exclusive
|
107
|
+
EOF
|
108
|
+
end
|
109
|
+
|
110
|
+
header +=<<-EOF
|
111
|
+
#CMD: #{rbbt_cmd}
|
112
|
+
EOF
|
113
|
+
|
114
|
+
# ENV
|
115
|
+
env = ""
|
116
|
+
env +=<<-EOF
|
117
|
+
# Prepare env
|
118
|
+
[[ -f ~/config/load.sh ]] && source ~/config/load.sh
|
119
|
+
module load java
|
120
|
+
|
121
|
+
# Calculate max available memory
|
122
|
+
let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
|
123
|
+
EOF
|
124
|
+
|
125
|
+
|
126
|
+
# RUN
|
127
|
+
run = ""
|
128
|
+
exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
|
129
|
+
|
130
|
+
|
131
|
+
if singularity
|
132
|
+
#{{{ SINGULARITY
|
133
|
+
|
134
|
+
singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
|
135
|
+
|
136
|
+
env +=<<-EOF
|
137
|
+
module load intel/2018.1
|
138
|
+
module load singularity
|
139
|
+
PROJECTS_ROOT="/gpfs/projects/bsc26/"
|
140
|
+
SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
|
141
|
+
SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
|
142
|
+
SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
|
143
|
+
mkdir -p "$SINGULARITY_RUBY_INLINE"
|
144
|
+
EOF
|
145
|
+
|
146
|
+
if contain
|
147
|
+
scratch_group_dir = File.join('/gpfs/scratch/', group)
|
148
|
+
projects_group_dir = File.join('/gpfs/projects/', group)
|
149
|
+
|
150
|
+
prep +=<<-EOF
|
151
|
+
|
152
|
+
# Prepare container dir
|
153
|
+
CONTAINER_DIR="#{contain}"
|
154
|
+
mkdir -p $CONTAINER_DIR/.rbbt/etc/
|
155
|
+
|
156
|
+
for dir in .ruby_inline git home; do
|
157
|
+
mkdir -p $CONTAINER_DIR/$dir
|
158
|
+
done
|
159
|
+
|
160
|
+
for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
|
161
|
+
mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
|
162
|
+
done
|
163
|
+
|
164
|
+
# Copy environment
|
165
|
+
cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
|
166
|
+
|
167
|
+
# Set search_paths
|
168
|
+
echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
|
169
|
+
echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
170
|
+
echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
171
|
+
echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
172
|
+
echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
173
|
+
echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
174
|
+
echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
175
|
+
EOF
|
176
|
+
|
177
|
+
if user_group && group != user_group
|
178
|
+
prep +=<<-EOF
|
179
|
+
|
180
|
+
# Add user_group search_path
|
181
|
+
echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
182
|
+
EOF
|
183
|
+
end
|
184
|
+
|
185
|
+
if inputs_dir
|
186
|
+
prep +=<<-EOF
|
187
|
+
|
188
|
+
# Copy inputs
|
189
|
+
[[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
|
190
|
+
EOF
|
191
|
+
rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
|
192
|
+
end
|
193
|
+
|
194
|
+
if copy_image
|
195
|
+
prep +=<<EOF
|
196
|
+
|
197
|
+
# Copy image
|
198
|
+
rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
|
199
|
+
SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
|
200
|
+
EOF
|
201
|
+
end
|
202
|
+
|
203
|
+
if wipe_container == "pre" || wipe_container == "both"
|
204
|
+
if singularity
|
205
|
+
prep +=<<-EOF
|
206
|
+
|
207
|
+
# Clean container pre
|
208
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
|
209
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
|
210
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
|
211
|
+
EOF
|
212
|
+
else
|
213
|
+
prep = ""
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
if contain
|
219
|
+
singularity_exec << %( -C -H "$CONTAINER_DIR" \
|
220
|
+
-B /scratch/tmp \
|
221
|
+
#{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
|
222
|
+
-B #{scratch_group_dir} \
|
223
|
+
-B #{projects_group_dir} \
|
224
|
+
-B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
|
225
|
+
-B ~/git:"$CONTAINER_DIR/git":ro \
|
226
|
+
#{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
|
227
|
+
-B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
|
228
|
+
"$SINGULARITY_IMG")
|
229
|
+
exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
|
230
|
+
else
|
231
|
+
singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
|
232
|
+
end
|
233
|
+
|
234
|
+
if development
|
235
|
+
exec_cmd += " rbbt --dev='#{development}'"
|
236
|
+
else
|
237
|
+
exec_cmd += ' rbbt'
|
238
|
+
end
|
239
|
+
|
240
|
+
exec_cmd = singularity_exec + " " + exec_cmd
|
241
|
+
else
|
242
|
+
if development
|
243
|
+
exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
|
244
|
+
else
|
245
|
+
exec_cmd << " " << 'rbbt'
|
246
|
+
end
|
247
|
+
|
248
|
+
if contain
|
249
|
+
rbbt_cmd << " " << %(--workdir_all='#{contain}')
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
|
254
|
+
cmd =<<-EOF
|
255
|
+
#{exec_cmd} \\
|
256
|
+
#{rbbt_cmd}
|
257
|
+
EOF
|
258
|
+
|
259
|
+
run +=<<-EOF
|
260
|
+
|
261
|
+
# Run command
|
262
|
+
#{cmd}
|
263
|
+
|
264
|
+
# Save exit status
|
265
|
+
exit_status=$?
|
266
|
+
|
267
|
+
EOF
|
268
|
+
|
269
|
+
# CODA
|
270
|
+
coda = ""
|
271
|
+
if sync
|
272
|
+
if singularity
|
273
|
+
coda +=<<-EOF
|
274
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
|
275
|
+
EOF
|
276
|
+
else
|
277
|
+
coda +=<<-EOF
|
278
|
+
rbbt system clean all -q &>> #{fsync}
|
279
|
+
EOF
|
280
|
+
end
|
281
|
+
|
282
|
+
if sync.include?("=>")
|
283
|
+
source, _sep, sync = sync.partition("=>")
|
284
|
+
source = source.strip
|
285
|
+
sync = sync.strip
|
286
|
+
source = File.join(File.expand_path(contain), source)
|
287
|
+
else
|
288
|
+
source = File.join(File.expand_path(contain), '.rbbt/var/jobs')
|
289
|
+
end
|
290
|
+
|
291
|
+
target = File.expand_path(sync)
|
292
|
+
coda +=<<-EOF
|
293
|
+
|
294
|
+
# Sync data to target location
|
295
|
+
mkdir -p "$(dirname '#{target}')"
|
296
|
+
rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
|
297
|
+
sync_es="$?"
|
298
|
+
find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
|
299
|
+
EOF
|
300
|
+
|
301
|
+
if contain && (wipe_container == "post" || wipe_container == "both")
|
302
|
+
prep =<<-EOF + prep
|
303
|
+
if ls -A '#{contain}' &> /dev/null ; then
|
304
|
+
echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
|
305
|
+
fi
|
306
|
+
EOF
|
307
|
+
if singularity
|
308
|
+
coda +=<<-EOF
|
309
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
|
310
|
+
|
311
|
+
|
312
|
+
# Clean container directory
|
313
|
+
#if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
314
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
|
315
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
|
316
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
|
317
|
+
#else
|
318
|
+
# echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
319
|
+
#fi
|
320
|
+
EOF
|
321
|
+
else
|
322
|
+
coda +=<<-EOF
|
323
|
+
#{exec_cmd} system clean
|
324
|
+
if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
325
|
+
rm -Rfv #{contain} &>> #{fsync}
|
326
|
+
else
|
327
|
+
echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
328
|
+
fi
|
329
|
+
unset sync_es
|
330
|
+
EOF
|
331
|
+
|
332
|
+
end
|
333
|
+
end
|
334
|
+
end
|
335
|
+
coda +=<<-EOF
|
336
|
+
|
337
|
+
# Write exit status to file
|
338
|
+
echo $exit_status > #{fexit}
|
339
|
+
EOF
|
340
|
+
if sync
|
341
|
+
coda +=<<-EOF
|
342
|
+
if [ "$sync_es" == '0' ]; then
|
343
|
+
unset sync_es
|
344
|
+
exit $exit_status
|
345
|
+
else
|
346
|
+
exit $sync_es
|
347
|
+
fi
|
348
|
+
EOF
|
349
|
+
else
|
350
|
+
coda +=<<-EOF
|
351
|
+
exit $exit_status
|
352
|
+
EOF
|
353
|
+
end
|
354
|
+
|
355
|
+
template = [header, env, prep, run, coda] * "\n"
|
356
|
+
|
357
|
+
template
|
358
|
+
end
|
359
|
+
|
360
|
+
def self.issue_template(template, options = {})
|
361
|
+
|
362
|
+
slurm_basedir = options[:slurm_basedir]
|
363
|
+
dependencies = options.delete :slurm_dependencies
|
364
|
+
Open.mkdir slurm_basedir
|
365
|
+
|
366
|
+
dry_run = options.delete :dry_run
|
367
|
+
|
368
|
+
fout = File.join(slurm_basedir, 'std.out')
|
369
|
+
ferr = File.join(slurm_basedir, 'std.err')
|
370
|
+
fjob = File.join(slurm_basedir, 'job.id')
|
371
|
+
fdep = File.join(slurm_basedir, 'dependencies.list')
|
372
|
+
fexit = File.join(slurm_basedir, 'exit.status')
|
373
|
+
fsync = File.join(slurm_basedir, 'sync.log')
|
374
|
+
fcmd = File.join(slurm_basedir, 'command.slurm')
|
375
|
+
|
376
|
+
job = nil
|
377
|
+
if options[:clean_job]
|
378
|
+
[fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
|
379
|
+
Open.rm file if Open.exists? file
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
return if Open.exists?(fexit)
|
384
|
+
|
385
|
+
STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
|
386
|
+
STDERR.puts template
|
387
|
+
|
388
|
+
Open.write(fcmd, template) unless File.exists? fcmd
|
389
|
+
if File.exists?(fjob)
|
390
|
+
job = Open.read(fjob).to_i
|
391
|
+
else
|
392
|
+
if File.exists?(fout)
|
393
|
+
return
|
394
|
+
elsif dry_run
|
395
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
|
396
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
|
397
|
+
raise HPC::SBATCH, slurm_basedir
|
398
|
+
else
|
399
|
+
Open.rm fsync
|
400
|
+
Open.rm fexit
|
401
|
+
Open.rm fout
|
402
|
+
Open.rm ferr
|
403
|
+
Open.write(fdep, dependencies * "\n") if dependencies.any?
|
404
|
+
dep_str = dependencies.any? ? "--dependency=afterok:" + dependencies * ":" : ''
|
405
|
+
job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
|
406
|
+
Log.debug "SBATCH job id: #{job}"
|
407
|
+
Open.write(fjob, job.to_s)
|
408
|
+
job
|
409
|
+
end
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
def self.follow_job(slurm_basedir, tail = true)
|
414
|
+
fjob = File.join(slurm_basedir, 'job.id')
|
415
|
+
fout = File.join(slurm_basedir, 'std.out')
|
416
|
+
ferr = File.join(slurm_basedir, 'std.err')
|
417
|
+
fstatus = File.join(slurm_basedir, 'job.status')
|
418
|
+
|
419
|
+
job = Open.read(fjob).strip if Open.exists?(fjob)
|
420
|
+
|
421
|
+
if job
|
422
|
+
status_txt = CMD.cmd("squeue --job #{job}").read
|
423
|
+
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
424
|
+
STDERR.puts status_txt
|
425
|
+
lines = status_txt.split("\n").length
|
426
|
+
end
|
427
|
+
|
428
|
+
if tail
|
429
|
+
Log.severity = 10
|
430
|
+
while ! File.exists? fout
|
431
|
+
if job
|
432
|
+
STDERR.puts
|
433
|
+
Log.clear_line(STDERR)
|
434
|
+
STDERR.write Log.color(:magenta, "Waiting for Output")
|
435
|
+
3.times do
|
436
|
+
STDERR.write Log.color(:magenta, ".")
|
437
|
+
sleep 1
|
438
|
+
end
|
439
|
+
status_txt = CMD.cmd("squeue --job #{job}").read
|
440
|
+
lines.times do
|
441
|
+
Log.clear_line(STDERR)
|
442
|
+
end
|
443
|
+
Log.clear_line(STDERR)
|
444
|
+
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
445
|
+
STDERR.puts status_txt
|
446
|
+
lines = status_txt.split("\n").length
|
447
|
+
end
|
448
|
+
end
|
449
|
+
STDERR.puts
|
450
|
+
Log.clear_line(STDERR)
|
451
|
+
STDERR.puts Log.color(:magenta, "Output:")
|
452
|
+
begin
|
453
|
+
CMD.cmd("squeue --job #{job} > #{fstatus}")
|
454
|
+
out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
|
455
|
+
err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
|
456
|
+
|
457
|
+
terr = Misc.consume_stream(err, true, STDERR) if err
|
458
|
+
tout = Misc.consume_stream(out, true, STDOUT) if out
|
459
|
+
|
460
|
+
sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
|
461
|
+
rescue Aborted
|
462
|
+
ensure
|
463
|
+
begin
|
464
|
+
terr.exit if terr
|
465
|
+
tout.exit if tout
|
466
|
+
err.close if err
|
467
|
+
err.join if err
|
468
|
+
rescue Exception
|
469
|
+
end
|
470
|
+
|
471
|
+
begin
|
472
|
+
out.close if out
|
473
|
+
out.join if out
|
474
|
+
rescue Exception
|
475
|
+
end
|
476
|
+
end
|
477
|
+
end
|
478
|
+
end
|
479
|
+
|
480
|
+
def self.wait_for_job(slurm_basedir, time = 1)
|
481
|
+
fexit = File.join(slurm_basedir, 'exit.status')
|
482
|
+
fjob = File.join(slurm_basedir, 'job.id')
|
483
|
+
job = Open.read(fjob) if Open.exists?(fjob)
|
484
|
+
|
485
|
+
|
486
|
+
while ! Open.exists?(fexit)
|
487
|
+
sleep time
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
def self.run_job(job, options = {})
|
492
|
+
options = IndiferentHash.setup(options.dup)
|
493
|
+
|
494
|
+
dry_run = options.delete :dry_run
|
495
|
+
tail = options.delete :tail
|
496
|
+
dependencies = options.delete :slurm_dependencies
|
497
|
+
options[:jobname] = job.clean_name
|
498
|
+
|
499
|
+
workflow = job.workflow
|
500
|
+
|
501
|
+
task = Symbol === job.overriden ? job.overriden : job.task_name
|
502
|
+
|
503
|
+
if job.overriden
|
504
|
+
override_deps = job.rec_dependencies.
|
505
|
+
select{|dep| Symbol === dep.overriden }.
|
506
|
+
collect do |dep|
|
507
|
+
|
508
|
+
name = [dep.workflow.to_s, dep.task_name] * "#"
|
509
|
+
[name, dep.path] * "="
|
510
|
+
end * ","
|
511
|
+
end
|
512
|
+
|
513
|
+
remove_slurm_basedir = options.delete :remove_slurm_basedir
|
514
|
+
slurm_basedir = options.delete :SLURM_basedir
|
515
|
+
slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
|
516
|
+
TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
|
517
|
+
options[:slurm_basedir] ||= tmp_directory
|
518
|
+
slurm_basedir = options[:slurm_basedir]
|
519
|
+
inputs_dir = File.join(tmp_directory, 'inputs_dir')
|
520
|
+
saved = Step.save_job_inputs(job, inputs_dir)
|
521
|
+
|
522
|
+
if saved && saved.any?
|
523
|
+
options[:inputs_dir] = inputs_dir
|
524
|
+
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
|
525
|
+
else
|
526
|
+
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
|
527
|
+
end
|
528
|
+
|
529
|
+
cmd << "--override_deps='#{override_deps}'" if override_deps and not override_deps.empty?
|
530
|
+
|
531
|
+
template = self.template(cmd, options)
|
532
|
+
jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
|
533
|
+
|
534
|
+
return jobid unless tail
|
535
|
+
|
536
|
+
t_monitor = Thread.new do
|
537
|
+
self.follow_job(slurm_basedir, :STDERR)
|
538
|
+
end
|
539
|
+
self.wait_for_job(slurm_basedir)
|
540
|
+
t_monitor.raise Aborted
|
541
|
+
return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
|
542
|
+
path = Open.read(File.join(slurm_basedir, 'std.out')).strip
|
543
|
+
if Open.exists?(path) && job.path != path
|
544
|
+
Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
|
545
|
+
Open.ln path, job.path
|
546
|
+
Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
|
547
|
+
Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
|
548
|
+
end
|
549
|
+
jobid
|
550
|
+
end
|
551
|
+
end
|
552
|
+
end
|
553
|
+
|
554
|
+
def self.relay(job, options={})
|
555
|
+
options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
|
556
|
+
done_deps = job.dependencies.select do |dep|
|
557
|
+
dep.done?
|
558
|
+
end
|
559
|
+
|
560
|
+
error_deps = job.dependencies.select do |dep|
|
561
|
+
dep.error? && ! dep.recoverable_error?
|
562
|
+
end
|
563
|
+
|
564
|
+
(done_deps + error_deps).each do |dep|
|
565
|
+
Step.migrate(dep.path, options[:search_path], options)
|
566
|
+
end
|
567
|
+
|
568
|
+
end
|
569
|
+
end
|
570
|
+
|