rbbt-util 5.28.10 → 5.29.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ require 'rbbt/workflow/util/orchestrator'
2
+ module HPC
3
+ module SLURM
4
+ def self.orchestrate_job(job, options, seen = {})
5
+ return if job.done?
6
+ return unless job.path.split("/")[-4] == "jobs"
7
+ options.delete "recursive_clean"
8
+ options.delete "tail"
9
+ rules = YAML.load(Open.read(options[:rules])) if options[:rules]
10
+ rules ||= {}
11
+
12
+ deps = job.dependencies || []
13
+ deps += job.input_dependencies || []
14
+
15
+ dep_ids = deps.collect do |dep|
16
+ seen[dep.path] ||= self.orchestrate_job(dep, options.dup, seen)
17
+ end.compact
18
+
19
+ job_rules = Workflow::Orchestrator.job_rules(rules, job)
20
+ job_options = options.merge(job_rules).merge(:slurm_dependencies => dep_ids)
21
+ run_job(job, job_options)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,570 @@
1
+ module HPC
2
+ class SBATCH < Exception;
3
+ attr_accessor :directory
4
+ def initialize(directory)
5
+ @directory = directory
6
+ end
7
+ end
8
+
9
+ module SLURM
10
+
11
+ def self.template(args, options = {})
12
+
13
+ development = options.delete :drbbt
14
+ singularity = options.delete :singularity
15
+ contain = options.delete :contain
16
+ sync = options.delete :sync
17
+ user_group = options.delete :user_group
18
+ contain_and_sync = options.delete :contain_and_sync
19
+ wipe_container = options.delete :wipe_container
20
+ copy_image = options.delete :copy_image
21
+ exclusive = options.delete :exclusive
22
+ highmem = options.delete :highmem
23
+
24
+ queue = options.delete(:queue) || 'bsc_ls'
25
+ task_cpus = options.delete(:task_cpus) || 1
26
+ nodes = options.delete(:nodes) || 1
27
+ time = options.delete(:time) || "0:00:10"
28
+
29
+ inputs_dir = options.delete :inputs_dir
30
+ config_keys = options.delete :config_keys
31
+
32
+ user = ENV['USER'] || `whoami`.strip
33
+ group = File.basename(File.dirname(ENV['HOME']))
34
+
35
+ if contain_and_sync
36
+ contain = "/scratch/tmp/rbbt-#{user}" if contain.nil?
37
+ sync = "~/.rbbt/var/jobs" if sync.nil?
38
+ wipe_container = "post" if wipe_container.nil?
39
+ end
40
+
41
+ contain = nil if contain == "" || contain == "none"
42
+ sync = nil if sync == "" || sync == "none"
43
+
44
+ contain = File.expand_path(contain) if contain
45
+
46
+ name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
47
+ options.delete(:name)
48
+ slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
49
+ options.delete(:slurm_basedir)
50
+
51
+ rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
52
+
53
+ rbbt_cmd += " " << options.collect do |o,v|
54
+ o = o.to_s
55
+ case v
56
+ when TrueClass
57
+ '--' << o
58
+ when FalseClass
59
+ '--' << o << "=false"
60
+ else
61
+ ['--' << o, "'#{v}'"] * " "
62
+ end
63
+ end * " "
64
+
65
+ rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
66
+
67
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
68
+
69
+
70
+ #{{{ PREPARE LOCAL LOGFILES
71
+
72
+ Open.mkdir slurm_basedir
73
+
74
+ fout = File.join(slurm_basedir, 'std.out')
75
+ ferr = File.join(slurm_basedir, 'std.err')
76
+ fjob = File.join(slurm_basedir, 'job.id')
77
+ fexit = File.join(slurm_basedir, 'exit.status')
78
+ fsync = File.join(slurm_basedir, 'sync.log')
79
+ fcmd = File.join(slurm_basedir, 'command.slurm')
80
+
81
+ #{{{ GENERATE TEMPLATE
82
+
83
+ # HEADER
84
+ header =<<-EOF
85
+ #!/bin/bash
86
+ #SBATCH --qos="#{queue}"
87
+ #SBATCH --job-name="#{name}"
88
+ #SBATCH --workdir="#{Dir.pwd}"
89
+ #SBATCH --output="#{fout}"
90
+ #SBATCH --error="#{ferr}"
91
+ #SBATCH --cpus-per-task="#{task_cpus}"
92
+ #SBATCH --time="#{time}"
93
+ #SBATCH --nodes="#{nodes}"
94
+ EOF
95
+
96
+ prep = ""
97
+
98
+ if highmem
99
+ header +=<<-EOF
100
+ #SBATCH --constraint=highmem
101
+ EOF
102
+ end
103
+
104
+ if exclusive
105
+ header +=<<-EOF
106
+ #SBATCH --exclusive
107
+ EOF
108
+ end
109
+
110
+ header +=<<-EOF
111
+ #CMD: #{rbbt_cmd}
112
+ EOF
113
+
114
+ # ENV
115
+ env = ""
116
+ env +=<<-EOF
117
+ # Prepare env
118
+ [[ -f ~/config/load.sh ]] && source ~/config/load.sh
119
+ module load java
120
+
121
+ # Calculate max available memory
122
+ let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
123
+ EOF
124
+
125
+
126
+ # RUN
127
+ run = ""
128
+ exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
129
+
130
+
131
+ if singularity
132
+ #{{{ SINGULARITY
133
+
134
+ singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
135
+
136
+ env +=<<-EOF
137
+ module load intel/2018.1
138
+ module load singularity
139
+ PROJECTS_ROOT="/gpfs/projects/bsc26/"
140
+ SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
141
+ SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
142
+ SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
143
+ mkdir -p "$SINGULARITY_RUBY_INLINE"
144
+ EOF
145
+
146
+ if contain
147
+ scratch_group_dir = File.join('/gpfs/scratch/', group)
148
+ projects_group_dir = File.join('/gpfs/projects/', group)
149
+
150
+ prep +=<<-EOF
151
+
152
+ # Prepare container dir
153
+ CONTAINER_DIR="#{contain}"
154
+ mkdir -p $CONTAINER_DIR/.rbbt/etc/
155
+
156
+ for dir in .ruby_inline git home; do
157
+ mkdir -p $CONTAINER_DIR/$dir
158
+ done
159
+
160
+ for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
161
+ mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
162
+ done
163
+
164
+ # Copy environment
165
+ cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
166
+
167
+ # Set search_paths
168
+ echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
169
+ echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
170
+ echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
171
+ echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
172
+ echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
173
+ echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
174
+ echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
175
+ EOF
176
+
177
+ if user_group && group != user_group
178
+ prep +=<<-EOF
179
+
180
+ # Add user_group search_path
181
+ echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
182
+ EOF
183
+ end
184
+
185
+ if inputs_dir
186
+ prep +=<<-EOF
187
+
188
+ # Copy inputs
189
+ [[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
190
+ EOF
191
+ rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
192
+ end
193
+
194
+ if copy_image
195
+ prep +=<<EOF
196
+
197
+ # Copy image
198
+ rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
199
+ SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
200
+ EOF
201
+ end
202
+
203
+ if wipe_container == "pre" || wipe_container == "both"
204
+ if singularity
205
+ prep +=<<-EOF
206
+
207
+ # Clean container pre
208
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
209
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
210
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
211
+ EOF
212
+ else
213
+ prep = ""
214
+ end
215
+ end
216
+ end
217
+
218
+ if contain
219
+ singularity_exec << %( -C -H "$CONTAINER_DIR" \
220
+ -B /scratch/tmp \
221
+ #{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
222
+ -B #{scratch_group_dir} \
223
+ -B #{projects_group_dir} \
224
+ -B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
225
+ -B ~/git:"$CONTAINER_DIR/git":ro \
226
+ #{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
227
+ -B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
228
+ "$SINGULARITY_IMG")
229
+ exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
230
+ else
231
+ singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
232
+ end
233
+
234
+ if development
235
+ exec_cmd += " rbbt --dev='#{development}'"
236
+ else
237
+ exec_cmd += ' rbbt'
238
+ end
239
+
240
+ exec_cmd = singularity_exec + " " + exec_cmd
241
+ else
242
+ if development
243
+ exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
244
+ else
245
+ exec_cmd << " " << 'rbbt'
246
+ end
247
+
248
+ if contain
249
+ rbbt_cmd << " " << %(--workdir_all='#{contain}')
250
+ end
251
+ end
252
+
253
+
254
+ cmd =<<-EOF
255
+ #{exec_cmd} \\
256
+ #{rbbt_cmd}
257
+ EOF
258
+
259
+ run +=<<-EOF
260
+
261
+ # Run command
262
+ #{cmd}
263
+
264
+ # Save exit status
265
+ exit_status=$?
266
+
267
+ EOF
268
+
269
+ # CODA
270
+ coda = ""
271
+ if sync
272
+ if singularity
273
+ coda +=<<-EOF
274
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
275
+ EOF
276
+ else
277
+ coda +=<<-EOF
278
+ rbbt system clean all -q &>> #{fsync}
279
+ EOF
280
+ end
281
+
282
+ if sync.include?("=>")
283
+ source, _sep, sync = sync.partition("=>")
284
+ source = source.strip
285
+ sync = sync.strip
286
+ source = File.join(File.expand_path(contain), source)
287
+ else
288
+ source = File.join(File.expand_path(contain), '.rbbt/var/jobs')
289
+ end
290
+
291
+ target = File.expand_path(sync)
292
+ coda +=<<-EOF
293
+
294
+ # Sync data to target location
295
+ mkdir -p "$(dirname '#{target}')"
296
+ rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
297
+ sync_es="$?"
298
+ find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
299
+ EOF
300
+
301
+ if contain && (wipe_container == "post" || wipe_container == "both")
302
+ prep =<<-EOF + prep
303
+ if ls -A '#{contain}' &> /dev/null ; then
304
+ echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
305
+ fi
306
+ EOF
307
+ if singularity
308
+ coda +=<<-EOF
309
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
310
+
311
+
312
+ # Clean container directory
313
+ #if [ $exit_status == '0' -a $sync_es == '0' ]; then
314
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
315
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
316
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
317
+ #else
318
+ # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
319
+ #fi
320
+ EOF
321
+ else
322
+ coda +=<<-EOF
323
+ #{exec_cmd} system clean
324
+ if [ $exit_status == '0' -a $sync_es == '0' ]; then
325
+ rm -Rfv #{contain} &>> #{fsync}
326
+ else
327
+ echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
328
+ fi
329
+ unset sync_es
330
+ EOF
331
+
332
+ end
333
+ end
334
+ end
335
+ coda +=<<-EOF
336
+
337
+ # Write exit status to file
338
+ echo $exit_status > #{fexit}
339
+ EOF
340
+ if sync
341
+ coda +=<<-EOF
342
+ if [ "$sync_es" == '0' ]; then
343
+ unset sync_es
344
+ exit $exit_status
345
+ else
346
+ exit $sync_es
347
+ fi
348
+ EOF
349
+ else
350
+ coda +=<<-EOF
351
+ exit $exit_status
352
+ EOF
353
+ end
354
+
355
+ template = [header, env, prep, run, coda] * "\n"
356
+
357
+ template
358
+ end
359
+
360
+ def self.issue_template(template, options = {})
361
+
362
+ slurm_basedir = options[:slurm_basedir]
363
+ dependencies = options.delete :slurm_dependencies
364
+ Open.mkdir slurm_basedir
365
+
366
+ dry_run = options.delete :dry_run
367
+
368
+ fout = File.join(slurm_basedir, 'std.out')
369
+ ferr = File.join(slurm_basedir, 'std.err')
370
+ fjob = File.join(slurm_basedir, 'job.id')
371
+ fdep = File.join(slurm_basedir, 'dependencies.list')
372
+ fexit = File.join(slurm_basedir, 'exit.status')
373
+ fsync = File.join(slurm_basedir, 'sync.log')
374
+ fcmd = File.join(slurm_basedir, 'command.slurm')
375
+
376
+ job = nil
377
+ if options[:clean_job]
378
+ [fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
379
+ Open.rm file if Open.exists? file
380
+ end
381
+ end
382
+
383
+ return if Open.exists?(fexit)
384
+
385
+ STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
386
+ STDERR.puts template
387
+
388
+ Open.write(fcmd, template) unless File.exists? fcmd
389
+ if File.exists?(fjob)
390
+ job = Open.read(fjob).to_i
391
+ else
392
+ if File.exists?(fout)
393
+ return
394
+ elsif dry_run
395
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
396
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
397
+ raise HPC::SBATCH, slurm_basedir
398
+ else
399
+ Open.rm fsync
400
+ Open.rm fexit
401
+ Open.rm fout
402
+ Open.rm ferr
403
+ Open.write(fdep, dependencies * "\n") if dependencies.any?
404
+ dep_str = dependencies.any? ? "--dependency=afterok:" + dependencies * ":" : ''
405
+ job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
406
+ Log.debug "SBATCH job id: #{job}"
407
+ Open.write(fjob, job.to_s)
408
+ job
409
+ end
410
+ end
411
+ end
412
+
413
+ def self.follow_job(slurm_basedir, tail = true)
414
+ fjob = File.join(slurm_basedir, 'job.id')
415
+ fout = File.join(slurm_basedir, 'std.out')
416
+ ferr = File.join(slurm_basedir, 'std.err')
417
+ fstatus = File.join(slurm_basedir, 'job.status')
418
+
419
+ job = Open.read(fjob).strip if Open.exists?(fjob)
420
+
421
+ if job
422
+ status_txt = CMD.cmd("squeue --job #{job}").read
423
+ STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
424
+ STDERR.puts status_txt
425
+ lines = status_txt.split("\n").length
426
+ end
427
+
428
+ if tail
429
+ Log.severity = 10
430
+ while ! File.exists? fout
431
+ if job
432
+ STDERR.puts
433
+ Log.clear_line(STDERR)
434
+ STDERR.write Log.color(:magenta, "Waiting for Output")
435
+ 3.times do
436
+ STDERR.write Log.color(:magenta, ".")
437
+ sleep 1
438
+ end
439
+ status_txt = CMD.cmd("squeue --job #{job}").read
440
+ lines.times do
441
+ Log.clear_line(STDERR)
442
+ end
443
+ Log.clear_line(STDERR)
444
+ STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
445
+ STDERR.puts status_txt
446
+ lines = status_txt.split("\n").length
447
+ end
448
+ end
449
+ STDERR.puts
450
+ Log.clear_line(STDERR)
451
+ STDERR.puts Log.color(:magenta, "Output:")
452
+ begin
453
+ CMD.cmd("squeue --job #{job} > #{fstatus}")
454
+ out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
455
+ err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
456
+
457
+ terr = Misc.consume_stream(err, true, STDERR) if err
458
+ tout = Misc.consume_stream(out, true, STDOUT) if out
459
+
460
+ sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
461
+ rescue Aborted
462
+ ensure
463
+ begin
464
+ terr.exit if terr
465
+ tout.exit if tout
466
+ err.close if err
467
+ err.join if err
468
+ rescue Exception
469
+ end
470
+
471
+ begin
472
+ out.close if out
473
+ out.join if out
474
+ rescue Exception
475
+ end
476
+ end
477
+ end
478
+ end
479
+
480
+ def self.wait_for_job(slurm_basedir, time = 1)
481
+ fexit = File.join(slurm_basedir, 'exit.status')
482
+ fjob = File.join(slurm_basedir, 'job.id')
483
+ job = Open.read(fjob) if Open.exists?(fjob)
484
+
485
+
486
+ while ! Open.exists?(fexit)
487
+ sleep time
488
+ end
489
+ end
490
+
491
+ def self.run_job(job, options = {})
492
+ options = IndiferentHash.setup(options.dup)
493
+
494
+ dry_run = options.delete :dry_run
495
+ tail = options.delete :tail
496
+ dependencies = options.delete :slurm_dependencies
497
+ options[:jobname] = job.clean_name
498
+
499
+ workflow = job.workflow
500
+
501
+ task = Symbol === job.overriden ? job.overriden : job.task_name
502
+
503
+ if job.overriden
504
+ override_deps = job.rec_dependencies.
505
+ select{|dep| Symbol === dep.overriden }.
506
+ collect do |dep|
507
+
508
+ name = [dep.workflow.to_s, dep.task_name] * "#"
509
+ [name, dep.path] * "="
510
+ end * ","
511
+ end
512
+
513
+ remove_slurm_basedir = options.delete :remove_slurm_basedir
514
+ slurm_basedir = options.delete :SLURM_basedir
515
+ slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
516
+ TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
517
+ options[:slurm_basedir] ||= tmp_directory
518
+ slurm_basedir = options[:slurm_basedir]
519
+ inputs_dir = File.join(tmp_directory, 'inputs_dir')
520
+ saved = Step.save_job_inputs(job, inputs_dir)
521
+
522
+ if saved && saved.any?
523
+ options[:inputs_dir] = inputs_dir
524
+ cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
525
+ else
526
+ cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
527
+ end
528
+
529
+ cmd << "--override_deps='#{override_deps}'" if override_deps and not override_deps.empty?
530
+
531
+ template = self.template(cmd, options)
532
+ jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
533
+
534
+ return jobid unless tail
535
+
536
+ t_monitor = Thread.new do
537
+ self.follow_job(slurm_basedir, :STDERR)
538
+ end
539
+ self.wait_for_job(slurm_basedir)
540
+ t_monitor.raise Aborted
541
+ return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
542
+ path = Open.read(File.join(slurm_basedir, 'std.out')).strip
543
+ if Open.exists?(path) && job.path != path
544
+ Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
545
+ Open.ln path, job.path
546
+ Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
547
+ Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
548
+ end
549
+ jobid
550
+ end
551
+ end
552
+ end
553
+
554
+ def self.relay(job, options={})
555
+ options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
556
+ done_deps = job.dependencies.select do |dep|
557
+ dep.done?
558
+ end
559
+
560
+ error_deps = job.dependencies.select do |dep|
561
+ dep.error? && ! dep.recoverable_error?
562
+ end
563
+
564
+ (done_deps + error_deps).each do |dep|
565
+ Step.migrate(dep.path, options[:search_path], options)
566
+ end
567
+
568
+ end
569
+ end
570
+