rbbt-util 5.28.11 → 5.29.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,100 @@
1
+ require 'rbbt/workflow/util/orchestrator'
2
+ module HPC
3
+ module SLURM
4
+
5
+ def self.job_rules(rules, job)
6
+ workflow = job.workflow.to_s
7
+ task_name = job.task_name.to_s
8
+ defaults = rules["defaults"] || {}
9
+
10
+ job_rules = IndiferentHash.setup(defaults.dup)
11
+
12
+ rules["chains"].each do |name,info|
13
+ IndiferentHash.setup(info)
14
+ chain_tasks = info[:tasks].split(/,\s*/)
15
+
16
+ chain_tasks.each do |task|
17
+ task_workflow, chain_task = task.split("#")
18
+ chain_task, task_workflow = task_workflow, info[:workflow] if chain_task.nil? or chain_tasks.empty?
19
+ job_rules["chain_tasks"] ||= {}
20
+ job_rules["chain_tasks"][task_workflow] ||= []
21
+ job_rules["chain_tasks"][task_workflow] << chain_task
22
+ next unless task_name == chain_task.to_s && workflow == task_workflow.to_s
23
+ config_keys = job_rules.delete :config_keys
24
+ job_rules = IndiferentHash.setup(job_rules.merge(info))
25
+ if config_keys
26
+ config_keys.gsub!(/,\s+/,',')
27
+ job_rules[:config_keys] = job_rules[:config_keys] ? config_keys + "," + job_rules[:config_keys] : config_keys
28
+ end
29
+ end
30
+
31
+ if job_rules["chain_tasks"][workflow] && job_rules["chain_tasks"][workflow].include?(task_name)
32
+ break
33
+ else
34
+ job_rules.delete "chain_tasks"
35
+ end
36
+ end if rules["chains"]
37
+
38
+ config_keys = job_rules.delete :config_keys
39
+ job_rules = IndiferentHash.setup(job_rules.merge(rules[workflow][task_name])) if rules[workflow] && rules[workflow][task_name]
40
+
41
+ if config_keys
42
+ config_keys.gsub!(/,\s+/,',')
43
+ job_rules[:config_keys] = job_rules[:config_keys] ? config_keys + "," + job_rules[:config_keys] : config_keys
44
+ end
45
+
46
+ if rules["skip"] && rules["skip"][workflow]
47
+ job_rules["skip"] = true if rules["skip"][workflow].split(/,\s*/).include? task_name
48
+ end
49
+
50
+ job_rules
51
+ end
52
+
53
+ def self.get_job_dependencies(job, job_rules)
54
+ deps = job.dependencies || []
55
+ deps += job.input_dependencies || []
56
+ deps
57
+ end
58
+
59
+ def self.orchestrate_job(job, options, skip = false, seen = {})
60
+ return if job.done?
61
+ return unless job.path.split("/")[-4] == "jobs"
62
+ seen[:orchestration_target_job] ||= job
63
+ options.delete "recursive_clean"
64
+ options.delete "tail"
65
+ options.delete "printfile"
66
+ rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
67
+ rules ||= {}
68
+ IndiferentHash.setup(rules)
69
+
70
+ job_rules = self.job_rules(rules, job)
71
+
72
+ deps = get_job_dependencies(job, job_rules)
73
+
74
+ dep_ids = deps.collect do |dep|
75
+ skip_dep = job_rules["chain_tasks"] &&
76
+ job_rules["chain_tasks"][job.workflow.to_s] && job_rules["chain_tasks"][job.workflow.to_s].include?(job.task_name.to_s) &&
77
+ job_rules["chain_tasks"][dep.workflow.to_s] && job_rules["chain_tasks"][dep.workflow.to_s].include?(dep.task_name.to_s)
78
+ seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
79
+ end.flatten.compact.uniq
80
+
81
+ skip = true if job_rules[:skip]
82
+ return dep_ids if skip and seen[:orchestration_target_job] != job
83
+
84
+ job_rules.delete :chain_tasks
85
+ job_rules.delete :tasks
86
+ job_rules.delete :workflow
87
+
88
+ config_keys = job_rules.delete(:config_keys)
89
+
90
+ job_options = IndiferentHash.setup(options.merge(job_rules).merge(:slurm_dependencies => dep_ids))
91
+ job_options.delete :orchestration_rules
92
+ if config_keys
93
+ config_keys.gsub!(/,\s+/,',')
94
+ job_options[:config_keys] = job_options[:config_keys] ? config_keys + "," + job_options[:config_keys] : config_keys
95
+ end
96
+
97
+ run_job(job, job_options)
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,571 @@
1
+ module HPC
2
+ class SBATCH < Exception;
3
+ attr_accessor :directory
4
+ def initialize(directory)
5
+ @directory = directory
6
+ end
7
+ end
8
+
9
+ module SLURM
10
+
11
+ def self.template(args, options = {})
12
+
13
+ development = options.delete :drbbt
14
+ singularity = options.delete :singularity
15
+ contain = options.delete :contain
16
+ sync = options.delete :sync
17
+ user_group = options.delete :user_group
18
+ contain_and_sync = options.delete :contain_and_sync
19
+ wipe_container = options.delete :wipe_container
20
+ copy_image = options.delete :copy_image
21
+ exclusive = options.delete :exclusive
22
+ highmem = options.delete :highmem
23
+
24
+ queue = options.delete(:queue) || 'bsc_ls'
25
+ task_cpus = options.delete(:task_cpus) || 1
26
+ nodes = options.delete(:nodes) || 1
27
+ time = options.delete(:time) || "0:00:10"
28
+
29
+ inputs_dir = options.delete :inputs_dir
30
+ config_keys = options.delete :config_keys
31
+
32
+ user = ENV['USER'] || `whoami`.strip
33
+ group = File.basename(File.dirname(ENV['HOME']))
34
+
35
+ if contain_and_sync
36
+ contain = "/scratch/tmp/rbbt-#{user}" if contain.nil?
37
+ sync = "~/.rbbt/var/jobs" if sync.nil?
38
+ wipe_container = "post" if wipe_container.nil?
39
+ end
40
+
41
+ contain = nil if contain == "" || contain == "none"
42
+ sync = nil if sync == "" || sync == "none"
43
+
44
+ contain = File.expand_path(contain) if contain
45
+
46
+ name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
47
+ options.delete(:name)
48
+ slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
49
+ options.delete(:slurm_basedir)
50
+
51
+ rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
52
+
53
+ rbbt_cmd += " " << options.collect do |o,v|
54
+ o = o.to_s
55
+ case v
56
+ when TrueClass
57
+ '--' << o
58
+ when FalseClass
59
+ '--' << o << "=false"
60
+ else
61
+ ['--' << o, "'#{v}'"] * " "
62
+ end
63
+ end * " "
64
+
65
+ rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
66
+
67
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
68
+
69
+
70
+ #{{{ PREPARE LOCAL LOGFILES
71
+
72
+ Open.mkdir slurm_basedir
73
+
74
+ fout = File.join(slurm_basedir, 'std.out')
75
+ ferr = File.join(slurm_basedir, 'std.err')
76
+ fjob = File.join(slurm_basedir, 'job.id')
77
+ fexit = File.join(slurm_basedir, 'exit.status')
78
+ fsync = File.join(slurm_basedir, 'sync.log')
79
+ fcmd = File.join(slurm_basedir, 'command.slurm')
80
+
81
+ #{{{ GENERATE TEMPLATE
82
+
83
+ # HEADER
84
+ header =<<-EOF
85
+ #!/bin/bash
86
+ #SBATCH --qos="#{queue}"
87
+ #SBATCH --job-name="#{name}"
88
+ #SBATCH --workdir="#{Dir.pwd}"
89
+ #SBATCH --output="#{fout}"
90
+ #SBATCH --error="#{ferr}"
91
+ #SBATCH --cpus-per-task="#{task_cpus}"
92
+ #SBATCH --time="#{time}"
93
+ #SBATCH --nodes="#{nodes}"
94
+ EOF
95
+
96
+ prep = ""
97
+
98
+ if highmem
99
+ header +=<<-EOF
100
+ #SBATCH --constraint=highmem
101
+ EOF
102
+ end
103
+
104
+ if exclusive
105
+ header +=<<-EOF
106
+ #SBATCH --exclusive
107
+ EOF
108
+ end
109
+
110
+ header +=<<-EOF
111
+ #CMD: #{rbbt_cmd}
112
+ EOF
113
+
114
+ # ENV
115
+ env = ""
116
+ env +=<<-EOF
117
+ # Prepare env
118
+ [[ -f ~/config/load.sh ]] && source ~/config/load.sh
119
+ module load java
120
+
121
+ # Calculate max available memory
122
+ let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
123
+ EOF
124
+
125
+
126
+ # RUN
127
+ run = ""
128
+ exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
129
+
130
+
131
+ if singularity
132
+ #{{{ SINGULARITY
133
+
134
+ singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
135
+
136
+ env +=<<-EOF
137
+ module load intel/2018.1
138
+ module load singularity
139
+ PROJECTS_ROOT="/gpfs/projects/bsc26/"
140
+ SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
141
+ SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
142
+ SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
143
+ mkdir -p "$SINGULARITY_RUBY_INLINE"
144
+ EOF
145
+
146
+ if contain
147
+ scratch_group_dir = File.join('/gpfs/scratch/', group)
148
+ projects_group_dir = File.join('/gpfs/projects/', group)
149
+
150
+ prep +=<<-EOF
151
+
152
+ # Prepare container dir
153
+ CONTAINER_DIR="#{contain}"
154
+ mkdir -p $CONTAINER_DIR/.rbbt/etc/
155
+
156
+ for dir in .ruby_inline git home; do
157
+ mkdir -p $CONTAINER_DIR/$dir
158
+ done
159
+
160
+ for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
161
+ mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
162
+ done
163
+
164
+ # Copy environment
165
+ cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
166
+
167
+ # Set search_paths
168
+ echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
169
+ echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
170
+ echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
171
+ echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
172
+ echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
173
+ echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
174
+ echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
175
+ EOF
176
+
177
+ if user_group && group != user_group
178
+ prep +=<<-EOF
179
+
180
+ # Add user_group search_path
181
+ echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
182
+ EOF
183
+ end
184
+
185
+ if inputs_dir
186
+ prep +=<<-EOF
187
+
188
+ # Copy inputs
189
+ [[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
190
+ EOF
191
+ rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
192
+ end
193
+
194
+ if copy_image
195
+ prep +=<<EOF
196
+
197
+ # Copy image
198
+ rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
199
+ SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
200
+ EOF
201
+ end
202
+
203
+ if wipe_container == "pre" || wipe_container == "both"
204
+ if singularity
205
+ prep +=<<-EOF
206
+
207
+ # Clean container pre
208
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
209
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
210
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
211
+ EOF
212
+ else
213
+ prep = ""
214
+ end
215
+ end
216
+ end
217
+
218
+ if contain
219
+ singularity_exec << %( -C -H "$CONTAINER_DIR" \
220
+ -B /scratch/tmp \
221
+ #{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
222
+ -B #{scratch_group_dir} \
223
+ -B #{projects_group_dir} \
224
+ -B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
225
+ -B ~/git:"$CONTAINER_DIR/git":ro \
226
+ #{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
227
+ -B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
228
+ "$SINGULARITY_IMG")
229
+ exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
230
+ else
231
+ singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
232
+ end
233
+
234
+ if development
235
+ exec_cmd += " rbbt --dev='#{development}'"
236
+ else
237
+ exec_cmd += ' rbbt'
238
+ end
239
+
240
+ exec_cmd = singularity_exec + " " + exec_cmd
241
+ else
242
+ if development
243
+ exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
244
+ else
245
+ exec_cmd << " " << 'rbbt'
246
+ end
247
+
248
+ if contain
249
+ rbbt_cmd << " " << %(--workdir_all='#{contain}')
250
+ end
251
+ end
252
+
253
+
254
+ cmd =<<-EOF
255
+ #{exec_cmd} \\
256
+ #{rbbt_cmd}
257
+ EOF
258
+
259
+ run +=<<-EOF
260
+
261
+ # Run command
262
+ #{cmd}
263
+
264
+ # Save exit status
265
+ exit_status=$?
266
+
267
+ EOF
268
+
269
+ # CODA
270
+ coda = ""
271
+ if sync
272
+ if singularity
273
+ coda +=<<-EOF
274
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
275
+ EOF
276
+ else
277
+ coda +=<<-EOF
278
+ rbbt system clean all -q &>> #{fsync}
279
+ EOF
280
+ end
281
+
282
+ if sync.include?("=>")
283
+ source, _sep, sync = sync.partition("=>")
284
+ source = source.strip
285
+ sync = sync.strip
286
+ source = File.join(File.expand_path(contain), source)
287
+ else
288
+ source = File.join(File.expand_path(contain), '.rbbt/var/jobs')
289
+ end
290
+
291
+ target = File.expand_path(sync)
292
+ coda +=<<-EOF
293
+
294
+ # Sync data to target location
295
+ mkdir -p "$(dirname '#{target}')"
296
+ rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
297
+ sync_es="$?"
298
+ find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
299
+ EOF
300
+
301
+ if contain && (wipe_container == "post" || wipe_container == "both")
302
+ prep =<<-EOF + prep
303
+ if ls -A '#{contain}' &> /dev/null ; then
304
+ echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
305
+ fi
306
+ EOF
307
+ if singularity
308
+ coda +=<<-EOF
309
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
310
+
311
+
312
+ # Clean container directory
313
+ #if [ $exit_status == '0' -a $sync_es == '0' ]; then
314
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
315
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
316
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
317
+ #else
318
+ # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
319
+ #fi
320
+ EOF
321
+ else
322
+ coda +=<<-EOF
323
+ #{exec_cmd} system clean
324
+ if [ $exit_status == '0' -a $sync_es == '0' ]; then
325
+ rm -Rfv #{contain} &>> #{fsync}
326
+ else
327
+ echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
328
+ fi
329
+ unset sync_es
330
+ EOF
331
+
332
+ end
333
+ end
334
+ end
335
+ coda +=<<-EOF
336
+
337
+ # Write exit status to file
338
+ echo $exit_status > #{fexit}
339
+ EOF
340
+ if sync
341
+ coda +=<<-EOF
342
+ if [ "$sync_es" == '0' ]; then
343
+ unset sync_es
344
+ exit $exit_status
345
+ else
346
+ exit $sync_es
347
+ fi
348
+ EOF
349
+ else
350
+ coda +=<<-EOF
351
+ exit $exit_status
352
+ EOF
353
+ end
354
+
355
+ template = [header, env, prep, run, coda] * "\n"
356
+
357
+ template
358
+ end
359
+
360
+ def self.issue_template(template, options = {})
361
+
362
+ slurm_basedir = options[:slurm_basedir]
363
+ dependencies = options.delete :slurm_dependencies
364
+ dependencies = [] if dependencies.nil?
365
+ Open.mkdir slurm_basedir
366
+
367
+ dry_run = options.delete :dry_run
368
+
369
+ fout = File.join(slurm_basedir, 'std.out')
370
+ ferr = File.join(slurm_basedir, 'std.err')
371
+ fjob = File.join(slurm_basedir, 'job.id')
372
+ fdep = File.join(slurm_basedir, 'dependencies.list')
373
+ fexit = File.join(slurm_basedir, 'exit.status')
374
+ fsync = File.join(slurm_basedir, 'sync.log')
375
+ fcmd = File.join(slurm_basedir, 'command.slurm')
376
+
377
+ job = nil
378
+ if options[:clean_job]
379
+ [fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
380
+ Open.rm file if Open.exists? file
381
+ end
382
+ end
383
+
384
+ return if Open.exists?(fexit)
385
+
386
+ STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
387
+ STDERR.puts template
388
+
389
+ Open.write(fcmd, template) unless File.exists? fcmd
390
+ if File.exists?(fjob)
391
+ job = Open.read(fjob).to_i
392
+ else
393
+ if File.exists?(fout)
394
+ return
395
+ elsif dry_run
396
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
397
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
398
+ raise HPC::SBATCH, slurm_basedir
399
+ else
400
+ Open.rm fsync
401
+ Open.rm fexit
402
+ Open.rm fout
403
+ Open.rm ferr
404
+ Open.write(fdep, dependencies * "\n") if dependencies.any?
405
+ dep_str = dependencies.any? ? "--dependency=afterok:" + dependencies * ":" : ''
406
+ job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
407
+ Log.debug "SBATCH job id: #{job}"
408
+ Open.write(fjob, job.to_s)
409
+ job
410
+ end
411
+ end
412
+ end
413
+
414
+ def self.follow_job(slurm_basedir, tail = true)
415
+ fjob = File.join(slurm_basedir, 'job.id')
416
+ fout = File.join(slurm_basedir, 'std.out')
417
+ ferr = File.join(slurm_basedir, 'std.err')
418
+ fstatus = File.join(slurm_basedir, 'job.status')
419
+
420
+ job = Open.read(fjob).strip if Open.exists?(fjob)
421
+
422
+ if job
423
+ status_txt = CMD.cmd("squeue --job #{job}").read
424
+ STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
425
+ STDERR.puts status_txt
426
+ lines = status_txt.split("\n").length
427
+ end
428
+
429
+ if tail
430
+ Log.severity = 10
431
+ while ! File.exists? fout
432
+ if job
433
+ STDERR.puts
434
+ Log.clear_line(STDERR)
435
+ STDERR.write Log.color(:magenta, "Waiting for Output")
436
+ 3.times do
437
+ STDERR.write Log.color(:magenta, ".")
438
+ sleep 1
439
+ end
440
+ status_txt = CMD.cmd("squeue --job #{job}").read
441
+ lines.times do
442
+ Log.clear_line(STDERR)
443
+ end
444
+ Log.clear_line(STDERR)
445
+ STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
446
+ STDERR.puts status_txt
447
+ lines = status_txt.split("\n").length
448
+ end
449
+ end
450
+ STDERR.puts
451
+ Log.clear_line(STDERR)
452
+ STDERR.puts Log.color(:magenta, "Output:")
453
+ begin
454
+ CMD.cmd("squeue --job #{job} > #{fstatus}")
455
+ out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
456
+ err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
457
+
458
+ terr = Misc.consume_stream(err, true, STDERR) if err
459
+ tout = Misc.consume_stream(out, true, STDOUT) if out
460
+
461
+ sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
462
+ rescue Aborted
463
+ ensure
464
+ begin
465
+ terr.exit if terr
466
+ tout.exit if tout
467
+ err.close if err
468
+ err.join if err
469
+ rescue Exception
470
+ end
471
+
472
+ begin
473
+ out.close if out
474
+ out.join if out
475
+ rescue Exception
476
+ end
477
+ end
478
+ end
479
+ end
480
+
481
+ def self.wait_for_job(slurm_basedir, time = 1)
482
+ fexit = File.join(slurm_basedir, 'exit.status')
483
+ fjob = File.join(slurm_basedir, 'job.id')
484
+ job = Open.read(fjob) if Open.exists?(fjob)
485
+
486
+
487
+ while ! Open.exists?(fexit)
488
+ sleep time
489
+ end
490
+ end
491
+
492
+ def self.run_job(job, options = {})
493
+ options = IndiferentHash.setup(options.dup)
494
+
495
+ dry_run = options.delete :dry_run
496
+ tail = options.delete :tail
497
+ dependencies = options.delete :slurm_dependencies
498
+ options[:jobname] = job.clean_name
499
+
500
+ workflow = job.workflow
501
+
502
+ task = Symbol === job.overriden ? job.overriden : job.task_name
503
+
504
+ if job.overriden
505
+ override_deps = job.rec_dependencies.
506
+ select{|dep| Symbol === dep.overriden }.
507
+ collect do |dep|
508
+
509
+ name = [dep.workflow.to_s, dep.task_name] * "#"
510
+ [name, dep.path] * "="
511
+ end * ","
512
+ end
513
+
514
+ remove_slurm_basedir = options.delete :remove_slurm_basedir
515
+ slurm_basedir = options.delete :SLURM_basedir
516
+ slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
517
+ TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
518
+ options[:slurm_basedir] ||= tmp_directory
519
+ slurm_basedir = options[:slurm_basedir]
520
+ inputs_dir = File.join(tmp_directory, 'inputs_dir')
521
+ saved = Step.save_job_inputs(job, inputs_dir)
522
+
523
+ if saved && saved.any?
524
+ options[:inputs_dir] = inputs_dir
525
+ cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
526
+ else
527
+ cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
528
+ end
529
+
530
+ cmd << "--override_deps='#{override_deps}'" if override_deps and not override_deps.empty?
531
+
532
+ template = self.template(cmd, options)
533
+ jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
534
+
535
+ return jobid unless tail
536
+
537
+ t_monitor = Thread.new do
538
+ self.follow_job(slurm_basedir, :STDERR)
539
+ end
540
+ self.wait_for_job(slurm_basedir)
541
+ t_monitor.raise Aborted
542
+ return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
543
+ path = Open.read(File.join(slurm_basedir, 'std.out')).strip
544
+ if Open.exists?(path) && job.path != path
545
+ Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
546
+ Open.ln path, job.path
547
+ Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
548
+ Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
549
+ end
550
+ jobid
551
+ end
552
+ end
553
+ end
554
+
555
+ def self.relay(job, options={})
556
+ options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
557
+ done_deps = job.dependencies.select do |dep|
558
+ dep.done?
559
+ end
560
+
561
+ error_deps = job.dependencies.select do |dep|
562
+ dep.error? && ! dep.recoverable_error?
563
+ end
564
+
565
+ (done_deps + error_deps).each do |dep|
566
+ Step.migrate(dep.path, options[:search_path], options)
567
+ end
568
+
569
+ end
570
+ end
571
+