rbbt-util 5.28.12 → 5.29.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,111 @@
1
+ require 'rbbt/workflow/util/orchestrator'
2
+ module HPC
3
+ module SLURM
4
+
5
+ def self.job_rules(rules, job)
6
+ workflow = job.workflow.to_s
7
+ task_name = job.task_name.to_s
8
+ defaults = rules["defaults"] || {}
9
+
10
+ job_rules = IndiferentHash.setup(defaults.dup)
11
+
12
+ rules["chains"].each do |name,info|
13
+ IndiferentHash.setup(info)
14
+ chain_tasks = info[:tasks].split(/,\s*/)
15
+
16
+ chain_tasks.each do |task|
17
+ task_workflow, chain_task = task.split("#")
18
+ chain_task, task_workflow = task_workflow, info[:workflow] if chain_task.nil? or chain_tasks.empty?
19
+ job_rules["chain_tasks"] ||= {}
20
+ job_rules["chain_tasks"][task_workflow] ||= []
21
+ job_rules["chain_tasks"][task_workflow] << chain_task
22
+ next unless task_name == chain_task.to_s && workflow == task_workflow.to_s
23
+ config_keys = job_rules.delete :config_keys
24
+ job_rules = IndiferentHash.setup(job_rules.merge(info))
25
+ if config_keys
26
+ config_keys.gsub!(/,\s+/,',')
27
+ job_rules[:config_keys] = job_rules[:config_keys] ? config_keys + "," + job_rules[:config_keys] : config_keys
28
+ end
29
+ end
30
+
31
+ if job_rules["chain_tasks"][workflow] && job_rules["chain_tasks"][workflow].include?(task_name)
32
+ break
33
+ else
34
+ job_rules.delete "chain_tasks"
35
+ end
36
+ end if rules["chains"]
37
+
38
+ config_keys = job_rules.delete :config_keys
39
+ job_rules = IndiferentHash.setup(job_rules.merge(rules[workflow][task_name])) if rules[workflow] && rules[workflow][task_name]
40
+
41
+ if config_keys
42
+ config_keys.gsub!(/,\s+/,',')
43
+ job_rules[:config_keys] = job_rules[:config_keys] ? config_keys + "," + job_rules[:config_keys] : config_keys
44
+ end
45
+
46
+ if rules["skip"] && rules["skip"][workflow]
47
+ job_rules["skip"] = true if rules["skip"][workflow].split(/,\s*/).include? task_name
48
+ end
49
+
50
+ job_rules
51
+ end
52
+
53
+ def self.get_job_dependencies(job, job_rules)
54
+ deps = job.dependencies || []
55
+ deps += job.input_dependencies || []
56
+ deps
57
+ end
58
+
59
+ def self.orchestrate_job(job, options, skip = false, seen = {})
60
+ return if job.done?
61
+ return unless job.path.split("/")[-4] == "jobs"
62
+ seen[:orchestration_target_job] ||= job
63
+
64
+ options.delete "recursive_clean"
65
+ options.delete "clean_task"
66
+ options.delete "clean"
67
+ options.delete "tail"
68
+ options.delete "printfile"
69
+ options.delete "detach"
70
+
71
+ rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
72
+ rules ||= {}
73
+ IndiferentHash.setup(rules)
74
+
75
+ job_rules = self.job_rules(rules, job)
76
+
77
+ deps = get_job_dependencies(job, job_rules)
78
+
79
+ dep_ids = deps.collect do |dep|
80
+ skip_dep = job_rules["chain_tasks"] &&
81
+ job_rules["chain_tasks"][job.workflow.to_s] && job_rules["chain_tasks"][job.workflow.to_s].include?(job.task_name.to_s) &&
82
+ job_rules["chain_tasks"][dep.workflow.to_s] && job_rules["chain_tasks"][dep.workflow.to_s].include?(dep.task_name.to_s)
83
+
84
+ deps = seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
85
+ if job.canfail_paths.include? dep.path
86
+ [deps].flatten.collect{|id| ['canfail', id] * ":"}
87
+ else
88
+ deps
89
+ end
90
+ end.flatten.compact.uniq
91
+
92
+ skip = true if job_rules[:skip]
93
+ return dep_ids if skip and seen[:orchestration_target_job] != job
94
+
95
+ job_rules.delete :chain_tasks
96
+ job_rules.delete :tasks
97
+ job_rules.delete :workflow
98
+
99
+ config_keys = job_rules.delete(:config_keys)
100
+
101
+ job_options = IndiferentHash.setup(options.merge(job_rules).merge(:slurm_dependencies => dep_ids))
102
+ job_options.delete :orchestration_rules
103
+ if config_keys
104
+ config_keys.gsub!(/,\s+/,',')
105
+ job_options[:config_keys] = job_options[:config_keys] ? config_keys + "," + job_options[:config_keys] : config_keys
106
+ end
107
+
108
+ run_job(job, job_options)
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,592 @@
1
+ module HPC
2
+ class SBATCH < Exception;
3
+ attr_accessor :directory
4
+ def initialize(directory)
5
+ @directory = directory
6
+ end
7
+ end
8
+
9
+ module SLURM
10
+
11
+ def self.template(args, options = {})
12
+
13
+ development = options.delete :drbbt
14
+ singularity = options.delete :singularity
15
+ contain = options.delete :contain
16
+ sync = options.delete :sync
17
+ user_group = options.delete :user_group
18
+ contain_and_sync = options.delete :contain_and_sync
19
+ wipe_container = options.delete :wipe_container
20
+ copy_image = options.delete :copy_image
21
+ exclusive = options.delete :exclusive
22
+ highmem = options.delete :highmem
23
+
24
+ queue = options.delete(:queue) || 'bsc_ls'
25
+ task_cpus = options.delete(:task_cpus) || 1
26
+ nodes = options.delete(:nodes) || 1
27
+ time = options.delete(:time) || "0:00:10"
28
+
29
+ inputs_dir = options.delete :inputs_dir
30
+ config_keys = options.delete :config_keys
31
+
32
+ user = ENV['USER'] || `whoami`.strip
33
+ group = File.basename(File.dirname(ENV['HOME']))
34
+
35
+ if contain_and_sync
36
+ contain = "/scratch/tmp/rbbt-#{user}" if contain.nil?
37
+ sync = "~/.rbbt/var/jobs" if sync.nil?
38
+ wipe_container = "post" if wipe_container.nil?
39
+ end
40
+
41
+ contain = nil if contain == "" || contain == "none"
42
+ sync = nil if sync == "" || sync == "none"
43
+
44
+ contain = File.expand_path(contain) if contain
45
+
46
+ name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
47
+ options.delete(:name)
48
+ slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
49
+ options.delete(:slurm_basedir)
50
+
51
+ rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
52
+
53
+ rbbt_cmd += " " << options.collect do |o,v|
54
+ o = o.to_s
55
+ case v
56
+ when TrueClass
57
+ '--' << o
58
+ when FalseClass
59
+ '--' << o << "=false"
60
+ else
61
+ ['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
62
+ end
63
+ end * " "
64
+
65
+ rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
66
+
67
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
68
+
69
+
70
+ #{{{ PREPARE LOCAL LOGFILES
71
+
72
+ Open.mkdir slurm_basedir
73
+
74
+ fout = File.join(slurm_basedir, 'std.out')
75
+ ferr = File.join(slurm_basedir, 'std.err')
76
+ fjob = File.join(slurm_basedir, 'job.id')
77
+ fexit = File.join(slurm_basedir, 'exit.status')
78
+ fsync = File.join(slurm_basedir, 'sync.log')
79
+ fsyncexit = File.join(slurm_basedir, 'sync.status')
80
+ fcmd = File.join(slurm_basedir, 'command.slurm')
81
+
82
+ #{{{ GENERATE TEMPLATE
83
+
84
+ # HEADER
85
+ header =<<-EOF
86
+ #!/bin/bash
87
+ #SBATCH --qos="#{queue}"
88
+ #SBATCH --job-name="#{name}"
89
+ #SBATCH --workdir="#{Dir.pwd}"
90
+ #SBATCH --output="#{fout}"
91
+ #SBATCH --error="#{ferr}"
92
+ #SBATCH --cpus-per-task="#{task_cpus}"
93
+ #SBATCH --time="#{time}"
94
+ #SBATCH --nodes="#{nodes}"
95
+ EOF
96
+
97
+ prep = ""
98
+
99
+ if highmem
100
+ header +=<<-EOF
101
+ #SBATCH --constraint=highmem
102
+ EOF
103
+ end
104
+
105
+ if exclusive
106
+ header +=<<-EOF
107
+ #SBATCH --exclusive
108
+ EOF
109
+ end
110
+
111
+ # ENV
112
+ env = ""
113
+ env +=<<-EOF
114
+ # Prepare env
115
+ [[ -f ~/config/load.sh ]] && source ~/config/load.sh
116
+ module load java
117
+
118
+ # Calculate max available memory
119
+ let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
120
+ EOF
121
+
122
+
123
+ # RUN
124
+ run = ""
125
+ exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
126
+
127
+
128
+ if singularity
129
+ #{{{ SINGULARITY
130
+
131
+ singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
132
+
133
+ env +=<<-EOF
134
+ module load intel/2018.1
135
+ module load singularity
136
+ PROJECTS_ROOT="/gpfs/projects/bsc26/"
137
+ SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
138
+ SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
139
+ SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
140
+ mkdir -p "$SINGULARITY_RUBY_INLINE"
141
+ EOF
142
+
143
+ if contain
144
+ scratch_group_dir = File.join('/gpfs/scratch/', group)
145
+ projects_group_dir = File.join('/gpfs/projects/', group)
146
+
147
+ prep +=<<-EOF
148
+
149
+ # Prepare container dir
150
+ CONTAINER_DIR="#{contain}"
151
+ mkdir -p $CONTAINER_DIR/.rbbt/etc/
152
+
153
+ for dir in .ruby_inline git home; do
154
+ mkdir -p $CONTAINER_DIR/$dir
155
+ done
156
+
157
+ for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
158
+ mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
159
+ done
160
+
161
+ # Copy environment
162
+ cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
163
+
164
+ # Set search_paths
165
+ echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
166
+ echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
167
+ echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
168
+ echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
169
+ echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
170
+ echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
171
+ echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
172
+ EOF
173
+
174
+ if user_group && group != user_group
175
+ prep +=<<-EOF
176
+
177
+ # Add user_group search_path
178
+ echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
179
+ EOF
180
+ end
181
+
182
+ if inputs_dir
183
+ prep +=<<-EOF
184
+
185
+ # Copy inputs
186
+ [[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
187
+ EOF
188
+ rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
189
+ end
190
+
191
+ if copy_image
192
+ prep +=<<EOF
193
+
194
+ # Copy image
195
+ rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
196
+ SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
197
+ EOF
198
+ end
199
+
200
+ if wipe_container == "pre" || wipe_container == "both"
201
+ if singularity
202
+ prep +=<<-EOF
203
+
204
+ # Clean container pre
205
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
206
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
207
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
208
+ EOF
209
+ else
210
+ prep = ""
211
+ end
212
+ end
213
+ end
214
+
215
+ if contain
216
+ singularity_exec << %( -C -H "$CONTAINER_DIR" \
217
+ -B /scratch/tmp \
218
+ #{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
219
+ -B #{scratch_group_dir} \
220
+ -B #{projects_group_dir} \
221
+ -B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
222
+ -B ~/git:"$CONTAINER_DIR/git":ro \
223
+ #{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
224
+ -B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
225
+ "$SINGULARITY_IMG")
226
+ exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
227
+ else
228
+ singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
229
+ end
230
+
231
+ if development
232
+ exec_cmd += " rbbt --dev='#{development}'"
233
+ else
234
+ exec_cmd += ' rbbt'
235
+ end
236
+
237
+ exec_cmd = singularity_exec + " " + exec_cmd
238
+ else
239
+ if development
240
+ exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
241
+ else
242
+ exec_cmd << " " << 'rbbt'
243
+ end
244
+
245
+ if contain
246
+ rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/.rbbt/var/jobs')
247
+ end
248
+ end
249
+
250
+
251
+ cmd =<<-EOF
252
+ #{exec_cmd} \\
253
+ #{rbbt_cmd}
254
+ EOF
255
+
256
+ header +=<<-EOF
257
+ #CMD: #{rbbt_cmd}
258
+ EOF
259
+
260
+ run +=<<-EOF
261
+
262
+ # Run command
263
+ #{cmd}
264
+
265
+ # Save exit status
266
+ exit_status=$?
267
+
268
+ EOF
269
+
270
+ # CODA
271
+ coda = ""
272
+ if sync
273
+ if singularity
274
+ coda +=<<-EOF
275
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
276
+ EOF
277
+ # else
278
+ # coda +=<<-EOF
279
+ #rbbt system clean all -q &>> #{fsync}
280
+ #EOF
281
+ end
282
+
283
+ if sync.include?("=>")
284
+ source, _sep, sync = sync.partition("=>")
285
+ source = source.strip
286
+ sync = sync.strip
287
+ source = File.join(File.expand_path(contain), source)
288
+ else
289
+ source = File.join(File.expand_path(contain), '.rbbt/var/jobs')
290
+ end
291
+
292
+ target = File.expand_path(sync)
293
+ coda +=<<-EOF
294
+
295
+ # Sync data to target location
296
+ mkdir -p "$(dirname '#{target}')"
297
+ rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
298
+ sync_es="$?"
299
+ echo $sync_es > #{fsyncexit}
300
+ find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
301
+ EOF
302
+
303
+ if contain && (wipe_container == "post" || wipe_container == "both")
304
+ prep =<<-EOF + prep
305
+ if ls -A '#{contain}' &> /dev/null ; then
306
+ echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
307
+ fi
308
+ EOF
309
+ if singularity
310
+ coda +=<<-EOF
311
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
312
+
313
+
314
+ # Clean container directory
315
+ #if [ $exit_status == '0' -a $sync_es == '0' ]; then
316
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
317
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
318
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
319
+ #else
320
+ # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
321
+ #fi
322
+ EOF
323
+ else
324
+ coda +=<<-EOF
325
+ ##{exec_cmd} system clean
326
+ if [ $exit_status == '0' -a $sync_es == '0' ]; then
327
+ rm -Rfv #{contain} &>> #{fsync}
328
+ else
329
+ echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
330
+ fi
331
+ EOF
332
+
333
+ end
334
+ end
335
+ end
336
+
337
+ coda +=<<-EOF
338
+
339
+ # Write exit status to file
340
+ echo $exit_status > #{fexit}
341
+ EOF
342
+
343
+ if sync
344
+ coda +=<<-EOF
345
+ if [ "$sync_es" == '0' ]; then
346
+ unset sync_es
347
+ exit $exit_status
348
+ else
349
+ exit $sync_es
350
+ fi
351
+ EOF
352
+ else
353
+ coda +=<<-EOF
354
+ exit $exit_status
355
+ EOF
356
+ end
357
+
358
+ template = [header, env, prep, run, coda] * "\n"
359
+
360
+ template
361
+ end
362
+
363
+ def self.issue_template(template, options = {})
364
+
365
+ slurm_basedir = options[:slurm_basedir]
366
+ dependencies = options.delete :slurm_dependencies
367
+ dependencies = [] if dependencies.nil?
368
+
369
+ canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
370
+ dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
371
+
372
+ Open.mkdir slurm_basedir
373
+
374
+ dry_run = options.delete :dry_run
375
+
376
+ fout = File.join(slurm_basedir, 'std.out')
377
+ ferr = File.join(slurm_basedir, 'std.err')
378
+ fjob = File.join(slurm_basedir, 'job.id')
379
+ fdep = File.join(slurm_basedir, 'dependencies.list')
380
+ fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
381
+ fexit = File.join(slurm_basedir, 'exit.status')
382
+ fsync = File.join(slurm_basedir, 'sync.log')
383
+ fcmd = File.join(slurm_basedir, 'command.slurm')
384
+
385
+ job = nil
386
+ if options[:clean_job]
387
+ [fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
388
+ Open.rm file if Open.exists? file
389
+ end
390
+ end
391
+
392
+ return if Open.exists?(fexit)
393
+
394
+ STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
395
+ STDERR.puts template
396
+
397
+ Open.write(fcmd, template) unless File.exists? fcmd
398
+ if File.exists?(fjob)
399
+ job = Open.read(fjob).to_i
400
+ else
401
+ if File.exists?(fout)
402
+ return
403
+ elsif dry_run
404
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
405
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
406
+ raise HPC::SBATCH, slurm_basedir
407
+ else
408
+ Open.rm fsync
409
+ Open.rm fexit
410
+ Open.rm fout
411
+ Open.rm ferr
412
+
413
+ Open.write(fdep, dependencies * "\n") if dependencies.any?
414
+ Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
415
+
416
+
417
+ dep_str = '--dependency='
418
+ normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
419
+ canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
420
+
421
+ if normal_dep_str.nil? && canfail_dep_str.nil?
422
+ dep_str = ""
423
+ else
424
+ dep_str += [normal_dep_str, canfail_dep_str].compact * ","
425
+ end
426
+
427
+ job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
428
+ Log.debug "SBATCH job id: #{job}"
429
+ Open.write(fjob, job.to_s)
430
+ job
431
+ end
432
+ end
433
+ end
434
+
435
+ def self.follow_job(slurm_basedir, tail = true)
436
+ fjob = File.join(slurm_basedir, 'job.id')
437
+ fout = File.join(slurm_basedir, 'std.out')
438
+ ferr = File.join(slurm_basedir, 'std.err')
439
+ fstatus = File.join(slurm_basedir, 'job.status')
440
+
441
+ job = Open.read(fjob).strip if Open.exists?(fjob)
442
+
443
+ if job
444
+ status_txt = CMD.cmd("squeue --job #{job}").read
445
+ STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
446
+ STDERR.puts status_txt
447
+ lines = status_txt.split("\n").length
448
+ end
449
+
450
+ if tail
451
+ Log.severity = 10
452
+ while ! File.exists? fout
453
+ if job
454
+ STDERR.puts
455
+ Log.clear_line(STDERR)
456
+ STDERR.write Log.color(:magenta, "Waiting for Output")
457
+ 3.times do
458
+ STDERR.write Log.color(:magenta, ".")
459
+ sleep 1
460
+ end
461
+ status_txt = CMD.cmd("squeue --job #{job}").read
462
+ lines.times do
463
+ Log.clear_line(STDERR)
464
+ end
465
+ Log.clear_line(STDERR)
466
+ STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
467
+ STDERR.puts status_txt
468
+ lines = status_txt.split("\n").length
469
+ end
470
+ end
471
+ STDERR.puts
472
+ Log.clear_line(STDERR)
473
+ STDERR.puts Log.color(:magenta, "Output:")
474
+ begin
475
+ CMD.cmd("squeue --job #{job} > #{fstatus}")
476
+ out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
477
+ err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
478
+
479
+ terr = Misc.consume_stream(err, true, STDERR) if err
480
+ tout = Misc.consume_stream(out, true, STDOUT) if out
481
+
482
+ sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
483
+ rescue Aborted
484
+ ensure
485
+ begin
486
+ terr.exit if terr
487
+ tout.exit if tout
488
+ err.close if err
489
+ err.join if err
490
+ rescue Exception
491
+ end
492
+
493
+ begin
494
+ out.close if out
495
+ out.join if out
496
+ rescue Exception
497
+ end
498
+ end
499
+ end
500
+ end
501
+
502
+ def self.wait_for_job(slurm_basedir, time = 1)
503
+ fexit = File.join(slurm_basedir, 'exit.status')
504
+ fjob = File.join(slurm_basedir, 'job.id')
505
+ job = Open.read(fjob) if Open.exists?(fjob)
506
+
507
+
508
+ while ! Open.exists?(fexit)
509
+ sleep time
510
+ end
511
+ end
512
+
513
+ def self.run_job(job, options = {})
514
+ options = IndiferentHash.setup(options.dup)
515
+
516
+ dry_run = options.delete :dry_run
517
+ tail = options.delete :tail
518
+ dependencies = options.delete :slurm_dependencies
519
+ options[:jobname] = job.clean_name
520
+
521
+ workflow = job.workflow
522
+
523
+ task = Symbol === job.overriden ? job.overriden : job.task_name
524
+
525
+ if job.overriden
526
+ override_deps = job.rec_dependencies.
527
+ select{|dep| Symbol === dep.overriden }.
528
+ collect do |dep|
529
+
530
+ name = [dep.workflow.to_s, dep.task_name] * "#"
531
+ [name, dep.path] * "="
532
+ end * ","
533
+ end
534
+
535
+ remove_slurm_basedir = options.delete :remove_slurm_basedir
536
+ slurm_basedir = options.delete :SLURM_basedir
537
+ slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
538
+ TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
539
+ options[:slurm_basedir] ||= tmp_directory
540
+ slurm_basedir = options[:slurm_basedir]
541
+ inputs_dir = File.join(tmp_directory, 'inputs_dir')
542
+ saved = Step.save_job_inputs(job, inputs_dir)
543
+
544
+ if saved && saved.any?
545
+ options[:inputs_dir] = inputs_dir
546
+ cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
547
+ else
548
+ cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
549
+ end
550
+
551
+ cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
552
+
553
+ template = self.template(cmd, options)
554
+ jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
555
+
556
+ return jobid unless tail
557
+
558
+ t_monitor = Thread.new do
559
+ self.follow_job(slurm_basedir, :STDERR)
560
+ end
561
+ self.wait_for_job(slurm_basedir)
562
+ t_monitor.raise Aborted
563
+ return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
564
+ path = Open.read(File.join(slurm_basedir, 'std.out')).strip
565
+ if Open.exists?(path) && job.path != path
566
+ Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
567
+ Open.ln path, job.path
568
+ Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
569
+ Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
570
+ end
571
+ jobid
572
+ end
573
+ end
574
+ end
575
+
576
+ def self.relay(job, options={})
577
+ options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
578
+ done_deps = job.dependencies.select do |dep|
579
+ dep.done?
580
+ end
581
+
582
+ error_deps = job.dependencies.select do |dep|
583
+ dep.error? && ! dep.recoverable_error?
584
+ end
585
+
586
+ (done_deps + error_deps).each do |dep|
587
+ Step.migrate(dep.path, options[:search_path], options)
588
+ end
589
+
590
+ end
591
+ end
592
+