rbbt-util 5.28.14 → 5.30.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,111 @@
1
+ require 'rbbt/workflow/util/orchestrator'
2
+ module HPC
3
+ module SLURM
4
+
5
+ def self.job_rules(rules, job)
6
+ workflow = job.workflow.to_s
7
+ task_name = job.task_name.to_s
8
+ defaults = rules["defaults"] || {}
9
+
10
+ job_rules = IndiferentHash.setup(defaults.dup)
11
+
12
+ rules["chains"].each do |name,info|
13
+ IndiferentHash.setup(info)
14
+ chain_tasks = info[:tasks].split(/,\s*/)
15
+
16
+ chain_tasks.each do |task|
17
+ task_workflow, chain_task = task.split("#")
18
+ chain_task, task_workflow = task_workflow, info[:workflow] if chain_task.nil? or chain_tasks.empty?
19
+ job_rules["chain_tasks"] ||= {}
20
+ job_rules["chain_tasks"][task_workflow] ||= []
21
+ job_rules["chain_tasks"][task_workflow] << chain_task
22
+ next unless task_name == chain_task.to_s && workflow == task_workflow.to_s
23
+ config_keys = job_rules.delete :config_keys
24
+ job_rules = IndiferentHash.setup(job_rules.merge(info))
25
+ if config_keys
26
+ config_keys.gsub!(/,\s+/,',')
27
+ job_rules[:config_keys] = job_rules[:config_keys] ? config_keys + "," + job_rules[:config_keys] : config_keys
28
+ end
29
+ end
30
+
31
+ if job_rules["chain_tasks"][workflow] && job_rules["chain_tasks"][workflow].include?(task_name)
32
+ break
33
+ else
34
+ job_rules.delete "chain_tasks"
35
+ end
36
+ end if rules["chains"]
37
+
38
+ config_keys = job_rules.delete :config_keys
39
+ job_rules = IndiferentHash.setup(job_rules.merge(rules[workflow][task_name])) if rules[workflow] && rules[workflow][task_name]
40
+
41
+ if config_keys
42
+ config_keys.gsub!(/,\s+/,',')
43
+ job_rules[:config_keys] = job_rules[:config_keys] ? config_keys + "," + job_rules[:config_keys] : config_keys
44
+ end
45
+
46
+ if rules["skip"] && rules["skip"][workflow]
47
+ job_rules["skip"] = true if rules["skip"][workflow].split(/,\s*/).include? task_name
48
+ end
49
+
50
+ job_rules
51
+ end
52
+
53
+ def self.get_job_dependencies(job, job_rules)
54
+ deps = job.dependencies || []
55
+ deps += job.input_dependencies || []
56
+ deps
57
+ end
58
+
59
+ def self.orchestrate_job(job, options, skip = false, seen = {})
60
+ return if job.done?
61
+ return unless job.path.split("/")[-4] == "jobs"
62
+ seen[:orchestration_target_job] ||= job
63
+
64
+ options.delete "recursive_clean"
65
+ options.delete "clean_task"
66
+ options.delete "clean"
67
+ options.delete "tail"
68
+ options.delete "printfile"
69
+ options.delete "detach"
70
+
71
+ rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
72
+ rules ||= {}
73
+ IndiferentHash.setup(rules)
74
+
75
+ job_rules = self.job_rules(rules, job)
76
+
77
+ deps = get_job_dependencies(job, job_rules)
78
+
79
+ dep_ids = deps.collect do |dep|
80
+ skip_dep = job_rules["chain_tasks"] &&
81
+ job_rules["chain_tasks"][job.workflow.to_s] && job_rules["chain_tasks"][job.workflow.to_s].include?(job.task_name.to_s) &&
82
+ job_rules["chain_tasks"][dep.workflow.to_s] && job_rules["chain_tasks"][dep.workflow.to_s].include?(dep.task_name.to_s)
83
+
84
+ deps = seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
85
+ if job.canfail_paths.include? dep.path
86
+ [deps].flatten.collect{|id| ['canfail', id] * ":"}
87
+ else
88
+ deps
89
+ end
90
+ end.flatten.compact.uniq
91
+
92
+ skip = true if job_rules[:skip]
93
+ return dep_ids if skip and seen[:orchestration_target_job] != job
94
+
95
+ job_rules.delete :chain_tasks
96
+ job_rules.delete :tasks
97
+ job_rules.delete :workflow
98
+
99
+ config_keys = job_rules.delete(:config_keys)
100
+
101
+ job_options = IndiferentHash.setup(options.merge(job_rules).merge(:slurm_dependencies => dep_ids))
102
+ job_options.delete :orchestration_rules
103
+ if config_keys
104
+ config_keys.gsub!(/,\s+/,',')
105
+ job_options[:config_keys] = job_options[:config_keys] ? config_keys + "," + job_options[:config_keys] : config_keys
106
+ end
107
+
108
+ run_job(job, job_options)
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,603 @@
1
+ module HPC
2
+ class SBATCH < Exception;
3
+ attr_accessor :directory
4
+ def initialize(directory)
5
+ @directory = directory
6
+ end
7
+ end
8
+
9
+ module SLURM
10
+
11
+ def self.template(args, options = {})
12
+
13
+ development = options.delete :drbbt
14
+ singularity = options.delete :singularity
15
+ contain = options.delete :contain
16
+ sync = options.delete :sync
17
+ user_group = options.delete :user_group
18
+ contain_and_sync = options.delete :contain_and_sync
19
+ wipe_container = options.delete :wipe_container
20
+ copy_image = options.delete :copy_image
21
+ exclusive = options.delete :exclusive
22
+ highmem = options.delete :highmem
23
+
24
+ queue = options.delete(:queue) || 'bsc_ls'
25
+ task_cpus = options.delete(:task_cpus) || 1
26
+ nodes = options.delete(:nodes) || 1
27
+ time = options.delete(:time) || "0:00:10"
28
+
29
+ inputs_dir = options.delete :inputs_dir
30
+ config_keys = options.delete :config_keys
31
+
32
+ user = ENV['USER'] || `whoami`.strip
33
+ group = File.basename(File.dirname(ENV['HOME']))
34
+
35
+ if contain_and_sync
36
+ random_file = TmpFile.random_name
37
+ contain = "/scratch/tmp/rbbt-#{user}/#{random_file}" if contain.nil?
38
+ sync = "~/.rbbt/var/jobs" if sync.nil?
39
+ wipe_container = "post" if wipe_container.nil?
40
+ end
41
+
42
+ contain = nil if contain == "" || contain == "none"
43
+ sync = nil if sync == "" || sync == "none"
44
+
45
+ contain = File.expand_path(contain) if contain
46
+
47
+ name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
48
+ options.delete(:name)
49
+ slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
50
+ options.delete(:slurm_basedir)
51
+
52
+ rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
53
+
54
+ rbbt_cmd += " " << options.collect do |o,v|
55
+ o = o.to_s
56
+ case v
57
+ when TrueClass
58
+ '--' << o
59
+ when FalseClass
60
+ '--' << o << "=false"
61
+ else
62
+ ['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
63
+ end
64
+ end * " "
65
+
66
+ rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
67
+
68
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
69
+
70
+
71
+ #{{{ PREPARE LOCAL LOGFILES
72
+
73
+ Open.mkdir slurm_basedir
74
+
75
+ fout = File.join(slurm_basedir, 'std.out')
76
+ ferr = File.join(slurm_basedir, 'std.err')
77
+ fjob = File.join(slurm_basedir, 'job.id')
78
+ fexit = File.join(slurm_basedir, 'exit.status')
79
+ fsync = File.join(slurm_basedir, 'sync.log')
80
+ fsyncexit = File.join(slurm_basedir, 'sync.status')
81
+ fcmd = File.join(slurm_basedir, 'command.slurm')
82
+
83
+ #{{{ GENERATE TEMPLATE
84
+
85
+ # HEADER
86
+ header =<<-EOF
87
+ #!/bin/bash
88
+ #SBATCH --qos="#{queue}"
89
+ #SBATCH --job-name="#{name}"
90
+ #SBATCH --workdir="#{Dir.pwd}"
91
+ #SBATCH --output="#{fout}"
92
+ #SBATCH --error="#{ferr}"
93
+ #SBATCH --cpus-per-task="#{task_cpus}"
94
+ #SBATCH --time="#{time}"
95
+ #SBATCH --nodes="#{nodes}"
96
+ EOF
97
+
98
+ prep = ""
99
+
100
+ if highmem
101
+ header +=<<-EOF
102
+ #SBATCH --constraint=highmem
103
+ EOF
104
+ end
105
+
106
+ if exclusive
107
+ header +=<<-EOF
108
+ #SBATCH --exclusive
109
+ EOF
110
+ end
111
+
112
+ # ENV
113
+ env = ""
114
+ env +=<<-EOF
115
+ # Prepare env
116
+ [[ -f ~/config/load.sh ]] && source ~/config/load.sh
117
+ module load java
118
+
119
+ # Calculate max available memory
120
+ let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
121
+ EOF
122
+
123
+
124
+ # RUN
125
+ run = ""
126
+ exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
127
+
128
+
129
+ if singularity
130
+ #{{{ SINGULARITY
131
+
132
+ singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
133
+
134
+ env +=<<-EOF
135
+ module load intel/2018.1
136
+ module load singularity
137
+ PROJECTS_ROOT="/gpfs/projects/bsc26/"
138
+ SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
139
+ SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
140
+ SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
141
+ mkdir -p "$SINGULARITY_RUBY_INLINE"
142
+ EOF
143
+
144
+ if contain
145
+ scratch_group_dir = File.join('/gpfs/scratch/', group)
146
+ projects_group_dir = File.join('/gpfs/projects/', group)
147
+
148
+ prep +=<<-EOF
149
+
150
+ # Prepare container dir
151
+ CONTAINER_DIR="#{contain}"
152
+ mkdir -p $CONTAINER_DIR/.rbbt/etc/
153
+
154
+ for dir in .ruby_inline git home; do
155
+ mkdir -p $CONTAINER_DIR/$dir
156
+ done
157
+
158
+ for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
159
+ mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
160
+ done
161
+
162
+ # Copy environment
163
+ cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
164
+
165
+ # Set search_paths
166
+ echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
167
+ echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
168
+ echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
169
+ echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
170
+ echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
171
+ echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
172
+ echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
173
+ EOF
174
+
175
+ if user_group && group != user_group
176
+ prep +=<<-EOF
177
+
178
+ # Add user_group search_path
179
+ echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
180
+ EOF
181
+ end
182
+
183
+ if inputs_dir
184
+ prep +=<<-EOF
185
+
186
+ # Copy inputs
187
+ [[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
188
+ EOF
189
+ rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
190
+ end
191
+
192
+ if copy_image
193
+ prep +=<<EOF
194
+
195
+ # Copy image
196
+ rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
197
+ SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
198
+ EOF
199
+ end
200
+
201
+ if wipe_container == "pre" || wipe_container == "both"
202
+ if singularity
203
+ prep +=<<-EOF
204
+
205
+ # Clean container pre
206
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
207
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
208
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
209
+ EOF
210
+ else
211
+ prep = ""
212
+ end
213
+ end
214
+ end
215
+
216
+ if contain
217
+ singularity_exec << %( -C -H "$CONTAINER_DIR" \
218
+ -B /scratch/tmp \
219
+ #{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
220
+ -B #{scratch_group_dir} \
221
+ -B #{projects_group_dir} \
222
+ -B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
223
+ -B ~/git:"$CONTAINER_DIR/git":ro \
224
+ #{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
225
+ -B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
226
+ "$SINGULARITY_IMG")
227
+ exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
228
+ else
229
+ singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
230
+ end
231
+
232
+ if development
233
+ exec_cmd += " rbbt --dev='#{development}'"
234
+ else
235
+ exec_cmd += ' rbbt'
236
+ end
237
+
238
+ exec_cmd = singularity_exec + " " + exec_cmd
239
+ else
240
+ if development
241
+ exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
242
+ else
243
+ exec_cmd << " " << 'rbbt'
244
+ end
245
+
246
+ if contain
247
+ rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/workdir')
248
+ end
249
+ end
250
+
251
+
252
+ cmd =<<-EOF
253
+ #{exec_cmd} \\
254
+ #{rbbt_cmd}
255
+ EOF
256
+ annotate_cmd =<<-EOF
257
+ #{exec_cmd} \\
258
+ workflow write_info --recursive --force=false --check_pid "$step_path" slurm_job $SLURM_JOB_ID
259
+ EOF
260
+
261
+ header +=<<-EOF
262
+ #CMD: #{rbbt_cmd}
263
+ EOF
264
+
265
+ run +=<<-EOF
266
+
267
+ # Run command
268
+ step_path=$(#{cmd})
269
+
270
+ # Save exit status
271
+ exit_status=$?
272
+
273
+ # Annotate info with SLURM job_info
274
+ #{annotate_cmd}
275
+
276
+ EOF
277
+
278
+ # CODA
279
+ coda = ""
280
+ if sync
281
+ if singularity
282
+ coda +=<<-EOF
283
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
284
+ EOF
285
+ # else
286
+ # coda +=<<-EOF
287
+ #rbbt system clean all -q &>> #{fsync}
288
+ #EOF
289
+ end
290
+
291
+ if sync.include?("=>")
292
+ source, _sep, sync = sync.partition("=>")
293
+ source = source.strip
294
+ sync = sync.strip
295
+ source = File.join(File.expand_path(contain), source)
296
+ else
297
+ source = File.join(File.expand_path(contain), 'workdir/var/jobs')
298
+ end
299
+
300
+ target = File.expand_path(sync)
301
+ coda +=<<-EOF
302
+
303
+ # Sync data to target location
304
+ mkdir -p "$(dirname '#{target}')"
305
+ rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
306
+ sync_es="$?"
307
+ echo $sync_es > #{fsyncexit}
308
+ find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
309
+ EOF
310
+
311
+ if contain && (wipe_container == "post" || wipe_container == "both")
312
+ prep =<<-EOF + prep
313
+ if ls -A '#{contain}' &> /dev/null ; then
314
+ echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
315
+ fi
316
+ EOF
317
+ if singularity
318
+ coda +=<<-EOF
319
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
320
+
321
+
322
+ # Clean container directory
323
+ #if [ $exit_status == '0' -a $sync_es == '0' ]; then
324
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
325
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
326
+ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
327
+ #else
328
+ # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
329
+ #fi
330
+ EOF
331
+ else
332
+ coda +=<<-EOF
333
+ ##{exec_cmd} system clean
334
+ if [ $exit_status == '0' -a $sync_es == '0' ]; then
335
+ rm -Rfv #{contain} &>> #{fsync}
336
+ else
337
+ echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
338
+ fi
339
+ EOF
340
+
341
+ end
342
+ end
343
+ end
344
+
345
+ coda +=<<-EOF
346
+
347
+ # Write exit status to file
348
+ echo $exit_status > #{fexit}
349
+ EOF
350
+
351
+ if sync
352
+ coda +=<<-EOF
353
+ if [ "$sync_es" == '0' ]; then
354
+ unset sync_es
355
+ exit $exit_status
356
+ else
357
+ exit $sync_es
358
+ fi
359
+ EOF
360
+ else
361
+ coda +=<<-EOF
362
+ exit $exit_status
363
+ EOF
364
+ end
365
+
366
+ template = [header, env, prep, run, coda] * "\n"
367
+
368
+ template
369
+ end
370
+
371
+ def self.issue_template(template, options = {})
372
+
373
+ slurm_basedir = options[:slurm_basedir]
374
+ dependencies = options.delete :slurm_dependencies
375
+ dependencies = [] if dependencies.nil?
376
+
377
+ canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
378
+ dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
379
+
380
+ Open.mkdir slurm_basedir
381
+
382
+ dry_run = options.delete :dry_run
383
+
384
+ fout = File.join(slurm_basedir, 'std.out')
385
+ ferr = File.join(slurm_basedir, 'std.err')
386
+ fjob = File.join(slurm_basedir, 'job.id')
387
+ fdep = File.join(slurm_basedir, 'dependencies.list')
388
+ fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
389
+ fexit = File.join(slurm_basedir, 'exit.status')
390
+ fsync = File.join(slurm_basedir, 'sync.log')
391
+ fcmd = File.join(slurm_basedir, 'command.slurm')
392
+
393
+ job = nil
394
+ if options[:clean_job]
395
+ [fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
396
+ Open.rm file if Open.exists? file
397
+ end
398
+ end
399
+
400
+ return if Open.exists?(fexit)
401
+
402
+ STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
403
+ STDERR.puts template
404
+
405
+ Open.write(fcmd, template) unless File.exists? fcmd
406
+ if File.exists?(fjob)
407
+ job = Open.read(fjob).to_i
408
+ else
409
+ if File.exists?(fout)
410
+ return
411
+ elsif dry_run
412
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
413
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
414
+ raise HPC::SBATCH, slurm_basedir
415
+ else
416
+ Open.rm fsync
417
+ Open.rm fexit
418
+ Open.rm fout
419
+ Open.rm ferr
420
+
421
+ Open.write(fdep, dependencies * "\n") if dependencies.any?
422
+ Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
423
+
424
+
425
+ dep_str = '--dependency='
426
+ normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
427
+ canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
428
+
429
+ if normal_dep_str.nil? && canfail_dep_str.nil?
430
+ dep_str = ""
431
+ else
432
+ dep_str += [normal_dep_str, canfail_dep_str].compact * ","
433
+ end
434
+
435
+ job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
436
+ Log.debug "SBATCH job id: #{job}"
437
+ Open.write(fjob, job.to_s)
438
+ job
439
+ end
440
+ end
441
+ end
442
+
443
+ def self.follow_job(slurm_basedir, tail = true)
444
+ fjob = File.join(slurm_basedir, 'job.id')
445
+ fout = File.join(slurm_basedir, 'std.out')
446
+ ferr = File.join(slurm_basedir, 'std.err')
447
+ fstatus = File.join(slurm_basedir, 'job.status')
448
+
449
+ job = Open.read(fjob).strip if Open.exists?(fjob)
450
+
451
+ if job
452
+ status_txt = CMD.cmd("squeue --job #{job}").read
453
+ STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
454
+ STDERR.puts status_txt
455
+ lines = status_txt.split("\n").length
456
+ end
457
+
458
+ if tail
459
+ Log.severity = 10
460
+ while ! File.exists? fout
461
+ if job
462
+ STDERR.puts
463
+ Log.clear_line(STDERR)
464
+ STDERR.write Log.color(:magenta, "Waiting for Output")
465
+ 3.times do
466
+ STDERR.write Log.color(:magenta, ".")
467
+ sleep 1
468
+ end
469
+ status_txt = CMD.cmd("squeue --job #{job}").read
470
+ lines.times do
471
+ Log.clear_line(STDERR)
472
+ end
473
+ Log.clear_line(STDERR)
474
+ STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
475
+ STDERR.puts status_txt
476
+ lines = status_txt.split("\n").length
477
+ end
478
+ end
479
+ STDERR.puts
480
+ Log.clear_line(STDERR)
481
+ STDERR.puts Log.color(:magenta, "Output:")
482
+ begin
483
+ CMD.cmd("squeue --job #{job} > #{fstatus}")
484
+ out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
485
+ err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
486
+
487
+ terr = Misc.consume_stream(err, true, STDERR) if err
488
+ tout = Misc.consume_stream(out, true, STDOUT) if out
489
+
490
+ sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
491
+ rescue Aborted
492
+ ensure
493
+ begin
494
+ terr.exit if terr
495
+ tout.exit if tout
496
+ err.close if err
497
+ err.join if err
498
+ rescue Exception
499
+ end
500
+
501
+ begin
502
+ out.close if out
503
+ out.join if out
504
+ rescue Exception
505
+ end
506
+ end
507
+ end
508
+ end
509
+
510
+ def self.wait_for_job(slurm_basedir, time = 1)
511
+ fexit = File.join(slurm_basedir, 'exit.status')
512
+ fjob = File.join(slurm_basedir, 'job.id')
513
+ job = Open.read(fjob) if Open.exists?(fjob)
514
+
515
+
516
+ while ! Open.exists?(fexit)
517
+ sleep time
518
+ end
519
+ end
520
+
521
+ def self.run_job(job, options = {})
522
+ options = IndiferentHash.setup(options.dup)
523
+
524
+ dry_run = options.delete :dry_run
525
+ tail = options.delete :tail
526
+ dependencies = options.delete :slurm_dependencies
527
+ procpath = options.delete :SLURM_procpath
528
+ options[:jobname] = job.clean_name
529
+
530
+ workflow = job.workflow
531
+
532
+ task = Symbol === job.overriden ? job.overriden : job.task_name
533
+
534
+ if job.overriden
535
+ override_deps = job.rec_dependencies.
536
+ select{|dep| Symbol === dep.overriden }.
537
+ collect do |dep|
538
+
539
+ name = [dep.workflow.to_s, dep.task_name] * "#"
540
+ [name, dep.path] * "="
541
+ end * ","
542
+ end
543
+
544
+ remove_slurm_basedir = options.delete :remove_slurm_basedir
545
+ slurm_basedir = options.delete :SLURM_basedir
546
+ slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
547
+ TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
548
+ options[:slurm_basedir] ||= tmp_directory
549
+ slurm_basedir = options[:slurm_basedir]
550
+ inputs_dir = File.join(tmp_directory, 'inputs_dir')
551
+ saved = Step.save_job_inputs(job, inputs_dir)
552
+
553
+ if saved && saved.any?
554
+ options[:inputs_dir] = inputs_dir
555
+ cmd = ['workflow', 'task', workflow.to_s, task.to_s, '--printpath', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
556
+ else
557
+ cmd = ['workflow', 'task', workflow.to_s, task.to_s, '--printpath', '--log', (options[:log] || Log.severity).to_s]
558
+ end
559
+
560
+ cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
561
+
562
+ cmd << "--procpath_performance='#{tmp_directory}/procpath##{procpath.gsub(',', '#')}'" if procpath
563
+
564
+ template = self.template(cmd, options)
565
+ jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
566
+
567
+ return jobid unless tail
568
+
569
+ t_monitor = Thread.new do
570
+ self.follow_job(slurm_basedir, :STDERR)
571
+ end
572
+ self.wait_for_job(slurm_basedir)
573
+ t_monitor.raise Aborted
574
+ return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
575
+ path = Open.read(File.join(slurm_basedir, 'std.out')).strip
576
+ if Open.exists?(path) && job.path != path
577
+ Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
578
+ Open.ln path, job.path
579
+ Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
580
+ Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
581
+ end
582
+ jobid
583
+ end
584
+ end
585
+ end
586
+
587
+ def self.relay(job, options={})
588
+ options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
589
+ done_deps = job.dependencies.select do |dep|
590
+ dep.done?
591
+ end
592
+
593
+ error_deps = job.dependencies.select do |dep|
594
+ dep.error? && ! dep.recoverable_error?
595
+ end
596
+
597
+ (done_deps + error_deps).each do |dep|
598
+ Step.migrate(dep.path, options[:search_path], options)
599
+ end
600
+
601
+ end
602
+ end
603
+