rbbt-util 5.28.10 → 5.29.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 443812f3080048a2e39b95ceda4fda2b3f403788c4d32ae4822b0b6920ac1d1f
4
- data.tar.gz: e586076cca9dbd72b9dae8ba8be0061269e9bef5d04163f382f2d3b3f353d1b8
3
+ metadata.gz: 9b88fc549c1c1dc5cd56f06d933776d56540e3d0f4bacf77f04a449abcda974f
4
+ data.tar.gz: caf80ab624418c6c0a744038d98569c3a32d18dc8d9de162b42363e1c281e8c2
5
5
  SHA512:
6
- metadata.gz: ca6ea9a139a48dfcc76fb04b889d45881d023e9f6fda362da0bb7bd4c63b31217ca0a9bb9f4702186c78b741a1eb1b5c7ca2607512e6dced51026081bdcb8aa7
7
- data.tar.gz: 76f021c8d565e2833506f17fb54436d0fe80d29d8bface099cece0445cb70718f8f2126e6fa6eb139f6555e33fe9382d1401339aacb100073bb4964c62f1a260
6
+ metadata.gz: 8944f1d996afa5610f70046f4e61cf326461050f06f347529cd7e403440e0d9b47d0c33b862bc83e89d4479ce889e773aef0bbabfaf0dae797c3a2164f0e8a8c
7
+ data.tar.gz: fa0051835f35e23873bde6a075d81805d2878643e35e96e20c5d7423f0c7c48eaf25b2c5afad48215481b0ddd17b9d251555eae5b5c4a7fa420d9796dba9130b
@@ -1,551 +1,3 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/util/cmd'
3
-
4
- module Marenostrum
5
- SERVER='mn1'
6
- class SBATCH < Exception;
7
- attr_accessor :directory
8
- def initialize(directory)
9
- @directory = directory
10
- end
11
- end
12
-
13
- module SLURM
14
-
15
- def self.template(args, options = {})
16
-
17
- development = options.delete :drbbt
18
- singularity = options.delete :singularity
19
- contain = options.delete :contain
20
- sync = options.delete :sync
21
- user_group = options.delete :user_group
22
- contain_and_sync = options.delete :contain_and_sync
23
- wipe_container = options.delete :wipe_container
24
- copy_image = options.delete :copy_image
25
- exclusive = options.delete :exclusive
26
- highmem = options.delete :highmem
27
-
28
- queue = options.delete(:queue) || 'bsc_ls'
29
- task_cpus = options.delete(:task_cpus) || 1
30
- nodes = options.delete(:nodes) || 1
31
- time = options.delete(:time) || "0:00:10"
32
-
33
- inputs_dir = options.delete :inputs_dir
34
- config_keys = options.delete :config_keys
35
-
36
- user = ENV['USER'] || `whoami`.strip
37
- group = File.basename(File.dirname(ENV['HOME']))
38
-
39
- if contain_and_sync
40
- contain = "/scratch/tmp/rbbt-#{user}" if contain.nil?
41
- sync = "~/.rbbt/var/jobs" if sync.nil?
42
- wipe_container = "post" if wipe_container.nil?
43
- end
44
-
45
- contain = nil if contain == "" || contain == "none"
46
- sync = nil if sync == "" || sync == "none"
47
-
48
- contain = File.expand_path(contain) if contain
49
-
50
- name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
51
- options.delete(:name)
52
- slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
53
- options.delete(:slurm_basedir)
54
-
55
- rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
56
-
57
- rbbt_cmd += " " << options.collect do |o,v|
58
- o = o.to_s
59
- case v
60
- when TrueClass
61
- '--' << o
62
- when FalseClass
63
- '--' << o << "=false"
64
- else
65
- ['--' << o, "'#{v}'"] * " "
66
- end
67
- end * " "
68
-
69
- rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
70
-
71
-
72
- time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
73
-
74
-
75
- #{{{ PREPARE LOCAL LOGFILES
76
-
77
- Open.mkdir slurm_basedir
78
-
79
- fout = File.join(slurm_basedir, 'std.out')
80
- ferr = File.join(slurm_basedir, 'std.err')
81
- fjob = File.join(slurm_basedir, 'job.id')
82
- fexit = File.join(slurm_basedir, 'exit.status')
83
- fsync = File.join(slurm_basedir, 'sync.log')
84
- fcmd = File.join(slurm_basedir, 'command.slurm')
85
-
86
- #{{{ GENERATE TEMPLATE
87
-
88
- # HEADER
89
- header =<<-EOF
90
- #!/bin/bash
91
- #SBATCH --qos="#{queue}"
92
- #SBATCH --job-name="#{name}"
93
- #SBATCH --workdir="#{Dir.pwd}"
94
- #SBATCH --output="#{fout}"
95
- #SBATCH --error="#{ferr}"
96
- #SBATCH --cpus-per-task="#{task_cpus}"
97
- #SBATCH --time="#{time}"
98
- #SBATCH --nodes="#{nodes}"
99
- EOF
100
-
101
- if highmem
102
- header +=<<-EOF
103
- #SBATCH --constraint=highmem
104
- EOF
105
- end
106
-
107
- if exclusive
108
- header +=<<-EOF
109
- #SBATCH --exclusive
110
- EOF
111
- end
112
-
113
- header +=<<-EOF
114
- #CMD: #{rbbt_cmd}
115
- EOF
116
-
117
- # ENV
118
- env = ""
119
- env +=<<-EOF
120
- # Prepare env
121
- [[ -f ~/config/load.sh ]] && source ~/config/load.sh
122
- module load java
123
-
124
- # Calculate max available memory
125
- let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_ON_NODE"
126
- EOF
127
-
128
-
129
- # RUN
130
- run = ""
131
- exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
132
-
133
-
134
- if singularity
135
- #{{{ SINGULARITY
136
-
137
- singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
138
-
139
- env +=<<-EOF
140
- module load intel/2018.1
141
- module load singularity
142
- PROJECTS_ROOT="/gpfs/projects/bsc26/"
143
- SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
144
- SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
145
- SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
146
- mkdir -p "$SINGULARITY_RUBY_INLINE"
147
- EOF
148
-
149
- prep = ""
150
-
151
- if contain
152
- scratch_group_dir = File.join('/gpfs/scratch/', group)
153
- projects_group_dir = File.join('/gpfs/projects/', group)
154
-
155
- prep +=<<-EOF
156
-
157
- # Prepare container dir
158
- CONTAINER_DIR="#{contain}"
159
- mkdir -p $CONTAINER_DIR/.rbbt/etc/
160
-
161
- for dir in .ruby_inline git home; do
162
- mkdir -p $CONTAINER_DIR/$dir
163
- done
164
-
165
- for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
166
- mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
167
- done
168
-
169
- # Copy environment
170
- cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
171
-
172
- # Set search_paths
173
- echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
174
- echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
175
- echo "home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
176
- echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
177
- echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
178
- echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
179
- echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
180
- EOF
181
-
182
- if user_group && group != user_group
183
- prep +=<<-EOF
184
-
185
- # Add user_group search_path
186
- echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
187
- EOF
188
- end
189
-
190
- if inputs_dir
191
- prep +=<<-EOF
192
-
193
- # Copy inputs
194
- [[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
195
- EOF
196
- rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
197
- end
198
-
199
- if copy_image
200
- prep +=<<EOF
201
-
202
- # Copy image
203
- rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
204
- SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
205
- EOF
206
- end
207
-
208
- if wipe_container == "pre" || wipe_container == "both"
209
- if singularity
210
- prep +=<<-EOF
211
-
212
- # Clean container pre
213
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
214
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
215
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
216
- EOF
217
- end
218
- end
219
- end
220
-
221
- if contain
222
- singularity_exec << %( -C -H "$CONTAINER_DIR" \
223
- -B /scratch/tmp \
224
- #{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
225
- -B #{scratch_group_dir} \
226
- -B #{projects_group_dir} \
227
- -B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
228
- -B ~/git:"$CONTAINER_DIR/git":ro \
229
- #{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
230
- -B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
231
- "$SINGULARITY_IMG")
232
- exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
233
- else
234
- singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
235
- end
236
-
237
- if development
238
- exec_cmd += " rbbt --dev='#{development}'"
239
- else
240
- exec_cmd += ' rbbt'
241
- end
242
-
243
- exec_cmd = singularity_exec + " " + exec_cmd
244
- else
245
- if development
246
- exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
247
- else
248
- exec_cmd << " " << 'rbbt'
249
- end
250
-
251
- if contain
252
- rbbt_cmd << " " << %(--workdir_all='#{contain}')
253
- end
254
- end
255
-
256
-
257
- cmd =<<-EOF
258
- #{exec_cmd} \\
259
- #{rbbt_cmd}
260
- EOF
261
-
262
- run +=<<-EOF
263
-
264
- # Run command
265
- #{cmd}
266
-
267
- # Save exit status
268
- exit_status=$?
269
-
270
- EOF
271
-
272
- # CODA
273
- coda = ""
274
- if sync
275
- if singularity
276
- coda +=<<-EOF
277
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
278
- EOF
279
- else
280
- coda +=<<-EOF
281
- rbbt system clean all -q &>> #{fsync}
282
- EOF
283
- end
284
-
285
- if sync.include?("=>")
286
- source, _sep, sync = sync.partition("=>")
287
- source = source.strip
288
- sync = sync.strip
289
- source = File.join(File.expand_path(contain), source)
290
- else
291
- source = File.join(File.expand_path(contain), '.rbbt/var/jobs')
292
- end
293
-
294
- target = File.expand_path(sync)
295
- coda +=<<-EOF
296
-
297
- # Sync data to target location
298
- mkdir -p "$(dirname '#{target}')"
299
- rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
300
- sync_es="$?"
301
- find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
302
- EOF
303
-
304
- if contain && (wipe_container == "post" || wipe_container == "both")
305
- prep =<<-EOF + prep
306
- if ls -A '#{contain}' &> /dev/null ; then
307
- echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
308
- fi
309
- EOF
310
- if singularity
311
- coda +=<<-EOF
312
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
313
-
314
-
315
- # Clean container directory
316
- #if [ $exit_status == '0' -a $sync_es == '0' ]; then
317
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
318
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
319
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
320
- #else
321
- # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
322
- #fi
323
- EOF
324
- else
325
- coda +=<<-EOF
326
- #{exec_cmd} system clean
327
- if [ $exit_status == '0' -a $sync_es == '0' ]; then
328
- rm -Rfv #{contain} &>> #{fsync}
329
- else
330
- echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
331
- fi
332
- unset sync_es
333
- EOF
334
-
335
- end
336
- end
337
- end
338
- coda +=<<-EOF
339
-
340
- # Write exit status to file
341
- echo $exit_status > #{fexit}
342
- EOF
343
- if sync
344
- coda +=<<-EOF
345
- if [ "$sync_es" == '0' ]; then
346
- unset sync_es
347
- exit $exit_status
348
- else
349
- exit $sync_es
350
- fi
351
- EOF
352
- else
353
- coda +=<<-EOF
354
- exit $exit_status
355
- EOF
356
- end
357
-
358
- template = [header, env, prep, run, coda] * "\n"
359
-
360
- template
361
- end
362
-
363
- def self.issue_template(template, options = {})
364
-
365
- slurm_basedir = options[:slurm_basedir]
366
- Open.mkdir slurm_basedir
367
-
368
- dry_run = options.delete :dry_run
369
-
370
- fout = File.join(slurm_basedir, 'std.out')
371
- ferr = File.join(slurm_basedir, 'std.err')
372
- fjob = File.join(slurm_basedir, 'job.id')
373
- fexit = File.join(slurm_basedir, 'exit.status')
374
- fsync = File.join(slurm_basedir, 'sync.log')
375
- fcmd = File.join(slurm_basedir, 'command.slurm')
376
-
377
- job = nil
378
- if options[:clean_job]
379
- [fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
380
- Open.rm file if Open.exists? file
381
- end
382
- end
383
-
384
- return if Open.exists?(fexit)
385
-
386
- STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
387
- STDERR.puts template
388
-
389
- Open.write(fcmd, template) unless File.exists? fcmd
390
- if File.exists?(fjob)
391
- job = Open.read(fjob).to_i
392
- else
393
- if File.exists?(fout)
394
- return
395
- elsif dry_run
396
- STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
397
- STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
398
- raise Marenostrum::SBATCH, slurm_basedir
399
- else
400
- Open.rm fsync
401
- Open.rm fexit
402
- Open.rm fout
403
- Open.rm ferr
404
- job = CMD.cmd("sbatch '#{fcmd}'").read.scan(/\d+/).first.to_i
405
- Open.write(fjob, job.to_s)
406
- end
407
- end
408
- end
409
-
410
- def self.follow_job(slurm_basedir, tail = true)
411
- fjob = File.join(slurm_basedir, 'job.id')
412
- fout = File.join(slurm_basedir, 'std.out')
413
- ferr = File.join(slurm_basedir, 'std.err')
414
- fstatus = File.join(slurm_basedir, 'job.status')
415
-
416
- job = Open.read(fjob).strip if Open.exists?(fjob)
417
-
418
- if job
419
- status_txt = CMD.cmd("squeue --job #{job}").read
420
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
421
- STDERR.puts status_txt
422
- lines = status_txt.split("\n").length
423
- end
424
-
425
- if tail
426
- Log.severity = 10
427
- while ! File.exists? fout
428
- if job
429
- STDERR.puts
430
- Log.clear_line(STDERR)
431
- STDERR.write Log.color(:magenta, "Waiting for Output")
432
- 3.times do
433
- STDERR.write Log.color(:magenta, ".")
434
- sleep 1
435
- end
436
- status_txt = CMD.cmd("squeue --job #{job}").read
437
- lines.times do
438
- Log.clear_line(STDERR)
439
- end
440
- Log.clear_line(STDERR)
441
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
442
- STDERR.puts status_txt
443
- lines = status_txt.split("\n").length
444
- end
445
- end
446
- STDERR.puts
447
- Log.clear_line(STDERR)
448
- STDERR.puts Log.color(:magenta, "Output:")
449
- begin
450
- CMD.cmd("squeue --job #{job} > #{fstatus}")
451
- out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
452
- err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
453
-
454
- terr = Misc.consume_stream(err, true, STDERR) if err
455
- tout = Misc.consume_stream(out, true, STDOUT) if out
456
-
457
- sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
458
- rescue Aborted
459
- ensure
460
- begin
461
- terr.exit if terr
462
- tout.exit if tout
463
- err.close if err
464
- err.join if err
465
- rescue Exception
466
- end
467
-
468
- begin
469
- out.close if out
470
- out.join if out
471
- rescue Exception
472
- end
473
- end
474
- end
475
- end
476
-
477
- def self.wait_for_job(slurm_basedir, time = 1)
478
- fexit = File.join(slurm_basedir, 'exit.status')
479
- fjob = File.join(slurm_basedir, 'job.id')
480
- job = Open.read(fjob) if Open.exists?(fjob)
481
-
482
-
483
- while ! Open.exists?(fexit)
484
- sleep time
485
- end
486
- end
487
-
488
- def self.run_job(job, options = {})
489
- options = IndiferentHash.setup(options.dup)
490
-
491
- dry_run = options.delete :dry_run
492
- tail = options.delete :tail
493
-
494
- workflow = job.workflow
495
- task = job.task_name
496
-
497
- keep_slurm_basedir = options.delete :keep_SLURM_slurm_basedir
498
- slurm_basedir = options.delete :SLURM_basedir
499
- slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
500
- TmpFile.with_file(nil, !keep_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
501
- options[:slurm_basedir] ||= tmp_directory
502
- slurm_basedir = options[:slurm_basedir]
503
- inputs_dir = File.join(tmp_directory, 'inputs_dir')
504
- saved = Step.save_job_inputs(job, inputs_dir, options)
505
- if saved
506
- options[:inputs_dir] = inputs_dir
507
- cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
508
- else
509
- cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
510
- end
511
-
512
-
513
- template = self.template(cmd, options)
514
- self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run))
515
-
516
- return unless tail
517
-
518
- t_monitor = Thread.new do
519
- self.follow_job(slurm_basedir, :STDERR)
520
- end
521
- self.wait_for_job(slurm_basedir)
522
- t_monitor.raise Aborted
523
- return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
524
- path = Open.read(File.join(slurm_basedir, 'std.out')).strip
525
- if Open.exists?(path) && job.path != path
526
- Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
527
- Open.ln path, job.path
528
- Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
529
- Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
530
- end
531
- end
532
- end
533
- end
534
-
535
- def self.relay(job, options={})
536
- options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
537
- done_deps = job.dependencies.select do |dep|
538
- dep.done?
539
- end
540
-
541
- error_deps = job.dependencies.select do |dep|
542
- dep.error? && ! dep.recoverable_error?
543
- end
544
-
545
- (done_deps + error_deps).each do |dep|
546
- Step.migrate(dep.path, options[:search_path], options)
547
- end
548
-
549
- end
550
- end
551
-
3
+ require 'rbbt/hpc/slurm'