rbbt-util 5.28.14 → 5.30.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74253d97dc17c890ce9022e38ccd514847e785b69f4cfd74a8a5333a9aa1c97d
4
- data.tar.gz: d908fecdec1c5e6ccce81b3190883011d775f38cab689c136e7c90808c3c93ac
3
+ metadata.gz: b0e0e6d9165c16d73d1122a8d85e8038befd335215646bc3779400f5747b4b07
4
+ data.tar.gz: 7c0e92d49399111157fae43149c7ec8e598aa64510591fc2f74314bbefc89b2d
5
5
  SHA512:
6
- metadata.gz: 7c6b8f81e5f814a35ed3b738ad85d4e7314b9fb24b36ecdb3828d1d4644cc8b9ce4117c862ae7baa0bd7f60362e93be2aff282a2d24c474e327d1d5948f56fff
7
- data.tar.gz: 3a6235b141f139f245935b8858b175b2d54302b89dc3ac10101ffddfd10d1161760aaf0f176114a4fc6d8f0b8aa17556b8637e2f2ab815dcf875ea98c4825921
6
+ metadata.gz: 6f1ecdb8ea4481b12138f6b63d2508f888ad60d42b56cc187ac74e3d705bb40db76d52f7e615bbd3b7fba74e49298325dcf7e5477f5527e55aa4a42e6744586d
7
+ data.tar.gz: 5b3761d1300940911383fc0659a4335b422db217a9118a0fd41ebd2374cec87884775564f003d5a0a14555eff68fcac7b70c68c3cea72c6d802908a48b8b514c
data/lib/rbbt/hpc.rb CHANGED
@@ -1,553 +1,3 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/util/cmd'
3
-
4
- module Marenostrum
5
- SERVER='mn1'
6
- class SBATCH < Exception;
7
- attr_accessor :directory
8
- def initialize(directory)
9
- @directory = directory
10
- end
11
- end
12
-
13
- module SLURM
14
-
15
- def self.template(args, options = {})
16
-
17
- development = options.delete :drbbt
18
- singularity = options.delete :singularity
19
- contain = options.delete :contain
20
- sync = options.delete :sync
21
- user_group = options.delete :user_group
22
- contain_and_sync = options.delete :contain_and_sync
23
- wipe_container = options.delete :wipe_container
24
- copy_image = options.delete :copy_image
25
- exclusive = options.delete :exclusive
26
- highmem = options.delete :highmem
27
-
28
- queue = options.delete(:queue) || 'bsc_ls'
29
- task_cpus = options.delete(:task_cpus) || 1
30
- nodes = options.delete(:nodes) || 1
31
- time = options.delete(:time) || "0:00:10"
32
-
33
- inputs_dir = options.delete :inputs_dir
34
- config_keys = options.delete :config_keys
35
-
36
- user = ENV['USER'] || `whoami`.strip
37
- group = File.basename(File.dirname(ENV['HOME']))
38
-
39
- if contain_and_sync
40
- contain = "/scratch/tmp/rbbt-#{user}" if contain.nil?
41
- sync = "~/.rbbt/var/jobs" if sync.nil?
42
- wipe_container = "post" if wipe_container.nil?
43
- end
44
-
45
- contain = nil if contain == "" || contain == "none"
46
- sync = nil if sync == "" || sync == "none"
47
-
48
- contain = File.expand_path(contain) if contain
49
-
50
- name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
51
- options.delete(:name)
52
- slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
53
- options.delete(:slurm_basedir)
54
-
55
- rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
56
-
57
- rbbt_cmd += " " << options.collect do |o,v|
58
- o = o.to_s
59
- case v
60
- when TrueClass
61
- '--' << o
62
- when FalseClass
63
- '--' << o << "=false"
64
- else
65
- ['--' << o, "'#{v}'"] * " "
66
- end
67
- end * " "
68
-
69
- rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
70
-
71
-
72
- time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
73
-
74
-
75
- #{{{ PREPARE LOCAL LOGFILES
76
-
77
- Open.mkdir slurm_basedir
78
-
79
- fout = File.join(slurm_basedir, 'std.out')
80
- ferr = File.join(slurm_basedir, 'std.err')
81
- fjob = File.join(slurm_basedir, 'job.id')
82
- fexit = File.join(slurm_basedir, 'exit.status')
83
- fsync = File.join(slurm_basedir, 'sync.log')
84
- fcmd = File.join(slurm_basedir, 'command.slurm')
85
-
86
- #{{{ GENERATE TEMPLATE
87
-
88
- # HEADER
89
- header =<<-EOF
90
- #!/bin/bash
91
- #SBATCH --qos="#{queue}"
92
- #SBATCH --job-name="#{name}"
93
- #SBATCH --workdir="#{Dir.pwd}"
94
- #SBATCH --output="#{fout}"
95
- #SBATCH --error="#{ferr}"
96
- #SBATCH --cpus-per-task="#{task_cpus}"
97
- #SBATCH --time="#{time}"
98
- #SBATCH --nodes="#{nodes}"
99
- EOF
100
-
101
- prep = ""
102
-
103
- if highmem
104
- header +=<<-EOF
105
- #SBATCH --constraint=highmem
106
- EOF
107
- end
108
-
109
- if exclusive
110
- header +=<<-EOF
111
- #SBATCH --exclusive
112
- EOF
113
- end
114
-
115
- header +=<<-EOF
116
- #CMD: #{rbbt_cmd}
117
- EOF
118
-
119
- # ENV
120
- env = ""
121
- env +=<<-EOF
122
- # Prepare env
123
- [[ -f ~/config/load.sh ]] && source ~/config/load.sh
124
- module load java
125
-
126
- # Calculate max available memory
127
- let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK"
128
- EOF
129
-
130
-
131
- # RUN
132
- run = ""
133
- exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
134
-
135
-
136
- if singularity
137
- #{{{ SINGULARITY
138
-
139
- singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
140
-
141
- env +=<<-EOF
142
- module load intel/2018.1
143
- module load singularity
144
- PROJECTS_ROOT="/gpfs/projects/bsc26/"
145
- SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
146
- SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
147
- SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
148
- mkdir -p "$SINGULARITY_RUBY_INLINE"
149
- EOF
150
-
151
- if contain
152
- scratch_group_dir = File.join('/gpfs/scratch/', group)
153
- projects_group_dir = File.join('/gpfs/projects/', group)
154
-
155
- prep +=<<-EOF
156
-
157
- # Prepare container dir
158
- CONTAINER_DIR="#{contain}"
159
- mkdir -p $CONTAINER_DIR/.rbbt/etc/
160
-
161
- for dir in .ruby_inline git home; do
162
- mkdir -p $CONTAINER_DIR/$dir
163
- done
164
-
165
- for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
166
- mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
167
- done
168
-
169
- # Copy environment
170
- cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
171
-
172
- # Set search_paths
173
- echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
174
- echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
175
- echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
176
- echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
177
- echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
178
- echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
179
- echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
180
- EOF
181
-
182
- if user_group && group != user_group
183
- prep +=<<-EOF
184
-
185
- # Add user_group search_path
186
- echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
187
- EOF
188
- end
189
-
190
- if inputs_dir
191
- prep +=<<-EOF
192
-
193
- # Copy inputs
194
- [[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
195
- EOF
196
- rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
197
- end
198
-
199
- if copy_image
200
- prep +=<<EOF
201
-
202
- # Copy image
203
- rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
204
- SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
205
- EOF
206
- end
207
-
208
- if wipe_container == "pre" || wipe_container == "both"
209
- if singularity
210
- prep +=<<-EOF
211
-
212
- # Clean container pre
213
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
214
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
215
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
216
- EOF
217
- else
218
- prep = ""
219
- end
220
- end
221
- end
222
-
223
- if contain
224
- singularity_exec << %( -C -H "$CONTAINER_DIR" \
225
- -B /scratch/tmp \
226
- #{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
227
- -B #{scratch_group_dir} \
228
- -B #{projects_group_dir} \
229
- -B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
230
- -B ~/git:"$CONTAINER_DIR/git":ro \
231
- #{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
232
- -B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
233
- "$SINGULARITY_IMG")
234
- exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
235
- else
236
- singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
237
- end
238
-
239
- if development
240
- exec_cmd += " rbbt --dev='#{development}'"
241
- else
242
- exec_cmd += ' rbbt'
243
- end
244
-
245
- exec_cmd = singularity_exec + " " + exec_cmd
246
- else
247
- if development
248
- exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
249
- else
250
- exec_cmd << " " << 'rbbt'
251
- end
252
-
253
- if contain
254
- rbbt_cmd << " " << %(--workdir_all='#{contain}')
255
- end
256
- end
257
-
258
-
259
- cmd =<<-EOF
260
- #{exec_cmd} \\
261
- #{rbbt_cmd}
262
- EOF
263
-
264
- run +=<<-EOF
265
-
266
- # Run command
267
- #{cmd}
268
-
269
- # Save exit status
270
- exit_status=$?
271
-
272
- EOF
273
-
274
- # CODA
275
- coda = ""
276
- if sync
277
- if singularity
278
- coda +=<<-EOF
279
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
280
- EOF
281
- else
282
- coda +=<<-EOF
283
- rbbt system clean all -q &>> #{fsync}
284
- EOF
285
- end
286
-
287
- if sync.include?("=>")
288
- source, _sep, sync = sync.partition("=>")
289
- source = source.strip
290
- sync = sync.strip
291
- source = File.join(File.expand_path(contain), source)
292
- else
293
- source = File.join(File.expand_path(contain), '.rbbt/var/jobs')
294
- end
295
-
296
- target = File.expand_path(sync)
297
- coda +=<<-EOF
298
-
299
- # Sync data to target location
300
- mkdir -p "$(dirname '#{target}')"
301
- rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
302
- sync_es="$?"
303
- find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
304
- EOF
305
-
306
- if contain && (wipe_container == "post" || wipe_container == "both")
307
- prep =<<-EOF + prep
308
- if ls -A '#{contain}' &> /dev/null ; then
309
- echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
310
- fi
311
- EOF
312
- if singularity
313
- coda +=<<-EOF
314
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
315
-
316
-
317
- # Clean container directory
318
- #if [ $exit_status == '0' -a $sync_es == '0' ]; then
319
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
320
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
321
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
322
- #else
323
- # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
324
- #fi
325
- EOF
326
- else
327
- coda +=<<-EOF
328
- #{exec_cmd} system clean
329
- if [ $exit_status == '0' -a $sync_es == '0' ]; then
330
- rm -Rfv #{contain} &>> #{fsync}
331
- else
332
- echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
333
- fi
334
- unset sync_es
335
- EOF
336
-
337
- end
338
- end
339
- end
340
- coda +=<<-EOF
341
-
342
- # Write exit status to file
343
- echo $exit_status > #{fexit}
344
- EOF
345
- if sync
346
- coda +=<<-EOF
347
- if [ "$sync_es" == '0' ]; then
348
- unset sync_es
349
- exit $exit_status
350
- else
351
- exit $sync_es
352
- fi
353
- EOF
354
- else
355
- coda +=<<-EOF
356
- exit $exit_status
357
- EOF
358
- end
359
-
360
- template = [header, env, prep, run, coda] * "\n"
361
-
362
- template
363
- end
364
-
365
- def self.issue_template(template, options = {})
366
-
367
- slurm_basedir = options[:slurm_basedir]
368
- Open.mkdir slurm_basedir
369
-
370
- dry_run = options.delete :dry_run
371
-
372
- fout = File.join(slurm_basedir, 'std.out')
373
- ferr = File.join(slurm_basedir, 'std.err')
374
- fjob = File.join(slurm_basedir, 'job.id')
375
- fexit = File.join(slurm_basedir, 'exit.status')
376
- fsync = File.join(slurm_basedir, 'sync.log')
377
- fcmd = File.join(slurm_basedir, 'command.slurm')
378
-
379
- job = nil
380
- if options[:clean_job]
381
- [fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
382
- Open.rm file if Open.exists? file
383
- end
384
- end
385
-
386
- return if Open.exists?(fexit)
387
-
388
- STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
389
- STDERR.puts template
390
-
391
- Open.write(fcmd, template) unless File.exists? fcmd
392
- if File.exists?(fjob)
393
- job = Open.read(fjob).to_i
394
- else
395
- if File.exists?(fout)
396
- return
397
- elsif dry_run
398
- STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
399
- STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
400
- raise Marenostrum::SBATCH, slurm_basedir
401
- else
402
- Open.rm fsync
403
- Open.rm fexit
404
- Open.rm fout
405
- Open.rm ferr
406
- job = CMD.cmd("sbatch '#{fcmd}'").read.scan(/\d+/).first.to_i
407
- Open.write(fjob, job.to_s)
408
- end
409
- end
410
- end
411
-
412
- def self.follow_job(slurm_basedir, tail = true)
413
- fjob = File.join(slurm_basedir, 'job.id')
414
- fout = File.join(slurm_basedir, 'std.out')
415
- ferr = File.join(slurm_basedir, 'std.err')
416
- fstatus = File.join(slurm_basedir, 'job.status')
417
-
418
- job = Open.read(fjob).strip if Open.exists?(fjob)
419
-
420
- if job
421
- status_txt = CMD.cmd("squeue --job #{job}").read
422
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
423
- STDERR.puts status_txt
424
- lines = status_txt.split("\n").length
425
- end
426
-
427
- if tail
428
- Log.severity = 10
429
- while ! File.exists? fout
430
- if job
431
- STDERR.puts
432
- Log.clear_line(STDERR)
433
- STDERR.write Log.color(:magenta, "Waiting for Output")
434
- 3.times do
435
- STDERR.write Log.color(:magenta, ".")
436
- sleep 1
437
- end
438
- status_txt = CMD.cmd("squeue --job #{job}").read
439
- lines.times do
440
- Log.clear_line(STDERR)
441
- end
442
- Log.clear_line(STDERR)
443
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
444
- STDERR.puts status_txt
445
- lines = status_txt.split("\n").length
446
- end
447
- end
448
- STDERR.puts
449
- Log.clear_line(STDERR)
450
- STDERR.puts Log.color(:magenta, "Output:")
451
- begin
452
- CMD.cmd("squeue --job #{job} > #{fstatus}")
453
- out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
454
- err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
455
-
456
- terr = Misc.consume_stream(err, true, STDERR) if err
457
- tout = Misc.consume_stream(out, true, STDOUT) if out
458
-
459
- sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
460
- rescue Aborted
461
- ensure
462
- begin
463
- terr.exit if terr
464
- tout.exit if tout
465
- err.close if err
466
- err.join if err
467
- rescue Exception
468
- end
469
-
470
- begin
471
- out.close if out
472
- out.join if out
473
- rescue Exception
474
- end
475
- end
476
- end
477
- end
478
-
479
- def self.wait_for_job(slurm_basedir, time = 1)
480
- fexit = File.join(slurm_basedir, 'exit.status')
481
- fjob = File.join(slurm_basedir, 'job.id')
482
- job = Open.read(fjob) if Open.exists?(fjob)
483
-
484
-
485
- while ! Open.exists?(fexit)
486
- sleep time
487
- end
488
- end
489
-
490
- def self.run_job(job, options = {})
491
- options = IndiferentHash.setup(options.dup)
492
-
493
- dry_run = options.delete :dry_run
494
- tail = options.delete :tail
495
-
496
- workflow = job.workflow
497
- task = job.task_name
498
-
499
- keep_slurm_basedir = options.delete :keep_SLURM_slurm_basedir
500
- slurm_basedir = options.delete :SLURM_basedir
501
- slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
502
- TmpFile.with_file(nil, !keep_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
503
- options[:slurm_basedir] ||= tmp_directory
504
- slurm_basedir = options[:slurm_basedir]
505
- inputs_dir = File.join(tmp_directory, 'inputs_dir')
506
- saved = Step.save_job_inputs(job, inputs_dir, options)
507
- if saved
508
- options[:inputs_dir] = inputs_dir
509
- cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
510
- else
511
- cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
512
- end
513
-
514
-
515
- template = self.template(cmd, options)
516
- self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run))
517
-
518
- return unless tail
519
-
520
- t_monitor = Thread.new do
521
- self.follow_job(slurm_basedir, :STDERR)
522
- end
523
- self.wait_for_job(slurm_basedir)
524
- t_monitor.raise Aborted
525
- return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
526
- path = Open.read(File.join(slurm_basedir, 'std.out')).strip
527
- if Open.exists?(path) && job.path != path
528
- Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
529
- Open.ln path, job.path
530
- Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
531
- Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
532
- end
533
- end
534
- end
535
- end
536
-
537
- def self.relay(job, options={})
538
- options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
539
- done_deps = job.dependencies.select do |dep|
540
- dep.done?
541
- end
542
-
543
- error_deps = job.dependencies.select do |dep|
544
- dep.error? && ! dep.recoverable_error?
545
- end
546
-
547
- (done_deps + error_deps).each do |dep|
548
- Step.migrate(dep.path, options[:search_path], options)
549
- end
550
-
551
- end
552
- end
553
-
3
+ require 'rbbt/hpc/slurm'