rbbt-util 5.30.13 → 5.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc.rb +3 -0
- data/lib/rbbt/hpc/batch.rb +623 -0
- data/lib/rbbt/hpc/lsf.rb +119 -0
- data/lib/rbbt/hpc/orchestrate.rb +12 -11
- data/lib/rbbt/hpc/slurm.rb +62 -567
- data/lib/rbbt/resource/path.rb +3 -1
- data/lib/rbbt/tsv/accessor.rb +5 -2
- data/lib/rbbt/tsv/dumper.rb +1 -0
- data/lib/rbbt/tsv/parallel/traverse.rb +1 -1
- data/lib/rbbt/tsv/stream.rb +5 -6
- data/lib/rbbt/util/log.rb +22 -1
- data/lib/rbbt/util/misc/development.rb +2 -2
- data/lib/rbbt/util/misc/options.rb +5 -0
- data/lib/rbbt/workflow/step/accessor.rb +1 -1
- data/lib/rbbt/workflow/usage.rb +13 -13
- data/share/config.ru +3 -3
- data/share/rbbt_commands/{slurm → hpc}/clean +91 -18
- data/share/rbbt_commands/{slurm → hpc}/list +100 -30
- data/share/rbbt_commands/hpc/orchestrate +81 -0
- data/share/rbbt_commands/hpc/tail +81 -0
- data/share/rbbt_commands/hpc/task +80 -0
- data/test/rbbt/hpc/test_batch.rb +65 -0
- data/test/rbbt/hpc/test_slurm.rb +30 -0
- data/test/rbbt/util/misc/test_development.rb +11 -0
- data/test/test_helper.rb +3 -1
- metadata +16 -7
- data/share/rbbt_commands/slurm/orchestrate +0 -48
- data/share/rbbt_commands/slurm/task +0 -46
data/lib/rbbt/hpc/lsf.rb
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'rbbt/hpc/batch'
|
2
|
+
|
3
|
+
module HPC
|
4
|
+
module LSF
|
5
|
+
extend HPC::TemplateGeneration
|
6
|
+
extend HPC::Orchestration
|
7
|
+
|
8
|
+
def self.batch_system_variables
|
9
|
+
<<-EOF
|
10
|
+
MAX_MEMORY=$LSB_MAX_MEM_RUSAGE || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
|
11
|
+
BATCH_JOB_ID=$LSF_JOBID
|
12
|
+
BATCH_SYSTEM=LSF
|
13
|
+
EOF
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.header(options = {})
|
17
|
+
options = options.dup
|
18
|
+
|
19
|
+
queue = Misc.process_options options, :queue
|
20
|
+
task_cpus = Misc.process_options options, :task_cpus
|
21
|
+
time = Misc.process_options options, :time
|
22
|
+
nodes = Misc.process_options options, :nodes
|
23
|
+
workdir = Misc.process_options options, :workdir
|
24
|
+
exclusive = Misc.process_options options, :exclusive
|
25
|
+
|
26
|
+
batch_dir = Misc.process_options options, :batch_dir
|
27
|
+
batch_name = Misc.process_options options, :batch_name
|
28
|
+
batch_name ||= File.basename(batch_dir)
|
29
|
+
|
30
|
+
fout = File.join(batch_dir, 'std.out')
|
31
|
+
ferr = File.join(batch_dir, 'std.err')
|
32
|
+
|
33
|
+
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
34
|
+
|
35
|
+
time = time.split(":").values_at(0, 1) * ":"
|
36
|
+
|
37
|
+
header =<<-EOF
|
38
|
+
#!/bin/bash
|
39
|
+
#BSUB -J "#{batch_name}"
|
40
|
+
#BSUB -cwd "#{workdir}"
|
41
|
+
#BSUB -oo "#{fout}"
|
42
|
+
#BSUB -eo "#{ferr}"
|
43
|
+
#BSUB -q "#{queue}"
|
44
|
+
#BSUB -n "#{task_cpus}"
|
45
|
+
#BSUB -W "#{time}"
|
46
|
+
EOF
|
47
|
+
|
48
|
+
header << "#BSUB -x" << "\n" if exclusive
|
49
|
+
|
50
|
+
header
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.run_template(batch_dir, dry_run)
|
54
|
+
|
55
|
+
fout = File.join(batch_dir, 'std.out')
|
56
|
+
ferr = File.join(batch_dir, 'std.err')
|
57
|
+
fjob = File.join(batch_dir, 'job.id')
|
58
|
+
fdep = File.join(batch_dir, 'dependencies.list')
|
59
|
+
fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
|
60
|
+
fexit = File.join(batch_dir, 'exit.status')
|
61
|
+
fsync = File.join(batch_dir, 'sync.log')
|
62
|
+
fcmd = File.join(batch_dir, 'command.batch')
|
63
|
+
|
64
|
+
return if Open.exists?(fexit)
|
65
|
+
|
66
|
+
STDERR.puts Log.color(:magenta, "Issuing LSF file: #{fcmd}")
|
67
|
+
STDERR.puts Open.read(fcmd)
|
68
|
+
|
69
|
+
if File.exists?(fjob)
|
70
|
+
job = Open.read(fjob).to_i
|
71
|
+
else
|
72
|
+
|
73
|
+
dependencies = Open.read(fdep).split("\n") if File.exists? fdep
|
74
|
+
canfail_dependencies = Open.read(fcfdep).split("\n") if File.exists? fcfdep
|
75
|
+
|
76
|
+
normal_dep_list = dependencies && dependencies.any? ? dependencies.collect{|d| "post_done(#{d})"} : []
|
77
|
+
canfail_dep_list = canfail_dependencies && canfail_dependencies.any? ? canfail_dependencies.collect{|d| "done(#{d})"} : []
|
78
|
+
|
79
|
+
dep_list = normal_dep_list + canfail_dep_list
|
80
|
+
|
81
|
+
if dep_list.any?
|
82
|
+
dep_str = '-w "' + dep_list * " && " + '"'
|
83
|
+
else
|
84
|
+
dep_str = ""
|
85
|
+
end
|
86
|
+
|
87
|
+
cmd = "bsub #{dep_str} < '#{fcmd}'"
|
88
|
+
|
89
|
+
if File.exists?(fout)
|
90
|
+
return
|
91
|
+
elsif dry_run
|
92
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, cmd)
|
93
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt lsf tail '#{batch_dir}'")
|
94
|
+
raise HPC::SBATCH, batch_dir
|
95
|
+
else
|
96
|
+
Open.rm fsync
|
97
|
+
Open.rm fexit
|
98
|
+
Open.rm fout
|
99
|
+
Open.rm ferr
|
100
|
+
|
101
|
+
|
102
|
+
job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
|
103
|
+
Log.debug "BSUB job id: #{job}"
|
104
|
+
Open.write(fjob, job.to_s)
|
105
|
+
job
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.job_status(job = nil)
|
111
|
+
if job.nil?
|
112
|
+
CMD.cmd("bjobs -w").read
|
113
|
+
else
|
114
|
+
CMD.cmd("bjobs -w #{job}").read
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
data/lib/rbbt/hpc/orchestrate.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'rbbt/workflow/util/orchestrator'
|
2
2
|
module HPC
|
3
|
-
module
|
3
|
+
module Orchestration
|
4
4
|
|
5
|
-
def
|
5
|
+
def job_rules(rules, job)
|
6
6
|
workflow = job.workflow.to_s
|
7
7
|
task_name = job.task_name.to_s
|
8
8
|
task_name = job.overriden.to_s if Symbol === job.overriden
|
@@ -53,18 +53,18 @@ module HPC
|
|
53
53
|
job_rules
|
54
54
|
end
|
55
55
|
|
56
|
-
def
|
56
|
+
def get_job_dependencies(job, job_rules = nil)
|
57
57
|
deps = job.dependencies || []
|
58
58
|
deps += job.input_dependencies || []
|
59
59
|
deps
|
60
60
|
end
|
61
61
|
|
62
|
-
def
|
62
|
+
def get_recursive_job_dependencies(job)
|
63
63
|
deps = get_job_dependencies(job)
|
64
64
|
(deps + deps.collect{|dep| get_recursive_job_dependencies(dep) }).flatten
|
65
65
|
end
|
66
66
|
|
67
|
-
def
|
67
|
+
def piggyback(job, job_rules, job_deps)
|
68
68
|
return false unless job_rules["skip"]
|
69
69
|
final_deps = job_deps - job_deps.collect{|dep| get_recursive_job_dependencies(dep)}.flatten.uniq
|
70
70
|
final_deps = final_deps.reject{|dep| dep.done? }
|
@@ -72,7 +72,7 @@ module HPC
|
|
72
72
|
return false
|
73
73
|
end
|
74
74
|
|
75
|
-
def
|
75
|
+
def get_chains(job, rules, chains = {})
|
76
76
|
job_rules = self.job_rules(rules, job)
|
77
77
|
job_deps = get_job_dependencies(job)
|
78
78
|
|
@@ -102,7 +102,7 @@ module HPC
|
|
102
102
|
chains
|
103
103
|
end
|
104
104
|
|
105
|
-
def
|
105
|
+
def workload(job, rules, chains, options, seen = nil)
|
106
106
|
return [] if job.done?
|
107
107
|
if seen.nil?
|
108
108
|
seen = {}
|
@@ -145,7 +145,7 @@ module HPC
|
|
145
145
|
job_rules.delete :workflow
|
146
146
|
|
147
147
|
|
148
|
-
job_options = IndiferentHash.setup(options.merge(job_rules).merge(:
|
148
|
+
job_options = IndiferentHash.setup(options.merge(job_rules).merge(:batch_dependencies => dep_ids))
|
149
149
|
job_options.delete :orchestration_rules
|
150
150
|
|
151
151
|
config_keys = job_rules.delete(:config_keys)
|
@@ -182,7 +182,7 @@ module HPC
|
|
182
182
|
|
183
183
|
if options[:dry_run]
|
184
184
|
puts Log.color(:magenta, "Manifest: ") + Log.color(:blue, job_options[:manifest] * ", ") + " - tasks: #{job_options[:task_cpus] || 1} - time: #{job_options[:time]} - config: #{job_options[:config_keys]}"
|
185
|
-
puts Log.color(:yellow, "Deps: ") + Log.color(:blue, job_options[:
|
185
|
+
puts Log.color(:yellow, "Deps: ") + Log.color(:blue, job_options[:batch_dependencies]*", ")
|
186
186
|
job_options[:manifest].first
|
187
187
|
else
|
188
188
|
run_job(job, job_options)
|
@@ -190,13 +190,14 @@ module HPC
|
|
190
190
|
end
|
191
191
|
|
192
192
|
|
193
|
-
def
|
193
|
+
def orchestrate_job(job, options)
|
194
194
|
options.delete "recursive_clean"
|
195
195
|
options.delete "clean_task"
|
196
196
|
options.delete "clean"
|
197
197
|
options.delete "tail"
|
198
|
-
options.delete "
|
198
|
+
options.delete "printpath"
|
199
199
|
options.delete "detach"
|
200
|
+
options.delete "jobname"
|
200
201
|
|
201
202
|
rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
|
202
203
|
rules ||= {}
|
data/lib/rbbt/hpc/slurm.rb
CHANGED
@@ -1,454 +1,101 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
attr_accessor :directory
|
4
|
-
def initialize(directory)
|
5
|
-
@directory = directory
|
6
|
-
end
|
7
|
-
end
|
8
|
-
|
9
|
-
module SLURM
|
10
|
-
|
11
|
-
def self.template(args, options = {})
|
12
|
-
|
13
|
-
development = options.delete :drbbt
|
14
|
-
singularity = options.delete :singularity
|
15
|
-
contain = options.delete :contain
|
16
|
-
sync = options.delete :sync
|
17
|
-
user_group = options.delete :user_group
|
18
|
-
contain_and_sync = options.delete :contain_and_sync
|
19
|
-
wipe_container = options.delete :wipe_container
|
20
|
-
copy_image = options.delete :copy_image
|
21
|
-
exclusive = options.delete :exclusive
|
22
|
-
highmem = options.delete :highmem
|
23
|
-
|
24
|
-
slurm_step_path = options.delete :slurm_step_path
|
25
|
-
|
26
|
-
manifest = options.delete :manifest
|
27
|
-
|
28
|
-
queue = options.delete(:queue) || Rbbt::Config.get('queue', :slurm_queue, :slurm, :SLURM, :default => 'bsc_ls')
|
29
|
-
task_cpus = options.delete(:task_cpus) || 1
|
30
|
-
nodes = options.delete(:nodes) || 1
|
31
|
-
time = options.delete(:time) || "0:02:00"
|
32
|
-
|
33
|
-
inputs_dir = options.delete :inputs_dir
|
34
|
-
config_keys = options.delete :config_keys
|
1
|
+
require 'rbbt/hpc/batch'
|
2
|
+
require 'rbbt/hpc/orchestrate'
|
35
3
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
random_file = TmpFile.random_name
|
41
|
-
contain = "/scratch/tmp/rbbt-#{user}/#{random_file}" if contain.nil?
|
42
|
-
sync = "~/.rbbt/var/jobs" if sync.nil?
|
43
|
-
wipe_container = "post" if wipe_container.nil?
|
44
|
-
end
|
45
|
-
|
46
|
-
contain = nil if contain == "" || contain == "none"
|
47
|
-
sync = nil if sync == "" || sync == "none"
|
4
|
+
module HPC
|
5
|
+
module SLURM
|
6
|
+
extend HPC::TemplateGeneration
|
7
|
+
extend HPC::Orchestration
|
48
8
|
|
49
|
-
|
9
|
+
def self.batch_system_variables
|
10
|
+
<<-EOF
|
11
|
+
let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
|
12
|
+
BATCH_JOB_ID=$SLURM_JOB_ID
|
13
|
+
BATCH_SYSTEM=SLURM
|
14
|
+
EOF
|
15
|
+
end
|
50
16
|
|
51
|
-
|
52
|
-
options.
|
53
|
-
slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
|
54
|
-
options.delete(:slurm_basedir)
|
17
|
+
def self.header(options = {})
|
18
|
+
options = options.dup
|
55
19
|
|
56
|
-
|
20
|
+
queue = Misc.process_options options, :queue
|
21
|
+
task_cpus = Misc.process_options options, :task_cpus
|
22
|
+
time = Misc.process_options options, :time
|
23
|
+
nodes = Misc.process_options options, :nodes
|
24
|
+
workdir = Misc.process_options options, :workdir
|
25
|
+
exclusive = Misc.process_options options, :exclusive
|
57
26
|
|
58
|
-
|
59
|
-
|
60
|
-
case v
|
61
|
-
when TrueClass
|
62
|
-
'--' << o
|
63
|
-
when FalseClass
|
64
|
-
'--' << o << "=false"
|
65
|
-
else
|
66
|
-
['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
|
67
|
-
end
|
68
|
-
end * " "
|
27
|
+
batch_dir = Misc.process_options options, :batch_dir
|
28
|
+
batch_name = Misc.process_options options, :batch_name
|
69
29
|
|
70
|
-
|
30
|
+
fout = File.join(batch_dir, 'std.out')
|
31
|
+
ferr = File.join(batch_dir, 'std.err')
|
71
32
|
|
72
33
|
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
73
34
|
|
74
|
-
|
75
|
-
#{{{ PREPARE LOCAL LOGFILES
|
76
|
-
|
77
|
-
Open.mkdir slurm_basedir
|
78
|
-
|
79
|
-
fout = File.join(slurm_basedir, 'std.out')
|
80
|
-
ferr = File.join(slurm_basedir, 'std.err')
|
81
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
82
|
-
fexit = File.join(slurm_basedir, 'exit.status')
|
83
|
-
fsync = File.join(slurm_basedir, 'sync.log')
|
84
|
-
fsyncexit = File.join(slurm_basedir, 'sync.status')
|
85
|
-
fcmd = File.join(slurm_basedir, 'command.slurm')
|
86
|
-
|
87
|
-
#{{{ GENERATE TEMPLATE
|
88
|
-
|
89
|
-
# HEADER
|
90
35
|
header =<<-EOF
|
91
36
|
#!/bin/bash
|
92
|
-
#SBATCH --
|
93
|
-
#SBATCH --
|
94
|
-
#SBATCH --workdir="#{Dir.pwd}"
|
37
|
+
#SBATCH --job-name="#{batch_name}"
|
38
|
+
#SBATCH --workdir="#{workdir}"
|
95
39
|
#SBATCH --output="#{fout}"
|
96
40
|
#SBATCH --error="#{ferr}"
|
41
|
+
#SBATCH --qos="#{queue}"
|
97
42
|
#SBATCH --cpus-per-task="#{task_cpus}"
|
98
43
|
#SBATCH --time="#{time}"
|
99
44
|
#SBATCH --nodes="#{nodes}"
|
100
45
|
EOF
|
101
46
|
|
102
|
-
|
103
|
-
|
104
|
-
if highmem
|
105
|
-
header +=<<-EOF
|
106
|
-
#SBATCH --constraint=highmem
|
107
|
-
EOF
|
108
|
-
end
|
109
|
-
|
110
|
-
if exclusive
|
111
|
-
header +=<<-EOF
|
112
|
-
#SBATCH --exclusive
|
113
|
-
EOF
|
114
|
-
end
|
115
|
-
|
116
|
-
# ENV
|
117
|
-
env = ""
|
118
|
-
env +=<<-EOF
|
119
|
-
# Prepare env
|
120
|
-
[[ -f ~/config/load.sh ]] && source ~/config/load.sh
|
121
|
-
module load java
|
122
|
-
|
123
|
-
# Calculate max available memory
|
124
|
-
let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
|
125
|
-
EOF
|
126
|
-
|
127
|
-
|
128
|
-
# RUN
|
129
|
-
run = ""
|
130
|
-
exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
|
131
|
-
|
132
|
-
|
133
|
-
if singularity
|
134
|
-
#{{{ SINGULARITY
|
135
|
-
|
136
|
-
singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
|
137
|
-
|
138
|
-
env +=<<-EOF
|
139
|
-
module load intel/2018.1
|
140
|
-
module load singularity
|
141
|
-
PROJECTS_ROOT="/gpfs/projects/bsc26/"
|
142
|
-
SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
|
143
|
-
SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
|
144
|
-
SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
|
145
|
-
mkdir -p "$SINGULARITY_RUBY_INLINE"
|
146
|
-
EOF
|
147
|
-
|
148
|
-
if contain
|
149
|
-
scratch_group_dir = File.join('/gpfs/scratch/', group)
|
150
|
-
projects_group_dir = File.join('/gpfs/projects/', group)
|
47
|
+
header << "#SBATCH --exclusive" << "\n" if exclusive
|
151
48
|
|
152
|
-
|
153
|
-
|
154
|
-
# Prepare container dir
|
155
|
-
CONTAINER_DIR="#{contain}"
|
156
|
-
mkdir -p $CONTAINER_DIR/.rbbt/etc/
|
157
|
-
|
158
|
-
for dir in .ruby_inline git home; do
|
159
|
-
mkdir -p $CONTAINER_DIR/$dir
|
160
|
-
done
|
161
|
-
|
162
|
-
for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
|
163
|
-
mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
|
164
|
-
done
|
165
|
-
|
166
|
-
# Copy environment
|
167
|
-
cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
|
168
|
-
|
169
|
-
# Set search_paths
|
170
|
-
echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
|
171
|
-
echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
172
|
-
echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
173
|
-
echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
174
|
-
echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
175
|
-
echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
176
|
-
echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
177
|
-
EOF
|
178
|
-
|
179
|
-
if user_group && group != user_group
|
180
|
-
prep +=<<-EOF
|
181
|
-
|
182
|
-
# Add user_group search_path
|
183
|
-
echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
184
|
-
EOF
|
185
|
-
end
|
186
|
-
|
187
|
-
if inputs_dir
|
188
|
-
prep +=<<-EOF
|
189
|
-
|
190
|
-
# Copy inputs
|
191
|
-
[[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
|
192
|
-
EOF
|
193
|
-
rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
|
194
|
-
end
|
195
|
-
|
196
|
-
if copy_image
|
197
|
-
prep +=<<EOF
|
198
|
-
|
199
|
-
# Copy image
|
200
|
-
rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
|
201
|
-
SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
|
202
|
-
EOF
|
203
|
-
end
|
49
|
+
header
|
50
|
+
end
|
204
51
|
|
205
|
-
|
206
|
-
if singularity
|
207
|
-
prep +=<<-EOF
|
52
|
+
def self.run_template(batch_dir, dry_run)
|
208
53
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
end
|
218
|
-
end
|
54
|
+
fout = File.join(batch_dir, 'std.out')
|
55
|
+
ferr = File.join(batch_dir, 'std.err')
|
56
|
+
fjob = File.join(batch_dir, 'job.id')
|
57
|
+
fdep = File.join(batch_dir, 'dependencies.list')
|
58
|
+
fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
|
59
|
+
fexit = File.join(batch_dir, 'exit.status')
|
60
|
+
fsync = File.join(batch_dir, 'sync.log')
|
61
|
+
fcmd = File.join(batch_dir, 'command.batch')
|
219
62
|
|
220
|
-
|
221
|
-
singularity_exec << %( -C -H "$CONTAINER_DIR" \
|
222
|
-
-B /scratch/tmp \
|
223
|
-
#{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
|
224
|
-
-B #{scratch_group_dir} \
|
225
|
-
-B #{projects_group_dir} \
|
226
|
-
-B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
|
227
|
-
-B ~/git:"$CONTAINER_DIR/git":ro \
|
228
|
-
#{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
|
229
|
-
-B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
|
230
|
-
"$SINGULARITY_IMG")
|
231
|
-
exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
|
232
|
-
else
|
233
|
-
singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
|
234
|
-
end
|
63
|
+
return if Open.exists?(fexit)
|
235
64
|
|
236
|
-
|
237
|
-
|
238
|
-
else
|
239
|
-
exec_cmd += ' rbbt'
|
240
|
-
end
|
65
|
+
STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
|
66
|
+
STDERR.puts Open.read(fcmd)
|
241
67
|
|
242
|
-
|
68
|
+
if File.exists?(fjob)
|
69
|
+
job = Open.read(fjob).to_i
|
243
70
|
else
|
244
|
-
if development
|
245
|
-
exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
|
246
|
-
else
|
247
|
-
exec_cmd << " " << 'rbbt'
|
248
|
-
end
|
249
|
-
|
250
|
-
if contain
|
251
|
-
rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/workdir')
|
252
|
-
end
|
253
|
-
end
|
254
|
-
|
255
|
-
|
256
|
-
cmd =<<-EOF
|
257
|
-
#{exec_cmd} \\
|
258
|
-
#{rbbt_cmd}
|
259
|
-
EOF
|
260
|
-
annotate_cmd =<<-EOF
|
261
|
-
#{exec_cmd} \\
|
262
|
-
workflow write_info --recursive --force=false --check_pid "$step_path" slurm_job $SLURM_JOB_ID
|
263
|
-
EOF
|
264
|
-
|
265
|
-
header +=<<-EOF if manifest
|
266
|
-
#MANIFEST: #{manifest * ", "}
|
267
|
-
EOF
|
268
|
-
|
269
|
-
header +=<<-EOF if slurm_step_path
|
270
|
-
#STEP_PATH: #{slurm_step_path}
|
271
|
-
EOF
|
272
|
-
|
273
|
-
header +=<<-EOF
|
274
|
-
#CMD: #{rbbt_cmd}
|
275
|
-
EOF
|
276
|
-
|
277
|
-
run +=<<-EOF
|
278
|
-
|
279
|
-
# Run command
|
280
|
-
step_path=$(#{cmd})
|
281
|
-
|
282
|
-
# Save exit status
|
283
|
-
exit_status=$?
|
284
|
-
|
285
|
-
# Annotate info with SLURM job_info
|
286
|
-
#{annotate_cmd}
|
287
71
|
|
288
|
-
|
72
|
+
dependencies = Open.read(fdep).split("\n") if File.exists? fdep
|
73
|
+
canfail_dependencies = Open.read(fcfdep).split("\n") if File.exists? fcfdep
|
289
74
|
|
290
|
-
|
291
|
-
|
292
|
-
if sync
|
293
|
-
if singularity
|
294
|
-
coda +=<<-EOF
|
295
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
|
296
|
-
EOF
|
297
|
-
# else
|
298
|
-
# coda +=<<-EOF
|
299
|
-
#rbbt system clean all -q &>> #{fsync}
|
300
|
-
#EOF
|
301
|
-
end
|
75
|
+
normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
|
76
|
+
canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
|
302
77
|
|
303
|
-
if
|
304
|
-
|
305
|
-
source = source.strip
|
306
|
-
sync = sync.strip
|
307
|
-
source = File.join(File.expand_path(contain), source)
|
78
|
+
if normal_dep_str.nil? && canfail_dep_str.nil?
|
79
|
+
dep_str = ""
|
308
80
|
else
|
309
|
-
|
310
|
-
end
|
311
|
-
|
312
|
-
target = File.expand_path(sync)
|
313
|
-
coda +=<<-EOF
|
314
|
-
|
315
|
-
# Sync data to target location
|
316
|
-
if [ $exit_status == '0' ]; then
|
317
|
-
mkdir -p "$(dirname '#{target}')"
|
318
|
-
rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
|
319
|
-
sync_es="$?"
|
320
|
-
echo $sync_es > #{fsyncexit}
|
321
|
-
find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
|
322
|
-
else
|
323
|
-
sync_es="$exit_status"
|
324
|
-
fi
|
325
|
-
EOF
|
326
|
-
|
327
|
-
if contain && (wipe_container == "post" || wipe_container == "both")
|
328
|
-
prep =<<-EOF + prep
|
329
|
-
if ls -A '#{contain}' &> /dev/null ; then
|
330
|
-
echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
|
331
|
-
fi
|
332
|
-
EOF
|
333
|
-
if singularity
|
334
|
-
coda +=<<-EOF
|
335
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
|
336
|
-
|
337
|
-
|
338
|
-
# Clean container directory
|
339
|
-
#if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
340
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
|
341
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
|
342
|
-
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
|
343
|
-
#else
|
344
|
-
# echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
345
|
-
#fi
|
346
|
-
EOF
|
347
|
-
else
|
348
|
-
coda +=<<-EOF
|
349
|
-
##{exec_cmd} system clean
|
350
|
-
#if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
351
|
-
rm -Rfv #{contain} &>> #{fsync}
|
352
|
-
#else
|
353
|
-
# echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
354
|
-
#fi
|
355
|
-
EOF
|
356
|
-
|
357
|
-
end
|
81
|
+
dep_str = '--dependency=' + [normal_dep_str, canfail_dep_str].compact * ","
|
358
82
|
end
|
359
|
-
end
|
360
|
-
|
361
|
-
coda +=<<-EOF
|
362
|
-
|
363
|
-
# Write exit status to file
|
364
|
-
echo $exit_status > #{fexit}
|
365
|
-
EOF
|
366
|
-
|
367
|
-
if sync
|
368
|
-
coda +=<<-EOF
|
369
|
-
if [ "$sync_es" == '0' ]; then
|
370
|
-
unset sync_es
|
371
|
-
exit $exit_status
|
372
|
-
else
|
373
|
-
exit $sync_es
|
374
|
-
fi
|
375
|
-
EOF
|
376
|
-
else
|
377
|
-
coda +=<<-EOF
|
378
|
-
exit $exit_status
|
379
|
-
EOF
|
380
|
-
end
|
381
|
-
|
382
|
-
template = [header, env, prep, run, coda] * "\n"
|
383
|
-
|
384
|
-
template
|
385
|
-
end
|
386
|
-
|
387
|
-
def self.issue_template(template, options = {})
|
388
|
-
|
389
|
-
slurm_basedir = options[:slurm_basedir]
|
390
|
-
dependencies = options.delete :slurm_dependencies
|
391
|
-
dependencies = [] if dependencies.nil?
|
392
|
-
|
393
|
-
canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
|
394
|
-
dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
|
395
|
-
|
396
|
-
Open.mkdir slurm_basedir
|
397
83
|
|
398
|
-
|
84
|
+
cmd = "sbatch #{dep_str} '#{fcmd}'"
|
399
85
|
|
400
|
-
fout = File.join(slurm_basedir, 'std.out')
|
401
|
-
ferr = File.join(slurm_basedir, 'std.err')
|
402
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
403
|
-
fdep = File.join(slurm_basedir, 'dependencies.list')
|
404
|
-
fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
|
405
|
-
fexit = File.join(slurm_basedir, 'exit.status')
|
406
|
-
fsync = File.join(slurm_basedir, 'sync.log')
|
407
|
-
fcmd = File.join(slurm_basedir, 'command.slurm')
|
408
|
-
|
409
|
-
job = nil
|
410
|
-
if options[:clean_job]
|
411
|
-
[fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
|
412
|
-
Open.rm file if Open.exists? file
|
413
|
-
end
|
414
|
-
end
|
415
|
-
|
416
|
-
return if Open.exists?(fexit)
|
417
|
-
|
418
|
-
STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
|
419
|
-
STDERR.puts template
|
420
|
-
|
421
|
-
Open.write(fcmd, template) unless File.exists? fcmd
|
422
|
-
if File.exists?(fjob)
|
423
|
-
job = Open.read(fjob).to_i
|
424
|
-
else
|
425
86
|
if File.exists?(fout)
|
426
87
|
return
|
427
88
|
elsif dry_run
|
428
|
-
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{
|
429
|
-
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt
|
430
|
-
raise HPC::SBATCH,
|
89
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{fcmd}'")
|
90
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt slurm tail '#{batch_dir}'")
|
91
|
+
raise HPC::SBATCH, batch_dir
|
431
92
|
else
|
432
93
|
Open.rm fsync
|
433
94
|
Open.rm fexit
|
434
95
|
Open.rm fout
|
435
96
|
Open.rm ferr
|
436
97
|
|
437
|
-
|
438
|
-
Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
|
439
|
-
|
440
|
-
|
441
|
-
dep_str = '--dependency='
|
442
|
-
normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
|
443
|
-
canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
|
444
|
-
|
445
|
-
if normal_dep_str.nil? && canfail_dep_str.nil?
|
446
|
-
dep_str = ""
|
447
|
-
else
|
448
|
-
dep_str += [normal_dep_str, canfail_dep_str].compact * ","
|
449
|
-
end
|
450
|
-
|
451
|
-
job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
|
98
|
+
job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
|
452
99
|
Log.debug "SBATCH job id: #{job}"
|
453
100
|
Open.write(fjob, job.to_s)
|
454
101
|
job
|
@@ -456,165 +103,13 @@ EOF
|
|
456
103
|
end
|
457
104
|
end
|
458
105
|
|
459
|
-
def self.
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
job = Open.read(fjob).strip if Open.exists?(fjob)
|
466
|
-
|
467
|
-
if job
|
468
|
-
status_txt = CMD.cmd("squeue --job #{job}").read
|
469
|
-
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
470
|
-
STDERR.puts status_txt
|
471
|
-
lines = status_txt.split("\n").length
|
472
|
-
end
|
473
|
-
|
474
|
-
if tail
|
475
|
-
Log.severity = 10
|
476
|
-
while ! File.exists? fout
|
477
|
-
if job
|
478
|
-
STDERR.puts
|
479
|
-
Log.clear_line(STDERR)
|
480
|
-
STDERR.write Log.color(:magenta, "Waiting for Output")
|
481
|
-
3.times do
|
482
|
-
STDERR.write Log.color(:magenta, ".")
|
483
|
-
sleep 1
|
484
|
-
end
|
485
|
-
status_txt = CMD.cmd("squeue --job #{job}").read
|
486
|
-
lines.times do
|
487
|
-
Log.clear_line(STDERR)
|
488
|
-
end
|
489
|
-
Log.clear_line(STDERR)
|
490
|
-
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
491
|
-
STDERR.puts status_txt
|
492
|
-
lines = status_txt.split("\n").length
|
493
|
-
end
|
494
|
-
end
|
495
|
-
STDERR.puts
|
496
|
-
Log.clear_line(STDERR)
|
497
|
-
STDERR.puts Log.color(:magenta, "Output:")
|
498
|
-
begin
|
499
|
-
CMD.cmd("squeue --job #{job} > #{fstatus}")
|
500
|
-
out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
|
501
|
-
err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
|
502
|
-
|
503
|
-
terr = Misc.consume_stream(err, true, STDERR) if err
|
504
|
-
tout = Misc.consume_stream(out, true, STDOUT) if out
|
505
|
-
|
506
|
-
sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
|
507
|
-
rescue Aborted
|
508
|
-
ensure
|
509
|
-
begin
|
510
|
-
terr.exit if terr
|
511
|
-
tout.exit if tout
|
512
|
-
err.close if err
|
513
|
-
err.join if err
|
514
|
-
rescue Exception
|
515
|
-
end
|
516
|
-
|
517
|
-
begin
|
518
|
-
out.close if out
|
519
|
-
out.join if out
|
520
|
-
rescue Exception
|
521
|
-
end
|
522
|
-
end
|
523
|
-
end
|
524
|
-
end
|
525
|
-
|
526
|
-
def self.wait_for_job(slurm_basedir, time = 1)
|
527
|
-
fexit = File.join(slurm_basedir, 'exit.status')
|
528
|
-
fjob = File.join(slurm_basedir, 'job.id')
|
529
|
-
job = Open.read(fjob) if Open.exists?(fjob)
|
530
|
-
|
531
|
-
|
532
|
-
while ! Open.exists?(fexit)
|
533
|
-
sleep time
|
534
|
-
end
|
535
|
-
end
|
536
|
-
|
537
|
-
def self.run_job(job, options = {})
|
538
|
-
options = IndiferentHash.setup(options.dup)
|
539
|
-
|
540
|
-
dry_run = options.delete :dry_run
|
541
|
-
tail = options.delete :tail
|
542
|
-
dependencies = options.delete :slurm_dependencies
|
543
|
-
procpath = options.delete :SLURM_procpath
|
544
|
-
|
545
|
-
options[:jobname] = job.clean_name
|
546
|
-
options[:slurm_step_path] = job.path
|
547
|
-
|
548
|
-
log_level = options.delete :log
|
549
|
-
log_level ||= Log.severity
|
550
|
-
|
551
|
-
workflow = job.workflow
|
552
|
-
|
553
|
-
task = Symbol === job.overriden ? job.overriden : job.task_name
|
554
|
-
|
555
|
-
if job.overriden
|
556
|
-
override_deps = job.rec_dependencies.
|
557
|
-
select{|dep| Symbol === dep.overriden }.
|
558
|
-
collect do |dep|
|
559
|
-
|
560
|
-
name = [dep.workflow.to_s, dep.task_name] * "#"
|
561
|
-
[name, dep.path] * "="
|
562
|
-
end * ","
|
563
|
-
end
|
564
|
-
|
565
|
-
remove_slurm_basedir = options.delete :remove_slurm_basedir
|
566
|
-
slurm_basedir = options.delete :SLURM_basedir
|
567
|
-
slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
|
568
|
-
TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
|
569
|
-
options[:slurm_basedir] ||= tmp_directory
|
570
|
-
slurm_basedir = options[:slurm_basedir]
|
571
|
-
inputs_dir = File.join(tmp_directory, 'inputs_dir')
|
572
|
-
saved = Step.save_job_inputs(job, inputs_dir)
|
573
|
-
|
574
|
-
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '--printpath', '--log', log_level.to_s]
|
575
|
-
|
576
|
-
cmd << "--procpath_performance='#{tmp_directory}/procpath##{procpath.gsub(',', '#')}'" if procpath
|
577
|
-
|
578
|
-
cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
|
579
|
-
|
580
|
-
cmd << "--load_inputs='#{inputs_dir}'" if saved && saved.any?
|
581
|
-
|
582
|
-
template = self.template(cmd, options)
|
583
|
-
jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
|
584
|
-
|
585
|
-
return jobid unless tail
|
586
|
-
|
587
|
-
t_monitor = Thread.new do
|
588
|
-
self.follow_job(slurm_basedir, :STDERR)
|
589
|
-
end
|
590
|
-
self.wait_for_job(slurm_basedir)
|
591
|
-
t_monitor.raise Aborted
|
592
|
-
return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
|
593
|
-
path = Open.read(File.join(slurm_basedir, 'std.out')).strip
|
594
|
-
if Open.exists?(path) && job.path != path
|
595
|
-
Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
|
596
|
-
Open.ln path, job.path
|
597
|
-
Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
|
598
|
-
Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
|
599
|
-
end
|
600
|
-
jobid
|
106
|
+
def self.job_status(job = nil)
|
107
|
+
if job.nil?
|
108
|
+
CMD.cmd("squeue").read
|
109
|
+
else
|
110
|
+
CMD.cmd("squeue --job #{job}").read
|
601
111
|
end
|
602
112
|
end
|
603
|
-
end
|
604
|
-
|
605
|
-
def self.relay(job, options={})
|
606
|
-
options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
|
607
|
-
done_deps = job.dependencies.select do |dep|
|
608
|
-
dep.done?
|
609
|
-
end
|
610
|
-
|
611
|
-
error_deps = job.dependencies.select do |dep|
|
612
|
-
dep.error? && ! dep.recoverable_error?
|
613
|
-
end
|
614
|
-
|
615
|
-
(done_deps + error_deps).each do |dep|
|
616
|
-
Step.migrate(dep.path, options[:search_path], options)
|
617
|
-
end
|
618
113
|
|
619
114
|
end
|
620
115
|
end
|