rbbt-util 5.28.14 → 5.30.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc.rb +1 -551
- data/lib/rbbt/hpc/orchestrate.rb +111 -0
- data/lib/rbbt/hpc/slurm.rb +603 -0
- data/lib/rbbt/persist.rb +4 -0
- data/lib/rbbt/persist/tsv/adapter.rb +48 -13
- data/lib/rbbt/util/cmd.rb +6 -1
- data/lib/rbbt/util/misc/inspect.rb +13 -3
- data/lib/rbbt/util/misc/options.rb +0 -42
- data/lib/rbbt/util/procpath.rb +49 -0
- data/lib/rbbt/workflow.rb +2 -1
- data/lib/rbbt/workflow/accessor.rb +7 -1
- data/lib/rbbt/workflow/examples.rb +2 -2
- data/lib/rbbt/workflow/step.rb +8 -5
- data/lib/rbbt/workflow/step/accessor.rb +28 -19
- data/lib/rbbt/workflow/step/dependencies.rb +1 -2
- data/lib/rbbt/workflow/step/run.rb +2 -5
- data/lib/rbbt/workflow/usage.rb +1 -1
- data/lib/rbbt/workflow/util/orchestrator.rb +14 -9
- data/lib/rbbt/workflow/util/provenance.rb +5 -2
- data/share/rbbt_commands/slurm/clean +165 -0
- data/share/rbbt_commands/slurm/list +220 -0
- data/share/rbbt_commands/slurm/orchestrate +48 -0
- data/share/rbbt_commands/{workflow/slurm → slurm/task} +11 -3
- data/share/rbbt_commands/tsv/slice +3 -3
- data/share/rbbt_commands/workflow/info +1 -1
- data/share/rbbt_commands/workflow/task +17 -7
- data/share/rbbt_commands/workflow/write_info +52 -0
- data/test/rbbt/test_workflow.rb +7 -7
- data/test/rbbt/util/test_procpath.rb +23 -0
- metadata +12 -3
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'rbbt/workflow/util/orchestrator'
|
2
|
+
module HPC
|
3
|
+
module SLURM
|
4
|
+
|
5
|
+
def self.job_rules(rules, job)
|
6
|
+
workflow = job.workflow.to_s
|
7
|
+
task_name = job.task_name.to_s
|
8
|
+
defaults = rules["defaults"] || {}
|
9
|
+
|
10
|
+
job_rules = IndiferentHash.setup(defaults.dup)
|
11
|
+
|
12
|
+
rules["chains"].each do |name,info|
|
13
|
+
IndiferentHash.setup(info)
|
14
|
+
chain_tasks = info[:tasks].split(/,\s*/)
|
15
|
+
|
16
|
+
chain_tasks.each do |task|
|
17
|
+
task_workflow, chain_task = task.split("#")
|
18
|
+
chain_task, task_workflow = task_workflow, info[:workflow] if chain_task.nil? or chain_tasks.empty?
|
19
|
+
job_rules["chain_tasks"] ||= {}
|
20
|
+
job_rules["chain_tasks"][task_workflow] ||= []
|
21
|
+
job_rules["chain_tasks"][task_workflow] << chain_task
|
22
|
+
next unless task_name == chain_task.to_s && workflow == task_workflow.to_s
|
23
|
+
config_keys = job_rules.delete :config_keys
|
24
|
+
job_rules = IndiferentHash.setup(job_rules.merge(info))
|
25
|
+
if config_keys
|
26
|
+
config_keys.gsub!(/,\s+/,',')
|
27
|
+
job_rules[:config_keys] = job_rules[:config_keys] ? config_keys + "," + job_rules[:config_keys] : config_keys
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
if job_rules["chain_tasks"][workflow] && job_rules["chain_tasks"][workflow].include?(task_name)
|
32
|
+
break
|
33
|
+
else
|
34
|
+
job_rules.delete "chain_tasks"
|
35
|
+
end
|
36
|
+
end if rules["chains"]
|
37
|
+
|
38
|
+
config_keys = job_rules.delete :config_keys
|
39
|
+
job_rules = IndiferentHash.setup(job_rules.merge(rules[workflow][task_name])) if rules[workflow] && rules[workflow][task_name]
|
40
|
+
|
41
|
+
if config_keys
|
42
|
+
config_keys.gsub!(/,\s+/,',')
|
43
|
+
job_rules[:config_keys] = job_rules[:config_keys] ? config_keys + "," + job_rules[:config_keys] : config_keys
|
44
|
+
end
|
45
|
+
|
46
|
+
if rules["skip"] && rules["skip"][workflow]
|
47
|
+
job_rules["skip"] = true if rules["skip"][workflow].split(/,\s*/).include? task_name
|
48
|
+
end
|
49
|
+
|
50
|
+
job_rules
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.get_job_dependencies(job, job_rules)
|
54
|
+
deps = job.dependencies || []
|
55
|
+
deps += job.input_dependencies || []
|
56
|
+
deps
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.orchestrate_job(job, options, skip = false, seen = {})
|
60
|
+
return if job.done?
|
61
|
+
return unless job.path.split("/")[-4] == "jobs"
|
62
|
+
seen[:orchestration_target_job] ||= job
|
63
|
+
|
64
|
+
options.delete "recursive_clean"
|
65
|
+
options.delete "clean_task"
|
66
|
+
options.delete "clean"
|
67
|
+
options.delete "tail"
|
68
|
+
options.delete "printfile"
|
69
|
+
options.delete "detach"
|
70
|
+
|
71
|
+
rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
|
72
|
+
rules ||= {}
|
73
|
+
IndiferentHash.setup(rules)
|
74
|
+
|
75
|
+
job_rules = self.job_rules(rules, job)
|
76
|
+
|
77
|
+
deps = get_job_dependencies(job, job_rules)
|
78
|
+
|
79
|
+
dep_ids = deps.collect do |dep|
|
80
|
+
skip_dep = job_rules["chain_tasks"] &&
|
81
|
+
job_rules["chain_tasks"][job.workflow.to_s] && job_rules["chain_tasks"][job.workflow.to_s].include?(job.task_name.to_s) &&
|
82
|
+
job_rules["chain_tasks"][dep.workflow.to_s] && job_rules["chain_tasks"][dep.workflow.to_s].include?(dep.task_name.to_s)
|
83
|
+
|
84
|
+
deps = seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
|
85
|
+
if job.canfail_paths.include? dep.path
|
86
|
+
[deps].flatten.collect{|id| ['canfail', id] * ":"}
|
87
|
+
else
|
88
|
+
deps
|
89
|
+
end
|
90
|
+
end.flatten.compact.uniq
|
91
|
+
|
92
|
+
skip = true if job_rules[:skip]
|
93
|
+
return dep_ids if skip and seen[:orchestration_target_job] != job
|
94
|
+
|
95
|
+
job_rules.delete :chain_tasks
|
96
|
+
job_rules.delete :tasks
|
97
|
+
job_rules.delete :workflow
|
98
|
+
|
99
|
+
config_keys = job_rules.delete(:config_keys)
|
100
|
+
|
101
|
+
job_options = IndiferentHash.setup(options.merge(job_rules).merge(:slurm_dependencies => dep_ids))
|
102
|
+
job_options.delete :orchestration_rules
|
103
|
+
if config_keys
|
104
|
+
config_keys.gsub!(/,\s+/,',')
|
105
|
+
job_options[:config_keys] = job_options[:config_keys] ? config_keys + "," + job_options[:config_keys] : config_keys
|
106
|
+
end
|
107
|
+
|
108
|
+
run_job(job, job_options)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,603 @@
|
|
1
|
+
module HPC
|
2
|
+
class SBATCH < Exception;
|
3
|
+
attr_accessor :directory
|
4
|
+
def initialize(directory)
|
5
|
+
@directory = directory
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
module SLURM
|
10
|
+
|
11
|
+
def self.template(args, options = {})
|
12
|
+
|
13
|
+
development = options.delete :drbbt
|
14
|
+
singularity = options.delete :singularity
|
15
|
+
contain = options.delete :contain
|
16
|
+
sync = options.delete :sync
|
17
|
+
user_group = options.delete :user_group
|
18
|
+
contain_and_sync = options.delete :contain_and_sync
|
19
|
+
wipe_container = options.delete :wipe_container
|
20
|
+
copy_image = options.delete :copy_image
|
21
|
+
exclusive = options.delete :exclusive
|
22
|
+
highmem = options.delete :highmem
|
23
|
+
|
24
|
+
queue = options.delete(:queue) || 'bsc_ls'
|
25
|
+
task_cpus = options.delete(:task_cpus) || 1
|
26
|
+
nodes = options.delete(:nodes) || 1
|
27
|
+
time = options.delete(:time) || "0:00:10"
|
28
|
+
|
29
|
+
inputs_dir = options.delete :inputs_dir
|
30
|
+
config_keys = options.delete :config_keys
|
31
|
+
|
32
|
+
user = ENV['USER'] || `whoami`.strip
|
33
|
+
group = File.basename(File.dirname(ENV['HOME']))
|
34
|
+
|
35
|
+
if contain_and_sync
|
36
|
+
random_file = TmpFile.random_name
|
37
|
+
contain = "/scratch/tmp/rbbt-#{user}/#{random_file}" if contain.nil?
|
38
|
+
sync = "~/.rbbt/var/jobs" if sync.nil?
|
39
|
+
wipe_container = "post" if wipe_container.nil?
|
40
|
+
end
|
41
|
+
|
42
|
+
contain = nil if contain == "" || contain == "none"
|
43
|
+
sync = nil if sync == "" || sync == "none"
|
44
|
+
|
45
|
+
contain = File.expand_path(contain) if contain
|
46
|
+
|
47
|
+
name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
|
48
|
+
options.delete(:name)
|
49
|
+
slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
|
50
|
+
options.delete(:slurm_basedir)
|
51
|
+
|
52
|
+
rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
|
53
|
+
|
54
|
+
rbbt_cmd += " " << options.collect do |o,v|
|
55
|
+
o = o.to_s
|
56
|
+
case v
|
57
|
+
when TrueClass
|
58
|
+
'--' << o
|
59
|
+
when FalseClass
|
60
|
+
'--' << o << "=false"
|
61
|
+
else
|
62
|
+
['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
|
63
|
+
end
|
64
|
+
end * " "
|
65
|
+
|
66
|
+
rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
|
67
|
+
|
68
|
+
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
69
|
+
|
70
|
+
|
71
|
+
#{{{ PREPARE LOCAL LOGFILES
|
72
|
+
|
73
|
+
Open.mkdir slurm_basedir
|
74
|
+
|
75
|
+
fout = File.join(slurm_basedir, 'std.out')
|
76
|
+
ferr = File.join(slurm_basedir, 'std.err')
|
77
|
+
fjob = File.join(slurm_basedir, 'job.id')
|
78
|
+
fexit = File.join(slurm_basedir, 'exit.status')
|
79
|
+
fsync = File.join(slurm_basedir, 'sync.log')
|
80
|
+
fsyncexit = File.join(slurm_basedir, 'sync.status')
|
81
|
+
fcmd = File.join(slurm_basedir, 'command.slurm')
|
82
|
+
|
83
|
+
#{{{ GENERATE TEMPLATE
|
84
|
+
|
85
|
+
# HEADER
|
86
|
+
header =<<-EOF
|
87
|
+
#!/bin/bash
|
88
|
+
#SBATCH --qos="#{queue}"
|
89
|
+
#SBATCH --job-name="#{name}"
|
90
|
+
#SBATCH --workdir="#{Dir.pwd}"
|
91
|
+
#SBATCH --output="#{fout}"
|
92
|
+
#SBATCH --error="#{ferr}"
|
93
|
+
#SBATCH --cpus-per-task="#{task_cpus}"
|
94
|
+
#SBATCH --time="#{time}"
|
95
|
+
#SBATCH --nodes="#{nodes}"
|
96
|
+
EOF
|
97
|
+
|
98
|
+
prep = ""
|
99
|
+
|
100
|
+
if highmem
|
101
|
+
header +=<<-EOF
|
102
|
+
#SBATCH --constraint=highmem
|
103
|
+
EOF
|
104
|
+
end
|
105
|
+
|
106
|
+
if exclusive
|
107
|
+
header +=<<-EOF
|
108
|
+
#SBATCH --exclusive
|
109
|
+
EOF
|
110
|
+
end
|
111
|
+
|
112
|
+
# ENV
|
113
|
+
env = ""
|
114
|
+
env +=<<-EOF
|
115
|
+
# Prepare env
|
116
|
+
[[ -f ~/config/load.sh ]] && source ~/config/load.sh
|
117
|
+
module load java
|
118
|
+
|
119
|
+
# Calculate max available memory
|
120
|
+
let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
|
121
|
+
EOF
|
122
|
+
|
123
|
+
|
124
|
+
# RUN
|
125
|
+
run = ""
|
126
|
+
exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
|
127
|
+
|
128
|
+
|
129
|
+
if singularity
|
130
|
+
#{{{ SINGULARITY
|
131
|
+
|
132
|
+
singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
|
133
|
+
|
134
|
+
env +=<<-EOF
|
135
|
+
module load intel/2018.1
|
136
|
+
module load singularity
|
137
|
+
PROJECTS_ROOT="/gpfs/projects/bsc26/"
|
138
|
+
SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
|
139
|
+
SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
|
140
|
+
SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
|
141
|
+
mkdir -p "$SINGULARITY_RUBY_INLINE"
|
142
|
+
EOF
|
143
|
+
|
144
|
+
if contain
|
145
|
+
scratch_group_dir = File.join('/gpfs/scratch/', group)
|
146
|
+
projects_group_dir = File.join('/gpfs/projects/', group)
|
147
|
+
|
148
|
+
prep +=<<-EOF
|
149
|
+
|
150
|
+
# Prepare container dir
|
151
|
+
CONTAINER_DIR="#{contain}"
|
152
|
+
mkdir -p $CONTAINER_DIR/.rbbt/etc/
|
153
|
+
|
154
|
+
for dir in .ruby_inline git home; do
|
155
|
+
mkdir -p $CONTAINER_DIR/$dir
|
156
|
+
done
|
157
|
+
|
158
|
+
for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
|
159
|
+
mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
|
160
|
+
done
|
161
|
+
|
162
|
+
# Copy environment
|
163
|
+
cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
|
164
|
+
|
165
|
+
# Set search_paths
|
166
|
+
echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
|
167
|
+
echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
168
|
+
echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
169
|
+
echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
170
|
+
echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
171
|
+
echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
172
|
+
echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
173
|
+
EOF
|
174
|
+
|
175
|
+
if user_group && group != user_group
|
176
|
+
prep +=<<-EOF
|
177
|
+
|
178
|
+
# Add user_group search_path
|
179
|
+
echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
|
180
|
+
EOF
|
181
|
+
end
|
182
|
+
|
183
|
+
if inputs_dir
|
184
|
+
prep +=<<-EOF
|
185
|
+
|
186
|
+
# Copy inputs
|
187
|
+
[[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
|
188
|
+
EOF
|
189
|
+
rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
|
190
|
+
end
|
191
|
+
|
192
|
+
if copy_image
|
193
|
+
prep +=<<EOF
|
194
|
+
|
195
|
+
# Copy image
|
196
|
+
rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
|
197
|
+
SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
|
198
|
+
EOF
|
199
|
+
end
|
200
|
+
|
201
|
+
if wipe_container == "pre" || wipe_container == "both"
|
202
|
+
if singularity
|
203
|
+
prep +=<<-EOF
|
204
|
+
|
205
|
+
# Clean container pre
|
206
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
|
207
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
|
208
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
|
209
|
+
EOF
|
210
|
+
else
|
211
|
+
prep = ""
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
if contain
|
217
|
+
singularity_exec << %( -C -H "$CONTAINER_DIR" \
|
218
|
+
-B /scratch/tmp \
|
219
|
+
#{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
|
220
|
+
-B #{scratch_group_dir} \
|
221
|
+
-B #{projects_group_dir} \
|
222
|
+
-B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
|
223
|
+
-B ~/git:"$CONTAINER_DIR/git":ro \
|
224
|
+
#{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
|
225
|
+
-B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
|
226
|
+
"$SINGULARITY_IMG")
|
227
|
+
exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
|
228
|
+
else
|
229
|
+
singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
|
230
|
+
end
|
231
|
+
|
232
|
+
if development
|
233
|
+
exec_cmd += " rbbt --dev='#{development}'"
|
234
|
+
else
|
235
|
+
exec_cmd += ' rbbt'
|
236
|
+
end
|
237
|
+
|
238
|
+
exec_cmd = singularity_exec + " " + exec_cmd
|
239
|
+
else
|
240
|
+
if development
|
241
|
+
exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
|
242
|
+
else
|
243
|
+
exec_cmd << " " << 'rbbt'
|
244
|
+
end
|
245
|
+
|
246
|
+
if contain
|
247
|
+
rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/workdir')
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
|
252
|
+
cmd =<<-EOF
|
253
|
+
#{exec_cmd} \\
|
254
|
+
#{rbbt_cmd}
|
255
|
+
EOF
|
256
|
+
annotate_cmd =<<-EOF
|
257
|
+
#{exec_cmd} \\
|
258
|
+
workflow write_info --recursive --force=false --check_pid "$step_path" slurm_job $SLURM_JOB_ID
|
259
|
+
EOF
|
260
|
+
|
261
|
+
header +=<<-EOF
|
262
|
+
#CMD: #{rbbt_cmd}
|
263
|
+
EOF
|
264
|
+
|
265
|
+
run +=<<-EOF
|
266
|
+
|
267
|
+
# Run command
|
268
|
+
step_path=$(#{cmd})
|
269
|
+
|
270
|
+
# Save exit status
|
271
|
+
exit_status=$?
|
272
|
+
|
273
|
+
# Annotate info with SLURM job_info
|
274
|
+
#{annotate_cmd}
|
275
|
+
|
276
|
+
EOF
|
277
|
+
|
278
|
+
# CODA
|
279
|
+
coda = ""
|
280
|
+
if sync
|
281
|
+
if singularity
|
282
|
+
coda +=<<-EOF
|
283
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
|
284
|
+
EOF
|
285
|
+
# else
|
286
|
+
# coda +=<<-EOF
|
287
|
+
#rbbt system clean all -q &>> #{fsync}
|
288
|
+
#EOF
|
289
|
+
end
|
290
|
+
|
291
|
+
if sync.include?("=>")
|
292
|
+
source, _sep, sync = sync.partition("=>")
|
293
|
+
source = source.strip
|
294
|
+
sync = sync.strip
|
295
|
+
source = File.join(File.expand_path(contain), source)
|
296
|
+
else
|
297
|
+
source = File.join(File.expand_path(contain), 'workdir/var/jobs')
|
298
|
+
end
|
299
|
+
|
300
|
+
target = File.expand_path(sync)
|
301
|
+
coda +=<<-EOF
|
302
|
+
|
303
|
+
# Sync data to target location
|
304
|
+
mkdir -p "$(dirname '#{target}')"
|
305
|
+
rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
|
306
|
+
sync_es="$?"
|
307
|
+
echo $sync_es > #{fsyncexit}
|
308
|
+
find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
|
309
|
+
EOF
|
310
|
+
|
311
|
+
if contain && (wipe_container == "post" || wipe_container == "both")
|
312
|
+
prep =<<-EOF + prep
|
313
|
+
if ls -A '#{contain}' &> /dev/null ; then
|
314
|
+
echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
|
315
|
+
fi
|
316
|
+
EOF
|
317
|
+
if singularity
|
318
|
+
coda +=<<-EOF
|
319
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
|
320
|
+
|
321
|
+
|
322
|
+
# Clean container directory
|
323
|
+
#if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
324
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
|
325
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
|
326
|
+
singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
|
327
|
+
#else
|
328
|
+
# echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
329
|
+
#fi
|
330
|
+
EOF
|
331
|
+
else
|
332
|
+
coda +=<<-EOF
|
333
|
+
##{exec_cmd} system clean
|
334
|
+
if [ $exit_status == '0' -a $sync_es == '0' ]; then
|
335
|
+
rm -Rfv #{contain} &>> #{fsync}
|
336
|
+
else
|
337
|
+
echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
|
338
|
+
fi
|
339
|
+
EOF
|
340
|
+
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
coda +=<<-EOF
|
346
|
+
|
347
|
+
# Write exit status to file
|
348
|
+
echo $exit_status > #{fexit}
|
349
|
+
EOF
|
350
|
+
|
351
|
+
if sync
|
352
|
+
coda +=<<-EOF
|
353
|
+
if [ "$sync_es" == '0' ]; then
|
354
|
+
unset sync_es
|
355
|
+
exit $exit_status
|
356
|
+
else
|
357
|
+
exit $sync_es
|
358
|
+
fi
|
359
|
+
EOF
|
360
|
+
else
|
361
|
+
coda +=<<-EOF
|
362
|
+
exit $exit_status
|
363
|
+
EOF
|
364
|
+
end
|
365
|
+
|
366
|
+
template = [header, env, prep, run, coda] * "\n"
|
367
|
+
|
368
|
+
template
|
369
|
+
end
|
370
|
+
|
371
|
+
def self.issue_template(template, options = {})
|
372
|
+
|
373
|
+
slurm_basedir = options[:slurm_basedir]
|
374
|
+
dependencies = options.delete :slurm_dependencies
|
375
|
+
dependencies = [] if dependencies.nil?
|
376
|
+
|
377
|
+
canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
|
378
|
+
dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
|
379
|
+
|
380
|
+
Open.mkdir slurm_basedir
|
381
|
+
|
382
|
+
dry_run = options.delete :dry_run
|
383
|
+
|
384
|
+
fout = File.join(slurm_basedir, 'std.out')
|
385
|
+
ferr = File.join(slurm_basedir, 'std.err')
|
386
|
+
fjob = File.join(slurm_basedir, 'job.id')
|
387
|
+
fdep = File.join(slurm_basedir, 'dependencies.list')
|
388
|
+
fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
|
389
|
+
fexit = File.join(slurm_basedir, 'exit.status')
|
390
|
+
fsync = File.join(slurm_basedir, 'sync.log')
|
391
|
+
fcmd = File.join(slurm_basedir, 'command.slurm')
|
392
|
+
|
393
|
+
job = nil
|
394
|
+
if options[:clean_job]
|
395
|
+
[fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
|
396
|
+
Open.rm file if Open.exists? file
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
400
|
+
return if Open.exists?(fexit)
|
401
|
+
|
402
|
+
STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
|
403
|
+
STDERR.puts template
|
404
|
+
|
405
|
+
Open.write(fcmd, template) unless File.exists? fcmd
|
406
|
+
if File.exists?(fjob)
|
407
|
+
job = Open.read(fjob).to_i
|
408
|
+
else
|
409
|
+
if File.exists?(fout)
|
410
|
+
return
|
411
|
+
elsif dry_run
|
412
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
|
413
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
|
414
|
+
raise HPC::SBATCH, slurm_basedir
|
415
|
+
else
|
416
|
+
Open.rm fsync
|
417
|
+
Open.rm fexit
|
418
|
+
Open.rm fout
|
419
|
+
Open.rm ferr
|
420
|
+
|
421
|
+
Open.write(fdep, dependencies * "\n") if dependencies.any?
|
422
|
+
Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
|
423
|
+
|
424
|
+
|
425
|
+
dep_str = '--dependency='
|
426
|
+
normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
|
427
|
+
canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
|
428
|
+
|
429
|
+
if normal_dep_str.nil? && canfail_dep_str.nil?
|
430
|
+
dep_str = ""
|
431
|
+
else
|
432
|
+
dep_str += [normal_dep_str, canfail_dep_str].compact * ","
|
433
|
+
end
|
434
|
+
|
435
|
+
job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
|
436
|
+
Log.debug "SBATCH job id: #{job}"
|
437
|
+
Open.write(fjob, job.to_s)
|
438
|
+
job
|
439
|
+
end
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
def self.follow_job(slurm_basedir, tail = true)
|
444
|
+
fjob = File.join(slurm_basedir, 'job.id')
|
445
|
+
fout = File.join(slurm_basedir, 'std.out')
|
446
|
+
ferr = File.join(slurm_basedir, 'std.err')
|
447
|
+
fstatus = File.join(slurm_basedir, 'job.status')
|
448
|
+
|
449
|
+
job = Open.read(fjob).strip if Open.exists?(fjob)
|
450
|
+
|
451
|
+
if job
|
452
|
+
status_txt = CMD.cmd("squeue --job #{job}").read
|
453
|
+
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
454
|
+
STDERR.puts status_txt
|
455
|
+
lines = status_txt.split("\n").length
|
456
|
+
end
|
457
|
+
|
458
|
+
if tail
|
459
|
+
Log.severity = 10
|
460
|
+
while ! File.exists? fout
|
461
|
+
if job
|
462
|
+
STDERR.puts
|
463
|
+
Log.clear_line(STDERR)
|
464
|
+
STDERR.write Log.color(:magenta, "Waiting for Output")
|
465
|
+
3.times do
|
466
|
+
STDERR.write Log.color(:magenta, ".")
|
467
|
+
sleep 1
|
468
|
+
end
|
469
|
+
status_txt = CMD.cmd("squeue --job #{job}").read
|
470
|
+
lines.times do
|
471
|
+
Log.clear_line(STDERR)
|
472
|
+
end
|
473
|
+
Log.clear_line(STDERR)
|
474
|
+
STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
|
475
|
+
STDERR.puts status_txt
|
476
|
+
lines = status_txt.split("\n").length
|
477
|
+
end
|
478
|
+
end
|
479
|
+
STDERR.puts
|
480
|
+
Log.clear_line(STDERR)
|
481
|
+
STDERR.puts Log.color(:magenta, "Output:")
|
482
|
+
begin
|
483
|
+
CMD.cmd("squeue --job #{job} > #{fstatus}")
|
484
|
+
out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
|
485
|
+
err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
|
486
|
+
|
487
|
+
terr = Misc.consume_stream(err, true, STDERR) if err
|
488
|
+
tout = Misc.consume_stream(out, true, STDOUT) if out
|
489
|
+
|
490
|
+
sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
|
491
|
+
rescue Aborted
|
492
|
+
ensure
|
493
|
+
begin
|
494
|
+
terr.exit if terr
|
495
|
+
tout.exit if tout
|
496
|
+
err.close if err
|
497
|
+
err.join if err
|
498
|
+
rescue Exception
|
499
|
+
end
|
500
|
+
|
501
|
+
begin
|
502
|
+
out.close if out
|
503
|
+
out.join if out
|
504
|
+
rescue Exception
|
505
|
+
end
|
506
|
+
end
|
507
|
+
end
|
508
|
+
end
|
509
|
+
|
510
|
+
def self.wait_for_job(slurm_basedir, time = 1)
|
511
|
+
fexit = File.join(slurm_basedir, 'exit.status')
|
512
|
+
fjob = File.join(slurm_basedir, 'job.id')
|
513
|
+
job = Open.read(fjob) if Open.exists?(fjob)
|
514
|
+
|
515
|
+
|
516
|
+
while ! Open.exists?(fexit)
|
517
|
+
sleep time
|
518
|
+
end
|
519
|
+
end
|
520
|
+
|
521
|
+
def self.run_job(job, options = {})
|
522
|
+
options = IndiferentHash.setup(options.dup)
|
523
|
+
|
524
|
+
dry_run = options.delete :dry_run
|
525
|
+
tail = options.delete :tail
|
526
|
+
dependencies = options.delete :slurm_dependencies
|
527
|
+
procpath = options.delete :SLURM_procpath
|
528
|
+
options[:jobname] = job.clean_name
|
529
|
+
|
530
|
+
workflow = job.workflow
|
531
|
+
|
532
|
+
task = Symbol === job.overriden ? job.overriden : job.task_name
|
533
|
+
|
534
|
+
if job.overriden
|
535
|
+
override_deps = job.rec_dependencies.
|
536
|
+
select{|dep| Symbol === dep.overriden }.
|
537
|
+
collect do |dep|
|
538
|
+
|
539
|
+
name = [dep.workflow.to_s, dep.task_name] * "#"
|
540
|
+
[name, dep.path] * "="
|
541
|
+
end * ","
|
542
|
+
end
|
543
|
+
|
544
|
+
remove_slurm_basedir = options.delete :remove_slurm_basedir
|
545
|
+
slurm_basedir = options.delete :SLURM_basedir
|
546
|
+
slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
|
547
|
+
TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
|
548
|
+
options[:slurm_basedir] ||= tmp_directory
|
549
|
+
slurm_basedir = options[:slurm_basedir]
|
550
|
+
inputs_dir = File.join(tmp_directory, 'inputs_dir')
|
551
|
+
saved = Step.save_job_inputs(job, inputs_dir)
|
552
|
+
|
553
|
+
if saved && saved.any?
|
554
|
+
options[:inputs_dir] = inputs_dir
|
555
|
+
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '--printpath', '--load_inputs', inputs_dir, '--log', (options[:log] || Log.severity).to_s]
|
556
|
+
else
|
557
|
+
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '--printpath', '--log', (options[:log] || Log.severity).to_s]
|
558
|
+
end
|
559
|
+
|
560
|
+
cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
|
561
|
+
|
562
|
+
cmd << "--procpath_performance='#{tmp_directory}/procpath##{procpath.gsub(',', '#')}'" if procpath
|
563
|
+
|
564
|
+
template = self.template(cmd, options)
|
565
|
+
jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
|
566
|
+
|
567
|
+
return jobid unless tail
|
568
|
+
|
569
|
+
t_monitor = Thread.new do
|
570
|
+
self.follow_job(slurm_basedir, :STDERR)
|
571
|
+
end
|
572
|
+
self.wait_for_job(slurm_basedir)
|
573
|
+
t_monitor.raise Aborted
|
574
|
+
return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
|
575
|
+
path = Open.read(File.join(slurm_basedir, 'std.out')).strip
|
576
|
+
if Open.exists?(path) && job.path != path
|
577
|
+
Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
|
578
|
+
Open.ln path, job.path
|
579
|
+
Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
|
580
|
+
Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
|
581
|
+
end
|
582
|
+
jobid
|
583
|
+
end
|
584
|
+
end
|
585
|
+
end
|
586
|
+
|
587
|
+
def self.relay(job, options={})
|
588
|
+
options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
|
589
|
+
done_deps = job.dependencies.select do |dep|
|
590
|
+
dep.done?
|
591
|
+
end
|
592
|
+
|
593
|
+
error_deps = job.dependencies.select do |dep|
|
594
|
+
dep.error? && ! dep.recoverable_error?
|
595
|
+
end
|
596
|
+
|
597
|
+
(done_deps + error_deps).each do |dep|
|
598
|
+
Step.migrate(dep.path, options[:search_path], options)
|
599
|
+
end
|
600
|
+
|
601
|
+
end
|
602
|
+
end
|
603
|
+
|