rbbt-util 5.30.12 → 5.31.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,119 @@
1
+ require 'rbbt/hpc/batch'
2
+
3
+ module HPC
4
+ module LSF
5
+ extend HPC::TemplateGeneration
6
+ extend HPC::Orchestration
7
+
8
+ def self.batch_system_variables
9
+ <<-EOF
10
+ MAX_MEMORY=$LSB_MAX_MEM_RUSAGE || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
11
+ BATCH_JOB_ID=$LSF_JOBID
12
+ BATCH_SYSTEM=LSF
13
+ EOF
14
+ end
15
+
16
+ def self.header(options = {})
17
+ options = options.dup
18
+
19
+ queue = Misc.process_options options, :queue
20
+ task_cpus = Misc.process_options options, :task_cpus
21
+ time = Misc.process_options options, :time
22
+ nodes = Misc.process_options options, :nodes
23
+ workdir = Misc.process_options options, :workdir
24
+ exclusive = Misc.process_options options, :exclusive
25
+
26
+ batch_dir = Misc.process_options options, :batch_dir
27
+ batch_name = Misc.process_options options, :batch_name
28
+ batch_name ||= File.basename(batch_dir)
29
+
30
+ fout = File.join(batch_dir, 'std.out')
31
+ ferr = File.join(batch_dir, 'std.err')
32
+
33
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
34
+
35
+ time = time.split(":").values_at(0, 1) * ":"
36
+
37
+ header =<<-EOF
38
+ #!/bin/bash
39
+ #BSUB -J "#{batch_name}"
40
+ #BSUB -cwd "#{workdir}"
41
+ #BSUB -oo "#{fout}"
42
+ #BSUB -eo "#{ferr}"
43
+ #BSUB -q "#{queue}"
44
+ #BSUB -n "#{task_cpus}"
45
+ #BSUB -W "#{time}"
46
+ EOF
47
+
48
+ header << "#BSUB -x" << "\n" if exclusive
49
+
50
+ header
51
+ end
52
+
53
+ def self.run_template(batch_dir, dry_run)
54
+
55
+ fout = File.join(batch_dir, 'std.out')
56
+ ferr = File.join(batch_dir, 'std.err')
57
+ fjob = File.join(batch_dir, 'job.id')
58
+ fdep = File.join(batch_dir, 'dependencies.list')
59
+ fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
60
+ fexit = File.join(batch_dir, 'exit.status')
61
+ fsync = File.join(batch_dir, 'sync.log')
62
+ fcmd = File.join(batch_dir, 'command.batch')
63
+
64
+ return if Open.exists?(fexit)
65
+
66
+ STDERR.puts Log.color(:magenta, "Issuing LSF file: #{fcmd}")
67
+ STDERR.puts Open.read(fcmd)
68
+
69
+ if File.exists?(fjob)
70
+ job = Open.read(fjob).to_i
71
+ else
72
+
73
+ dependencies = Open.read(fdep).split("\n") if File.exists? fdep
74
+ canfail_dependencies = Open.read(fcfdep).split("\n") if File.exists? fcfdep
75
+
76
+ normal_dep_list = dependencies && dependencies.any? ? dependencies.collect{|d| "post_done(#{d})"} : []
77
+ canfail_dep_list = canfail_dependencies && canfail_dependencies.any? ? canfail_dependencies.collect{|d| "done(#{d})"} : []
78
+
79
+ dep_list = normal_dep_list + canfail_dep_list
80
+
81
+ if dep_list.any?
82
+ dep_str = '-w "' + dep_list * " && " + '"'
83
+ else
84
+ dep_str = ""
85
+ end
86
+
87
+ cmd = "bsub #{dep_str} < '#{fcmd}'"
88
+
89
+ if File.exists?(fout)
90
+ return
91
+ elsif dry_run
92
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, cmd)
93
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt lsf tail '#{batch_dir}'")
94
+ raise HPC::SBATCH, batch_dir
95
+ else
96
+ Open.rm fsync
97
+ Open.rm fexit
98
+ Open.rm fout
99
+ Open.rm ferr
100
+
101
+
102
+ job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
103
+ Log.debug "BSUB job id: #{job}"
104
+ Open.write(fjob, job.to_s)
105
+ job
106
+ end
107
+ end
108
+ end
109
+
110
+ def self.job_status(job = nil)
111
+ if job.nil?
112
+ CMD.cmd("bjobs -w").read
113
+ else
114
+ CMD.cmd("bjobs -w #{job}").read
115
+ end
116
+ end
117
+ end
118
+ end
119
+
@@ -1,8 +1,8 @@
1
1
  require 'rbbt/workflow/util/orchestrator'
2
2
  module HPC
3
- module SLURM
3
+ module Orchestration
4
4
 
5
- def self.job_rules(rules, job)
5
+ def job_rules(rules, job)
6
6
  workflow = job.workflow.to_s
7
7
  task_name = job.task_name.to_s
8
8
  task_name = job.overriden.to_s if Symbol === job.overriden
@@ -53,18 +53,18 @@ module HPC
53
53
  job_rules
54
54
  end
55
55
 
56
- def self.get_job_dependencies(job, job_rules = nil)
56
+ def get_job_dependencies(job, job_rules = nil)
57
57
  deps = job.dependencies || []
58
58
  deps += job.input_dependencies || []
59
59
  deps
60
60
  end
61
61
 
62
- def self.get_recursive_job_dependencies(job)
62
+ def get_recursive_job_dependencies(job)
63
63
  deps = get_job_dependencies(job)
64
64
  (deps + deps.collect{|dep| get_recursive_job_dependencies(dep) }).flatten
65
65
  end
66
66
 
67
- def self.piggyback(job, job_rules, job_deps)
67
+ def piggyback(job, job_rules, job_deps)
68
68
  return false unless job_rules["skip"]
69
69
  final_deps = job_deps - job_deps.collect{|dep| get_recursive_job_dependencies(dep)}.flatten.uniq
70
70
  final_deps = final_deps.reject{|dep| dep.done? }
@@ -72,7 +72,7 @@ module HPC
72
72
  return false
73
73
  end
74
74
 
75
- def self.get_chains(job, rules, chains = {})
75
+ def get_chains(job, rules, chains = {})
76
76
  job_rules = self.job_rules(rules, job)
77
77
  job_deps = get_job_dependencies(job)
78
78
 
@@ -102,7 +102,7 @@ module HPC
102
102
  chains
103
103
  end
104
104
 
105
- def self.workload(job, rules, chains, options, seen = nil)
105
+ def workload(job, rules, chains, options, seen = nil)
106
106
  return [] if job.done?
107
107
  if seen.nil?
108
108
  seen = {}
@@ -145,7 +145,7 @@ module HPC
145
145
  job_rules.delete :workflow
146
146
 
147
147
 
148
- job_options = IndiferentHash.setup(options.merge(job_rules).merge(:slurm_dependencies => dep_ids))
148
+ job_options = IndiferentHash.setup(options.merge(job_rules).merge(:batch_dependencies => dep_ids))
149
149
  job_options.delete :orchestration_rules
150
150
 
151
151
  config_keys = job_rules.delete(:config_keys)
@@ -182,7 +182,7 @@ module HPC
182
182
 
183
183
  if options[:dry_run]
184
184
  puts Log.color(:magenta, "Manifest: ") + Log.color(:blue, job_options[:manifest] * ", ") + " - tasks: #{job_options[:task_cpus] || 1} - time: #{job_options[:time]} - config: #{job_options[:config_keys]}"
185
- puts Log.color(:yellow, "Deps: ") + Log.color(:blue, job_options[:slurm_dependencies]*", ")
185
+ puts Log.color(:yellow, "Deps: ") + Log.color(:blue, job_options[:batch_dependencies]*", ")
186
186
  job_options[:manifest].first
187
187
  else
188
188
  run_job(job, job_options)
@@ -190,13 +190,14 @@ module HPC
190
190
  end
191
191
 
192
192
 
193
- def self.orchestrate_job(job, options)
193
+ def orchestrate_job(job, options)
194
194
  options.delete "recursive_clean"
195
195
  options.delete "clean_task"
196
196
  options.delete "clean"
197
197
  options.delete "tail"
198
- options.delete "printfile"
198
+ options.delete "printpath"
199
199
  options.delete "detach"
200
+ options.delete "jobname"
200
201
 
201
202
  rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
202
203
  rules ||= {}
@@ -1,454 +1,101 @@
1
- module HPC
2
- class SBATCH < Exception;
3
- attr_accessor :directory
4
- def initialize(directory)
5
- @directory = directory
6
- end
7
- end
8
-
9
- module SLURM
10
-
11
- def self.template(args, options = {})
12
-
13
- development = options.delete :drbbt
14
- singularity = options.delete :singularity
15
- contain = options.delete :contain
16
- sync = options.delete :sync
17
- user_group = options.delete :user_group
18
- contain_and_sync = options.delete :contain_and_sync
19
- wipe_container = options.delete :wipe_container
20
- copy_image = options.delete :copy_image
21
- exclusive = options.delete :exclusive
22
- highmem = options.delete :highmem
23
-
24
- slurm_step_path = options.delete :slurm_step_path
25
-
26
- manifest = options.delete :manifest
27
-
28
- queue = options.delete(:queue) || Rbbt::Config.get('queue', :slurm_queue, :slurm, :SLURM, :default => 'bsc_ls')
29
- task_cpus = options.delete(:task_cpus) || 1
30
- nodes = options.delete(:nodes) || 1
31
- time = options.delete(:time) || "0:02:00"
32
-
33
- inputs_dir = options.delete :inputs_dir
34
- config_keys = options.delete :config_keys
1
+ require 'rbbt/hpc/batch'
2
+ require 'rbbt/hpc/orchestrate'
35
3
 
36
- user = ENV['USER'] || `whoami`.strip
37
- group = File.basename(File.dirname(ENV['HOME']))
38
-
39
- if contain_and_sync
40
- random_file = TmpFile.random_name
41
- contain = "/scratch/tmp/rbbt-#{user}/#{random_file}" if contain.nil?
42
- sync = "~/.rbbt/var/jobs" if sync.nil?
43
- wipe_container = "post" if wipe_container.nil?
44
- end
45
-
46
- contain = nil if contain == "" || contain == "none"
47
- sync = nil if sync == "" || sync == "none"
4
+ module HPC
5
+ module SLURM
6
+ extend HPC::TemplateGeneration
7
+ extend HPC::Orchestration
48
8
 
49
- contain = File.expand_path(contain) if contain
9
+ def self.batch_system_variables
10
+ <<-EOF
11
+ let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
12
+ BATCH_JOB_ID=$SLURM_JOB_ID
13
+ BATCH_SYSTEM=SLURM
14
+ EOF
15
+ end
50
16
 
51
- name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
52
- options.delete(:name)
53
- slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
54
- options.delete(:slurm_basedir)
17
+ def self.header(options = {})
18
+ options = options.dup
55
19
 
56
- rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
20
+ queue = Misc.process_options options, :queue
21
+ task_cpus = Misc.process_options options, :task_cpus
22
+ time = Misc.process_options options, :time
23
+ nodes = Misc.process_options options, :nodes
24
+ workdir = Misc.process_options options, :workdir
25
+ exclusive = Misc.process_options options, :exclusive
57
26
 
58
- rbbt_cmd += " " << options.collect do |o,v|
59
- o = o.to_s
60
- case v
61
- when TrueClass
62
- '--' << o
63
- when FalseClass
64
- '--' << o << "=false"
65
- else
66
- ['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
67
- end
68
- end * " "
27
+ batch_dir = Misc.process_options options, :batch_dir
28
+ batch_name = Misc.process_options options, :batch_name
69
29
 
70
- rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
30
+ fout = File.join(batch_dir, 'std.out')
31
+ ferr = File.join(batch_dir, 'std.err')
71
32
 
72
33
  time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
73
34
 
74
-
75
- #{{{ PREPARE LOCAL LOGFILES
76
-
77
- Open.mkdir slurm_basedir
78
-
79
- fout = File.join(slurm_basedir, 'std.out')
80
- ferr = File.join(slurm_basedir, 'std.err')
81
- fjob = File.join(slurm_basedir, 'job.id')
82
- fexit = File.join(slurm_basedir, 'exit.status')
83
- fsync = File.join(slurm_basedir, 'sync.log')
84
- fsyncexit = File.join(slurm_basedir, 'sync.status')
85
- fcmd = File.join(slurm_basedir, 'command.slurm')
86
-
87
- #{{{ GENERATE TEMPLATE
88
-
89
- # HEADER
90
35
  header =<<-EOF
91
36
  #!/bin/bash
92
- #SBATCH --qos="#{queue}"
93
- #SBATCH --job-name="#{name}"
94
- #SBATCH --workdir="#{Dir.pwd}"
37
+ #SBATCH --job-name="#{batch_name}"
38
+ #SBATCH --workdir="#{workdir}"
95
39
  #SBATCH --output="#{fout}"
96
40
  #SBATCH --error="#{ferr}"
41
+ #SBATCH --qos="#{queue}"
97
42
  #SBATCH --cpus-per-task="#{task_cpus}"
98
43
  #SBATCH --time="#{time}"
99
44
  #SBATCH --nodes="#{nodes}"
100
45
  EOF
101
46
 
102
- prep = ""
103
-
104
- if highmem
105
- header +=<<-EOF
106
- #SBATCH --constraint=highmem
107
- EOF
108
- end
109
-
110
- if exclusive
111
- header +=<<-EOF
112
- #SBATCH --exclusive
113
- EOF
114
- end
115
-
116
- # ENV
117
- env = ""
118
- env +=<<-EOF
119
- # Prepare env
120
- [[ -f ~/config/load.sh ]] && source ~/config/load.sh
121
- module load java
122
-
123
- # Calculate max available memory
124
- let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
125
- EOF
126
-
127
-
128
- # RUN
129
- run = ""
130
- exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
131
-
132
-
133
- if singularity
134
- #{{{ SINGULARITY
135
-
136
- singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
137
-
138
- env +=<<-EOF
139
- module load intel/2018.1
140
- module load singularity
141
- PROJECTS_ROOT="/gpfs/projects/bsc26/"
142
- SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
143
- SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
144
- SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
145
- mkdir -p "$SINGULARITY_RUBY_INLINE"
146
- EOF
147
-
148
- if contain
149
- scratch_group_dir = File.join('/gpfs/scratch/', group)
150
- projects_group_dir = File.join('/gpfs/projects/', group)
47
+ header << "#SBATCH --exclusive" << "\n" if exclusive
151
48
 
152
- prep +=<<-EOF
153
-
154
- # Prepare container dir
155
- CONTAINER_DIR="#{contain}"
156
- mkdir -p $CONTAINER_DIR/.rbbt/etc/
157
-
158
- for dir in .ruby_inline git home; do
159
- mkdir -p $CONTAINER_DIR/$dir
160
- done
161
-
162
- for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
163
- mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
164
- done
165
-
166
- # Copy environment
167
- cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
168
-
169
- # Set search_paths
170
- echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
171
- echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
172
- echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
173
- echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
174
- echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
175
- echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
176
- echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
177
- EOF
178
-
179
- if user_group && group != user_group
180
- prep +=<<-EOF
181
-
182
- # Add user_group search_path
183
- echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
184
- EOF
185
- end
186
-
187
- if inputs_dir
188
- prep +=<<-EOF
189
-
190
- # Copy inputs
191
- [[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
192
- EOF
193
- rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
194
- end
195
-
196
- if copy_image
197
- prep +=<<EOF
198
-
199
- # Copy image
200
- rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
201
- SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
202
- EOF
203
- end
49
+ header
50
+ end
204
51
 
205
- if wipe_container == "pre" || wipe_container == "both"
206
- if singularity
207
- prep +=<<-EOF
52
+ def self.run_template(batch_dir, dry_run)
208
53
 
209
- # Clean container pre
210
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
211
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
212
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
213
- EOF
214
- else
215
- prep = ""
216
- end
217
- end
218
- end
54
+ fout = File.join(batch_dir, 'std.out')
55
+ ferr = File.join(batch_dir, 'std.err')
56
+ fjob = File.join(batch_dir, 'job.id')
57
+ fdep = File.join(batch_dir, 'dependencies.list')
58
+ fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
59
+ fexit = File.join(batch_dir, 'exit.status')
60
+ fsync = File.join(batch_dir, 'sync.log')
61
+ fcmd = File.join(batch_dir, 'command.batch')
219
62
 
220
- if contain
221
- singularity_exec << %( -C -H "$CONTAINER_DIR" \
222
- -B /scratch/tmp \
223
- #{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
224
- -B #{scratch_group_dir} \
225
- -B #{projects_group_dir} \
226
- -B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
227
- -B ~/git:"$CONTAINER_DIR/git":ro \
228
- #{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
229
- -B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
230
- "$SINGULARITY_IMG")
231
- exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
232
- else
233
- singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
234
- end
63
+ return if Open.exists?(fexit)
235
64
 
236
- if development
237
- exec_cmd += " rbbt --dev='#{development}'"
238
- else
239
- exec_cmd += ' rbbt'
240
- end
65
+ STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
66
+ STDERR.puts Open.read(fcmd)
241
67
 
242
- exec_cmd = singularity_exec + " " + exec_cmd
68
+ if File.exists?(fjob)
69
+ job = Open.read(fjob).to_i
243
70
  else
244
- if development
245
- exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
246
- else
247
- exec_cmd << " " << 'rbbt'
248
- end
249
-
250
- if contain
251
- rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/workdir')
252
- end
253
- end
254
-
255
-
256
- cmd =<<-EOF
257
- #{exec_cmd} \\
258
- #{rbbt_cmd}
259
- EOF
260
- annotate_cmd =<<-EOF
261
- #{exec_cmd} \\
262
- workflow write_info --recursive --force=false --check_pid "$step_path" slurm_job $SLURM_JOB_ID
263
- EOF
264
-
265
- header +=<<-EOF if manifest
266
- #MANIFEST: #{manifest * ", "}
267
- EOF
268
-
269
- header +=<<-EOF if slurm_step_path
270
- #STEP_PATH: #{slurm_step_path}
271
- EOF
272
-
273
- header +=<<-EOF
274
- #CMD: #{rbbt_cmd}
275
- EOF
276
-
277
- run +=<<-EOF
278
-
279
- # Run command
280
- step_path=$(#{cmd})
281
-
282
- # Save exit status
283
- exit_status=$?
284
-
285
- # Annotate info with SLURM job_info
286
- #{annotate_cmd}
287
71
 
288
- EOF
72
+ dependencies = Open.read(fdep).split("\n") if File.exists? fdep
73
+ canfail_dependencies = Open.read(fcfdep).split("\n") if File.exists? fcfdep
289
74
 
290
- # CODA
291
- coda = ""
292
- if sync
293
- if singularity
294
- coda +=<<-EOF
295
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
296
- EOF
297
- # else
298
- # coda +=<<-EOF
299
- #rbbt system clean all -q &>> #{fsync}
300
- #EOF
301
- end
75
+ normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
76
+ canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
302
77
 
303
- if sync.include?("=>")
304
- source, _sep, sync = sync.partition("=>")
305
- source = source.strip
306
- sync = sync.strip
307
- source = File.join(File.expand_path(contain), source)
78
+ if normal_dep_str.nil? && canfail_dep_str.nil?
79
+ dep_str = ""
308
80
  else
309
- source = File.join(File.expand_path(contain), 'workdir/var/jobs')
310
- end
311
-
312
- target = File.expand_path(sync)
313
- coda +=<<-EOF
314
-
315
- # Sync data to target location
316
- if [ $exit_status == '0' ]; then
317
- mkdir -p "$(dirname '#{target}')"
318
- rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
319
- sync_es="$?"
320
- echo $sync_es > #{fsyncexit}
321
- find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
322
- else
323
- sync_es="$exit_status"
324
- fi
325
- EOF
326
-
327
- if contain && (wipe_container == "post" || wipe_container == "both")
328
- prep =<<-EOF + prep
329
- if ls -A '#{contain}' &> /dev/null ; then
330
- echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
331
- fi
332
- EOF
333
- if singularity
334
- coda +=<<-EOF
335
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
336
-
337
-
338
- # Clean container directory
339
- #if [ $exit_status == '0' -a $sync_es == '0' ]; then
340
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
341
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
342
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
343
- #else
344
- # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
345
- #fi
346
- EOF
347
- else
348
- coda +=<<-EOF
349
- ##{exec_cmd} system clean
350
- #if [ $exit_status == '0' -a $sync_es == '0' ]; then
351
- rm -Rfv #{contain} &>> #{fsync}
352
- #else
353
- # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
354
- #fi
355
- EOF
356
-
357
- end
81
+ dep_str = '--dependency=' + [normal_dep_str, canfail_dep_str].compact * ","
358
82
  end
359
- end
360
-
361
- coda +=<<-EOF
362
-
363
- # Write exit status to file
364
- echo $exit_status > #{fexit}
365
- EOF
366
-
367
- if sync
368
- coda +=<<-EOF
369
- if [ "$sync_es" == '0' ]; then
370
- unset sync_es
371
- exit $exit_status
372
- else
373
- exit $sync_es
374
- fi
375
- EOF
376
- else
377
- coda +=<<-EOF
378
- exit $exit_status
379
- EOF
380
- end
381
-
382
- template = [header, env, prep, run, coda] * "\n"
383
-
384
- template
385
- end
386
-
387
- def self.issue_template(template, options = {})
388
-
389
- slurm_basedir = options[:slurm_basedir]
390
- dependencies = options.delete :slurm_dependencies
391
- dependencies = [] if dependencies.nil?
392
-
393
- canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
394
- dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
395
-
396
- Open.mkdir slurm_basedir
397
83
 
398
- dry_run = options.delete :dry_run
84
+ cmd = "sbatch #{dep_str} '#{fcmd}'"
399
85
 
400
- fout = File.join(slurm_basedir, 'std.out')
401
- ferr = File.join(slurm_basedir, 'std.err')
402
- fjob = File.join(slurm_basedir, 'job.id')
403
- fdep = File.join(slurm_basedir, 'dependencies.list')
404
- fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
405
- fexit = File.join(slurm_basedir, 'exit.status')
406
- fsync = File.join(slurm_basedir, 'sync.log')
407
- fcmd = File.join(slurm_basedir, 'command.slurm')
408
-
409
- job = nil
410
- if options[:clean_job]
411
- [fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
412
- Open.rm file if Open.exists? file
413
- end
414
- end
415
-
416
- return if Open.exists?(fexit)
417
-
418
- STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
419
- STDERR.puts template
420
-
421
- Open.write(fcmd, template) unless File.exists? fcmd
422
- if File.exists?(fjob)
423
- job = Open.read(fjob).to_i
424
- else
425
86
  if File.exists?(fout)
426
87
  return
427
88
  elsif dry_run
428
- STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
429
- STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
430
- raise HPC::SBATCH, slurm_basedir
89
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{fcmd}'")
90
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt slurm tail '#{batch_dir}'")
91
+ raise HPC::SBATCH, batch_dir
431
92
  else
432
93
  Open.rm fsync
433
94
  Open.rm fexit
434
95
  Open.rm fout
435
96
  Open.rm ferr
436
97
 
437
- Open.write(fdep, dependencies * "\n") if dependencies.any?
438
- Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
439
-
440
-
441
- dep_str = '--dependency='
442
- normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
443
- canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
444
-
445
- if normal_dep_str.nil? && canfail_dep_str.nil?
446
- dep_str = ""
447
- else
448
- dep_str += [normal_dep_str, canfail_dep_str].compact * ","
449
- end
450
-
451
- job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
98
+ job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
452
99
  Log.debug "SBATCH job id: #{job}"
453
100
  Open.write(fjob, job.to_s)
454
101
  job
@@ -456,165 +103,13 @@ EOF
456
103
  end
457
104
  end
458
105
 
459
- def self.follow_job(slurm_basedir, tail = true)
460
- fjob = File.join(slurm_basedir, 'job.id')
461
- fout = File.join(slurm_basedir, 'std.out')
462
- ferr = File.join(slurm_basedir, 'std.err')
463
- fstatus = File.join(slurm_basedir, 'job.status')
464
-
465
- job = Open.read(fjob).strip if Open.exists?(fjob)
466
-
467
- if job
468
- status_txt = CMD.cmd("squeue --job #{job}").read
469
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
470
- STDERR.puts status_txt
471
- lines = status_txt.split("\n").length
472
- end
473
-
474
- if tail
475
- Log.severity = 10
476
- while ! File.exists? fout
477
- if job
478
- STDERR.puts
479
- Log.clear_line(STDERR)
480
- STDERR.write Log.color(:magenta, "Waiting for Output")
481
- 3.times do
482
- STDERR.write Log.color(:magenta, ".")
483
- sleep 1
484
- end
485
- status_txt = CMD.cmd("squeue --job #{job}").read
486
- lines.times do
487
- Log.clear_line(STDERR)
488
- end
489
- Log.clear_line(STDERR)
490
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
491
- STDERR.puts status_txt
492
- lines = status_txt.split("\n").length
493
- end
494
- end
495
- STDERR.puts
496
- Log.clear_line(STDERR)
497
- STDERR.puts Log.color(:magenta, "Output:")
498
- begin
499
- CMD.cmd("squeue --job #{job} > #{fstatus}")
500
- out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
501
- err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
502
-
503
- terr = Misc.consume_stream(err, true, STDERR) if err
504
- tout = Misc.consume_stream(out, true, STDOUT) if out
505
-
506
- sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
507
- rescue Aborted
508
- ensure
509
- begin
510
- terr.exit if terr
511
- tout.exit if tout
512
- err.close if err
513
- err.join if err
514
- rescue Exception
515
- end
516
-
517
- begin
518
- out.close if out
519
- out.join if out
520
- rescue Exception
521
- end
522
- end
523
- end
524
- end
525
-
526
- def self.wait_for_job(slurm_basedir, time = 1)
527
- fexit = File.join(slurm_basedir, 'exit.status')
528
- fjob = File.join(slurm_basedir, 'job.id')
529
- job = Open.read(fjob) if Open.exists?(fjob)
530
-
531
-
532
- while ! Open.exists?(fexit)
533
- sleep time
534
- end
535
- end
536
-
537
- def self.run_job(job, options = {})
538
- options = IndiferentHash.setup(options.dup)
539
-
540
- dry_run = options.delete :dry_run
541
- tail = options.delete :tail
542
- dependencies = options.delete :slurm_dependencies
543
- procpath = options.delete :SLURM_procpath
544
-
545
- options[:jobname] = job.clean_name
546
- options[:slurm_step_path] = job.path
547
-
548
- log_level = options.delete :log
549
- log_level ||= Log.severity
550
-
551
- workflow = job.workflow
552
-
553
- task = Symbol === job.overriden ? job.overriden : job.task_name
554
-
555
- if job.overriden
556
- override_deps = job.rec_dependencies.
557
- select{|dep| Symbol === dep.overriden }.
558
- collect do |dep|
559
-
560
- name = [dep.workflow.to_s, dep.task_name] * "#"
561
- [name, dep.path] * "="
562
- end * ","
563
- end
564
-
565
- remove_slurm_basedir = options.delete :remove_slurm_basedir
566
- slurm_basedir = options.delete :SLURM_basedir
567
- slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
568
- TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
569
- options[:slurm_basedir] ||= tmp_directory
570
- slurm_basedir = options[:slurm_basedir]
571
- inputs_dir = File.join(tmp_directory, 'inputs_dir')
572
- saved = Step.save_job_inputs(job, inputs_dir)
573
-
574
- cmd = ['workflow', 'task', workflow.to_s, task.to_s, '--printpath', '--log', log_level.to_s]
575
-
576
- cmd << "--procpath_performance='#{tmp_directory}/procpath##{procpath.gsub(',', '#')}'" if procpath
577
-
578
- cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
579
-
580
- cmd << "--load_inputs='#{inputs_dir}'" if saved && saved.any?
581
-
582
- template = self.template(cmd, options)
583
- jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
584
-
585
- return jobid unless tail
586
-
587
- t_monitor = Thread.new do
588
- self.follow_job(slurm_basedir, :STDERR)
589
- end
590
- self.wait_for_job(slurm_basedir)
591
- t_monitor.raise Aborted
592
- return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
593
- path = Open.read(File.join(slurm_basedir, 'std.out')).strip
594
- if Open.exists?(path) && job.path != path
595
- Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
596
- Open.ln path, job.path
597
- Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
598
- Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
599
- end
600
- jobid
106
+ def self.job_status(job = nil)
107
+ if job.nil?
108
+ CMD.cmd("squeue").read
109
+ else
110
+ CMD.cmd("squeue --job #{job}").read
601
111
  end
602
112
  end
603
- end
604
-
605
- def self.relay(job, options={})
606
- options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
607
- done_deps = job.dependencies.select do |dep|
608
- dep.done?
609
- end
610
-
611
- error_deps = job.dependencies.select do |dep|
612
- dep.error? && ! dep.recoverable_error?
613
- end
614
-
615
- (done_deps + error_deps).each do |dep|
616
- Step.migrate(dep.path, options[:search_path], options)
617
- end
618
113
 
619
114
  end
620
115
  end