rbbt-util 5.30.13 → 5.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,119 @@
1
+ require 'rbbt/hpc/batch'
2
+
3
+ module HPC
4
+ module LSF
5
+ extend HPC::TemplateGeneration
6
+ extend HPC::Orchestration
7
+
8
+ def self.batch_system_variables
9
+ <<-EOF
10
+ MAX_MEMORY=$LSB_MAX_MEM_RUSAGE || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
11
+ BATCH_JOB_ID=$LSF_JOBID
12
+ BATCH_SYSTEM=LSF
13
+ EOF
14
+ end
15
+
16
+ def self.header(options = {})
17
+ options = options.dup
18
+
19
+ queue = Misc.process_options options, :queue
20
+ task_cpus = Misc.process_options options, :task_cpus
21
+ time = Misc.process_options options, :time
22
+ nodes = Misc.process_options options, :nodes
23
+ workdir = Misc.process_options options, :workdir
24
+ exclusive = Misc.process_options options, :exclusive
25
+
26
+ batch_dir = Misc.process_options options, :batch_dir
27
+ batch_name = Misc.process_options options, :batch_name
28
+ batch_name ||= File.basename(batch_dir)
29
+
30
+ fout = File.join(batch_dir, 'std.out')
31
+ ferr = File.join(batch_dir, 'std.err')
32
+
33
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
34
+
35
+ time = time.split(":").values_at(0, 1) * ":"
36
+
37
+ header =<<-EOF
38
+ #!/bin/bash
39
+ #BSUB -J "#{batch_name}"
40
+ #BSUB -cwd "#{workdir}"
41
+ #BSUB -oo "#{fout}"
42
+ #BSUB -eo "#{ferr}"
43
+ #BSUB -q "#{queue}"
44
+ #BSUB -n "#{task_cpus}"
45
+ #BSUB -W "#{time}"
46
+ EOF
47
+
48
+ header << "#BSUB -x" << "\n" if exclusive
49
+
50
+ header
51
+ end
52
+
53
+ def self.run_template(batch_dir, dry_run)
54
+
55
+ fout = File.join(batch_dir, 'std.out')
56
+ ferr = File.join(batch_dir, 'std.err')
57
+ fjob = File.join(batch_dir, 'job.id')
58
+ fdep = File.join(batch_dir, 'dependencies.list')
59
+ fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
60
+ fexit = File.join(batch_dir, 'exit.status')
61
+ fsync = File.join(batch_dir, 'sync.log')
62
+ fcmd = File.join(batch_dir, 'command.batch')
63
+
64
+ return if Open.exists?(fexit)
65
+
66
+ STDERR.puts Log.color(:magenta, "Issuing LSF file: #{fcmd}")
67
+ STDERR.puts Open.read(fcmd)
68
+
69
+ if File.exists?(fjob)
70
+ job = Open.read(fjob).to_i
71
+ else
72
+
73
+ dependencies = Open.read(fdep).split("\n") if File.exists? fdep
74
+ canfail_dependencies = Open.read(fcfdep).split("\n") if File.exists? fcfdep
75
+
76
+ normal_dep_list = dependencies && dependencies.any? ? dependencies.collect{|d| "post_done(#{d})"} : []
77
+ canfail_dep_list = canfail_dependencies && canfail_dependencies.any? ? canfail_dependencies.collect{|d| "done(#{d})"} : []
78
+
79
+ dep_list = normal_dep_list + canfail_dep_list
80
+
81
+ if dep_list.any?
82
+ dep_str = '-w "' + dep_list * " && " + '"'
83
+ else
84
+ dep_str = ""
85
+ end
86
+
87
+ cmd = "bsub #{dep_str} < '#{fcmd}'"
88
+
89
+ if File.exists?(fout)
90
+ return
91
+ elsif dry_run
92
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, cmd)
93
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt lsf tail '#{batch_dir}'")
94
+ raise HPC::SBATCH, batch_dir
95
+ else
96
+ Open.rm fsync
97
+ Open.rm fexit
98
+ Open.rm fout
99
+ Open.rm ferr
100
+
101
+
102
+ job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
103
+ Log.debug "BSUB job id: #{job}"
104
+ Open.write(fjob, job.to_s)
105
+ job
106
+ end
107
+ end
108
+ end
109
+
110
+ def self.job_status(job = nil)
111
+ if job.nil?
112
+ CMD.cmd("bjobs -w").read
113
+ else
114
+ CMD.cmd("bjobs -w #{job}").read
115
+ end
116
+ end
117
+ end
118
+ end
119
+
@@ -1,8 +1,8 @@
1
1
  require 'rbbt/workflow/util/orchestrator'
2
2
  module HPC
3
- module SLURM
3
+ module Orchestration
4
4
 
5
- def self.job_rules(rules, job)
5
+ def job_rules(rules, job)
6
6
  workflow = job.workflow.to_s
7
7
  task_name = job.task_name.to_s
8
8
  task_name = job.overriden.to_s if Symbol === job.overriden
@@ -53,18 +53,18 @@ module HPC
53
53
  job_rules
54
54
  end
55
55
 
56
- def self.get_job_dependencies(job, job_rules = nil)
56
+ def get_job_dependencies(job, job_rules = nil)
57
57
  deps = job.dependencies || []
58
58
  deps += job.input_dependencies || []
59
59
  deps
60
60
  end
61
61
 
62
- def self.get_recursive_job_dependencies(job)
62
+ def get_recursive_job_dependencies(job)
63
63
  deps = get_job_dependencies(job)
64
64
  (deps + deps.collect{|dep| get_recursive_job_dependencies(dep) }).flatten
65
65
  end
66
66
 
67
- def self.piggyback(job, job_rules, job_deps)
67
+ def piggyback(job, job_rules, job_deps)
68
68
  return false unless job_rules["skip"]
69
69
  final_deps = job_deps - job_deps.collect{|dep| get_recursive_job_dependencies(dep)}.flatten.uniq
70
70
  final_deps = final_deps.reject{|dep| dep.done? }
@@ -72,7 +72,7 @@ module HPC
72
72
  return false
73
73
  end
74
74
 
75
- def self.get_chains(job, rules, chains = {})
75
+ def get_chains(job, rules, chains = {})
76
76
  job_rules = self.job_rules(rules, job)
77
77
  job_deps = get_job_dependencies(job)
78
78
 
@@ -102,7 +102,7 @@ module HPC
102
102
  chains
103
103
  end
104
104
 
105
- def self.workload(job, rules, chains, options, seen = nil)
105
+ def workload(job, rules, chains, options, seen = nil)
106
106
  return [] if job.done?
107
107
  if seen.nil?
108
108
  seen = {}
@@ -145,7 +145,7 @@ module HPC
145
145
  job_rules.delete :workflow
146
146
 
147
147
 
148
- job_options = IndiferentHash.setup(options.merge(job_rules).merge(:slurm_dependencies => dep_ids))
148
+ job_options = IndiferentHash.setup(options.merge(job_rules).merge(:batch_dependencies => dep_ids))
149
149
  job_options.delete :orchestration_rules
150
150
 
151
151
  config_keys = job_rules.delete(:config_keys)
@@ -182,7 +182,7 @@ module HPC
182
182
 
183
183
  if options[:dry_run]
184
184
  puts Log.color(:magenta, "Manifest: ") + Log.color(:blue, job_options[:manifest] * ", ") + " - tasks: #{job_options[:task_cpus] || 1} - time: #{job_options[:time]} - config: #{job_options[:config_keys]}"
185
- puts Log.color(:yellow, "Deps: ") + Log.color(:blue, job_options[:slurm_dependencies]*", ")
185
+ puts Log.color(:yellow, "Deps: ") + Log.color(:blue, job_options[:batch_dependencies]*", ")
186
186
  job_options[:manifest].first
187
187
  else
188
188
  run_job(job, job_options)
@@ -190,13 +190,14 @@ module HPC
190
190
  end
191
191
 
192
192
 
193
- def self.orchestrate_job(job, options)
193
+ def orchestrate_job(job, options)
194
194
  options.delete "recursive_clean"
195
195
  options.delete "clean_task"
196
196
  options.delete "clean"
197
197
  options.delete "tail"
198
- options.delete "printfile"
198
+ options.delete "printpath"
199
199
  options.delete "detach"
200
+ options.delete "jobname"
200
201
 
201
202
  rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
202
203
  rules ||= {}
@@ -1,454 +1,101 @@
1
- module HPC
2
- class SBATCH < Exception;
3
- attr_accessor :directory
4
- def initialize(directory)
5
- @directory = directory
6
- end
7
- end
8
-
9
- module SLURM
10
-
11
- def self.template(args, options = {})
12
-
13
- development = options.delete :drbbt
14
- singularity = options.delete :singularity
15
- contain = options.delete :contain
16
- sync = options.delete :sync
17
- user_group = options.delete :user_group
18
- contain_and_sync = options.delete :contain_and_sync
19
- wipe_container = options.delete :wipe_container
20
- copy_image = options.delete :copy_image
21
- exclusive = options.delete :exclusive
22
- highmem = options.delete :highmem
23
-
24
- slurm_step_path = options.delete :slurm_step_path
25
-
26
- manifest = options.delete :manifest
27
-
28
- queue = options.delete(:queue) || Rbbt::Config.get('queue', :slurm_queue, :slurm, :SLURM, :default => 'bsc_ls')
29
- task_cpus = options.delete(:task_cpus) || 1
30
- nodes = options.delete(:nodes) || 1
31
- time = options.delete(:time) || "0:02:00"
32
-
33
- inputs_dir = options.delete :inputs_dir
34
- config_keys = options.delete :config_keys
1
+ require 'rbbt/hpc/batch'
2
+ require 'rbbt/hpc/orchestrate'
35
3
 
36
- user = ENV['USER'] || `whoami`.strip
37
- group = File.basename(File.dirname(ENV['HOME']))
38
-
39
- if contain_and_sync
40
- random_file = TmpFile.random_name
41
- contain = "/scratch/tmp/rbbt-#{user}/#{random_file}" if contain.nil?
42
- sync = "~/.rbbt/var/jobs" if sync.nil?
43
- wipe_container = "post" if wipe_container.nil?
44
- end
45
-
46
- contain = nil if contain == "" || contain == "none"
47
- sync = nil if sync == "" || sync == "none"
4
+ module HPC
5
+ module SLURM
6
+ extend HPC::TemplateGeneration
7
+ extend HPC::Orchestration
48
8
 
49
- contain = File.expand_path(contain) if contain
9
+ def self.batch_system_variables
10
+ <<-EOF
11
+ let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
12
+ BATCH_JOB_ID=$SLURM_JOB_ID
13
+ BATCH_SYSTEM=SLURM
14
+ EOF
15
+ end
50
16
 
51
- name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
52
- options.delete(:name)
53
- slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
54
- options.delete(:slurm_basedir)
17
+ def self.header(options = {})
18
+ options = options.dup
55
19
 
56
- rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
20
+ queue = Misc.process_options options, :queue
21
+ task_cpus = Misc.process_options options, :task_cpus
22
+ time = Misc.process_options options, :time
23
+ nodes = Misc.process_options options, :nodes
24
+ workdir = Misc.process_options options, :workdir
25
+ exclusive = Misc.process_options options, :exclusive
57
26
 
58
- rbbt_cmd += " " << options.collect do |o,v|
59
- o = o.to_s
60
- case v
61
- when TrueClass
62
- '--' << o
63
- when FalseClass
64
- '--' << o << "=false"
65
- else
66
- ['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
67
- end
68
- end * " "
27
+ batch_dir = Misc.process_options options, :batch_dir
28
+ batch_name = Misc.process_options options, :batch_name
69
29
 
70
- rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
30
+ fout = File.join(batch_dir, 'std.out')
31
+ ferr = File.join(batch_dir, 'std.err')
71
32
 
72
33
  time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
73
34
 
74
-
75
- #{{{ PREPARE LOCAL LOGFILES
76
-
77
- Open.mkdir slurm_basedir
78
-
79
- fout = File.join(slurm_basedir, 'std.out')
80
- ferr = File.join(slurm_basedir, 'std.err')
81
- fjob = File.join(slurm_basedir, 'job.id')
82
- fexit = File.join(slurm_basedir, 'exit.status')
83
- fsync = File.join(slurm_basedir, 'sync.log')
84
- fsyncexit = File.join(slurm_basedir, 'sync.status')
85
- fcmd = File.join(slurm_basedir, 'command.slurm')
86
-
87
- #{{{ GENERATE TEMPLATE
88
-
89
- # HEADER
90
35
  header =<<-EOF
91
36
  #!/bin/bash
92
- #SBATCH --qos="#{queue}"
93
- #SBATCH --job-name="#{name}"
94
- #SBATCH --workdir="#{Dir.pwd}"
37
+ #SBATCH --job-name="#{batch_name}"
38
+ #SBATCH --workdir="#{workdir}"
95
39
  #SBATCH --output="#{fout}"
96
40
  #SBATCH --error="#{ferr}"
41
+ #SBATCH --qos="#{queue}"
97
42
  #SBATCH --cpus-per-task="#{task_cpus}"
98
43
  #SBATCH --time="#{time}"
99
44
  #SBATCH --nodes="#{nodes}"
100
45
  EOF
101
46
 
102
- prep = ""
103
-
104
- if highmem
105
- header +=<<-EOF
106
- #SBATCH --constraint=highmem
107
- EOF
108
- end
109
-
110
- if exclusive
111
- header +=<<-EOF
112
- #SBATCH --exclusive
113
- EOF
114
- end
115
-
116
- # ENV
117
- env = ""
118
- env +=<<-EOF
119
- # Prepare env
120
- [[ -f ~/config/load.sh ]] && source ~/config/load.sh
121
- module load java
122
-
123
- # Calculate max available memory
124
- let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
125
- EOF
126
-
127
-
128
- # RUN
129
- run = ""
130
- exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
131
-
132
-
133
- if singularity
134
- #{{{ SINGULARITY
135
-
136
- singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
137
-
138
- env +=<<-EOF
139
- module load intel/2018.1
140
- module load singularity
141
- PROJECTS_ROOT="/gpfs/projects/bsc26/"
142
- SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
143
- SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
144
- SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
145
- mkdir -p "$SINGULARITY_RUBY_INLINE"
146
- EOF
147
-
148
- if contain
149
- scratch_group_dir = File.join('/gpfs/scratch/', group)
150
- projects_group_dir = File.join('/gpfs/projects/', group)
47
+ header << "#SBATCH --exclusive" << "\n" if exclusive
151
48
 
152
- prep +=<<-EOF
153
-
154
- # Prepare container dir
155
- CONTAINER_DIR="#{contain}"
156
- mkdir -p $CONTAINER_DIR/.rbbt/etc/
157
-
158
- for dir in .ruby_inline git home; do
159
- mkdir -p $CONTAINER_DIR/$dir
160
- done
161
-
162
- for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
163
- mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
164
- done
165
-
166
- # Copy environment
167
- cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
168
-
169
- # Set search_paths
170
- echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
171
- echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
172
- echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
173
- echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
174
- echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
175
- echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
176
- echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
177
- EOF
178
-
179
- if user_group && group != user_group
180
- prep +=<<-EOF
181
-
182
- # Add user_group search_path
183
- echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
184
- EOF
185
- end
186
-
187
- if inputs_dir
188
- prep +=<<-EOF
189
-
190
- # Copy inputs
191
- [[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
192
- EOF
193
- rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
194
- end
195
-
196
- if copy_image
197
- prep +=<<EOF
198
-
199
- # Copy image
200
- rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
201
- SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
202
- EOF
203
- end
49
+ header
50
+ end
204
51
 
205
- if wipe_container == "pre" || wipe_container == "both"
206
- if singularity
207
- prep +=<<-EOF
52
+ def self.run_template(batch_dir, dry_run)
208
53
 
209
- # Clean container pre
210
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
211
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
212
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
213
- EOF
214
- else
215
- prep = ""
216
- end
217
- end
218
- end
54
+ fout = File.join(batch_dir, 'std.out')
55
+ ferr = File.join(batch_dir, 'std.err')
56
+ fjob = File.join(batch_dir, 'job.id')
57
+ fdep = File.join(batch_dir, 'dependencies.list')
58
+ fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
59
+ fexit = File.join(batch_dir, 'exit.status')
60
+ fsync = File.join(batch_dir, 'sync.log')
61
+ fcmd = File.join(batch_dir, 'command.batch')
219
62
 
220
- if contain
221
- singularity_exec << %( -C -H "$CONTAINER_DIR" \
222
- -B /scratch/tmp \
223
- #{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
224
- -B #{scratch_group_dir} \
225
- -B #{projects_group_dir} \
226
- -B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
227
- -B ~/git:"$CONTAINER_DIR/git":ro \
228
- #{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
229
- -B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
230
- "$SINGULARITY_IMG")
231
- exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
232
- else
233
- singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
234
- end
63
+ return if Open.exists?(fexit)
235
64
 
236
- if development
237
- exec_cmd += " rbbt --dev='#{development}'"
238
- else
239
- exec_cmd += ' rbbt'
240
- end
65
+ STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
66
+ STDERR.puts Open.read(fcmd)
241
67
 
242
- exec_cmd = singularity_exec + " " + exec_cmd
68
+ if File.exists?(fjob)
69
+ job = Open.read(fjob).to_i
243
70
  else
244
- if development
245
- exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
246
- else
247
- exec_cmd << " " << 'rbbt'
248
- end
249
-
250
- if contain
251
- rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/workdir')
252
- end
253
- end
254
-
255
-
256
- cmd =<<-EOF
257
- #{exec_cmd} \\
258
- #{rbbt_cmd}
259
- EOF
260
- annotate_cmd =<<-EOF
261
- #{exec_cmd} \\
262
- workflow write_info --recursive --force=false --check_pid "$step_path" slurm_job $SLURM_JOB_ID
263
- EOF
264
-
265
- header +=<<-EOF if manifest
266
- #MANIFEST: #{manifest * ", "}
267
- EOF
268
-
269
- header +=<<-EOF if slurm_step_path
270
- #STEP_PATH: #{slurm_step_path}
271
- EOF
272
-
273
- header +=<<-EOF
274
- #CMD: #{rbbt_cmd}
275
- EOF
276
-
277
- run +=<<-EOF
278
-
279
- # Run command
280
- step_path=$(#{cmd})
281
-
282
- # Save exit status
283
- exit_status=$?
284
-
285
- # Annotate info with SLURM job_info
286
- #{annotate_cmd}
287
71
 
288
- EOF
72
+ dependencies = Open.read(fdep).split("\n") if File.exists? fdep
73
+ canfail_dependencies = Open.read(fcfdep).split("\n") if File.exists? fcfdep
289
74
 
290
- # CODA
291
- coda = ""
292
- if sync
293
- if singularity
294
- coda +=<<-EOF
295
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
296
- EOF
297
- # else
298
- # coda +=<<-EOF
299
- #rbbt system clean all -q &>> #{fsync}
300
- #EOF
301
- end
75
+ normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
76
+ canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
302
77
 
303
- if sync.include?("=>")
304
- source, _sep, sync = sync.partition("=>")
305
- source = source.strip
306
- sync = sync.strip
307
- source = File.join(File.expand_path(contain), source)
78
+ if normal_dep_str.nil? && canfail_dep_str.nil?
79
+ dep_str = ""
308
80
  else
309
- source = File.join(File.expand_path(contain), 'workdir/var/jobs')
310
- end
311
-
312
- target = File.expand_path(sync)
313
- coda +=<<-EOF
314
-
315
- # Sync data to target location
316
- if [ $exit_status == '0' ]; then
317
- mkdir -p "$(dirname '#{target}')"
318
- rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
319
- sync_es="$?"
320
- echo $sync_es > #{fsyncexit}
321
- find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
322
- else
323
- sync_es="$exit_status"
324
- fi
325
- EOF
326
-
327
- if contain && (wipe_container == "post" || wipe_container == "both")
328
- prep =<<-EOF + prep
329
- if ls -A '#{contain}' &> /dev/null ; then
330
- echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
331
- fi
332
- EOF
333
- if singularity
334
- coda +=<<-EOF
335
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
336
-
337
-
338
- # Clean container directory
339
- #if [ $exit_status == '0' -a $sync_es == '0' ]; then
340
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
341
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
342
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
343
- #else
344
- # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
345
- #fi
346
- EOF
347
- else
348
- coda +=<<-EOF
349
- ##{exec_cmd} system clean
350
- #if [ $exit_status == '0' -a $sync_es == '0' ]; then
351
- rm -Rfv #{contain} &>> #{fsync}
352
- #else
353
- # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
354
- #fi
355
- EOF
356
-
357
- end
81
+ dep_str = '--dependency=' + [normal_dep_str, canfail_dep_str].compact * ","
358
82
  end
359
- end
360
-
361
- coda +=<<-EOF
362
-
363
- # Write exit status to file
364
- echo $exit_status > #{fexit}
365
- EOF
366
-
367
- if sync
368
- coda +=<<-EOF
369
- if [ "$sync_es" == '0' ]; then
370
- unset sync_es
371
- exit $exit_status
372
- else
373
- exit $sync_es
374
- fi
375
- EOF
376
- else
377
- coda +=<<-EOF
378
- exit $exit_status
379
- EOF
380
- end
381
-
382
- template = [header, env, prep, run, coda] * "\n"
383
-
384
- template
385
- end
386
-
387
- def self.issue_template(template, options = {})
388
-
389
- slurm_basedir = options[:slurm_basedir]
390
- dependencies = options.delete :slurm_dependencies
391
- dependencies = [] if dependencies.nil?
392
-
393
- canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
394
- dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
395
-
396
- Open.mkdir slurm_basedir
397
83
 
398
- dry_run = options.delete :dry_run
84
+ cmd = "sbatch #{dep_str} '#{fcmd}'"
399
85
 
400
- fout = File.join(slurm_basedir, 'std.out')
401
- ferr = File.join(slurm_basedir, 'std.err')
402
- fjob = File.join(slurm_basedir, 'job.id')
403
- fdep = File.join(slurm_basedir, 'dependencies.list')
404
- fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
405
- fexit = File.join(slurm_basedir, 'exit.status')
406
- fsync = File.join(slurm_basedir, 'sync.log')
407
- fcmd = File.join(slurm_basedir, 'command.slurm')
408
-
409
- job = nil
410
- if options[:clean_job]
411
- [fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
412
- Open.rm file if Open.exists? file
413
- end
414
- end
415
-
416
- return if Open.exists?(fexit)
417
-
418
- STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
419
- STDERR.puts template
420
-
421
- Open.write(fcmd, template) unless File.exists? fcmd
422
- if File.exists?(fjob)
423
- job = Open.read(fjob).to_i
424
- else
425
86
  if File.exists?(fout)
426
87
  return
427
88
  elsif dry_run
428
- STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
429
- STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
430
- raise HPC::SBATCH, slurm_basedir
89
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{fcmd}'")
90
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt slurm tail '#{batch_dir}'")
91
+ raise HPC::SBATCH, batch_dir
431
92
  else
432
93
  Open.rm fsync
433
94
  Open.rm fexit
434
95
  Open.rm fout
435
96
  Open.rm ferr
436
97
 
437
- Open.write(fdep, dependencies * "\n") if dependencies.any?
438
- Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
439
-
440
-
441
- dep_str = '--dependency='
442
- normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
443
- canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
444
-
445
- if normal_dep_str.nil? && canfail_dep_str.nil?
446
- dep_str = ""
447
- else
448
- dep_str += [normal_dep_str, canfail_dep_str].compact * ","
449
- end
450
-
451
- job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
98
+ job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
452
99
  Log.debug "SBATCH job id: #{job}"
453
100
  Open.write(fjob, job.to_s)
454
101
  job
@@ -456,165 +103,13 @@ EOF
456
103
  end
457
104
  end
458
105
 
459
- def self.follow_job(slurm_basedir, tail = true)
460
- fjob = File.join(slurm_basedir, 'job.id')
461
- fout = File.join(slurm_basedir, 'std.out')
462
- ferr = File.join(slurm_basedir, 'std.err')
463
- fstatus = File.join(slurm_basedir, 'job.status')
464
-
465
- job = Open.read(fjob).strip if Open.exists?(fjob)
466
-
467
- if job
468
- status_txt = CMD.cmd("squeue --job #{job}").read
469
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
470
- STDERR.puts status_txt
471
- lines = status_txt.split("\n").length
472
- end
473
-
474
- if tail
475
- Log.severity = 10
476
- while ! File.exists? fout
477
- if job
478
- STDERR.puts
479
- Log.clear_line(STDERR)
480
- STDERR.write Log.color(:magenta, "Waiting for Output")
481
- 3.times do
482
- STDERR.write Log.color(:magenta, ".")
483
- sleep 1
484
- end
485
- status_txt = CMD.cmd("squeue --job #{job}").read
486
- lines.times do
487
- Log.clear_line(STDERR)
488
- end
489
- Log.clear_line(STDERR)
490
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
491
- STDERR.puts status_txt
492
- lines = status_txt.split("\n").length
493
- end
494
- end
495
- STDERR.puts
496
- Log.clear_line(STDERR)
497
- STDERR.puts Log.color(:magenta, "Output:")
498
- begin
499
- CMD.cmd("squeue --job #{job} > #{fstatus}")
500
- out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
501
- err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
502
-
503
- terr = Misc.consume_stream(err, true, STDERR) if err
504
- tout = Misc.consume_stream(out, true, STDOUT) if out
505
-
506
- sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
507
- rescue Aborted
508
- ensure
509
- begin
510
- terr.exit if terr
511
- tout.exit if tout
512
- err.close if err
513
- err.join if err
514
- rescue Exception
515
- end
516
-
517
- begin
518
- out.close if out
519
- out.join if out
520
- rescue Exception
521
- end
522
- end
523
- end
524
- end
525
-
526
- def self.wait_for_job(slurm_basedir, time = 1)
527
- fexit = File.join(slurm_basedir, 'exit.status')
528
- fjob = File.join(slurm_basedir, 'job.id')
529
- job = Open.read(fjob) if Open.exists?(fjob)
530
-
531
-
532
- while ! Open.exists?(fexit)
533
- sleep time
534
- end
535
- end
536
-
537
- def self.run_job(job, options = {})
538
- options = IndiferentHash.setup(options.dup)
539
-
540
- dry_run = options.delete :dry_run
541
- tail = options.delete :tail
542
- dependencies = options.delete :slurm_dependencies
543
- procpath = options.delete :SLURM_procpath
544
-
545
- options[:jobname] = job.clean_name
546
- options[:slurm_step_path] = job.path
547
-
548
- log_level = options.delete :log
549
- log_level ||= Log.severity
550
-
551
- workflow = job.workflow
552
-
553
- task = Symbol === job.overriden ? job.overriden : job.task_name
554
-
555
- if job.overriden
556
- override_deps = job.rec_dependencies.
557
- select{|dep| Symbol === dep.overriden }.
558
- collect do |dep|
559
-
560
- name = [dep.workflow.to_s, dep.task_name] * "#"
561
- [name, dep.path] * "="
562
- end * ","
563
- end
564
-
565
- remove_slurm_basedir = options.delete :remove_slurm_basedir
566
- slurm_basedir = options.delete :SLURM_basedir
567
- slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
568
- TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
569
- options[:slurm_basedir] ||= tmp_directory
570
- slurm_basedir = options[:slurm_basedir]
571
- inputs_dir = File.join(tmp_directory, 'inputs_dir')
572
- saved = Step.save_job_inputs(job, inputs_dir)
573
-
574
- cmd = ['workflow', 'task', workflow.to_s, task.to_s, '--printpath', '--log', log_level.to_s]
575
-
576
- cmd << "--procpath_performance='#{tmp_directory}/procpath##{procpath.gsub(',', '#')}'" if procpath
577
-
578
- cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
579
-
580
- cmd << "--load_inputs='#{inputs_dir}'" if saved && saved.any?
581
-
582
- template = self.template(cmd, options)
583
- jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
584
-
585
- return jobid unless tail
586
-
587
- t_monitor = Thread.new do
588
- self.follow_job(slurm_basedir, :STDERR)
589
- end
590
- self.wait_for_job(slurm_basedir)
591
- t_monitor.raise Aborted
592
- return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
593
- path = Open.read(File.join(slurm_basedir, 'std.out')).strip
594
- if Open.exists?(path) && job.path != path
595
- Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
596
- Open.ln path, job.path
597
- Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
598
- Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
599
- end
600
- jobid
106
+ def self.job_status(job = nil)
107
+ if job.nil?
108
+ CMD.cmd("squeue").read
109
+ else
110
+ CMD.cmd("squeue --job #{job}").read
601
111
  end
602
112
  end
603
- end
604
-
605
- def self.relay(job, options={})
606
- options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
607
- done_deps = job.dependencies.select do |dep|
608
- dep.done?
609
- end
610
-
611
- error_deps = job.dependencies.select do |dep|
612
- dep.error? && ! dep.recoverable_error?
613
- end
614
-
615
- (done_deps + error_deps).each do |dep|
616
- Step.migrate(dep.path, options[:search_path], options)
617
- end
618
113
 
619
114
  end
620
115
  end