rbbt-util 5.30.9 → 5.31.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/hpc.rb +3 -0
  3. data/lib/rbbt/hpc/batch.rb +623 -0
  4. data/lib/rbbt/hpc/lsf.rb +119 -0
  5. data/lib/rbbt/hpc/orchestrate.rb +24 -19
  6. data/lib/rbbt/hpc/slurm.rb +62 -559
  7. data/lib/rbbt/resource/path.rb +3 -1
  8. data/lib/rbbt/tsv/accessor.rb +5 -2
  9. data/lib/rbbt/tsv/dumper.rb +1 -0
  10. data/lib/rbbt/tsv/parallel/traverse.rb +1 -1
  11. data/lib/rbbt/tsv/stream.rb +5 -6
  12. data/lib/rbbt/util/cmd.rb +15 -1
  13. data/lib/rbbt/util/config.rb +2 -2
  14. data/lib/rbbt/util/log.rb +22 -1
  15. data/lib/rbbt/util/log/progress.rb +17 -2
  16. data/lib/rbbt/util/log/progress/report.rb +36 -3
  17. data/lib/rbbt/util/misc/development.rb +2 -2
  18. data/lib/rbbt/util/misc/inspect.rb +17 -1
  19. data/lib/rbbt/util/misc/omics.rb +60 -1
  20. data/lib/rbbt/util/misc/options.rb +5 -0
  21. data/lib/rbbt/workflow/accessor.rb +7 -2
  22. data/lib/rbbt/workflow/definition.rb +7 -3
  23. data/lib/rbbt/workflow/step/accessor.rb +1 -1
  24. data/lib/rbbt/workflow/step/run.rb +9 -0
  25. data/lib/rbbt/workflow/usage.rb +13 -13
  26. data/lib/rbbt/workflow/util/archive.rb +5 -3
  27. data/lib/rbbt/workflow/util/provenance.rb +26 -21
  28. data/share/config.ru +3 -3
  29. data/share/rbbt_commands/{slurm → hpc}/clean +91 -18
  30. data/share/rbbt_commands/{slurm → hpc}/list +119 -31
  31. data/share/rbbt_commands/hpc/orchestrate +81 -0
  32. data/share/rbbt_commands/hpc/tail +81 -0
  33. data/share/rbbt_commands/hpc/task +80 -0
  34. data/test/rbbt/hpc/test_batch.rb +65 -0
  35. data/test/rbbt/hpc/test_slurm.rb +30 -0
  36. data/test/rbbt/util/misc/test_development.rb +11 -0
  37. data/test/rbbt/util/test_config.rb +13 -3
  38. data/test/test_helper.rb +3 -1
  39. metadata +16 -7
  40. data/share/rbbt_commands/slurm/orchestrate +0 -48
  41. data/share/rbbt_commands/slurm/task +0 -46
@@ -0,0 +1,119 @@
1
+ require 'rbbt/hpc/batch'
2
+
3
+ module HPC
4
+ module LSF
5
+ extend HPC::TemplateGeneration
6
+ extend HPC::Orchestration
7
+
8
+ def self.batch_system_variables
9
+ <<-EOF
10
+ MAX_MEMORY=$LSB_MAX_MEM_RUSAGE || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
11
+ BATCH_JOB_ID=$LSF_JOBID
12
+ BATCH_SYSTEM=LSF
13
+ EOF
14
+ end
15
+
16
+ def self.header(options = {})
17
+ options = options.dup
18
+
19
+ queue = Misc.process_options options, :queue
20
+ task_cpus = Misc.process_options options, :task_cpus
21
+ time = Misc.process_options options, :time
22
+ nodes = Misc.process_options options, :nodes
23
+ workdir = Misc.process_options options, :workdir
24
+ exclusive = Misc.process_options options, :exclusive
25
+
26
+ batch_dir = Misc.process_options options, :batch_dir
27
+ batch_name = Misc.process_options options, :batch_name
28
+ batch_name ||= File.basename(batch_dir)
29
+
30
+ fout = File.join(batch_dir, 'std.out')
31
+ ferr = File.join(batch_dir, 'std.err')
32
+
33
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
34
+
35
+ time = time.split(":").values_at(0, 1) * ":"
36
+
37
+ header =<<-EOF
38
+ #!/bin/bash
39
+ #BSUB -J "#{batch_name}"
40
+ #BSUB -cwd "#{workdir}"
41
+ #BSUB -oo "#{fout}"
42
+ #BSUB -eo "#{ferr}"
43
+ #BSUB -q "#{queue}"
44
+ #BSUB -n "#{task_cpus}"
45
+ #BSUB -W "#{time}"
46
+ EOF
47
+
48
+ header << "#BSUB -x" << "\n" if exclusive
49
+
50
+ header
51
+ end
52
+
53
+ def self.run_template(batch_dir, dry_run)
54
+
55
+ fout = File.join(batch_dir, 'std.out')
56
+ ferr = File.join(batch_dir, 'std.err')
57
+ fjob = File.join(batch_dir, 'job.id')
58
+ fdep = File.join(batch_dir, 'dependencies.list')
59
+ fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
60
+ fexit = File.join(batch_dir, 'exit.status')
61
+ fsync = File.join(batch_dir, 'sync.log')
62
+ fcmd = File.join(batch_dir, 'command.batch')
63
+
64
+ return if Open.exists?(fexit)
65
+
66
+ STDERR.puts Log.color(:magenta, "Issuing LSF file: #{fcmd}")
67
+ STDERR.puts Open.read(fcmd)
68
+
69
+ if File.exists?(fjob)
70
+ job = Open.read(fjob).to_i
71
+ else
72
+
73
+ dependencies = Open.read(fdep).split("\n") if File.exists? fdep
74
+ canfail_dependencies = Open.read(fcfdep).split("\n") if File.exists? fcfdep
75
+
76
+ normal_dep_list = dependencies && dependencies.any? ? dependencies.collect{|d| "post_done(#{d})"} : []
77
+ canfail_dep_list = canfail_dependencies && canfail_dependencies.any? ? canfail_dependencies.collect{|d| "done(#{d})"} : []
78
+
79
+ dep_list = normal_dep_list + canfail_dep_list
80
+
81
+ if dep_list.any?
82
+ dep_str = '-w "' + dep_list * " && " + '"'
83
+ else
84
+ dep_str = ""
85
+ end
86
+
87
+ cmd = "bsub #{dep_str} < '#{fcmd}'"
88
+
89
+ if File.exists?(fout)
90
+ return
91
+ elsif dry_run
92
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, cmd)
93
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt lsf tail '#{batch_dir}'")
94
+ raise HPC::SBATCH, batch_dir
95
+ else
96
+ Open.rm fsync
97
+ Open.rm fexit
98
+ Open.rm fout
99
+ Open.rm ferr
100
+
101
+
102
+ job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
103
+ Log.debug "BSUB job id: #{job}"
104
+ Open.write(fjob, job.to_s)
105
+ job
106
+ end
107
+ end
108
+ end
109
+
110
+ def self.job_status(job = nil)
111
+ if job.nil?
112
+ CMD.cmd("bjobs -w").read
113
+ else
114
+ CMD.cmd("bjobs -w #{job}").read
115
+ end
116
+ end
117
+ end
118
+ end
119
+
@@ -1,8 +1,8 @@
1
1
  require 'rbbt/workflow/util/orchestrator'
2
2
  module HPC
3
- module SLURM
3
+ module Orchestration
4
4
 
5
- def self.job_rules(rules, job)
5
+ def job_rules(rules, job)
6
6
  workflow = job.workflow.to_s
7
7
  task_name = job.task_name.to_s
8
8
  task_name = job.overriden.to_s if Symbol === job.overriden
@@ -53,25 +53,26 @@ module HPC
53
53
  job_rules
54
54
  end
55
55
 
56
- def self.get_job_dependencies(job, job_rules = nil)
56
+ def get_job_dependencies(job, job_rules = nil)
57
57
  deps = job.dependencies || []
58
58
  deps += job.input_dependencies || []
59
59
  deps
60
60
  end
61
61
 
62
- def self.get_recursive_job_dependencies(job)
62
+ def get_recursive_job_dependencies(job)
63
63
  deps = get_job_dependencies(job)
64
64
  (deps + deps.collect{|dep| get_recursive_job_dependencies(dep) }).flatten
65
65
  end
66
66
 
67
- def self.piggyback(job, job_rules, job_deps)
67
+ def piggyback(job, job_rules, job_deps)
68
68
  return false unless job_rules["skip"]
69
69
  final_deps = job_deps - job_deps.collect{|dep| get_recursive_job_dependencies(dep)}.flatten.uniq
70
+ final_deps = final_deps.reject{|dep| dep.done? }
70
71
  return final_deps.first if final_deps.length == 1
71
72
  return false
72
73
  end
73
74
 
74
- def self.get_chains(job, rules, chains = {})
75
+ def get_chains(job, rules, chains = {})
75
76
  job_rules = self.job_rules(rules, job)
76
77
  job_deps = get_job_dependencies(job)
77
78
 
@@ -101,22 +102,22 @@ module HPC
101
102
  chains
102
103
  end
103
104
 
104
- def self.workload(job, rules, chains, options, seen = nil)
105
+ def workload(job, rules, chains, options, seen = nil)
105
106
  return [] if job.done?
106
107
  if seen.nil?
107
- seen = {}
108
+ seen = {}
108
109
  target_job = true
109
110
  end
110
111
 
111
112
  job_rules = self.job_rules(rules, job)
112
113
  job_deps = get_job_dependencies(job)
113
114
 
114
-
115
115
  chain = chains[job]
116
- chain -= seen.keys if chain
116
+ chain = chain.reject{|j| seen.include? j.path} if chain
117
+ chain = chain.reject{|dep| dep.done? } if chain
117
118
  piggyback = piggyback(job, job_rules, job_deps)
118
119
  dep_ids = job_deps.collect do |dep|
119
- seen[dep] = nil if chain && chain.include?(dep) #&& ! job.input_dependencies.include?(dep)
120
+ seen[dep.path] ||= nil if chain && chain.include?(dep) #&& ! job.input_dependencies.include?(dep)
120
121
  next_options = IndiferentHash.setup(options.dup)
121
122
  if piggyback and piggyback == dep
122
123
  next_options[:piggyback] ||= []
@@ -129,19 +130,22 @@ module HPC
129
130
 
130
131
  ids = [ids].flatten.compact.collect{|id| ['canfail', id] * ":"} if job.canfail_paths.include? dep.path
131
132
 
132
- seen[dep] = ids
133
+ seen[dep.path] = ids
133
134
  ids
134
135
  end.compact.flatten.uniq
135
136
 
136
- return seen[job] || dep_ids if seen.include?(job)
137
- return seen[piggyback] if piggyback
137
+ return seen[job.path] || dep_ids if seen.include?(job.path)
138
+
139
+ if piggyback and seen[piggyback.path]
140
+ return seen[job.path] = seen[piggyback.path]
141
+ end
138
142
 
139
143
  job_rules.delete :chain_tasks
140
144
  job_rules.delete :tasks
141
145
  job_rules.delete :workflow
142
146
 
143
147
 
144
- job_options = IndiferentHash.setup(options.merge(job_rules).merge(:slurm_dependencies => dep_ids))
148
+ job_options = IndiferentHash.setup(options.merge(job_rules).merge(:batch_dependencies => dep_ids))
145
149
  job_options.delete :orchestration_rules
146
150
 
147
151
  config_keys = job_rules.delete(:config_keys)
@@ -172,13 +176,13 @@ module HPC
172
176
 
173
177
  manifest.uniq!
174
178
 
175
- job_options[:manifest] = manifest.collect{|j| j.workflow_short_path }
179
+ job_options[:manifest] = manifest.collect{|j| j.task_signature }
176
180
 
177
181
  job_options[:config_keys] = job_options[:config_keys].split(",").uniq * "," if job_options[:config_keys]
178
182
 
179
183
  if options[:dry_run]
180
184
  puts Log.color(:magenta, "Manifest: ") + Log.color(:blue, job_options[:manifest] * ", ") + " - tasks: #{job_options[:task_cpus] || 1} - time: #{job_options[:time]} - config: #{job_options[:config_keys]}"
181
- puts Log.color(:yellow, "Deps: ") + Log.color(:blue, job_options[:slurm_dependencies]*", ")
185
+ puts Log.color(:yellow, "Deps: ") + Log.color(:blue, job_options[:batch_dependencies]*", ")
182
186
  job_options[:manifest].first
183
187
  else
184
188
  run_job(job, job_options)
@@ -186,13 +190,14 @@ module HPC
186
190
  end
187
191
 
188
192
 
189
- def self.orchestrate_job(job, options)
193
+ def orchestrate_job(job, options)
190
194
  options.delete "recursive_clean"
191
195
  options.delete "clean_task"
192
196
  options.delete "clean"
193
197
  options.delete "tail"
194
- options.delete "printfile"
198
+ options.delete "printpath"
195
199
  options.delete "detach"
200
+ options.delete "jobname"
196
201
 
197
202
  rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
198
203
  rules ||= {}
@@ -1,448 +1,101 @@
1
- module HPC
2
- class SBATCH < Exception;
3
- attr_accessor :directory
4
- def initialize(directory)
5
- @directory = directory
6
- end
7
- end
8
-
9
- module SLURM
10
-
11
- def self.template(args, options = {})
12
-
13
- development = options.delete :drbbt
14
- singularity = options.delete :singularity
15
- contain = options.delete :contain
16
- sync = options.delete :sync
17
- user_group = options.delete :user_group
18
- contain_and_sync = options.delete :contain_and_sync
19
- wipe_container = options.delete :wipe_container
20
- copy_image = options.delete :copy_image
21
- exclusive = options.delete :exclusive
22
- highmem = options.delete :highmem
23
-
24
- manifest = options.delete :manifest
25
-
26
- queue = options.delete(:queue) || Rbbt::Config.get('queue', :slurm_queue, :slurm, :SLURM, :default => 'bsc_ls')
27
- task_cpus = options.delete(:task_cpus) || 1
28
- nodes = options.delete(:nodes) || 1
29
- time = options.delete(:time) || "0:02:00"
30
-
31
- inputs_dir = options.delete :inputs_dir
32
- config_keys = options.delete :config_keys
33
-
34
- user = ENV['USER'] || `whoami`.strip
35
- group = File.basename(File.dirname(ENV['HOME']))
1
+ require 'rbbt/hpc/batch'
2
+ require 'rbbt/hpc/orchestrate'
36
3
 
37
- if contain_and_sync
38
- random_file = TmpFile.random_name
39
- contain = "/scratch/tmp/rbbt-#{user}/#{random_file}" if contain.nil?
40
- sync = "~/.rbbt/var/jobs" if sync.nil?
41
- wipe_container = "post" if wipe_container.nil?
42
- end
43
-
44
- contain = nil if contain == "" || contain == "none"
45
- sync = nil if sync == "" || sync == "none"
4
+ module HPC
5
+ module SLURM
6
+ extend HPC::TemplateGeneration
7
+ extend HPC::Orchestration
46
8
 
47
- contain = File.expand_path(contain) if contain
9
+ def self.batch_system_variables
10
+ <<-EOF
11
+ let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
12
+ BATCH_JOB_ID=$SLURM_JOB_ID
13
+ BATCH_SYSTEM=SLURM
14
+ EOF
15
+ end
48
16
 
49
- name = options[:name] ||= Misc.obj2digest({:options => options.collect{|k,v| [k,v]}.sort_by{|k,v| k.to_s }, :args => args})
50
- options.delete(:name)
51
- slurm_basedir = options[:slurm_basedir] ||= File.expand_path(File.join('~/rbbt-slurm', name)) if slurm_basedir.nil?
52
- options.delete(:slurm_basedir)
17
+ def self.header(options = {})
18
+ options = options.dup
53
19
 
54
- rbbt_cmd = args.reject{|e| e == '--' }.collect{|e| e.include?(" ")? '"' + e + '"' : e } * " "
20
+ queue = Misc.process_options options, :queue
21
+ task_cpus = Misc.process_options options, :task_cpus
22
+ time = Misc.process_options options, :time
23
+ nodes = Misc.process_options options, :nodes
24
+ workdir = Misc.process_options options, :workdir
25
+ exclusive = Misc.process_options options, :exclusive
55
26
 
56
- rbbt_cmd += " " << options.collect do |o,v|
57
- o = o.to_s
58
- case v
59
- when TrueClass
60
- '--' << o
61
- when FalseClass
62
- '--' << o << "=false"
63
- else
64
- ['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
65
- end
66
- end * " "
27
+ batch_dir = Misc.process_options options, :batch_dir
28
+ batch_name = Misc.process_options options, :batch_name
67
29
 
68
- rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
30
+ fout = File.join(batch_dir, 'std.out')
31
+ ferr = File.join(batch_dir, 'std.err')
69
32
 
70
33
  time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
71
34
 
72
-
73
- #{{{ PREPARE LOCAL LOGFILES
74
-
75
- Open.mkdir slurm_basedir
76
-
77
- fout = File.join(slurm_basedir, 'std.out')
78
- ferr = File.join(slurm_basedir, 'std.err')
79
- fjob = File.join(slurm_basedir, 'job.id')
80
- fexit = File.join(slurm_basedir, 'exit.status')
81
- fsync = File.join(slurm_basedir, 'sync.log')
82
- fsyncexit = File.join(slurm_basedir, 'sync.status')
83
- fcmd = File.join(slurm_basedir, 'command.slurm')
84
-
85
- #{{{ GENERATE TEMPLATE
86
-
87
- # HEADER
88
35
  header =<<-EOF
89
36
  #!/bin/bash
90
- #SBATCH --qos="#{queue}"
91
- #SBATCH --job-name="#{name}"
92
- #SBATCH --workdir="#{Dir.pwd}"
37
+ #SBATCH --job-name="#{batch_name}"
38
+ #SBATCH --workdir="#{workdir}"
93
39
  #SBATCH --output="#{fout}"
94
40
  #SBATCH --error="#{ferr}"
41
+ #SBATCH --qos="#{queue}"
95
42
  #SBATCH --cpus-per-task="#{task_cpus}"
96
43
  #SBATCH --time="#{time}"
97
44
  #SBATCH --nodes="#{nodes}"
98
45
  EOF
99
46
 
100
- prep = ""
101
-
102
- if highmem
103
- header +=<<-EOF
104
- #SBATCH --constraint=highmem
105
- EOF
106
- end
107
-
108
- if exclusive
109
- header +=<<-EOF
110
- #SBATCH --exclusive
111
- EOF
112
- end
113
-
114
- # ENV
115
- env = ""
116
- env +=<<-EOF
117
- # Prepare env
118
- [[ -f ~/config/load.sh ]] && source ~/config/load.sh
119
- module load java
120
-
121
- # Calculate max available memory
122
- let "MAX_MEMORY=$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK" || let MAX_MEMORY="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / 1024"
123
- EOF
124
-
125
-
126
- # RUN
127
- run = ""
128
- exec_cmd = %(env _JAVA_OPTIONS="-Xms1g -Xmx${MAX_MEMORY}m")
129
-
130
-
131
- if singularity
132
- #{{{ SINGULARITY
133
-
134
- singularity_exec = %(singularity exec -e -B $SINGULARITY_OPT_DIR:/singularity_opt/ -B /apps/)
135
-
136
- env +=<<-EOF
137
- module load intel/2018.1
138
- module load singularity
139
- PROJECTS_ROOT="/gpfs/projects/bsc26/"
140
- SINGULARITY_IMG="$PROJECTS_ROOT/rbbt.singularity.img"
141
- SINGULARITY_OPT_DIR="$PROJECTS_ROOT/singularity_opt/"
142
- SINGULARITY_RUBY_INLINE="$HOME/.singularity_ruby_inline"
143
- mkdir -p "$SINGULARITY_RUBY_INLINE"
144
- EOF
145
-
146
- if contain
147
- scratch_group_dir = File.join('/gpfs/scratch/', group)
148
- projects_group_dir = File.join('/gpfs/projects/', group)
47
+ header << "#SBATCH --exclusive" << "\n" if exclusive
149
48
 
150
- prep +=<<-EOF
151
-
152
- # Prepare container dir
153
- CONTAINER_DIR="#{contain}"
154
- mkdir -p $CONTAINER_DIR/.rbbt/etc/
155
-
156
- for dir in .ruby_inline git home; do
157
- mkdir -p $CONTAINER_DIR/$dir
158
- done
159
-
160
- for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrite_locks step_info_locks tsv_open_locks; do
161
- mkdir -p $CONTAINER_DIR/.rbbt/tmp/$tmpd
162
- done
163
-
164
- # Copy environment
165
- cp ~/.rbbt/etc/environment $CONTAINER_DIR/.rbbt/etc/
166
-
167
- # Set search_paths
168
- echo "singularity: /singularity_opt/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" > $CONTAINER_DIR/.rbbt/etc/search_paths
169
- echo "rbbt_user: /home/rbbt/.rbbt/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
170
- echo "outside_home: $CONTAINER_DIR/home/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
171
- echo "group_projects: #{projects_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
172
- echo "group_scratch: #{scratch_group_dir}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
173
- echo "user_projects: #{projects_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
174
- echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
175
- EOF
176
-
177
- if user_group && group != user_group
178
- prep +=<<-EOF
179
-
180
- # Add user_group search_path
181
- echo "#{user_group}: /gpfs/projects/#{user_group}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}" >> $CONTAINER_DIR/.rbbt/etc/search_paths
182
- EOF
183
- end
184
-
185
- if inputs_dir
186
- prep +=<<-EOF
187
-
188
- # Copy inputs
189
- [[ -d '#{inputs_dir}' ]] && cp -R '#{inputs_dir}' $CONTAINER_DIR/inputs
190
- EOF
191
- rbbt_cmd = rbbt_cmd.sub(inputs_dir, "#{contain}/inputs")
192
- end
193
-
194
- if copy_image
195
- prep +=<<EOF
196
-
197
- # Copy image
198
- rsync -avz "$SINGULARITY_IMG" "$CONTAINER_DIR/rbbt.singularity.img" 1>&2
199
- SINGULARITY_IMG="$CONTAINER_DIR/rbbt.singularity.img"
200
- EOF
201
- end
49
+ header
50
+ end
202
51
 
203
- if wipe_container == "pre" || wipe_container == "both"
204
- if singularity
205
- prep +=<<-EOF
52
+ def self.run_template(batch_dir, dry_run)
206
53
 
207
- # Clean container pre
208
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
209
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
210
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
211
- EOF
212
- else
213
- prep = ""
214
- end
215
- end
216
- end
54
+ fout = File.join(batch_dir, 'std.out')
55
+ ferr = File.join(batch_dir, 'std.err')
56
+ fjob = File.join(batch_dir, 'job.id')
57
+ fdep = File.join(batch_dir, 'dependencies.list')
58
+ fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
59
+ fexit = File.join(batch_dir, 'exit.status')
60
+ fsync = File.join(batch_dir, 'sync.log')
61
+ fcmd = File.join(batch_dir, 'command.batch')
217
62
 
218
- if contain
219
- singularity_exec << %( -C -H "$CONTAINER_DIR" \
220
- -B /scratch/tmp \
221
- #{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
222
- -B #{scratch_group_dir} \
223
- -B #{projects_group_dir} \
224
- -B "$SINGULARITY_RUBY_INLINE":"$CONTAINER_DIR/.ruby_inline":rw \
225
- -B ~/git:"$CONTAINER_DIR/git":ro \
226
- #{Open.exists?('~/.rbbt/software/opt/')? '-B ~/.rbbt/software/opt/:"/opt/":ro' : '' } \
227
- -B ~/.rbbt:"$CONTAINER_DIR/home/":ro \
228
- "$SINGULARITY_IMG")
229
- exec_cmd << ' TMPDIR="$CONTAINER_DIR/.rbbt/tmp" '
230
- else
231
- singularity_exec += %( -B "$SINGULARITY_RUBY_INLINE":"$HOME/.ruby_inline":rw "$SINGULARITY_IMG" )
232
- end
63
+ return if Open.exists?(fexit)
233
64
 
234
- if development
235
- exec_cmd += " rbbt --dev='#{development}'"
236
- else
237
- exec_cmd += ' rbbt'
238
- end
65
+ STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
66
+ STDERR.puts Open.read(fcmd)
239
67
 
240
- exec_cmd = singularity_exec + " " + exec_cmd
68
+ if File.exists?(fjob)
69
+ job = Open.read(fjob).to_i
241
70
  else
242
- if development
243
- exec_cmd << " " << %(~/git/rbbt-util/bin/rbbt --dev=#{development})
244
- else
245
- exec_cmd << " " << 'rbbt'
246
- end
247
-
248
- if contain
249
- rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/workdir')
250
- end
251
- end
252
-
253
-
254
- cmd =<<-EOF
255
- #{exec_cmd} \\
256
- #{rbbt_cmd}
257
- EOF
258
- annotate_cmd =<<-EOF
259
- #{exec_cmd} \\
260
- workflow write_info --recursive --force=false --check_pid "$step_path" slurm_job $SLURM_JOB_ID
261
- EOF
262
71
 
263
- header +=<<-EOF if manifest
264
- #MANIFEST: #{manifest * ", "}
265
- EOF
266
-
267
- header +=<<-EOF
268
- #CMD: #{rbbt_cmd}
269
- EOF
270
-
271
- run +=<<-EOF
272
-
273
- # Run command
274
- step_path=$(#{cmd})
72
+ dependencies = Open.read(fdep).split("\n") if File.exists? fdep
73
+ canfail_dependencies = Open.read(fcfdep).split("\n") if File.exists? fcfdep
275
74
 
276
- # Save exit status
277
- exit_status=$?
278
-
279
- # Annotate info with SLURM job_info
280
- #{annotate_cmd}
281
-
282
- EOF
283
-
284
- # CODA
285
- coda = ""
286
- if sync
287
- if singularity
288
- coda +=<<-EOF
289
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
290
- EOF
291
- # else
292
- # coda +=<<-EOF
293
- #rbbt system clean all -q &>> #{fsync}
294
- #EOF
295
- end
75
+ normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
76
+ canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
296
77
 
297
- if sync.include?("=>")
298
- source, _sep, sync = sync.partition("=>")
299
- source = source.strip
300
- sync = sync.strip
301
- source = File.join(File.expand_path(contain), source)
78
+ if normal_dep_str.nil? && canfail_dep_str.nil?
79
+ dep_str = ""
302
80
  else
303
- source = File.join(File.expand_path(contain), 'workdir/var/jobs')
81
+ dep_str = '--dependency=' + [normal_dep_str, canfail_dep_str].compact * ","
304
82
  end
305
83
 
306
- target = File.expand_path(sync)
307
- coda +=<<-EOF
84
+ cmd = "sbatch #{dep_str} '#{fcmd}'"
308
85
 
309
- # Sync data to target location
310
- if [ $exit_status == '0' ]; then
311
- mkdir -p "$(dirname '#{target}')"
312
- rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
313
- sync_es="$?"
314
- echo $sync_es > #{fsyncexit}
315
- find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
316
- else
317
- sync_es="$exit_status"
318
- fi
319
- EOF
320
-
321
- if contain && (wipe_container == "post" || wipe_container == "both")
322
- prep =<<-EOF + prep
323
- if ls -A '#{contain}' &> /dev/null ; then
324
- echo "ERROR: Container directory not empty, refusing to wipe. #{contain}" &>> #{fsync}
325
- fi
326
- EOF
327
- if singularity
328
- coda +=<<-EOF
329
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem.*.{in,out,process} /dev/shm/sem.Session-PID.*.sem 2> /dev/null >> #{fsync}
330
-
331
-
332
- # Clean container directory
333
- #if [ $exit_status == '0' -a $sync_es == '0' ]; then
334
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean -f &>> #{fsync}
335
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv .rbbt/var/jobs &>> #{fsync}
336
- singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -Rfv tmp/ &>> #{fsync}
337
- #else
338
- # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
339
- #fi
340
- EOF
341
- else
342
- coda +=<<-EOF
343
- ##{exec_cmd} system clean
344
- #if [ $exit_status == '0' -a $sync_es == '0' ]; then
345
- rm -Rfv #{contain} &>> #{fsync}
346
- #else
347
- # echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
348
- #fi
349
- EOF
350
-
351
- end
352
- end
353
- end
354
-
355
- coda +=<<-EOF
356
-
357
- # Write exit status to file
358
- echo $exit_status > #{fexit}
359
- EOF
360
-
361
- if sync
362
- coda +=<<-EOF
363
- if [ "$sync_es" == '0' ]; then
364
- unset sync_es
365
- exit $exit_status
366
- else
367
- exit $sync_es
368
- fi
369
- EOF
370
- else
371
- coda +=<<-EOF
372
- exit $exit_status
373
- EOF
374
- end
375
-
376
- template = [header, env, prep, run, coda] * "\n"
377
-
378
- template
379
- end
380
-
381
- def self.issue_template(template, options = {})
382
-
383
- slurm_basedir = options[:slurm_basedir]
384
- dependencies = options.delete :slurm_dependencies
385
- dependencies = [] if dependencies.nil?
386
-
387
- canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
388
- dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
389
-
390
- Open.mkdir slurm_basedir
391
-
392
- dry_run = options.delete :dry_run
393
-
394
- fout = File.join(slurm_basedir, 'std.out')
395
- ferr = File.join(slurm_basedir, 'std.err')
396
- fjob = File.join(slurm_basedir, 'job.id')
397
- fdep = File.join(slurm_basedir, 'dependencies.list')
398
- fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
399
- fexit = File.join(slurm_basedir, 'exit.status')
400
- fsync = File.join(slurm_basedir, 'sync.log')
401
- fcmd = File.join(slurm_basedir, 'command.slurm')
402
-
403
- job = nil
404
- if options[:clean_job]
405
- [fcmd, fjob, fout, ferr, fsync, fexit].each do |file|
406
- Open.rm file if Open.exists? file
407
- end
408
- end
409
-
410
- return if Open.exists?(fexit)
411
-
412
- STDERR.puts Log.color(:magenta, "Issuing SLURM file: #{fcmd}")
413
- STDERR.puts template
414
-
415
- Open.write(fcmd, template) unless File.exists? fcmd
416
- if File.exists?(fjob)
417
- job = Open.read(fjob).to_i
418
- else
419
86
  if File.exists?(fout)
420
87
  return
421
88
  elsif dry_run
422
- STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{slurm_basedir}/command.slurm'")
423
- STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt mn --tail -w '#{slurm_basedir}'")
424
- raise HPC::SBATCH, slurm_basedir
89
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{fcmd}'")
90
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt slurm tail '#{batch_dir}'")
91
+ raise HPC::SBATCH, batch_dir
425
92
  else
426
93
  Open.rm fsync
427
94
  Open.rm fexit
428
95
  Open.rm fout
429
96
  Open.rm ferr
430
97
 
431
- Open.write(fdep, dependencies * "\n") if dependencies.any?
432
- Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
433
-
434
-
435
- dep_str = '--dependency='
436
- normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
437
- canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
438
-
439
- if normal_dep_str.nil? && canfail_dep_str.nil?
440
- dep_str = ""
441
- else
442
- dep_str += [normal_dep_str, canfail_dep_str].compact * ","
443
- end
444
-
445
- job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
98
+ job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
446
99
  Log.debug "SBATCH job id: #{job}"
447
100
  Open.write(fjob, job.to_s)
448
101
  job
@@ -450,163 +103,13 @@ EOF
450
103
  end
451
104
  end
452
105
 
453
- def self.follow_job(slurm_basedir, tail = true)
454
- fjob = File.join(slurm_basedir, 'job.id')
455
- fout = File.join(slurm_basedir, 'std.out')
456
- ferr = File.join(slurm_basedir, 'std.err')
457
- fstatus = File.join(slurm_basedir, 'job.status')
458
-
459
- job = Open.read(fjob).strip if Open.exists?(fjob)
460
-
461
- if job
462
- status_txt = CMD.cmd("squeue --job #{job}").read
463
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
464
- STDERR.puts status_txt
465
- lines = status_txt.split("\n").length
466
- end
467
-
468
- if tail
469
- Log.severity = 10
470
- while ! File.exists? fout
471
- if job
472
- STDERR.puts
473
- Log.clear_line(STDERR)
474
- STDERR.write Log.color(:magenta, "Waiting for Output")
475
- 3.times do
476
- STDERR.write Log.color(:magenta, ".")
477
- sleep 1
478
- end
479
- status_txt = CMD.cmd("squeue --job #{job}").read
480
- lines.times do
481
- Log.clear_line(STDERR)
482
- end
483
- Log.clear_line(STDERR)
484
- STDERR.puts Log.color(:magenta, "Status [#{job.to_i}]:")
485
- STDERR.puts status_txt
486
- lines = status_txt.split("\n").length
487
- end
488
- end
489
- STDERR.puts
490
- Log.clear_line(STDERR)
491
- STDERR.puts Log.color(:magenta, "Output:")
492
- begin
493
- CMD.cmd("squeue --job #{job} > #{fstatus}")
494
- out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exists?(fout) and not tail == :STDERR
495
- err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exists?(ferr)
496
-
497
- terr = Misc.consume_stream(err, true, STDERR) if err
498
- tout = Misc.consume_stream(out, true, STDOUT) if out
499
-
500
- sleep 3 while CMD.cmd("squeue --job #{job}").read.include? job.to_s
501
- rescue Aborted
502
- ensure
503
- begin
504
- terr.exit if terr
505
- tout.exit if tout
506
- err.close if err
507
- err.join if err
508
- rescue Exception
509
- end
510
-
511
- begin
512
- out.close if out
513
- out.join if out
514
- rescue Exception
515
- end
516
- end
517
- end
518
- end
519
-
520
- def self.wait_for_job(slurm_basedir, time = 1)
521
- fexit = File.join(slurm_basedir, 'exit.status')
522
- fjob = File.join(slurm_basedir, 'job.id')
523
- job = Open.read(fjob) if Open.exists?(fjob)
524
-
525
-
526
- while ! Open.exists?(fexit)
527
- sleep time
528
- end
529
- end
530
-
531
- def self.run_job(job, options = {})
532
- options = IndiferentHash.setup(options.dup)
533
-
534
- dry_run = options.delete :dry_run
535
- tail = options.delete :tail
536
- dependencies = options.delete :slurm_dependencies
537
- procpath = options.delete :SLURM_procpath
538
-
539
- options[:jobname] = job.clean_name
540
- log_level = options.delete :log
541
- log_level ||= Log.severity
542
-
543
- workflow = job.workflow
544
-
545
- task = Symbol === job.overriden ? job.overriden : job.task_name
546
-
547
- if job.overriden
548
- override_deps = job.rec_dependencies.
549
- select{|dep| Symbol === dep.overriden }.
550
- collect do |dep|
551
-
552
- name = [dep.workflow.to_s, dep.task_name] * "#"
553
- [name, dep.path] * "="
554
- end * ","
555
- end
556
-
557
- remove_slurm_basedir = options.delete :remove_slurm_basedir
558
- slurm_basedir = options.delete :SLURM_basedir
559
- slurm_basedir = "~/rbbt-slurm" if slurm_basedir.nil?
560
- TmpFile.with_file(nil, remove_slurm_basedir, :tmpdir => slurm_basedir, :prefix => "SLURM_rbbt_job-") do |tmp_directory|
561
- options[:slurm_basedir] ||= tmp_directory
562
- slurm_basedir = options[:slurm_basedir]
563
- inputs_dir = File.join(tmp_directory, 'inputs_dir')
564
- saved = Step.save_job_inputs(job, inputs_dir)
565
-
566
- cmd = ['workflow', 'task', workflow.to_s, task.to_s, '--printpath', '--log', log_level.to_s]
567
-
568
- cmd << "--procpath_performance='#{tmp_directory}/procpath##{procpath.gsub(',', '#')}'" if procpath
569
-
570
- cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
571
-
572
- cmd << "--load_inputs='#{inputs_dir}'" if saved && saved.any?
573
-
574
- template = self.template(cmd, options)
575
- jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
576
-
577
- return jobid unless tail
578
-
579
- t_monitor = Thread.new do
580
- self.follow_job(slurm_basedir, :STDERR)
581
- end
582
- self.wait_for_job(slurm_basedir)
583
- t_monitor.raise Aborted
584
- return unless Open.read(File.join(slurm_basedir, 'exit.status')).strip == '0'
585
- path = Open.read(File.join(slurm_basedir, 'std.out')).strip
586
- if Open.exists?(path) && job.path != path
587
- Log.info "Path of SLURM job #{path} is different from original job #{job.path}. Stablishing link."
588
- Open.ln path, job.path
589
- Open.ln path + '.info', job.path + '.info' if Open.exists?(path + '.info')
590
- Open.ln path + '.files', job.path + '.files' if Open.exists?(path + '.files')
591
- end
592
- jobid
106
+ def self.job_status(job = nil)
107
+ if job.nil?
108
+ CMD.cmd("squeue").read
109
+ else
110
+ CMD.cmd("squeue --job #{job}").read
593
111
  end
594
112
  end
595
- end
596
-
597
- def self.relay(job, options={})
598
- options = Misc.add_defaults options, :target => 'mn1', :search_path => 'user'
599
- done_deps = job.dependencies.select do |dep|
600
- dep.done?
601
- end
602
-
603
- error_deps = job.dependencies.select do |dep|
604
- dep.error? && ! dep.recoverable_error?
605
- end
606
-
607
- (done_deps + error_deps).each do |dep|
608
- Step.migrate(dep.path, options[:search_path], options)
609
- end
610
113
 
611
114
  end
612
115
  end