rbbt-util 5.30.12 → 5.31.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'rbbt-util'
4
4
  require 'rbbt/util/simpleopt'
5
+ require 'rbbt/hpc'
5
6
 
6
7
  #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
7
8
 
@@ -9,7 +10,7 @@ options = SOPT.setup <<EOF
9
10
 
10
11
  Queue a job in Marenostrum
11
12
 
12
- $ rbbt mnl [options]
13
+ $ rbbt slurm list [options]
13
14
 
14
15
  -h--help Print this help
15
16
  -d--done Done jobs only
@@ -21,9 +22,10 @@ $ rbbt mnl [options]
21
22
  -s--search* Regular expression
22
23
  -t--tail* Show the last lines of the STDERR
23
24
  -p--progress Report progress of job and the dependencies
24
- -SBP--sbatch_parameters show sbatch parameters
25
- -PERF--procpath_performance show Procpath performance summary
25
+ -BP--batch_parameters show batch parameters
26
+ -BPP--batch_procpath show Procpath performance summary
26
27
  -sacct--sacct_peformance show sacct performance summary
28
+ -bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
27
29
  EOF
28
30
 
29
31
  if options[:help]
@@ -35,14 +37,48 @@ if options[:help]
35
37
  exit 0
36
38
  end
37
39
 
38
- #Log.severity = 4
40
+ batch_system = options.delete :batch_system
41
+ batch_system ||= 'auto'
42
+
43
+ HPC::BATCH_MODULE = case batch_system.to_s.downcase
44
+ when 'slurm'
45
+ HPC::SLURM
46
+ when 'lsf'
47
+ HPC::LSF
48
+ when 'auto'
49
+ case $previous_commands.last
50
+ when 'slurm'
51
+ HPC::SLURM
52
+ when 'lsf'
53
+ HPC::LSF
54
+ else
55
+ case Rbbt::Config.get(:batch_system, :batch, :batch_system, :hpc, :HPC, :BATCH).to_s.downcase
56
+ when 'slurm'
57
+ HPC::SLURM
58
+ when 'lsf'
59
+ HPC::LSF
60
+ else
61
+ case ENV["BATCH_SYSTEM"].to_s.downcase
62
+ when 'slurm'
63
+ HPC::SLURM
64
+ when 'lsf'
65
+ HPC::LSF
66
+ end
67
+ end
68
+ end
69
+ end
70
+
71
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
72
+
73
+ batch_system = HPC::BATCH_MODULE.to_s.split("::").last.downcase
74
+
39
75
  done, error, running, queued, aborted, jobid, search, tail, progress = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail, :progress
40
76
 
41
- workdir = File.expand_path('~/rbbt-slurm')
77
+ workdir = File.expand_path('~/rbbt-batch')
42
78
  Path.setup(workdir)
43
79
 
44
80
  running_jobs = begin
45
- squeue_txt = CMD.cmd('squeue').read
81
+ squeue_txt = HPC::BATCH_MODULE.job_status
46
82
  squeue_txt.split("\n").collect{|l| l.to_i.to_s}
47
83
  rescue
48
84
  Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
@@ -62,35 +98,48 @@ else
62
98
  end
63
99
 
64
100
  count = 0
65
- workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
101
+ workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
66
102
  dir = File.dirname(fcmd)
103
+ command_txt = Open.read(fcmd)
67
104
 
68
- if m = Open.read(fcmd).match(/#CMD: (.*)/)
105
+ if m = command_txt.match(/#CMD: (.*)/)
69
106
  cmd = m[1]
70
107
  else
71
108
  cmd = nil
72
109
  end
73
110
 
74
- if m = Open.read(fcmd).match(/#MANIFEST: (.*)/)
111
+ if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
112
+ job_batch_system = m[1].downcase
113
+ else
114
+ job_batch_system = nil
115
+ end
116
+
117
+ different_system = job_batch_system != batch_system
118
+
119
+ if m = command_txt.match(/#MANIFEST: (.*)/)
75
120
  manifest = m[1]
76
121
  else
77
122
  manifest = nil
78
123
  end
79
124
 
125
+ if m = command_txt.match(/#STEP_PATH: (.*)/)
126
+ step_path = m[1]
127
+ else
128
+ step_path = nil
129
+ end
80
130
 
81
- if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
82
- exe = m[1].sub('step_path=$(','')
131
+ if m = command_txt.match(/#EXEC_CMD: (.*)/)
132
+ exe = m[1]
83
133
  else
84
134
  exe = nil
85
135
  end
86
136
 
87
- if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
137
+ if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
88
138
  container_home = m[1]
89
139
  else
90
140
  container_home = nil
91
141
  end
92
142
 
93
-
94
143
  if File.exists?(fid = File.join(dir, 'job.id'))
95
144
  id = Open.read(fid).chomp
96
145
  else
@@ -104,11 +153,20 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
104
153
  end
105
154
 
106
155
  if File.exists?(fstatus = File.join(dir, 'job.status'))
107
- nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
156
+ fstatus_txt = Open.read(fstatus)
157
+ begin
158
+ if job_batch_system == "lsf"
159
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
160
+ else
161
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
162
+ end
163
+ rescue
164
+ nodes = []
165
+ end
108
166
  elsif job_nodes[id]
109
- nodes = job_nodes[id].reject{|n| n.include? "("}
167
+ nodes = job_nodes[id].reject{|n| n.include? "("}
110
168
  else
111
- nodes = []
169
+ nodes = []
112
170
  end
113
171
 
114
172
  if File.exists?(File.join(dir, 'exit.status'))
@@ -137,7 +195,7 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
137
195
  select = true if done && exit_status == 0
138
196
  select = true if error && exit_status && exit_status != 0
139
197
  select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
140
- is_running = exit_status.nil? && running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)
198
+ is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
141
199
  select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
142
200
  select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
143
201
  select = true if jobid && jobid.split(",").include?(id)
@@ -151,29 +209,39 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
151
209
 
152
210
 
153
211
  puts Log.color :blue, dir
154
- puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
212
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
155
213
  puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err'))
156
214
  puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest)
215
+ puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
157
216
  puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
158
217
  puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
159
218
  puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
160
219
  puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
161
- puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
220
+ if different_system
221
+ puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
222
+ else
223
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
224
+ end
162
225
  puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
163
226
  puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
164
227
  puts Log.color(:magenta, "Nodes: ") << nodes * ", "
165
228
  puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
166
229
  puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exists?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
167
230
 
168
- if options[:sbatch_parameters]
169
- puts Log.color(:magenta, "SBATCH parameters: ")
170
- text = CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
231
+ if options[:batch_parameters]
232
+ puts Log.color(:magenta, "BATCH parameters: ")
233
+ case job_batch_system
234
+ when 'slurm'
235
+ text = CMD.cmd('grep "^#SBATCH" |tail -n +5', :in => Open.read(fcmd)).read.strip
236
+ when 'lsf'
237
+ text = CMD.cmd('grep "^#BSUB" |tail -n +5', :in => Open.read(fcmd)).read.strip
238
+ end
171
239
  lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
172
240
  puts Log.color :yellow, lines * "\n"
173
241
  end
174
242
 
175
243
  fprocpath = File.join(dir, 'procpath.sqlite3')
176
- if options[:procpath_performance] && Open.exists?(fprocpath)
244
+ if options[:batch_procpath] && Open.exists?(fprocpath)
177
245
  puts Log.color(:magenta, "Procpath summary: ")
178
246
  require 'rbbt/tsv/csv'
179
247
  meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
@@ -215,13 +283,15 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
215
283
 
216
284
  if options[:sacct_peformance]
217
285
  begin
218
- tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
219
- values = tsv[tsv.keys.first]
220
- if values.compact.any?
221
- puts Log.color(:magenta, "SACCT performance: ")
222
- puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
223
- end
286
+ raise "sacct not supported for LSF" unless batch_system == 'slurm'
287
+ tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
288
+ values = tsv[tsv.keys.first]
289
+ if values.compact.any?
290
+ puts Log.color(:magenta, "SACCT performance: ")
291
+ puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
292
+ end
224
293
  rescue
294
+ Log.warn $!.message
225
295
  end
226
296
  end
227
297
 
@@ -247,7 +317,7 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
247
317
  step = Step.new step_path
248
318
  step.load_dependencies_from_info
249
319
  (step.rec_dependencies + [step]).reverse.each do |j|
250
- next if j.done? || ! j.running?
320
+ next if j.done?
251
321
  next unless j.file(:progress).exists?
252
322
  bar = Log::ProgressBar.new
253
323
  bar.load(j.file(:progress).yaml)
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt/util/simpleopt'
4
+ require 'rbbt/workflow'
5
+ require 'rbbt/workflow/usage'
6
+ require 'rbbt/hpc'
7
+ require 'rbbt/hpc/orchestrate'
8
+ require 'time'
9
+
10
+ $slurm_options = SOPT.get <<EOF
11
+ -dr--dry_run Print only the template
12
+ -cj--clean_job Clean job
13
+ --drbbt* Use development version of rbbt
14
+ -sing--singularity Use Singularity
15
+ -ug--user_group* Use alternative user group for group project directory
16
+ -c--contain* Contain in directory (using Singularity)
17
+ -s--sync* Contain in directory and sync jobs
18
+ -e--exclusive Make exclusive use of the node
19
+ -hm--highmem Make use of highmem cores
20
+ -wc--wipe_container* Wipe the jobs from the contain directory
21
+ -CS--contain_and_sync Contain and sync to default locations
22
+ -ci--copy_image When using a container directory, copy image there
23
+ -t--tail Tail the logs
24
+ -BPP--batch_procpath* Save Procpath performance for batch job; specify only options
25
+ -q--queue* Queue
26
+ -t--task_cpus* Tasks
27
+ -W--workflows* Additional workflows
28
+ -tm--time* Time
29
+ -OR--orchestration_rules* Orchestration rules
30
+ -rmb--remove_batch_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
31
+ EOF
32
+
33
+ batch_system = $slurm_options.delete :batch_system
34
+ batch_system ||= 'auto'
35
+
36
+ HPC::BATCH_MODULE = case batch_system.to_s.downcase
37
+ when 'slurm'
38
+ HPC::SLURM
39
+ when 'lsf'
40
+ HPC::LSF
41
+ when 'auto'
42
+ case $previous_commands.last
43
+ when 'slurm'
44
+ HPC::SLURM
45
+ when 'lsf'
46
+ HPC::LSF
47
+ else
48
+ case Rbbt::Config.get(:batch_system, :batch, :batch_system, :hpc, :HPC, :BATCH).to_s.downcase
49
+ when 'slurm'
50
+ HPC::SLURM
51
+ when 'lsf'
52
+ HPC::LSF
53
+ else
54
+ case ENV["BATCH_SYSTEM"].to_s.downcase
55
+ when 'slurm'
56
+ HPC::SLURM
57
+ when 'lsf'
58
+ HPC::LSF
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
65
+
66
+ class Step
67
+ def run(*args)
68
+ if done?
69
+ self.load
70
+ else
71
+ begin
72
+ Log.debug "Issuing SLURM job for #{self.path}"
73
+ HPC::BATCH_MODULE.orchestrate_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
74
+ rescue HPC::SBATCH
75
+ end
76
+ end
77
+ end
78
+ end
79
+
80
+ ARGV.concat ["-W", $slurm_options[:workflows], '--detach'] if $slurm_options[:workflows]
81
+ load Rbbt.share.rbbt_commands.workflow.task.find
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+ require 'rbbt/hpc'
6
+
7
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
8
+
9
+ options = SOPT.setup <<EOF
10
+
11
+ Queue a job in Marenostrum
12
+
13
+ $ rbbt slurm tail <directory> [options]
14
+
15
+ -h--help Print this help
16
+ -d--done Done jobs only
17
+ -e--error Error jobs only
18
+ -a--aborted SLURM aboted jobs
19
+ -r--running Running jobs only
20
+ -q--queued Queued jobs only
21
+ -j--job* Job ids
22
+ -s--search* Regular expression
23
+ -t--tail* Show the last lines of the STDERR
24
+ -p--progress Report progress of job and the dependencies
25
+ -SBP--sbatch_parameters show sbatch parameters
26
+ -PERF--procpath_performance show Procpath performance summary
27
+ -sacct--sacct_peformance show sacct performance summary
28
+ -bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
29
+ EOF
30
+
31
+ if options[:help]
32
+ if defined? rbbt_usage
33
+ rbbt_usage
34
+ else
35
+ puts SOPT.doc
36
+ end
37
+ exit 0
38
+ end
39
+
40
+ batch_system = options.delete :batch_system
41
+ batch_system ||= 'auto'
42
+
43
+ HPC::BATCH_MODULE = case batch_system.to_s.downcase
44
+ when 'slurm'
45
+ HPC::SLURM
46
+ when 'lsf'
47
+ HPC::LSF
48
+ when 'auto'
49
+ case $previous_commands.last
50
+ when 'slurm'
51
+ HPC::SLURM
52
+ when 'lsf'
53
+ HPC::LSF
54
+ else
55
+ case Rbbt::Config.get(:batch_system, :batch, :batch_system, :hpc, :HPC, :BATCH).to_s.downcase
56
+ when 'slurm'
57
+ HPC::SLURM
58
+ when 'lsf'
59
+ HPC::LSF
60
+ else
61
+ case ENV["BATCH_SYSTEM"].to_s.downcase
62
+ when 'slurm'
63
+ HPC::SLURM
64
+ when 'lsf'
65
+ HPC::LSF
66
+ end
67
+ end
68
+ end
69
+ end
70
+
71
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
72
+
73
+ directory = ARGV.shift
74
+
75
+ raise ParameterException if directory.nil?
76
+
77
+ directory = File.dirname(directory) unless File.directory?(directory)
78
+
79
+ require 'rbbt/hpc/slurm'
80
+
81
+ HPC::BATCH_MODULE.follow_job directory, true
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt/util/simpleopt'
4
+ require 'rbbt/workflow'
5
+ require 'rbbt/workflow/usage'
6
+ require 'rbbt/hpc'
7
+ require 'time'
8
+
9
+ $slurm_options = SOPT.get <<EOF
10
+ -dr--dry_run Print only the template
11
+ -cj--clean_job Clean job
12
+ --drbbt* Use development version of rbbt
13
+ -sing--singularity Use Singularity
14
+ -ug--user_group* Use alternative user group for group project directory
15
+ -c--contain* Contain in directory (using Singularity)
16
+ -s--sync* Contain in directory and sync jobs
17
+ -e--exclusive Make exclusive use of the node
18
+ -hm--highmem Make use of highmem cores
19
+ -wc--wipe_container* Wipe the jobs from the contain directory
20
+ -CS--contain_and_sync Contain and sync to default locations
21
+ -ci--copy_image When using a container directory, copy image there
22
+ -t--tail Tail the logs
23
+ -BPP--batch_procpath* Save Procpath performance for batch job; specify only options
24
+ -q--queue* Queue
25
+ -t--task_cpus* Tasks
26
+ -W--workflows* Additional workflows
27
+ -tm--time* Time
28
+ -rmb--remove_batch_dir Remove the batch working directory (command, STDIN, exit status, ...)
29
+ -bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
30
+ EOF
31
+
32
+ batch_system = $slurm_options.delete :batch_system
33
+ batch_system ||= 'auto'
34
+
35
+ HPC::BATCH_MODULE = case batch_system.to_s.downcase
36
+ when 'slurm'
37
+ HPC::SLURM
38
+ when 'lsf'
39
+ HPC::LSF
40
+ when 'auto'
41
+ case $previous_commands.last
42
+ when 'slurm'
43
+ HPC::SLURM
44
+ when 'lsf'
45
+ HPC::LSF
46
+ else
47
+ case Rbbt::Config.get(:batch_system, :batch, :batch_system, :hpc, :HPC, :BATCH).to_s.downcase
48
+ when 'slurm'
49
+ HPC::SLURM
50
+ when 'lsf'
51
+ HPC::LSF
52
+ else
53
+ case ENV["BATCH_SYSTEM"].to_s.downcase
54
+ when 'slurm'
55
+ HPC::SLURM
56
+ when 'lsf'
57
+ HPC::LSF
58
+ end
59
+ end
60
+ end
61
+ end
62
+
63
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
64
+
65
+ class Step
66
+ def run(*args)
67
+ if done?
68
+ self.load
69
+ else
70
+ begin
71
+ Log.debug "Issuing SLURM job for #{self.path}"
72
+ HPC::BATCH_MODULE.run_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
73
+ rescue HPC::SBATCH
74
+ end
75
+ end
76
+ end
77
+ end
78
+
79
+ ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
80
+ load Rbbt.share.rbbt_commands.workflow.task.find