rbbt-util 5.32.4 → 5.32.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt/util/simpleopt'
4
+ require 'rbbt/workflow'
5
+ require 'rbbt/workflow/usage'
6
+ require 'rbbt/hpc'
7
+ require 'rbbt/hpc/orchestrate'
8
+ require 'time'
9
+
10
+ $slurm_options = SOPT.get <<EOF
11
+ -dr--dry_run Print only the template
12
+ -cj--clean_job Clean job
13
+ --drbbt* Use development version of rbbt
14
+ -sing--singularity Use Singularity
15
+ -si--singularity_img* Singularity image to use
16
+ -ug--user_group* Use alternative user group for group project directory
17
+ -c--contain* Contain in directory (using Singularity)
18
+ -s--sync* Contain in directory and sync jobs
19
+ -e--exclusive Make exclusive use of the node
20
+ -hm--highmem Make use of highmem cores
21
+ -wc--wipe_container* Wipe the jobs from the contain directory
22
+ -CS--contain_and_sync Contain and sync to default locations
23
+ -ci--copy_image When using a container directory, copy image there
24
+ -t--tail Tail the logs
25
+ -BPP--batch_procpath* Save Procpath performance for batch job; specify only options
26
+ -q--queue* Queue
27
+ -t--task_cpus* Tasks
28
+ -tm--time* Time
29
+ -lin--licenses* SLURM licenses
30
+ -cons--constraint* SLURM constraint
31
+ -W--workflows* Additional workflows
32
+ -OR--orchestration_rules* Orchestration rules
33
+ -rmb--remove_batch_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
34
+ EOF
35
+
36
+ batch_system = $slurm_options.delete :batch_system
37
+ batch_system ||= 'auto'
38
+
39
+ HPC::BATCH_MODULE = HPC.batch_system batch_system
40
+
41
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
42
+
43
+ class Step
44
+ def run(*args)
45
+ if done?
46
+ self.load
47
+ else
48
+ begin
49
+ Log.debug "Issuing SLURM job for #{self.path}"
50
+ HPC::BATCH_MODULE.orchestrate_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
51
+ rescue HPC::SBATCH
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ ARGV.concat ["-W", $slurm_options[:workflows], '--detach'] if $slurm_options[:workflows]
58
+ load Rbbt.share.rbbt_commands.workflow.task.find
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+ require 'rbbt/hpc'
6
+
7
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
8
+
9
+ options = SOPT.setup <<EOF
10
+
11
+ Queue a job in Marenostrum
12
+
13
+ $ rbbt slurm tail <directory> [options]
14
+
15
+ -h--help Print this help
16
+ -d--done Done jobs only
17
+ -e--error Error jobs only
18
+ -a--aborted SLURM aboted jobs
19
+ -r--running Running jobs only
20
+ -q--queued Queued jobs only
21
+ -j--job* Job ids
22
+ -s--search* Regular expression
23
+ -t--tail* Show the last lines of the STDERR
24
+ -p--progress Report progress of job and the dependencies
25
+ -SBP--sbatch_parameters show sbatch parameters
26
+ -PERF--procpath_performance show Procpath performance summary
27
+ -sacct--sacct_peformance show sacct performance summary
28
+ -bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
29
+ EOF
30
+
31
+ if options[:help]
32
+ if defined? rbbt_usage
33
+ rbbt_usage
34
+ else
35
+ puts SOPT.doc
36
+ end
37
+ exit 0
38
+ end
39
+
40
+ batch_system = options.delete :batch_system
41
+ batch_system ||= 'auto'
42
+
43
+ HPC::BATCH_MODULE = HPC.batch_system batch_system
44
+
45
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
46
+
47
+ directory = ARGV.shift
48
+
49
+ raise ParameterException if directory.nil?
50
+
51
+ directory = File.dirname(directory) unless File.directory?(directory)
52
+
53
+ require 'rbbt/hpc/slurm'
54
+
55
+ HPC::BATCH_MODULE.follow_job directory, true
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt/util/simpleopt'
4
+ require 'rbbt/workflow'
5
+ require 'rbbt/workflow/usage'
6
+ require 'rbbt/hpc'
7
+ require 'time'
8
+
9
+ $slurm_options = SOPT.get <<EOF
10
+ -dr--dry_run Print only the template
11
+ -cj--clean_job Clean job
12
+ --drbbt* Use development version of rbbt
13
+ -sing--singularity Use Singularity
14
+ -si--singularity_img* Singularity image to use
15
+ -ug--user_group* Use alternative user group for group project directory
16
+ -c--contain* Contain in directory (using Singularity)
17
+ -s--sync* Contain in directory and sync jobs
18
+ -e--exclusive Make exclusive use of the node
19
+ -hm--highmem Make use of highmem cores
20
+ -wc--wipe_container* Wipe the jobs from the contain directory
21
+ -CS--contain_and_sync Contain and sync to default locations
22
+ -ci--copy_image When using a container directory, copy image there
23
+ -t--tail Tail the logs
24
+ -BPP--batch_procpath* Save Procpath performance for batch job; specify only options
25
+ -q--queue* Queue
26
+ -t--task_cpus* Tasks
27
+ -tm--time* Time
28
+ -lin--licenses* SLURM licenses
29
+ -cons--constraint* SLURM constraint
30
+ -W--workflows* Additional workflows
31
+ -rmb--remove_batch_dir Remove the batch working directory (command, STDIN, exit status, ...)
32
+ -bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
33
+ EOF
34
+
35
+ batch_system = $slurm_options.delete :batch_system
36
+ batch_system ||= 'auto'
37
+
38
+ HPC::BATCH_MODULE = HPC.batch_system batch_system
39
+
40
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
41
+
42
+ class Step
43
+ def run(*args)
44
+ if done?
45
+ self.load
46
+ else
47
+ begin
48
+ Log.debug "Issuing SLURM job for #{self.path}"
49
+ HPC::BATCH_MODULE.run_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
50
+ rescue HPC::SBATCH
51
+ end
52
+ end
53
+ end
54
+ end
55
+
56
+ ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
57
+ load Rbbt.share.rbbt_commands.workflow.task.find
@@ -2,7 +2,7 @@
2
2
 
3
3
  require 'rbbt-util'
4
4
  require 'rbbt/util/simpleopt'
5
- require 'rbbt/workflow/remote_workflow'
5
+ require 'rbbt/util/migrate'
6
6
 
7
7
  $0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
8
8
 
@@ -30,83 +30,10 @@ if options[:help]
30
30
  exit 0
31
31
  end
32
32
 
33
- #excludes = %w(.save .crap .source tmp filecache open-remote workflows apps software jobs PCAWG)
34
- excludes = %w(.save .crap .source tmp filecache open-remote)
35
- excludes += (options[:exclude] || "").split(/,\s*/)
36
- excludes_str = excludes.collect{|s| "--exclude '#{s}'" } * " "
37
-
38
- test_str = options[:test] ? '-nv' : ''
39
-
40
33
  path, search_path, _sep, *other = ARGV
41
34
 
42
35
  search_path = 'user' if search_path.nil?
43
- resource = Rbbt
44
-
45
- path, real_paths, lpath = if options[:source]
46
- lpath, *paths = Misc.ssh_run(options[:source], <<-EOF).split("\n")
47
- require 'rbbt-util'
48
- path = "#{path}"
49
- if Open.exists?(path)
50
- path = #{resource.to_s}.identify(path)
51
- else
52
- path = Path.setup(path)
53
- end
54
- puts path
55
- puts path.glob_all.collect{|p| File.directory?(p) ? p + "/" : p } * "\n"
56
- EOF
57
- [path, paths.collect{|p| [options[:source], p] * ":"}, lpath]
58
- else
59
- if File.exists?(path)
60
-
61
- path = resource.identify(path)
62
- else
63
- path = Path.setup(path)
64
- end
65
- [path, path.glob_all, path]
66
- end
67
-
68
- target = if options[:target]
69
- target = Misc.ssh_run(options[:target], <<-EOF).split("\n").first
70
- require 'rbbt-util'
71
- path = "#{path}"
72
- resource = #{resource.to_s}
73
- search_path = "#{search_path}"
74
- puts resource[path].find(search_path)
75
- EOF
76
- else
77
- resource[lpath].find(search_path)
78
- end
79
-
80
- real_paths.each do |source|
81
-
82
-
83
- if File.directory?(source) || source =~ /\/$/
84
- source += "/" unless source[-1] == "/"
85
- target += "/" unless target[-1] == "/"
86
- end
87
36
 
88
- next if source == target
37
+ options[:other] = other
89
38
 
90
- if options[:target]
91
- CMD.cmd("ssh #{options[:target]} mkdir -p '#{File.dirname(target)}'")
92
- else
93
- Open.mkdir File.dirname(target)
94
- end
95
-
96
- if options[:target]
97
- target_path = [options[:target], target] * ":"
98
- else
99
- target_path = target
100
- end
101
-
102
- cmd = "rsync -avztAXHP --copy-unsafe-links #{test_str} #{excludes_str} #{source} #{target_path} #{other * " "}"
103
-
104
- cmd << " && rm -Rf #{source}" if options[:delete]
105
-
106
- if options[:print]
107
- puts cmd
108
- exit 0
109
- else
110
- CMD.cmd_log(cmd, :log => Log::INFO)
111
- end
112
- end
39
+ Rbbt.migrate(path, search_path, options)
@@ -0,0 +1,212 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+ require 'rbbt/hpc'
6
+
7
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
8
+
9
+ options = SOPT.setup <<EOF
10
+
11
+ Clean error or aborted jobs
12
+
13
+ $ rbbt slurm clean [options]
14
+
15
+ -h--help Print this help
16
+ -d--done Done jobs only
17
+ -e--error Error jobs only
18
+ -a--aborted SLURM aboted jobs
19
+ -q--queued Queued jobs only
20
+ -j--job* Job ids
21
+ -s--search* Regular expression
22
+ -t--tail* Show the last lines of the STDERR
23
+ -BP--batch_parameters show batch parameters
24
+ -dr--dry_run Do not erase anything
25
+ EOF
26
+
27
+ if options[:help]
28
+ if defined? rbbt_usage
29
+ rbbt_usage
30
+ else
31
+ puts SOPT.doc
32
+ end
33
+ exit 0
34
+ end
35
+
36
+ batch_system = options.delete :batch_system
37
+ batch_system ||= 'auto'
38
+
39
+ HPC::BATCH_MODULE = HPC.batch_system batch_system
40
+
41
+ raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
42
+
43
+ Log.severity = 4
44
+ done, error, aborted, queued, jobid, search, tail, batch_parameters, dry_run = options.values_at :done, :error, :aborted, :queued, :job, :search, :tail, :batch_parameters, :dry_run
45
+
46
+ workdir = File.expand_path('~/rbbt-batch')
47
+ Path.setup(workdir)
48
+
49
+ running_jobs = begin
50
+ squeue_txt = HPC::BATCH_MODULE.job_status
51
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
52
+ rescue
53
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
54
+ squeue_txt = nil
55
+ $norunningjobs = true
56
+ []
57
+ end
58
+
59
+ if squeue_txt
60
+ job_nodes = {}
61
+ squeue_txt.split("\n").each do |line|
62
+ parts = line.strip.split(/\s+/)
63
+ job_nodes[parts.first] = parts.last.split(",")
64
+ end
65
+ else
66
+ job_nodes = nil
67
+ end
68
+
69
+ count = 0
70
+ workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
71
+ dir = File.dirname(fcmd)
72
+ command_txt = Open.read(fcmd)
73
+
74
+ if m = command_txt.match(/#CMD: (.*)/)
75
+ cmd = m[1]
76
+ else
77
+ cmd = nil
78
+ end
79
+
80
+ if m = command_txt.match(/# Run command\n(.*?)\n/im)
81
+ exe = m[1]
82
+ else
83
+ exe = nil
84
+ end
85
+
86
+ if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
87
+ container_home = m[1]
88
+ else
89
+ container_home = nil
90
+ end
91
+
92
+ if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
93
+ job_batch_system = m[1].downcase
94
+ else
95
+ job_batch_system = nil
96
+ end
97
+
98
+ different_system = job_batch_system != batch_system
99
+
100
+ if File.exists?(fid = File.join(dir, 'job.id'))
101
+ id = Open.read(fid).chomp
102
+ else
103
+ id = nil
104
+ end
105
+
106
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
107
+ exit_status = Open.read(fstatus).to_i
108
+ else
109
+ exit_status = nil
110
+ end
111
+
112
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
113
+ fstatus_txt = Open.read(fstatus)
114
+ begin
115
+ if job_batch_system == "lsf"
116
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
117
+ else
118
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
119
+ end
120
+ rescue
121
+ nodes = []
122
+ end
123
+ elsif job_nodes[id]
124
+ nodes = job_nodes[id]
125
+ else
126
+ nodes = []
127
+ end
128
+
129
+ if File.exists?(File.join(dir, 'std.out'))
130
+ outt = File.mtime File.join(dir, 'std.out')
131
+ errt = File.mtime File.join(dir, 'std.err')
132
+ time_diff = Time.now - [outt, errt].max
133
+ end
134
+
135
+ fdep = File.join(dir, 'dependencies.list')
136
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
137
+
138
+ fcadep = File.join(dir, 'canfail_dependencies.list')
139
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
140
+
141
+ aborted = error = true if aborted.nil? && error.nil?
142
+ #if done || error || aborted || running || queued || jobid || search
143
+ # select = false
144
+ # select = true if done && exit_status && exit_status.to_i == 0
145
+ # select = true if error && exit_status && exit_status.to_i != 0
146
+ # select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
147
+ # select = select && jobid.split(",").include?(id) if jobid
148
+ # select = select && cmd.match(/#{search}/) if search
149
+ # next unless select
150
+ #end
151
+
152
+ if done || error || aborted || queued || jobid
153
+ select = false
154
+ select = true if done && exit_status == 0
155
+ select = true if error && exit_status && exit_status != 0
156
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
157
+ is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
158
+ select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
159
+ select = true if jobid && jobid.split(",").include?(id)
160
+ select = select && cmd.match(/#{search}/) if search
161
+ next unless select
162
+ elsif search
163
+ select = false
164
+ select = true if search && cmd.match(/#{search}/)
165
+ next unless select
166
+ end
167
+
168
+
169
+ puts Log.color(:yellow, "**ERASING**")
170
+ puts Log.color :blue, dir
171
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
172
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
173
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
174
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
175
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
176
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
177
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
178
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
179
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
180
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
181
+
182
+ if options[:batch_parameters]
183
+ puts Log.color(:magenta, "BATCH parameters: ")
184
+ case job_batch_system
185
+ when 'slurm'
186
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
187
+ when 'lsf'
188
+ puts Log.color :blue, CMD.cmd('grep "^#BSUB" |tail -n +6', :in => Open.read(fcmd)).read.strip
189
+ end
190
+ end
191
+
192
+ if tail && File.exists?(File.join(dir, 'std.err'))
193
+ if exit_status && exit_status != 0
194
+ puts Log.color(:magenta, "First error or exception found: ")
195
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
196
+ elsif exit_status
197
+ puts Log.color(:magenta, "Completed jobs: ")
198
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
199
+ else
200
+ puts Log.color(:magenta, "Log tail: ")
201
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
202
+ end
203
+ end
204
+
205
+ count += 1
206
+
207
+ Open.rm_rf dir unless dry_run
208
+ end
209
+
210
+ puts
211
+ puts "Found #{count} jobs"
212
+