rbbt-util 5.28.14 → 5.30.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -103,7 +103,7 @@ class Step
103
103
  end
104
104
 
105
105
  job.dup_inputs unless status == 'done' or job.started?
106
- job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started?
106
+ job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started? || ! Workflow.job_path?(job.path)
107
107
 
108
108
  canfail = ComputeDependency === job && job.canfail?
109
109
  end
@@ -130,7 +130,6 @@ class Step
130
130
  (inputs.flatten.select{|i| Step === i} + inputs.flatten.select{|dep| Path === dep && Step === dep.resource}.collect{|dep| dep.resource})
131
131
  end
132
132
 
133
-
134
133
  def execute_dependency(dependency, log = true)
135
134
  task_name = self.task_name
136
135
  canfail_paths = self.canfail_paths
@@ -122,7 +122,6 @@ class Step
122
122
  reject{|dependency| (defined?(WorkflowRemoteClient) && WorkflowRemoteClient::RemoteStep === dependency) || Open.remote?(dependency.path) }.
123
123
  reject{|dependency| dependency.error? }.
124
124
  #select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
125
- #select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
126
125
  select{|dependency| dependency.updatable? }.
127
126
  collect{|dependency| Workflow.relocate_dependency(self, dependency)}
128
127
  end
@@ -374,7 +373,6 @@ class Step
374
373
  Log.exception $!
375
374
  ensure
376
375
  Step.purge_stream_cache
377
- set_info :pid, nil
378
376
  Open.rm pid_file if Open.exist?(pid_file)
379
377
  end
380
378
  end
@@ -389,7 +387,6 @@ class Step
389
387
  _clean_finished
390
388
  rescue
391
389
  stop_dependencies
392
- set_info :pid, nil
393
390
  Open.rm pid_file if Open.exist?(pid_file)
394
391
  end
395
392
  end
@@ -450,7 +447,7 @@ class Step
450
447
  ensure
451
448
  no_load = false unless IO === result
452
449
  Open.rm pid_file if Open.exist?(pid_file) unless no_load
453
- set_info :pid, nil unless no_load
450
+ #set_info :pid, nil unless no_load
454
451
  end
455
452
  end
456
453
 
@@ -560,7 +557,7 @@ class Step
560
557
  RbbtSemaphore.post_semaphore(semaphore) if semaphore
561
558
  Kernel.exit! -1
562
559
  end
563
- set_info :pid, nil
560
+ #set_info :pid, nil
564
561
  ensure
565
562
  RbbtSemaphore.post_semaphore(semaphore) if semaphore
566
563
  end
@@ -57,7 +57,7 @@ module Task
57
57
  puts Log.color(:magenta, "Input select options")
58
58
  puts
59
59
  selects.collect{|p| p}.uniq.each do |input,options|
60
- puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| o.to_s} * ", ") << "\n"
60
+ puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| Array === o ? o.first.to_s : o.to_s} * ", ") << "\n"
61
61
  puts unless Log.compact
62
62
  end
63
63
  puts
@@ -26,15 +26,25 @@ module Workflow
26
26
  workload
27
27
  end
28
28
 
29
+ def self.workload(jobs)
30
+ jobs.inject({}) do |acc,job|
31
+ Orchestrator.job_workload(job).each do |j,d|
32
+ acc[j] = d unless acc.keys.collect{|k| k.path }.include? j.path
33
+ end
34
+ acc
35
+ end
36
+ end
37
+
29
38
  def self.job_rules(rules, job)
30
39
  workflow = job.workflow.to_s
31
40
  task_name = job.task_name.to_s
41
+ defaults = rules["defaults"] || {}
32
42
 
33
- return IndiferentHash.setup(rules["defaults"]) unless rules[workflow]
34
- return IndiferentHash.setup(rules["defaults"]) unless rules[workflow][task_name]
43
+ return IndiferentHash.setup(defaults) unless rules[workflow]
44
+ return IndiferentHash.setup(defaults) unless rules[workflow][task_name]
35
45
 
36
46
  job_rules = IndiferentHash.setup(rules[workflow][task_name])
37
- rules["defaults"].each{|k,v| job_rules[k] = v if job_rules[k].nil? } if rules["defaults"]
47
+ defaults.each{|k,v| job_rules[k] = v if job_rules[k].nil? } if defaults
38
48
  job_rules
39
49
  end
40
50
 
@@ -169,12 +179,7 @@ module Workflow
169
179
  def process(rules, jobs)
170
180
  begin
171
181
 
172
- workload = jobs.inject({}) do |acc,job|
173
- Orchestrator.job_workload(job).each do |j,d|
174
- acc[j] = d unless acc.keys.collect{|k| k.path }.include? j.path
175
- end
176
- acc
177
- end
182
+ workload = Orchestrator.workload(jobs)
178
183
  all_jobs = workload.keys
179
184
 
180
185
  top_level_jobs = jobs.collect{|job| job.path }
@@ -22,11 +22,14 @@ class Step
22
22
  end
23
23
 
24
24
  def self.prov_report_msg(status, name, path, info = nil)
25
- parts = path.sub(/\{.*/,'').sub(/#{Regexp.quote(name)}$/,'').split "/"
25
+ parts = path.sub(/\{.*/,'').split "/"
26
26
 
27
+ parts.pop
28
+
27
29
  task = Log.color(:yellow, parts.pop)
28
30
  workflow = Log.color(:magenta, parts.pop)
29
- if status.to_s == 'noinfo' and parts.last != 'jobs'
31
+ #if status.to_s == 'noinfo' && parts.last != 'jobs'
32
+ if ! Workflow.job_path?(path)
30
33
  task, status, workflow = Log.color(:yellow, info[:task_name]), Log.color(:green, "file"), Log.color(:magenta, "-")
31
34
  end
32
35
 
@@ -0,0 +1,165 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+
6
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
7
+
8
+ options = SOPT.setup <<EOF
9
+
10
+ Clean error or aborted jobs
11
+
12
+ $ rbbt mnl [options]
13
+
14
+ -h--help Print this help
15
+ -d--done Done jobs only
16
+ -e--error Error jobs only
17
+ -a--aborted SLURM aboted jobs
18
+ -j--job* Job ids
19
+ -s--search* Regular expression
20
+ -t--tail* Show the last lines of the STDERR
21
+ -SBP--sbatch_parameters show sbatch parameters
22
+ -dr--dry_run Do not erase anything
23
+ EOF
24
+
25
+ if options[:help]
26
+ if defined? rbbt_usage
27
+ rbbt_usage
28
+ else
29
+ puts SOPT.doc
30
+ end
31
+ exit 0
32
+ end
33
+
34
+ Log.severity = 4
35
+ done, error, aborted, jobid, search, tail, sbatch_parameters, dry_run = options.values_at :done, :error, :aborted, :job, :search, :tail, :sbatch_parameters, :dry_run
36
+
37
+ workdir = File.expand_path('~/rbbt-slurm')
38
+ Path.setup(workdir)
39
+
40
+ running_jobs = begin
41
+ squeue_txt = CMD.cmd('squeue').read
42
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
43
+ rescue
44
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
45
+ squeue_txt = nil
46
+ $norunningjobs = true
47
+ []
48
+ end
49
+
50
+ if squeue_txt
51
+ job_nodes = {}
52
+ squeue_txt.split("\n").each do |line|
53
+ parts = line.strip.split(/\s+/)
54
+ job_nodes[parts.first] = parts.last.split(",")
55
+ end
56
+ else
57
+ job_nodes = nil
58
+ end
59
+
60
+ count = 0
61
+ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
62
+ dir = File.dirname(fcmd)
63
+
64
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
65
+ cmd = m[1]
66
+ else
67
+ cmd = nil
68
+ end
69
+
70
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
71
+ exe = m[1]
72
+ else
73
+ exe = nil
74
+ end
75
+
76
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
77
+ container_home = m[1]
78
+ else
79
+ container_home = nil
80
+ end
81
+
82
+
83
+ if File.exists?(fid = File.join(dir, 'job.id'))
84
+ id = Open.read(fid).chomp
85
+ else
86
+ id = nil
87
+ end
88
+
89
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
90
+ exit_status = Open.read(fstatus).to_i
91
+ else
92
+ exit_status = nil
93
+ end
94
+
95
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
96
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
97
+ elsif job_nodes[id]
98
+ nodes = job_nodes[id]
99
+ else
100
+ nodes = []
101
+ end
102
+
103
+ if File.exists?(File.join(dir, 'std.out'))
104
+ outt = File.mtime File.join(dir, 'std.out')
105
+ errt = File.mtime File.join(dir, 'std.err')
106
+ time_diff = Time.now - [outt, errt].max
107
+ end
108
+
109
+ fdep = File.join(dir, 'dependencies.list')
110
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
111
+
112
+ fcadep = File.join(dir, 'canfail_dependencies.list')
113
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
114
+
115
+ aborted = error = true if aborted.nil? && error.nil?
116
+ if done || error || aborted || running || queued || jobid || search
117
+ select = false
118
+ select = true if done && exit_status && exit_status.to_i == 0
119
+ select = true if error && exit_status && exit_status.to_i != 0
120
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
121
+ select = select && jobid.split(",").include?(id) if jobid
122
+ select = select && cmd.match(/#{search}/) if search
123
+ next unless select
124
+ end
125
+
126
+
127
+ puts Log.color(:yellow, "**ERASING**")
128
+ puts Log.color :blue, dir
129
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
130
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
131
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
132
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
133
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
134
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
135
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
136
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
137
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
138
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
139
+
140
+ if options[:sbatch_parameters]
141
+ puts Log.color(:magenta, "SBATCH parameters: ")
142
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
143
+ end
144
+
145
+ if tail && File.exists?(File.join(dir, 'std.err'))
146
+ if exit_status && exit_status != 0
147
+ puts Log.color(:magenta, "First error or exception found: ")
148
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
149
+ elsif exit_status
150
+ puts Log.color(:magenta, "Completed jobs: ")
151
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
152
+ else
153
+ puts Log.color(:magenta, "Log tail: ")
154
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
155
+ end
156
+ end
157
+
158
+ count += 1
159
+
160
+ Open.rm_rf dir unless dry_run
161
+ end
162
+
163
+ puts
164
+ puts "Found #{count} jobs"
165
+
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+
6
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
7
+
8
+ options = SOPT.setup <<EOF
9
+
10
+ Queue a job in Marenostrum
11
+
12
+ $ rbbt mnl [options]
13
+
14
+ -h--help Print this help
15
+ -d--done Done jobs only
16
+ -e--error Error jobs only
17
+ -a--aborted SLURM aboted jobs
18
+ -r--running Running jobs only
19
+ -q--queued Queued jobs only
20
+ -j--job* Job ids
21
+ -s--search* Regular expression
22
+ -t--tail* Show the last lines of the STDERR
23
+ -SBP--sbatch_parameters show sbatch parameters
24
+ -PERF--procpath_performance show Procpath performance summary
25
+ -sacct--sacct_peformance show sacct performance summary
26
+ EOF
27
+
28
+ if options[:help]
29
+ if defined? rbbt_usage
30
+ rbbt_usage
31
+ else
32
+ puts SOPT.doc
33
+ end
34
+ exit 0
35
+ end
36
+
37
+ Log.severity = 4
38
+ done, error, running, queued, aborted, jobid, search, tail = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail
39
+
40
+ workdir = File.expand_path('~/rbbt-slurm')
41
+ Path.setup(workdir)
42
+
43
+ running_jobs = begin
44
+ squeue_txt = CMD.cmd('squeue').read
45
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
46
+ rescue
47
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
48
+ squeue_txt = nil
49
+ $norunningjobs = true
50
+ []
51
+ end
52
+
53
+ if squeue_txt
54
+ job_nodes = {}
55
+ squeue_txt.split("\n").each do |line|
56
+ parts = line.strip.split(/\s+/)
57
+ job_nodes[parts.first] = parts.last.split(",")
58
+ end
59
+ else
60
+ job_nodes = nil
61
+ end
62
+
63
+ count = 0
64
+ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
65
+ dir = File.dirname(fcmd)
66
+
67
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
68
+ cmd = m[1]
69
+ else
70
+ cmd = nil
71
+ end
72
+
73
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
74
+ exe = m[1]
75
+ else
76
+ exe = nil
77
+ end
78
+
79
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
80
+ container_home = m[1]
81
+ else
82
+ container_home = nil
83
+ end
84
+
85
+
86
+ if File.exists?(fid = File.join(dir, 'job.id'))
87
+ id = Open.read(fid).chomp
88
+ else
89
+ id = nil
90
+ end
91
+
92
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
93
+ exit_status = Open.read(fstatus).to_i
94
+ else
95
+ exit_status = nil
96
+ end
97
+
98
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
99
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
100
+ elsif job_nodes[id]
101
+ nodes = job_nodes[id]
102
+ else
103
+ nodes = []
104
+ end
105
+
106
+ if File.exists?(File.join(dir, 'std.out'))
107
+ outt = File.mtime File.join(dir, 'std.out')
108
+ errt = File.mtime File.join(dir, 'std.err')
109
+ time_diff = Time.now - [outt, errt].max
110
+ end
111
+
112
+ fdep = File.join(dir, 'dependencies.list')
113
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
114
+
115
+ fcadep = File.join(dir, 'canfail_dependencies.list')
116
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
117
+
118
+ if done || error || aborted || running || queued || jobid || search
119
+ select = false
120
+ select = true if done && exit_status == 0
121
+ select = true if error && exit_status && exit_status != 0
122
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
123
+ select = true if queued && deps && (running_jobs & deps).any?
124
+ select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
125
+ select = true if jobid && jobid.split(",").include?(id)
126
+ select = true if search && cmd.match(/#{search}/)
127
+ next unless select
128
+ end
129
+
130
+
131
+ puts Log.color :blue, dir
132
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
133
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
134
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
135
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
136
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
137
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
138
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
139
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
140
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
141
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
142
+
143
+ if options[:sbatch_parameters]
144
+ puts Log.color(:magenta, "SBATCH parameters: ")
145
+ text = CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
146
+ lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
147
+ puts Log.color :yellow, lines * "\n"
148
+ end
149
+
150
+ fprocpath = File.join(dir, 'procpath.sqlite3')
151
+ if options[:procpath_performance] && Open.exists?(fprocpath)
152
+ puts Log.color(:magenta, "Procpath summary: ")
153
+ require 'rbbt/tsv/csv'
154
+ meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
155
+ perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
156
+
157
+ page_size = meta["page_size"].first.to_f
158
+ clock_ticks = meta["clock_ticks"].first.to_f
159
+
160
+ cpu_average = {}
161
+ rss_average = {}
162
+ perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
163
+ time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
164
+ time = time.to_f
165
+
166
+ cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
167
+ cpu_average[stat_pid] ||= {}
168
+ cpu_average[stat_pid][time] ||= []
169
+ cpu_average[stat_pid][time] << cpu.to_f
170
+ rss_average[time] ||= []
171
+ rss_average[time] << rss.to_f * page_size
172
+ end
173
+
174
+ ticks = 0
175
+ cpu_average.each do |stat_pid, cpu_average_pid|
176
+ start = cpu_average_pid.keys.sort.first
177
+ eend = cpu_average_pid.keys.sort.last
178
+ ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
179
+ end
180
+ start = rss_average.keys.sort.first
181
+ eend = rss_average.keys.sort.last
182
+ time_elapsed = eend - start
183
+ puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
184
+ puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
185
+
186
+ end
187
+
188
+ if options[:sacct_peformance]
189
+ begin
190
+ tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
191
+ values = tsv[tsv.keys.first]
192
+ if values.compact.any?
193
+ puts Log.color(:magenta, "SACCT performance: ")
194
+ puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
195
+ end
196
+ rescue
197
+ end
198
+ end
199
+
200
+
201
+ if tail && File.exists?(File.join(dir, 'std.err'))
202
+ if exit_status && exit_status != 0
203
+ puts Log.color(:magenta, "First error or exception found: ")
204
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
205
+ elsif exit_status
206
+ puts Log.color(:magenta, "Completed jobs: ")
207
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
208
+ else
209
+ puts Log.color(:magenta, "Log tail: ")
210
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
211
+ end
212
+ end
213
+
214
+ count += 1
215
+
216
+ end
217
+
218
+ puts
219
+ puts "Found #{count} jobs"
220
+