rbbt-util 5.29.0 → 5.30.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,15 +20,18 @@ $ rbbt mnl [options]
20
20
  -j--job* Job ids
21
21
  -s--search* Regular expression
22
22
  -t--tail* Show the last lines of the STDERR
23
+ -SBP--sbatch_parameters show sbatch parameters
24
+ -PERF--procpath_performance show Procpath performance summary
25
+ -sacct--sacct_peformance show sacct performance summary
23
26
  EOF
24
27
 
25
28
  if options[:help]
26
- if defined? rbbt_usage
27
- rbbt_usage
28
- else
29
- puts SOPT.doc
30
- end
31
- exit 0
29
+ if defined? rbbt_usage
30
+ rbbt_usage
31
+ else
32
+ puts SOPT.doc
33
+ end
34
+ exit 0
32
35
  end
33
36
 
34
37
  Log.severity = 4
@@ -38,101 +41,177 @@ workdir = File.expand_path('~/rbbt-slurm')
38
41
  Path.setup(workdir)
39
42
 
40
43
  running_jobs = begin
41
- CMD.cmd('squeue').read.split("\n").collect{|l| l.to_i.to_s}
44
+ squeue_txt = CMD.cmd('squeue').read
45
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
42
46
  rescue
43
- Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
44
- $norunningjobs = true
45
- []
47
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
48
+ squeue_txt = nil
49
+ $norunningjobs = true
50
+ []
46
51
  end
47
52
 
53
+ if squeue_txt
54
+ job_nodes = {}
55
+ squeue_txt.split("\n").each do |line|
56
+ parts = line.strip.split(/\s+/)
57
+ job_nodes[parts.first] = parts.last.split(",")
58
+ end
59
+ else
60
+ job_nodes = nil
61
+ end
62
+
48
63
  count = 0
49
64
  workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
50
- dir = File.dirname(fcmd)
51
-
52
- if m = Open.read(fcmd).match(/#CMD: (.*)/)
53
- cmd = m[1]
54
- else
55
- cmd = nil
65
+ dir = File.dirname(fcmd)
66
+
67
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
68
+ cmd = m[1]
69
+ else
70
+ cmd = nil
71
+ end
72
+
73
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
74
+ exe = m[1].sub('step_path=$(','')
75
+ else
76
+ exe = nil
77
+ end
78
+
79
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
80
+ container_home = m[1]
81
+ else
82
+ container_home = nil
83
+ end
84
+
85
+
86
+ if File.exists?(fid = File.join(dir, 'job.id'))
87
+ id = Open.read(fid).chomp
88
+ else
89
+ id = nil
90
+ end
91
+
92
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
93
+ exit_status = Open.read(fstatus).to_i
94
+ else
95
+ exit_status = nil
96
+ end
97
+
98
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
99
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
100
+ elsif job_nodes[id]
101
+ nodes = job_nodes[id]
102
+ else
103
+ nodes = []
104
+ end
105
+
106
+ if File.exists?(File.join(dir, 'std.out'))
107
+ outt = File.mtime File.join(dir, 'std.out')
108
+ errt = File.mtime File.join(dir, 'std.err')
109
+ time_diff = Time.now - [outt, errt].max
110
+ end
111
+
112
+ fdep = File.join(dir, 'dependencies.list')
113
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
114
+
115
+ fcadep = File.join(dir, 'canfail_dependencies.list')
116
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
117
+
118
+ if done || error || aborted || running || queued || jobid || search
119
+ select = false
120
+ select = true if done && exit_status == 0
121
+ select = true if error && exit_status && exit_status != 0
122
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
123
+ select = true if queued && deps && (running_jobs & deps).any?
124
+ select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
125
+ select = true if jobid && jobid.split(",").include?(id)
126
+ select = true if search && cmd.match(/#{search}/)
127
+ next unless select
128
+ end
129
+
130
+
131
+ puts Log.color :blue, dir
132
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
133
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
134
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
135
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
136
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
137
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
138
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
139
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
140
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
141
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
142
+
143
+ if options[:sbatch_parameters]
144
+ puts Log.color(:magenta, "SBATCH parameters: ")
145
+ text = CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
146
+ lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
147
+ puts Log.color :yellow, lines * "\n"
148
+ end
149
+
150
+ fprocpath = File.join(dir, 'procpath.sqlite3')
151
+ if options[:procpath_performance] && Open.exists?(fprocpath)
152
+ puts Log.color(:magenta, "Procpath summary: ")
153
+ require 'rbbt/tsv/csv'
154
+ meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
155
+ perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
156
+
157
+ page_size = meta["page_size"].first.to_f
158
+ clock_ticks = meta["clock_ticks"].first.to_f
159
+
160
+ cpu_average = {}
161
+ rss_average = {}
162
+ perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
163
+ time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
164
+ time = time.to_f
165
+
166
+ cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
167
+ cpu_average[stat_pid] ||= {}
168
+ cpu_average[stat_pid][time] ||= []
169
+ cpu_average[stat_pid][time] << cpu.to_f
170
+ rss_average[time] ||= []
171
+ rss_average[time] << rss.to_f * page_size
172
+ end
173
+
174
+ ticks = 0
175
+ cpu_average.each do |stat_pid, cpu_average_pid|
176
+ start = cpu_average_pid.keys.sort.first
177
+ eend = cpu_average_pid.keys.sort.last
178
+ ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
179
+ end
180
+ start = rss_average.keys.sort.first
181
+ eend = rss_average.keys.sort.last
182
+ time_elapsed = eend - start
183
+ puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
184
+ puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
185
+
186
+ end
187
+
188
+ if options[:sacct_peformance]
189
+ begin
190
+ tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
191
+ values = tsv[tsv.keys.first]
192
+ if values.compact.any?
193
+ puts Log.color(:magenta, "SACCT performance: ")
194
+ puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
56
195
  end
57
-
58
- if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
59
- exe = m[1]
60
- else
61
- exe = nil
62
- end
63
-
64
- if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
65
- container_home = m[1]
66
- else
67
- container_home = nil
68
- end
69
-
70
-
71
- if File.exists?(fid = File.join(dir, 'job.id'))
72
- id = Open.read(fid).chomp
73
- else
74
- id = nil
75
- end
76
-
77
- if File.exists?(fstatus = File.join(dir, 'exit.status'))
78
- exit_status = Open.read(fstatus).to_i
79
- else
80
- exit_status = nil
81
- end
82
-
83
- if File.exists?(fstatus = File.join(dir, 'job.status'))
84
- nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
85
- else
86
- nodes = []
87
- end
88
-
89
- if File.exists?(File.join(dir, 'std.out'))
90
- outt = File.mtime File.join(dir, 'std.out')
91
- errt = File.mtime File.join(dir, 'std.err')
92
- time_diff = Time.now - [outt, errt].max
93
- end
94
-
95
- fdep = File.join(dir, 'dependencies.list')
96
- deps = Open.read(fdep).split("\n") if File.exists?(fdep)
97
-
98
- if done || error || aborted || running || queued || jobid || search
99
- select = false
100
- select = true if done && exit_status == 0
101
- select = true if error && exit_status && exit_status != 0
102
- select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
103
- select = true if queued && deps && (running_jobs & deps).any?
104
- select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
105
- select = true if jobid && jobid.split(",").include?(id)
106
- select = true if search && cmd.match(/#{search}/)
107
- next unless select
108
- end
109
-
110
-
111
- puts Log.color :blue, dir
112
- puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
113
- puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
114
- puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
115
- puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
116
- puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
117
- puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
118
- puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
119
- puts Log.color(:magenta, "Nodes: ") << nodes * ", "
120
- puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
121
-
122
- if tail && File.exists?(File.join(dir, 'std.err'))
123
- if exit_status && exit_status != 0
124
- puts Log.color(:magenta, "First error or exception found: ")
125
- puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
126
- elsif exit_status
127
- puts Log.color(:magenta, "Completed jobs: ")
128
- puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
129
- else
130
- puts Log.color(:magenta, "Log tail: ")
131
- puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
132
- end
133
- end
134
-
135
- count += 1
196
+ rescue
197
+ end
198
+ end
199
+
200
+
201
+ if tail && File.exists?(File.join(dir, 'std.err'))
202
+ if exit_status && exit_status != 0
203
+ puts Log.color(:magenta, "First error or exception found: ")
204
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
205
+ elsif exit_status
206
+ puts Log.color(:magenta, "Completed jobs: ")
207
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
208
+ else
209
+ puts Log.color(:magenta, "Log tail: ")
210
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
211
+ end
212
+ end
213
+
214
+ count += 1
136
215
 
137
216
  end
138
217
 
@@ -21,11 +21,12 @@ $slurm_options = SOPT.get <<EOF
21
21
  -CS--contain_and_sync Contain and sync to default locations
22
22
  -ci--copy_image When using a container directory, copy image there
23
23
  -t--tail Tail the logs
24
+ -SPERF--SLURM_procpath* Save Procpath performance for SLURM job; specify only options
24
25
  -q--queue* Queue
25
26
  -t--task_cpus* Tasks
26
27
  -W--workflows* Additional workflows
27
28
  -tm--time* Time
28
- -R--rules* Orchestration rules
29
+ -OR--orchestration_rules* Orchestration rules
29
30
  -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
30
31
  EOF
31
32
 
@@ -43,5 +44,5 @@ class Step
43
44
  end
44
45
  end
45
46
 
46
- ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
47
+ ARGV.concat ["-W", $slurm_options[:workflows], '--detach'] if $slurm_options[:workflows]
47
48
  load Rbbt.share.rbbt_commands.workflow.task.find
@@ -20,6 +20,7 @@ $slurm_options = SOPT.get <<EOF
20
20
  -CS--contain_and_sync Contain and sync to default locations
21
21
  -ci--copy_image When using a container directory, copy image there
22
22
  -t--tail Tail the logs
23
+ -SPERF--SLURM_procpath* Save Procpath performance for SLURM job; specify only options
23
24
  -q--queue* Queue
24
25
  -t--task_cpus* Tasks
25
26
  -W--workflows* Additional workflows
@@ -35,7 +35,7 @@ file = case file
35
35
  fields = options[:fields]
36
36
  raise ParameterException, "Please specify the fields to slice" if fields.nil?
37
37
 
38
- options[:header_hash] = options["header_hash"]
38
+ options[:header_hash] ||= options["header_hash"]
39
39
 
40
40
  case
41
41
  when options[:tokyocabinet]
@@ -45,8 +45,8 @@ when options[:tokyocabinet_bd]
45
45
  tsv = Persist.open_tokyocabinet(file, false, nil, TokyoCabinet::BDB)
46
46
  puts tsv.summary
47
47
  else
48
- stream = TSV.traverse file, options.merge(:into => :stream, :type => :list, :keys => fields, :unnamed => true) do |*p|
49
- p * "\t"
48
+ stream = TSV.traverse file, options.merge(:into => :stream, :type => :list, :fields => fields.split(","), :unnamed => true) do |k,fields,names|
49
+ [k,fields].flatten * "\t"
50
50
  end
51
51
  puts stream.read
52
52
  exit 0
@@ -86,7 +86,7 @@ messages = info[:messages]
86
86
  backtrace = info[:backtrace]
87
87
  pid = info[:pid]
88
88
  exception = info[:exception]
89
- rest = info.keys - [:inputs, :dependencies, :status, :time_elapsed, :messages, :backtrace, :exception, :pid, :archived_info]
89
+ rest = info.keys - [:inputs, :dependencies, :status, :time_elapsed, :messages, :backtrace, :exception, :archived_info]
90
90
 
91
91
 
92
92
  puts Log.color(:magenta, "File") << ": " << step.path
@@ -20,7 +20,7 @@ def usage(workflow = nil, task = nil, exception=nil, abridge = false)
20
20
  puts
21
21
  if workflow.nil?
22
22
  puts "No workflow specified. Use `rbbt workflow list` to list available workflows."
23
- exit -1
23
+ exit! -1
24
24
  end
25
25
 
26
26
  if task.nil?
@@ -203,10 +203,11 @@ The `recursive_clean` cleans all the job dependency steps recursively.
203
203
  -prec--prepare_cpus* Number of dependencies prepared in parallel
204
204
  -rwt--remote_workflow_tasks* Load a yaml file describing remote workflow tasks
205
205
  -od--override_deps* Override deps using 'Workflow#task=<path>' array_separated
206
+ -PERF--procpath_performance* Measure performance using procpath
206
207
  EOF
207
208
 
208
209
  workflow = ARGV.shift
209
- usage and exit -1 if workflow.nil?
210
+ usage and exit! -1 if workflow.nil?
210
211
 
211
212
  task = ARGV.shift
212
213
 
@@ -232,7 +233,8 @@ else
232
233
  remote_workflows = {}
233
234
  end
234
235
 
235
- Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
236
+ #Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
237
+ Workflow.workdir.search_paths.merge!({:workdir => File.expand_path(options.delete(:workdir_all)), :default => :workdir }) if options[:workdir_all]
236
238
 
237
239
  workflow = Workflow.require_workflow workflow
238
240
 
@@ -406,6 +408,23 @@ begin
406
408
  exit 0
407
409
  end
408
410
 
411
+ if options[:procpath_performance]
412
+ require 'rbbt/util/procpath'
413
+ current_pid = job.info[:pid]
414
+ job.fork
415
+ job.soft_grace
416
+ sleep 2 if job.info[:pid] == current_pid
417
+ if job.info[:pid] != current_pid
418
+ pid = job.info[:pid]
419
+ begin
420
+ ProcPath.monitor(pid, options[:procpath_performance])
421
+ rescue Errno::ECHILD
422
+ Log.warn "Procpath didn't find process #{pid} to monitor. Maybe it finished already"
423
+ rescue
424
+ Log.warn "Procpath failed: #{$!.message}"
425
+ end
426
+ end
427
+ end
409
428
 
410
429
  if do_fork
411
430
  ENV["RBBT_NO_PROGRESS"] = "true"
@@ -422,7 +441,6 @@ begin
422
441
  res = job
423
442
  end
424
443
 
425
-
426
444
  if options.delete(:printpath)
427
445
  job.join
428
446
  raise job.messages.last if (job.error? || job.aborted?) && job.messages
@@ -486,7 +504,7 @@ rescue ParameterException
486
504
  puts
487
505
  report_options saved_job_options
488
506
  puts
489
- exit -1
507
+ exit! -1
490
508
  end
491
509
 
492
510
  if options.delete(:list_job_files)
@@ -538,7 +556,7 @@ when Step
538
556
  io.abort if io.respond_to? :abort
539
557
  io.join if io.respond_to? :join
540
558
  ensure
541
- exit -1
559
+ exit! -1
542
560
  end
543
561
  rescue Exception
544
562
  Log.exception $!
@@ -547,9 +565,11 @@ when Step
547
565
  io.abort if io.respond_to? :abort
548
566
  io.join if io.respond_to? :join
549
567
  ensure
550
- exit -1
568
+ exit! -1
551
569
  end
552
570
  end
571
+ elsif detach
572
+ exit! 0
553
573
  else
554
574
  res.join
555
575
  out.puts Open.read(res.path) if Open.exist?(res.path) || Open.remote?(res.path) || Open.ssh?(res.path)
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt/workflow'
4
+
5
+ require 'rbbt-util'
6
+ require 'rbbt-util'
7
+ require 'rbbt/util/simpleopt'
8
+
9
+ $0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
10
+
11
+ options = SOPT.setup <<EOF
12
+ Examine the info of a job result
13
+
14
+ $ rbbt workflow info <job-result> <key> <value>
15
+
16
+ -h--help Help
17
+ -f--force Write info even if key is already present
18
+ -r--recursive Write info for all dependencies as well
19
+ -p--check_pid Check that recursive jobs where created by the same process
20
+ EOF
21
+
22
+ SOPT.usage if options[:help]
23
+
24
+ file, key, value = ARGV
25
+
26
+ force, recursive, check_pid = options.values_at :force, :recursive, :check_pid
27
+
28
+ def get_step(file)
29
+ file = file.sub(/\.(info|files)/,'')
30
+ step = Workflow.load_step file
31
+ step
32
+ end
33
+
34
+ raise ParameterException if key.nil? || value.nil?
35
+
36
+ if %w(DELETE nil).include? value
37
+ value = nil
38
+ force = true
39
+ end
40
+
41
+ step = get_step file
42
+
43
+ step.set_info key, value if force || ! step.info.include?(key)
44
+
45
+ pid = step.info[:pid]
46
+ host = step.info[:pid_hostname]
47
+
48
+ step.rec_dependencies.each do |dep|
49
+ dep.set_info key, value if (force || ! dep.info.include?(key)) && (!check_pid || dep.info[:pid].to_s == pid and dep.info[:pid_hostname] == host)
50
+ rescue
51
+ Log.warn "Could no set info #{key} for #{dep.path}: #{$!.message}"
52
+ end if recursive