rbbt-util 5.29.0 → 5.30.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,15 +20,18 @@ $ rbbt mnl [options]
20
20
  -j--job* Job ids
21
21
  -s--search* Regular expression
22
22
  -t--tail* Show the last lines of the STDERR
23
+ -SBP--sbatch_parameters show sbatch parameters
24
+ -PERF--procpath_performance show Procpath performance summary
25
+ -sacct--sacct_peformance show sacct performance summary
23
26
  EOF
24
27
 
25
28
  if options[:help]
26
- if defined? rbbt_usage
27
- rbbt_usage
28
- else
29
- puts SOPT.doc
30
- end
31
- exit 0
29
+ if defined? rbbt_usage
30
+ rbbt_usage
31
+ else
32
+ puts SOPT.doc
33
+ end
34
+ exit 0
32
35
  end
33
36
 
34
37
  Log.severity = 4
@@ -38,101 +41,177 @@ workdir = File.expand_path('~/rbbt-slurm')
38
41
  Path.setup(workdir)
39
42
 
40
43
  running_jobs = begin
41
- CMD.cmd('squeue').read.split("\n").collect{|l| l.to_i.to_s}
44
+ squeue_txt = CMD.cmd('squeue').read
45
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
42
46
  rescue
43
- Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
44
- $norunningjobs = true
45
- []
47
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
48
+ squeue_txt = nil
49
+ $norunningjobs = true
50
+ []
46
51
  end
47
52
 
53
+ if squeue_txt
54
+ job_nodes = {}
55
+ squeue_txt.split("\n").each do |line|
56
+ parts = line.strip.split(/\s+/)
57
+ job_nodes[parts.first] = parts.last.split(",")
58
+ end
59
+ else
60
+ job_nodes = nil
61
+ end
62
+
48
63
  count = 0
49
64
  workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
50
- dir = File.dirname(fcmd)
51
-
52
- if m = Open.read(fcmd).match(/#CMD: (.*)/)
53
- cmd = m[1]
54
- else
55
- cmd = nil
65
+ dir = File.dirname(fcmd)
66
+
67
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
68
+ cmd = m[1]
69
+ else
70
+ cmd = nil
71
+ end
72
+
73
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
74
+ exe = m[1].sub('step_path=$(','')
75
+ else
76
+ exe = nil
77
+ end
78
+
79
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
80
+ container_home = m[1]
81
+ else
82
+ container_home = nil
83
+ end
84
+
85
+
86
+ if File.exists?(fid = File.join(dir, 'job.id'))
87
+ id = Open.read(fid).chomp
88
+ else
89
+ id = nil
90
+ end
91
+
92
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
93
+ exit_status = Open.read(fstatus).to_i
94
+ else
95
+ exit_status = nil
96
+ end
97
+
98
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
99
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
100
+ elsif job_nodes[id]
101
+ nodes = job_nodes[id]
102
+ else
103
+ nodes = []
104
+ end
105
+
106
+ if File.exists?(File.join(dir, 'std.out'))
107
+ outt = File.mtime File.join(dir, 'std.out')
108
+ errt = File.mtime File.join(dir, 'std.err')
109
+ time_diff = Time.now - [outt, errt].max
110
+ end
111
+
112
+ fdep = File.join(dir, 'dependencies.list')
113
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
114
+
115
+ fcadep = File.join(dir, 'canfail_dependencies.list')
116
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
117
+
118
+ if done || error || aborted || running || queued || jobid || search
119
+ select = false
120
+ select = true if done && exit_status == 0
121
+ select = true if error && exit_status && exit_status != 0
122
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
123
+ select = true if queued && deps && (running_jobs & deps).any?
124
+ select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
125
+ select = true if jobid && jobid.split(",").include?(id)
126
+ select = true if search && cmd.match(/#{search}/)
127
+ next unless select
128
+ end
129
+
130
+
131
+ puts Log.color :blue, dir
132
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
133
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
134
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
135
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
136
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
137
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
138
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
139
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
140
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
141
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
142
+
143
+ if options[:sbatch_parameters]
144
+ puts Log.color(:magenta, "SBATCH parameters: ")
145
+ text = CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
146
+ lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
147
+ puts Log.color :yellow, lines * "\n"
148
+ end
149
+
150
+ fprocpath = File.join(dir, 'procpath.sqlite3')
151
+ if options[:procpath_performance] && Open.exists?(fprocpath)
152
+ puts Log.color(:magenta, "Procpath summary: ")
153
+ require 'rbbt/tsv/csv'
154
+ meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
155
+ perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
156
+
157
+ page_size = meta["page_size"].first.to_f
158
+ clock_ticks = meta["clock_ticks"].first.to_f
159
+
160
+ cpu_average = {}
161
+ rss_average = {}
162
+ perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
163
+ time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
164
+ time = time.to_f
165
+
166
+ cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
167
+ cpu_average[stat_pid] ||= {}
168
+ cpu_average[stat_pid][time] ||= []
169
+ cpu_average[stat_pid][time] << cpu.to_f
170
+ rss_average[time] ||= []
171
+ rss_average[time] << rss.to_f * page_size
172
+ end
173
+
174
+ ticks = 0
175
+ cpu_average.each do |stat_pid, cpu_average_pid|
176
+ start = cpu_average_pid.keys.sort.first
177
+ eend = cpu_average_pid.keys.sort.last
178
+ ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
179
+ end
180
+ start = rss_average.keys.sort.first
181
+ eend = rss_average.keys.sort.last
182
+ time_elapsed = eend - start
183
+ puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
184
+ puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
185
+
186
+ end
187
+
188
+ if options[:sacct_peformance]
189
+ begin
190
+ tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
191
+ values = tsv[tsv.keys.first]
192
+ if values.compact.any?
193
+ puts Log.color(:magenta, "SACCT performance: ")
194
+ puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
56
195
  end
57
-
58
- if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
59
- exe = m[1]
60
- else
61
- exe = nil
62
- end
63
-
64
- if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
65
- container_home = m[1]
66
- else
67
- container_home = nil
68
- end
69
-
70
-
71
- if File.exists?(fid = File.join(dir, 'job.id'))
72
- id = Open.read(fid).chomp
73
- else
74
- id = nil
75
- end
76
-
77
- if File.exists?(fstatus = File.join(dir, 'exit.status'))
78
- exit_status = Open.read(fstatus).to_i
79
- else
80
- exit_status = nil
81
- end
82
-
83
- if File.exists?(fstatus = File.join(dir, 'job.status'))
84
- nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
85
- else
86
- nodes = []
87
- end
88
-
89
- if File.exists?(File.join(dir, 'std.out'))
90
- outt = File.mtime File.join(dir, 'std.out')
91
- errt = File.mtime File.join(dir, 'std.err')
92
- time_diff = Time.now - [outt, errt].max
93
- end
94
-
95
- fdep = File.join(dir, 'dependencies.list')
96
- deps = Open.read(fdep).split("\n") if File.exists?(fdep)
97
-
98
- if done || error || aborted || running || queued || jobid || search
99
- select = false
100
- select = true if done && exit_status == 0
101
- select = true if error && exit_status && exit_status != 0
102
- select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
103
- select = true if queued && deps && (running_jobs & deps).any?
104
- select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
105
- select = true if jobid && jobid.split(",").include?(id)
106
- select = true if search && cmd.match(/#{search}/)
107
- next unless select
108
- end
109
-
110
-
111
- puts Log.color :blue, dir
112
- puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
113
- puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
114
- puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
115
- puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
116
- puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
117
- puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
118
- puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
119
- puts Log.color(:magenta, "Nodes: ") << nodes * ", "
120
- puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
121
-
122
- if tail && File.exists?(File.join(dir, 'std.err'))
123
- if exit_status && exit_status != 0
124
- puts Log.color(:magenta, "First error or exception found: ")
125
- puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
126
- elsif exit_status
127
- puts Log.color(:magenta, "Completed jobs: ")
128
- puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
129
- else
130
- puts Log.color(:magenta, "Log tail: ")
131
- puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
132
- end
133
- end
134
-
135
- count += 1
196
+ rescue
197
+ end
198
+ end
199
+
200
+
201
+ if tail && File.exists?(File.join(dir, 'std.err'))
202
+ if exit_status && exit_status != 0
203
+ puts Log.color(:magenta, "First error or exception found: ")
204
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
205
+ elsif exit_status
206
+ puts Log.color(:magenta, "Completed jobs: ")
207
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
208
+ else
209
+ puts Log.color(:magenta, "Log tail: ")
210
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
211
+ end
212
+ end
213
+
214
+ count += 1
136
215
 
137
216
  end
138
217
 
@@ -21,11 +21,12 @@ $slurm_options = SOPT.get <<EOF
21
21
  -CS--contain_and_sync Contain and sync to default locations
22
22
  -ci--copy_image When using a container directory, copy image there
23
23
  -t--tail Tail the logs
24
+ -SPERF--SLURM_procpath* Save Procpath performance for SLURM job; specify only options
24
25
  -q--queue* Queue
25
26
  -t--task_cpus* Tasks
26
27
  -W--workflows* Additional workflows
27
28
  -tm--time* Time
28
- -R--rules* Orchestration rules
29
+ -OR--orchestration_rules* Orchestration rules
29
30
  -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
30
31
  EOF
31
32
 
@@ -43,5 +44,5 @@ class Step
43
44
  end
44
45
  end
45
46
 
46
- ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
47
+ ARGV.concat ["-W", $slurm_options[:workflows], '--detach'] if $slurm_options[:workflows]
47
48
  load Rbbt.share.rbbt_commands.workflow.task.find
@@ -20,6 +20,7 @@ $slurm_options = SOPT.get <<EOF
20
20
  -CS--contain_and_sync Contain and sync to default locations
21
21
  -ci--copy_image When using a container directory, copy image there
22
22
  -t--tail Tail the logs
23
+ -SPERF--SLURM_procpath* Save Procpath performance for SLURM job; specify only options
23
24
  -q--queue* Queue
24
25
  -t--task_cpus* Tasks
25
26
  -W--workflows* Additional workflows
@@ -35,7 +35,7 @@ file = case file
35
35
  fields = options[:fields]
36
36
  raise ParameterException, "Please specify the fields to slice" if fields.nil?
37
37
 
38
- options[:header_hash] = options["header_hash"]
38
+ options[:header_hash] ||= options["header_hash"]
39
39
 
40
40
  case
41
41
  when options[:tokyocabinet]
@@ -45,8 +45,8 @@ when options[:tokyocabinet_bd]
45
45
  tsv = Persist.open_tokyocabinet(file, false, nil, TokyoCabinet::BDB)
46
46
  puts tsv.summary
47
47
  else
48
- stream = TSV.traverse file, options.merge(:into => :stream, :type => :list, :keys => fields, :unnamed => true) do |*p|
49
- p * "\t"
48
+ stream = TSV.traverse file, options.merge(:into => :stream, :type => :list, :fields => fields.split(","), :unnamed => true) do |k,fields,names|
49
+ [k,fields].flatten * "\t"
50
50
  end
51
51
  puts stream.read
52
52
  exit 0
@@ -86,7 +86,7 @@ messages = info[:messages]
86
86
  backtrace = info[:backtrace]
87
87
  pid = info[:pid]
88
88
  exception = info[:exception]
89
- rest = info.keys - [:inputs, :dependencies, :status, :time_elapsed, :messages, :backtrace, :exception, :pid, :archived_info]
89
+ rest = info.keys - [:inputs, :dependencies, :status, :time_elapsed, :messages, :backtrace, :exception, :archived_info]
90
90
 
91
91
 
92
92
  puts Log.color(:magenta, "File") << ": " << step.path
@@ -20,7 +20,7 @@ def usage(workflow = nil, task = nil, exception=nil, abridge = false)
20
20
  puts
21
21
  if workflow.nil?
22
22
  puts "No workflow specified. Use `rbbt workflow list` to list available workflows."
23
- exit -1
23
+ exit! -1
24
24
  end
25
25
 
26
26
  if task.nil?
@@ -203,10 +203,11 @@ The `recursive_clean` cleans all the job dependency steps recursively.
203
203
  -prec--prepare_cpus* Number of dependencies prepared in parallel
204
204
  -rwt--remote_workflow_tasks* Load a yaml file describing remote workflow tasks
205
205
  -od--override_deps* Override deps using 'Workflow#task=<path>' array_separated
206
+ -PERF--procpath_performance* Measure performance using procpath
206
207
  EOF
207
208
 
208
209
  workflow = ARGV.shift
209
- usage and exit -1 if workflow.nil?
210
+ usage and exit! -1 if workflow.nil?
210
211
 
211
212
  task = ARGV.shift
212
213
 
@@ -232,7 +233,8 @@ else
232
233
  remote_workflows = {}
233
234
  end
234
235
 
235
- Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
236
+ #Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
237
+ Workflow.workdir.search_paths.merge!({:workdir => File.expand_path(options.delete(:workdir_all)), :default => :workdir }) if options[:workdir_all]
236
238
 
237
239
  workflow = Workflow.require_workflow workflow
238
240
 
@@ -406,6 +408,23 @@ begin
406
408
  exit 0
407
409
  end
408
410
 
411
+ if options[:procpath_performance]
412
+ require 'rbbt/util/procpath'
413
+ current_pid = job.info[:pid]
414
+ job.fork
415
+ job.soft_grace
416
+ sleep 2 if job.info[:pid] == current_pid
417
+ if job.info[:pid] != current_pid
418
+ pid = job.info[:pid]
419
+ begin
420
+ ProcPath.monitor(pid, options[:procpath_performance])
421
+ rescue Errno::ECHILD
422
+ Log.warn "Procpath didn't find process #{pid} to monitor. Maybe it finished already"
423
+ rescue
424
+ Log.warn "Procpath failed: #{$!.message}"
425
+ end
426
+ end
427
+ end
409
428
 
410
429
  if do_fork
411
430
  ENV["RBBT_NO_PROGRESS"] = "true"
@@ -422,7 +441,6 @@ begin
422
441
  res = job
423
442
  end
424
443
 
425
-
426
444
  if options.delete(:printpath)
427
445
  job.join
428
446
  raise job.messages.last if (job.error? || job.aborted?) && job.messages
@@ -486,7 +504,7 @@ rescue ParameterException
486
504
  puts
487
505
  report_options saved_job_options
488
506
  puts
489
- exit -1
507
+ exit! -1
490
508
  end
491
509
 
492
510
  if options.delete(:list_job_files)
@@ -538,7 +556,7 @@ when Step
538
556
  io.abort if io.respond_to? :abort
539
557
  io.join if io.respond_to? :join
540
558
  ensure
541
- exit -1
559
+ exit! -1
542
560
  end
543
561
  rescue Exception
544
562
  Log.exception $!
@@ -547,9 +565,11 @@ when Step
547
565
  io.abort if io.respond_to? :abort
548
566
  io.join if io.respond_to? :join
549
567
  ensure
550
- exit -1
568
+ exit! -1
551
569
  end
552
570
  end
571
+ elsif detach
572
+ exit! 0
553
573
  else
554
574
  res.join
555
575
  out.puts Open.read(res.path) if Open.exist?(res.path) || Open.remote?(res.path) || Open.ssh?(res.path)
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt/workflow'
4
+
5
+ require 'rbbt-util'
6
+ require 'rbbt-util'
7
+ require 'rbbt/util/simpleopt'
8
+
9
+ $0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
10
+
11
+ options = SOPT.setup <<EOF
12
+ Examine the info of a job result
13
+
14
+ $ rbbt workflow info <job-result> <key> <value>
15
+
16
+ -h--help Help
17
+ -f--force Write info even if key is already present
18
+ -r--recursive Write info for all dependencies as well
19
+ -p--check_pid Check that recursive jobs where created by the same process
20
+ EOF
21
+
22
+ SOPT.usage if options[:help]
23
+
24
+ file, key, value = ARGV
25
+
26
+ force, recursive, check_pid = options.values_at :force, :recursive, :check_pid
27
+
28
+ def get_step(file)
29
+ file = file.sub(/\.(info|files)/,'')
30
+ step = Workflow.load_step file
31
+ step
32
+ end
33
+
34
+ raise ParameterException if key.nil? || value.nil?
35
+
36
+ if %w(DELETE nil).include? value
37
+ value = nil
38
+ force = true
39
+ end
40
+
41
+ step = get_step file
42
+
43
+ step.set_info key, value if force || ! step.info.include?(key)
44
+
45
+ pid = step.info[:pid]
46
+ host = step.info[:pid_hostname]
47
+
48
+ step.rec_dependencies.each do |dep|
49
+ dep.set_info key, value if (force || ! dep.info.include?(key)) && (!check_pid || dep.info[:pid].to_s == pid and dep.info[:pid_hostname] == host)
50
+ rescue
51
+ Log.warn "Could no set info #{key} for #{dep.path}: #{$!.message}"
52
+ end if recursive