rbbt-util 5.29.0 → 5.30.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rbbt/tsv.rb CHANGED
@@ -113,8 +113,8 @@ module TSV
113
113
 
114
114
  data.entity_options = entity_options
115
115
 
116
- if Path === source and data.identifiers
117
- data.identifiers = Path.setup(data.identifiers, source.pkgdir, source.resource)
116
+ if Path === source && data.identifiers
117
+ Path.setup(data.identifiers, source.pkgdir, source.resource)
118
118
  end
119
119
 
120
120
  if data.respond_to? :persistence_path
@@ -124,6 +124,10 @@ module TSV
124
124
  data.clear
125
125
  data.annotate h
126
126
  end
127
+
128
+ data.read
129
+
130
+ data
127
131
  end
128
132
 
129
133
  def self.parse_header(stream, options = {})
data/lib/rbbt/util/cmd.rb CHANGED
@@ -217,7 +217,7 @@ module CMD
217
217
  end
218
218
  end
219
219
 
220
- def self.cmd_log(*args)
220
+ def self.cmd_pid(*args)
221
221
  all_args = *args
222
222
 
223
223
  all_args << {} unless Hash === all_args.last
@@ -248,4 +248,9 @@ module CMD
248
248
  nil
249
249
  end
250
250
 
251
+ def self.cmd_log(*args)
252
+ cmd_pid(*args)
253
+ nil
254
+ end
255
+
251
256
  end
@@ -287,7 +287,9 @@ module Misc
287
287
  when Symbol
288
288
  obj.to_s
289
289
  when (defined?(Path) and Path)
290
- if Step === obj.resource
290
+ if defined?(Step) && Open.exists?(Step.info_file(obj))
291
+ obj2str(Workflow.load_step(obj))
292
+ elsif defined?(Step) && Step === obj.resource
291
293
  "Step file: " + obj
292
294
  else
293
295
  if obj.exists?
@@ -322,7 +324,11 @@ module Misc
322
324
  remove_long_items(obj)
323
325
  when File
324
326
  if obj.respond_to? :filename and obj.filename
325
- "<IO:" << obj.filename << "--" << mtime_str(obj.filename) << ">"
327
+ if defined?(Step) && Open.exists?(Step.info_file(obj.filename))
328
+ obj2str(Workflow.load_step(obj.filename))
329
+ else
330
+ "<IO:" << obj.filename << "--" << mtime_str(obj.filename) << ">"
331
+ end
326
332
  else
327
333
  "<IO:" << obj.path << "--" << mtime_str(obj.path) << ">"
328
334
  end
@@ -330,7 +336,11 @@ module Misc
330
336
  "<IO:" << obj.short_path << ">"
331
337
  when IO
332
338
  if obj.respond_to? :filename and obj.filename
333
- "<IO:" << obj.filename << "--" << mtime_str(obj.filename) << ">"
339
+ if defined?(Step) && Open.exists?(Step.info_file(obj.filename))
340
+ obj2str(Workflow.load_step(obj.filename))
341
+ else
342
+ "<IO:" << obj.filename << "--" << mtime_str(obj.filename) << ">"
343
+ end
334
344
  else
335
345
 
336
346
  if obj.respond_to? :obj2str
@@ -242,48 +242,6 @@ module Misc
242
242
 
243
243
  return options
244
244
 
245
- options = {}
246
- string.split(/#/).each do |str|
247
- if str.match(/(.*)=(.*)/)
248
- option, value = $1, $2
249
- else
250
- option, value = str, true
251
- end
252
-
253
- option = option.sub(":",'').to_sym if option.chars.first == ':'
254
- value = value.sub(":",'').to_sym if String === value and value.chars.first == ':'
255
-
256
- if value == true
257
- options[option] = option.to_s.chars.first != '!'
258
- else
259
- options[option] = Thread.start do
260
- $SAFE = 0;
261
- case
262
- when value =~ /^(?:true|T)$/i
263
- true
264
- when value =~ /^(?:false|F)$/i
265
- false
266
- when Symbol === value
267
- value
268
- when (String === value and value =~ /^\/(.*)\/$/)
269
- Regexp.new /#{$1}/
270
- else
271
- begin
272
- Kernel.const_get value
273
- rescue
274
- begin
275
- raise if value =~ /[a-z]/ and defined? value
276
- eval(value)
277
- rescue Exception
278
- value
279
- end
280
- end
281
- end
282
- end.value
283
- end
284
- end
285
-
286
- options
287
245
  end
288
246
 
289
247
  end
@@ -0,0 +1,49 @@
1
+ require 'rbbt/util/cmd'
2
+ module ProcPath
3
+ CMD.tool :procpath do
4
+ 'pip install procpath'
5
+ end
6
+
7
+ def self.record(pid, path, options = {})
8
+ IndiferentHash.setup(options)
9
+ options = Misc.add_defaults options, "interval" => 30
10
+
11
+ cmd_options = %w(interval recnum reevalnum).inject({}){|acc,k| acc[k] = options[k]; acc}
12
+
13
+ Log.debug "ProcPath recording #{pid} in #{path} (#{Misc.fingerprint options})"
14
+ procpath_thread = Thread.new do
15
+ begin
16
+ procpath_pid = CMD.cmd_pid(:procpath, "record --database-file '#{path}' '$..children[?(@.stat.pid == #{pid})]'", cmd_options.merge(:nofail => true, :add_option_dashes => true))
17
+ rescue Exception
18
+ Log.exceptions $!
19
+ Process.kill "INT", procpath_pid
20
+ end
21
+ end
22
+
23
+ procpath_thread.report_on_exception = false
24
+
25
+ Process.wait pid.to_i
26
+ procpath_thread.raise Interrupt
27
+ end
28
+
29
+ def self.plot(path, output, options = {})
30
+ IndiferentHash.setup(options)
31
+ options = Misc.add_defaults options, "query-name" => 'rss', 'epsilon' => 0.5, "moving-average-window" => 10
32
+
33
+ cmd_options = %w(query-name epsilon monitor-average-window title logarithmic after before custom-query-file custom-value-expr).inject({}){|acc,k| acc[k] = options[k]; acc}
34
+ CMD.cmd_log(:procpath, "plot --database-file '#{path}' --plot-file '#{output}' ", cmd_options.merge(:nofail => true, :add_option_dashes => true))
35
+ end
36
+
37
+ def self.monitor(pid, path)
38
+ database, options_str = path.split("#")
39
+ options = options_str.nil? ? {} : Misc.string2hash(options_str)
40
+
41
+ database = File.expand_path database
42
+ Log.low "ProcPath monitor #{pid} in #{database} (#{Misc.fingerprint options})"
43
+
44
+ ProcPath.record(pid, database + '.sqlite3', options)
45
+ ProcPath.plot(database + '.sqlite3', database + '.cpu.svg', options.merge("query-name" => 'cpu'))
46
+ ProcPath.plot(database + '.sqlite3', database + '.rss.svg', options.merge("query-name" => 'rss'))
47
+ end
48
+ end
49
+
@@ -16,6 +16,10 @@ end
16
16
 
17
17
  module Workflow
18
18
 
19
+ def self.job_path?(path)
20
+ path.split("/")[-4] == "jobs"
21
+ end
22
+
19
23
  def log(status, message = nil, &block)
20
24
  Step.log(status, message, nil, &block)
21
25
  end
@@ -301,8 +305,9 @@ module Workflow
301
305
 
302
306
  def setup_override_dependency(dep, workflow, task_name)
303
307
  dep = Step === dep ? dep : Workflow.load_step(dep)
308
+ dep.workflow = workflow
304
309
  dep.info[:name] = dep.name
305
- dep.original_task_name ||= dep.task_name
310
+ dep.original_task_name ||= dep.task_name if dep.workflow
306
311
  begin
307
312
  workflow = Kernel.const_get workflow if String === workflow
308
313
  dep.task = workflow.tasks[task_name] if dep.task.nil? && workflow.tasks.include?(task_name)
@@ -94,17 +94,21 @@ class Step
94
94
  Log.debug "Saving job input #{name} (#{type}) into #{path}"
95
95
  case
96
96
  when Step === value
97
- Open.link(value.path, path)
97
+ Open.ln_s(value.path, path)
98
98
  when type.to_s == "file"
99
99
  if String === value && File.exists?(value)
100
- Open.link(value, path)
100
+ Open.ln_s(value, path)
101
101
  else
102
102
  Open.write(path + '.yaml', value.to_yaml)
103
103
  end
104
104
  when Array === value
105
- Open.write(path, value * "\n")
105
+ Open.write(path, value.collect{|v| Step === v ? v.path : v.to_s} * "\n")
106
106
  when IO === value
107
- Open.write(path, value)
107
+ if value.filename && String === value.filename && File.exists?(value.filename)
108
+ Open.ln_s(value.filename, path)
109
+ else
110
+ Open.write(path, value)
111
+ end
108
112
  else
109
113
  Open.write(path, value.to_s)
110
114
  end
@@ -117,13 +121,17 @@ class Step
117
121
  task_name = Symbol === job.overriden ? job.overriden : job.task_name
118
122
  workflow = job.workflow
119
123
  workflow = Kernel.const_get workflow if String === workflow
120
- task_info = workflow.task_info(task_name)
121
- input_types = task_info[:input_types]
122
- task_inputs = task_info[:inputs]
123
- input_defaults = task_info[:input_defaults]
124
+ if workflow
125
+ task_info = workflow.task_info(task_name)
126
+ input_types = task_info[:input_types]
127
+ task_inputs = task_info[:inputs]
128
+ input_defaults = task_info[:input_defaults]
129
+ else
130
+ task_info = input_types = task_inputs = input_defaults = {}
131
+ end
124
132
 
125
133
  inputs = {}
126
- real_inputs = job.real_inputs || job.info[:real_inputs]
134
+ real_inputs = job.real_inputs || job.info[:real_inputs]
127
135
  job.recursive_inputs.zip(job.recursive_inputs.fields).each do |value,name|
128
136
  next unless task_inputs.include? name.to_sym
129
137
  next unless real_inputs.include? name.to_sym
@@ -497,8 +505,8 @@ class Step
497
505
 
498
506
  def running?
499
507
  return false if ! (started? || status == :ending)
500
- pid = info[:pid]
501
- return nil if pid.nil?
508
+ return nil unless Open.exist?(self.pid_file)
509
+ pid = Open.read(self.pid_file).to_i
502
510
 
503
511
  return false if done? or error? or aborted?
504
512
 
@@ -522,8 +530,7 @@ class Step
522
530
  end
523
531
 
524
532
  def nopid?
525
- pid = info[:pid] || Open.exists?(pid_file)
526
- ! pid && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
533
+ ! Open.exists?(pid_file) && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
527
534
  end
528
535
 
529
536
  def aborted?
@@ -103,7 +103,7 @@ class Step
103
103
  end
104
104
 
105
105
  job.dup_inputs unless status == 'done' or job.started?
106
- job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started?
106
+ job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started? || ! Workflow.job_path?(job.path)
107
107
 
108
108
  canfail = ComputeDependency === job && job.canfail?
109
109
  end
@@ -130,7 +130,6 @@ class Step
130
130
  (inputs.flatten.select{|i| Step === i} + inputs.flatten.select{|dep| Path === dep && Step === dep.resource}.collect{|dep| dep.resource})
131
131
  end
132
132
 
133
-
134
133
  def execute_dependency(dependency, log = true)
135
134
  task_name = self.task_name
136
135
  canfail_paths = self.canfail_paths
@@ -122,7 +122,6 @@ class Step
122
122
  reject{|dependency| (defined?(WorkflowRemoteClient) && WorkflowRemoteClient::RemoteStep === dependency) || Open.remote?(dependency.path) }.
123
123
  reject{|dependency| dependency.error? }.
124
124
  #select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
125
- #select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
126
125
  select{|dependency| dependency.updatable? }.
127
126
  collect{|dependency| Workflow.relocate_dependency(self, dependency)}
128
127
  end
@@ -374,7 +373,6 @@ class Step
374
373
  Log.exception $!
375
374
  ensure
376
375
  Step.purge_stream_cache
377
- set_info :pid, nil
378
376
  Open.rm pid_file if Open.exist?(pid_file)
379
377
  end
380
378
  end
@@ -389,7 +387,6 @@ class Step
389
387
  _clean_finished
390
388
  rescue
391
389
  stop_dependencies
392
- set_info :pid, nil
393
390
  Open.rm pid_file if Open.exist?(pid_file)
394
391
  end
395
392
  end
@@ -450,7 +447,7 @@ class Step
450
447
  ensure
451
448
  no_load = false unless IO === result
452
449
  Open.rm pid_file if Open.exist?(pid_file) unless no_load
453
- set_info :pid, nil unless no_load
450
+ #set_info :pid, nil unless no_load
454
451
  end
455
452
  end
456
453
 
@@ -560,7 +557,7 @@ class Step
560
557
  RbbtSemaphore.post_semaphore(semaphore) if semaphore
561
558
  Kernel.exit! -1
562
559
  end
563
- set_info :pid, nil
560
+ #set_info :pid, nil
564
561
  ensure
565
562
  RbbtSemaphore.post_semaphore(semaphore) if semaphore
566
563
  end
@@ -57,7 +57,7 @@ module Task
57
57
  puts Log.color(:magenta, "Input select options")
58
58
  puts
59
59
  selects.collect{|p| p}.uniq.each do |input,options|
60
- puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| o.to_s} * ", ") << "\n"
60
+ puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| Array === o ? o.first.to_s : o.to_s} * ", ") << "\n"
61
61
  puts unless Log.compact
62
62
  end
63
63
  puts
@@ -22,11 +22,14 @@ class Step
22
22
  end
23
23
 
24
24
  def self.prov_report_msg(status, name, path, info = nil)
25
- parts = path.sub(/\{.*/,'').sub(/#{Regexp.quote(name)}$/,'').split "/"
25
+ parts = path.sub(/\{.*/,'').split "/"
26
26
 
27
+ parts.pop
28
+
27
29
  task = Log.color(:yellow, parts.pop)
28
30
  workflow = Log.color(:magenta, parts.pop)
29
- if status.to_s == 'noinfo' and parts.last != 'jobs'
31
+ #if status.to_s == 'noinfo' && parts.last != 'jobs'
32
+ if ! Workflow.job_path?(path)
30
33
  task, status, workflow = Log.color(:yellow, info[:task_name]), Log.color(:green, "file"), Log.color(:magenta, "-")
31
34
  end
32
35
 
@@ -0,0 +1,165 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+
6
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
7
+
8
+ options = SOPT.setup <<EOF
9
+
10
+ Clean error or aborted jobs
11
+
12
+ $ rbbt mnl [options]
13
+
14
+ -h--help Print this help
15
+ -d--done Done jobs only
16
+ -e--error Error jobs only
17
+ -a--aborted SLURM aboted jobs
18
+ -j--job* Job ids
19
+ -s--search* Regular expression
20
+ -t--tail* Show the last lines of the STDERR
21
+ -SBP--sbatch_parameters show sbatch parameters
22
+ -dr--dry_run Do not erase anything
23
+ EOF
24
+
25
+ if options[:help]
26
+ if defined? rbbt_usage
27
+ rbbt_usage
28
+ else
29
+ puts SOPT.doc
30
+ end
31
+ exit 0
32
+ end
33
+
34
+ Log.severity = 4
35
+ done, error, aborted, jobid, search, tail, sbatch_parameters, dry_run = options.values_at :done, :error, :aborted, :job, :search, :tail, :sbatch_parameters, :dry_run
36
+
37
+ workdir = File.expand_path('~/rbbt-slurm')
38
+ Path.setup(workdir)
39
+
40
+ running_jobs = begin
41
+ squeue_txt = CMD.cmd('squeue').read
42
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
43
+ rescue
44
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
45
+ squeue_txt = nil
46
+ $norunningjobs = true
47
+ []
48
+ end
49
+
50
+ if squeue_txt
51
+ job_nodes = {}
52
+ squeue_txt.split("\n").each do |line|
53
+ parts = line.strip.split(/\s+/)
54
+ job_nodes[parts.first] = parts.last.split(",")
55
+ end
56
+ else
57
+ job_nodes = nil
58
+ end
59
+
60
+ count = 0
61
+ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
62
+ dir = File.dirname(fcmd)
63
+
64
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
65
+ cmd = m[1]
66
+ else
67
+ cmd = nil
68
+ end
69
+
70
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
71
+ exe = m[1]
72
+ else
73
+ exe = nil
74
+ end
75
+
76
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
77
+ container_home = m[1]
78
+ else
79
+ container_home = nil
80
+ end
81
+
82
+
83
+ if File.exists?(fid = File.join(dir, 'job.id'))
84
+ id = Open.read(fid).chomp
85
+ else
86
+ id = nil
87
+ end
88
+
89
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
90
+ exit_status = Open.read(fstatus).to_i
91
+ else
92
+ exit_status = nil
93
+ end
94
+
95
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
96
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
97
+ elsif job_nodes[id]
98
+ nodes = job_nodes[id]
99
+ else
100
+ nodes = []
101
+ end
102
+
103
+ if File.exists?(File.join(dir, 'std.out'))
104
+ outt = File.mtime File.join(dir, 'std.out')
105
+ errt = File.mtime File.join(dir, 'std.err')
106
+ time_diff = Time.now - [outt, errt].max
107
+ end
108
+
109
+ fdep = File.join(dir, 'dependencies.list')
110
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
111
+
112
+ fcadep = File.join(dir, 'canfail_dependencies.list')
113
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
114
+
115
+ aborted = error = true if aborted.nil? && error.nil?
116
+ if done || error || aborted || running || queued || jobid || search
117
+ select = false
118
+ select = true if done && exit_status && exit_status.to_i == 0
119
+ select = true if error && exit_status && exit_status.to_i != 0
120
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
121
+ select = select && jobid.split(",").include?(id) if jobid
122
+ select = select && cmd.match(/#{search}/) if search
123
+ next unless select
124
+ end
125
+
126
+
127
+ puts Log.color(:yellow, "**ERASING**")
128
+ puts Log.color :blue, dir
129
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
130
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
131
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
132
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
133
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
134
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
135
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
136
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
137
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
138
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
139
+
140
+ if options[:sbatch_parameters]
141
+ puts Log.color(:magenta, "SBATCH parameters: ")
142
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
143
+ end
144
+
145
+ if tail && File.exists?(File.join(dir, 'std.err'))
146
+ if exit_status && exit_status != 0
147
+ puts Log.color(:magenta, "First error or exception found: ")
148
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
149
+ elsif exit_status
150
+ puts Log.color(:magenta, "Completed jobs: ")
151
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
152
+ else
153
+ puts Log.color(:magenta, "Log tail: ")
154
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
155
+ end
156
+ end
157
+
158
+ count += 1
159
+
160
+ Open.rm_rf dir unless dry_run
161
+ end
162
+
163
+ puts
164
+ puts "Found #{count} jobs"
165
+