rbbt-util 5.29.0 → 5.30.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc/orchestrate.rb +96 -8
- data/lib/rbbt/hpc/slurm.rb +57 -24
- data/lib/rbbt/persist.rb +4 -0
- data/lib/rbbt/persist/tsv/adapter.rb +44 -13
- data/lib/rbbt/tsv.rb +6 -2
- data/lib/rbbt/util/cmd.rb +6 -1
- data/lib/rbbt/util/misc/inspect.rb +13 -3
- data/lib/rbbt/util/misc/options.rb +0 -42
- data/lib/rbbt/util/procpath.rb +49 -0
- data/lib/rbbt/workflow/accessor.rb +6 -1
- data/lib/rbbt/workflow/step/accessor.rb +20 -13
- data/lib/rbbt/workflow/step/dependencies.rb +1 -2
- data/lib/rbbt/workflow/step/run.rb +2 -5
- data/lib/rbbt/workflow/usage.rb +1 -1
- data/lib/rbbt/workflow/util/provenance.rb +5 -2
- data/share/rbbt_commands/slurm/clean +165 -0
- data/share/rbbt_commands/slurm/list +174 -95
- data/share/rbbt_commands/slurm/orchestrate +3 -2
- data/share/rbbt_commands/slurm/task +1 -0
- data/share/rbbt_commands/tsv/slice +3 -3
- data/share/rbbt_commands/workflow/info +1 -1
- data/share/rbbt_commands/workflow/task +27 -7
- data/share/rbbt_commands/workflow/write_info +52 -0
- data/test/rbbt/test_workflow.rb +7 -7
- data/test/rbbt/util/test_procpath.rb +23 -0
- metadata +7 -2
data/lib/rbbt/tsv.rb
CHANGED
@@ -113,8 +113,8 @@ module TSV
|
|
113
113
|
|
114
114
|
data.entity_options = entity_options
|
115
115
|
|
116
|
-
if Path === source
|
117
|
-
|
116
|
+
if Path === source && data.identifiers
|
117
|
+
Path.setup(data.identifiers, source.pkgdir, source.resource)
|
118
118
|
end
|
119
119
|
|
120
120
|
if data.respond_to? :persistence_path
|
@@ -124,6 +124,10 @@ module TSV
|
|
124
124
|
data.clear
|
125
125
|
data.annotate h
|
126
126
|
end
|
127
|
+
|
128
|
+
data.read
|
129
|
+
|
130
|
+
data
|
127
131
|
end
|
128
132
|
|
129
133
|
def self.parse_header(stream, options = {})
|
data/lib/rbbt/util/cmd.rb
CHANGED
@@ -217,7 +217,7 @@ module CMD
|
|
217
217
|
end
|
218
218
|
end
|
219
219
|
|
220
|
-
def self.
|
220
|
+
def self.cmd_pid(*args)
|
221
221
|
all_args = *args
|
222
222
|
|
223
223
|
all_args << {} unless Hash === all_args.last
|
@@ -248,4 +248,9 @@ module CMD
|
|
248
248
|
nil
|
249
249
|
end
|
250
250
|
|
251
|
+
def self.cmd_log(*args)
|
252
|
+
cmd_pid(*args)
|
253
|
+
nil
|
254
|
+
end
|
255
|
+
|
251
256
|
end
|
@@ -287,7 +287,9 @@ module Misc
|
|
287
287
|
when Symbol
|
288
288
|
obj.to_s
|
289
289
|
when (defined?(Path) and Path)
|
290
|
-
if Step
|
290
|
+
if defined?(Step) && Open.exists?(Step.info_file(obj))
|
291
|
+
obj2str(Workflow.load_step(obj))
|
292
|
+
elsif defined?(Step) && Step === obj.resource
|
291
293
|
"Step file: " + obj
|
292
294
|
else
|
293
295
|
if obj.exists?
|
@@ -322,7 +324,11 @@ module Misc
|
|
322
324
|
remove_long_items(obj)
|
323
325
|
when File
|
324
326
|
if obj.respond_to? :filename and obj.filename
|
325
|
-
|
327
|
+
if defined?(Step) && Open.exists?(Step.info_file(obj.filename))
|
328
|
+
obj2str(Workflow.load_step(obj.filename))
|
329
|
+
else
|
330
|
+
"<IO:" << obj.filename << "--" << mtime_str(obj.filename) << ">"
|
331
|
+
end
|
326
332
|
else
|
327
333
|
"<IO:" << obj.path << "--" << mtime_str(obj.path) << ">"
|
328
334
|
end
|
@@ -330,7 +336,11 @@ module Misc
|
|
330
336
|
"<IO:" << obj.short_path << ">"
|
331
337
|
when IO
|
332
338
|
if obj.respond_to? :filename and obj.filename
|
333
|
-
|
339
|
+
if defined?(Step) && Open.exists?(Step.info_file(obj.filename))
|
340
|
+
obj2str(Workflow.load_step(obj.filename))
|
341
|
+
else
|
342
|
+
"<IO:" << obj.filename << "--" << mtime_str(obj.filename) << ">"
|
343
|
+
end
|
334
344
|
else
|
335
345
|
|
336
346
|
if obj.respond_to? :obj2str
|
@@ -242,48 +242,6 @@ module Misc
|
|
242
242
|
|
243
243
|
return options
|
244
244
|
|
245
|
-
options = {}
|
246
|
-
string.split(/#/).each do |str|
|
247
|
-
if str.match(/(.*)=(.*)/)
|
248
|
-
option, value = $1, $2
|
249
|
-
else
|
250
|
-
option, value = str, true
|
251
|
-
end
|
252
|
-
|
253
|
-
option = option.sub(":",'').to_sym if option.chars.first == ':'
|
254
|
-
value = value.sub(":",'').to_sym if String === value and value.chars.first == ':'
|
255
|
-
|
256
|
-
if value == true
|
257
|
-
options[option] = option.to_s.chars.first != '!'
|
258
|
-
else
|
259
|
-
options[option] = Thread.start do
|
260
|
-
$SAFE = 0;
|
261
|
-
case
|
262
|
-
when value =~ /^(?:true|T)$/i
|
263
|
-
true
|
264
|
-
when value =~ /^(?:false|F)$/i
|
265
|
-
false
|
266
|
-
when Symbol === value
|
267
|
-
value
|
268
|
-
when (String === value and value =~ /^\/(.*)\/$/)
|
269
|
-
Regexp.new /#{$1}/
|
270
|
-
else
|
271
|
-
begin
|
272
|
-
Kernel.const_get value
|
273
|
-
rescue
|
274
|
-
begin
|
275
|
-
raise if value =~ /[a-z]/ and defined? value
|
276
|
-
eval(value)
|
277
|
-
rescue Exception
|
278
|
-
value
|
279
|
-
end
|
280
|
-
end
|
281
|
-
end
|
282
|
-
end.value
|
283
|
-
end
|
284
|
-
end
|
285
|
-
|
286
|
-
options
|
287
245
|
end
|
288
246
|
|
289
247
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'rbbt/util/cmd'
|
2
|
+
module ProcPath
|
3
|
+
CMD.tool :procpath do
|
4
|
+
'pip install procpath'
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.record(pid, path, options = {})
|
8
|
+
IndiferentHash.setup(options)
|
9
|
+
options = Misc.add_defaults options, "interval" => 30
|
10
|
+
|
11
|
+
cmd_options = %w(interval recnum reevalnum).inject({}){|acc,k| acc[k] = options[k]; acc}
|
12
|
+
|
13
|
+
Log.debug "ProcPath recording #{pid} in #{path} (#{Misc.fingerprint options})"
|
14
|
+
procpath_thread = Thread.new do
|
15
|
+
begin
|
16
|
+
procpath_pid = CMD.cmd_pid(:procpath, "record --database-file '#{path}' '$..children[?(@.stat.pid == #{pid})]'", cmd_options.merge(:nofail => true, :add_option_dashes => true))
|
17
|
+
rescue Exception
|
18
|
+
Log.exceptions $!
|
19
|
+
Process.kill "INT", procpath_pid
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
procpath_thread.report_on_exception = false
|
24
|
+
|
25
|
+
Process.wait pid.to_i
|
26
|
+
procpath_thread.raise Interrupt
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.plot(path, output, options = {})
|
30
|
+
IndiferentHash.setup(options)
|
31
|
+
options = Misc.add_defaults options, "query-name" => 'rss', 'epsilon' => 0.5, "moving-average-window" => 10
|
32
|
+
|
33
|
+
cmd_options = %w(query-name epsilon monitor-average-window title logarithmic after before custom-query-file custom-value-expr).inject({}){|acc,k| acc[k] = options[k]; acc}
|
34
|
+
CMD.cmd_log(:procpath, "plot --database-file '#{path}' --plot-file '#{output}' ", cmd_options.merge(:nofail => true, :add_option_dashes => true))
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.monitor(pid, path)
|
38
|
+
database, options_str = path.split("#")
|
39
|
+
options = options_str.nil? ? {} : Misc.string2hash(options_str)
|
40
|
+
|
41
|
+
database = File.expand_path database
|
42
|
+
Log.low "ProcPath monitor #{pid} in #{database} (#{Misc.fingerprint options})"
|
43
|
+
|
44
|
+
ProcPath.record(pid, database + '.sqlite3', options)
|
45
|
+
ProcPath.plot(database + '.sqlite3', database + '.cpu.svg', options.merge("query-name" => 'cpu'))
|
46
|
+
ProcPath.plot(database + '.sqlite3', database + '.rss.svg', options.merge("query-name" => 'rss'))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
@@ -16,6 +16,10 @@ end
|
|
16
16
|
|
17
17
|
module Workflow
|
18
18
|
|
19
|
+
def self.job_path?(path)
|
20
|
+
path.split("/")[-4] == "jobs"
|
21
|
+
end
|
22
|
+
|
19
23
|
def log(status, message = nil, &block)
|
20
24
|
Step.log(status, message, nil, &block)
|
21
25
|
end
|
@@ -301,8 +305,9 @@ module Workflow
|
|
301
305
|
|
302
306
|
def setup_override_dependency(dep, workflow, task_name)
|
303
307
|
dep = Step === dep ? dep : Workflow.load_step(dep)
|
308
|
+
dep.workflow = workflow
|
304
309
|
dep.info[:name] = dep.name
|
305
|
-
dep.original_task_name ||= dep.task_name
|
310
|
+
dep.original_task_name ||= dep.task_name if dep.workflow
|
306
311
|
begin
|
307
312
|
workflow = Kernel.const_get workflow if String === workflow
|
308
313
|
dep.task = workflow.tasks[task_name] if dep.task.nil? && workflow.tasks.include?(task_name)
|
@@ -94,17 +94,21 @@ class Step
|
|
94
94
|
Log.debug "Saving job input #{name} (#{type}) into #{path}"
|
95
95
|
case
|
96
96
|
when Step === value
|
97
|
-
Open.
|
97
|
+
Open.ln_s(value.path, path)
|
98
98
|
when type.to_s == "file"
|
99
99
|
if String === value && File.exists?(value)
|
100
|
-
Open.
|
100
|
+
Open.ln_s(value, path)
|
101
101
|
else
|
102
102
|
Open.write(path + '.yaml', value.to_yaml)
|
103
103
|
end
|
104
104
|
when Array === value
|
105
|
-
Open.write(path, value * "\n")
|
105
|
+
Open.write(path, value.collect{|v| Step === v ? v.path : v.to_s} * "\n")
|
106
106
|
when IO === value
|
107
|
-
|
107
|
+
if value.filename && String === value.filename && File.exists?(value.filename)
|
108
|
+
Open.ln_s(value.filename, path)
|
109
|
+
else
|
110
|
+
Open.write(path, value)
|
111
|
+
end
|
108
112
|
else
|
109
113
|
Open.write(path, value.to_s)
|
110
114
|
end
|
@@ -117,13 +121,17 @@ class Step
|
|
117
121
|
task_name = Symbol === job.overriden ? job.overriden : job.task_name
|
118
122
|
workflow = job.workflow
|
119
123
|
workflow = Kernel.const_get workflow if String === workflow
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
+
if workflow
|
125
|
+
task_info = workflow.task_info(task_name)
|
126
|
+
input_types = task_info[:input_types]
|
127
|
+
task_inputs = task_info[:inputs]
|
128
|
+
input_defaults = task_info[:input_defaults]
|
129
|
+
else
|
130
|
+
task_info = input_types = task_inputs = input_defaults = {}
|
131
|
+
end
|
124
132
|
|
125
133
|
inputs = {}
|
126
|
-
real_inputs = job.real_inputs ||
|
134
|
+
real_inputs = job.real_inputs || job.info[:real_inputs]
|
127
135
|
job.recursive_inputs.zip(job.recursive_inputs.fields).each do |value,name|
|
128
136
|
next unless task_inputs.include? name.to_sym
|
129
137
|
next unless real_inputs.include? name.to_sym
|
@@ -497,8 +505,8 @@ class Step
|
|
497
505
|
|
498
506
|
def running?
|
499
507
|
return false if ! (started? || status == :ending)
|
500
|
-
|
501
|
-
|
508
|
+
return nil unless Open.exist?(self.pid_file)
|
509
|
+
pid = Open.read(self.pid_file).to_i
|
502
510
|
|
503
511
|
return false if done? or error? or aborted?
|
504
512
|
|
@@ -522,8 +530,7 @@ class Step
|
|
522
530
|
end
|
523
531
|
|
524
532
|
def nopid?
|
525
|
-
|
526
|
-
! pid && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
|
533
|
+
! Open.exists?(pid_file) && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
|
527
534
|
end
|
528
535
|
|
529
536
|
def aborted?
|
@@ -103,7 +103,7 @@ class Step
|
|
103
103
|
end
|
104
104
|
|
105
105
|
job.dup_inputs unless status == 'done' or job.started?
|
106
|
-
job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started?
|
106
|
+
job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started? || ! Workflow.job_path?(job.path)
|
107
107
|
|
108
108
|
canfail = ComputeDependency === job && job.canfail?
|
109
109
|
end
|
@@ -130,7 +130,6 @@ class Step
|
|
130
130
|
(inputs.flatten.select{|i| Step === i} + inputs.flatten.select{|dep| Path === dep && Step === dep.resource}.collect{|dep| dep.resource})
|
131
131
|
end
|
132
132
|
|
133
|
-
|
134
133
|
def execute_dependency(dependency, log = true)
|
135
134
|
task_name = self.task_name
|
136
135
|
canfail_paths = self.canfail_paths
|
@@ -122,7 +122,6 @@ class Step
|
|
122
122
|
reject{|dependency| (defined?(WorkflowRemoteClient) && WorkflowRemoteClient::RemoteStep === dependency) || Open.remote?(dependency.path) }.
|
123
123
|
reject{|dependency| dependency.error? }.
|
124
124
|
#select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
|
125
|
-
#select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
|
126
125
|
select{|dependency| dependency.updatable? }.
|
127
126
|
collect{|dependency| Workflow.relocate_dependency(self, dependency)}
|
128
127
|
end
|
@@ -374,7 +373,6 @@ class Step
|
|
374
373
|
Log.exception $!
|
375
374
|
ensure
|
376
375
|
Step.purge_stream_cache
|
377
|
-
set_info :pid, nil
|
378
376
|
Open.rm pid_file if Open.exist?(pid_file)
|
379
377
|
end
|
380
378
|
end
|
@@ -389,7 +387,6 @@ class Step
|
|
389
387
|
_clean_finished
|
390
388
|
rescue
|
391
389
|
stop_dependencies
|
392
|
-
set_info :pid, nil
|
393
390
|
Open.rm pid_file if Open.exist?(pid_file)
|
394
391
|
end
|
395
392
|
end
|
@@ -450,7 +447,7 @@ class Step
|
|
450
447
|
ensure
|
451
448
|
no_load = false unless IO === result
|
452
449
|
Open.rm pid_file if Open.exist?(pid_file) unless no_load
|
453
|
-
set_info :pid, nil unless no_load
|
450
|
+
#set_info :pid, nil unless no_load
|
454
451
|
end
|
455
452
|
end
|
456
453
|
|
@@ -560,7 +557,7 @@ class Step
|
|
560
557
|
RbbtSemaphore.post_semaphore(semaphore) if semaphore
|
561
558
|
Kernel.exit! -1
|
562
559
|
end
|
563
|
-
set_info :pid, nil
|
560
|
+
#set_info :pid, nil
|
564
561
|
ensure
|
565
562
|
RbbtSemaphore.post_semaphore(semaphore) if semaphore
|
566
563
|
end
|
data/lib/rbbt/workflow/usage.rb
CHANGED
@@ -57,7 +57,7 @@ module Task
|
|
57
57
|
puts Log.color(:magenta, "Input select options")
|
58
58
|
puts
|
59
59
|
selects.collect{|p| p}.uniq.each do |input,options|
|
60
|
-
puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| o.to_s} * ", ") << "\n"
|
60
|
+
puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| Array === o ? o.first.to_s : o.to_s} * ", ") << "\n"
|
61
61
|
puts unless Log.compact
|
62
62
|
end
|
63
63
|
puts
|
@@ -22,11 +22,14 @@ class Step
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def self.prov_report_msg(status, name, path, info = nil)
|
25
|
-
parts = path.sub(/\{.*/,'').
|
25
|
+
parts = path.sub(/\{.*/,'').split "/"
|
26
26
|
|
27
|
+
parts.pop
|
28
|
+
|
27
29
|
task = Log.color(:yellow, parts.pop)
|
28
30
|
workflow = Log.color(:magenta, parts.pop)
|
29
|
-
if status.to_s == 'noinfo'
|
31
|
+
#if status.to_s == 'noinfo' && parts.last != 'jobs'
|
32
|
+
if ! Workflow.job_path?(path)
|
30
33
|
task, status, workflow = Log.color(:yellow, info[:task_name]), Log.color(:green, "file"), Log.color(:magenta, "-")
|
31
34
|
end
|
32
35
|
|
@@ -0,0 +1,165 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rbbt-util'
|
4
|
+
require 'rbbt/util/simpleopt'
|
5
|
+
|
6
|
+
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
7
|
+
|
8
|
+
options = SOPT.setup <<EOF
|
9
|
+
|
10
|
+
Clean error or aborted jobs
|
11
|
+
|
12
|
+
$ rbbt mnl [options]
|
13
|
+
|
14
|
+
-h--help Print this help
|
15
|
+
-d--done Done jobs only
|
16
|
+
-e--error Error jobs only
|
17
|
+
-a--aborted SLURM aboted jobs
|
18
|
+
-j--job* Job ids
|
19
|
+
-s--search* Regular expression
|
20
|
+
-t--tail* Show the last lines of the STDERR
|
21
|
+
-SBP--sbatch_parameters show sbatch parameters
|
22
|
+
-dr--dry_run Do not erase anything
|
23
|
+
EOF
|
24
|
+
|
25
|
+
if options[:help]
|
26
|
+
if defined? rbbt_usage
|
27
|
+
rbbt_usage
|
28
|
+
else
|
29
|
+
puts SOPT.doc
|
30
|
+
end
|
31
|
+
exit 0
|
32
|
+
end
|
33
|
+
|
34
|
+
Log.severity = 4
|
35
|
+
done, error, aborted, jobid, search, tail, sbatch_parameters, dry_run = options.values_at :done, :error, :aborted, :job, :search, :tail, :sbatch_parameters, :dry_run
|
36
|
+
|
37
|
+
workdir = File.expand_path('~/rbbt-slurm')
|
38
|
+
Path.setup(workdir)
|
39
|
+
|
40
|
+
running_jobs = begin
|
41
|
+
squeue_txt = CMD.cmd('squeue').read
|
42
|
+
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
43
|
+
rescue
|
44
|
+
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
45
|
+
squeue_txt = nil
|
46
|
+
$norunningjobs = true
|
47
|
+
[]
|
48
|
+
end
|
49
|
+
|
50
|
+
if squeue_txt
|
51
|
+
job_nodes = {}
|
52
|
+
squeue_txt.split("\n").each do |line|
|
53
|
+
parts = line.strip.split(/\s+/)
|
54
|
+
job_nodes[parts.first] = parts.last.split(",")
|
55
|
+
end
|
56
|
+
else
|
57
|
+
job_nodes = nil
|
58
|
+
end
|
59
|
+
|
60
|
+
count = 0
|
61
|
+
workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
62
|
+
dir = File.dirname(fcmd)
|
63
|
+
|
64
|
+
if m = Open.read(fcmd).match(/#CMD: (.*)/)
|
65
|
+
cmd = m[1]
|
66
|
+
else
|
67
|
+
cmd = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
|
71
|
+
exe = m[1]
|
72
|
+
else
|
73
|
+
exe = nil
|
74
|
+
end
|
75
|
+
|
76
|
+
if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
|
77
|
+
container_home = m[1]
|
78
|
+
else
|
79
|
+
container_home = nil
|
80
|
+
end
|
81
|
+
|
82
|
+
|
83
|
+
if File.exists?(fid = File.join(dir, 'job.id'))
|
84
|
+
id = Open.read(fid).chomp
|
85
|
+
else
|
86
|
+
id = nil
|
87
|
+
end
|
88
|
+
|
89
|
+
if File.exists?(fstatus = File.join(dir, 'exit.status'))
|
90
|
+
exit_status = Open.read(fstatus).to_i
|
91
|
+
else
|
92
|
+
exit_status = nil
|
93
|
+
end
|
94
|
+
|
95
|
+
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
96
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
97
|
+
elsif job_nodes[id]
|
98
|
+
nodes = job_nodes[id]
|
99
|
+
else
|
100
|
+
nodes = []
|
101
|
+
end
|
102
|
+
|
103
|
+
if File.exists?(File.join(dir, 'std.out'))
|
104
|
+
outt = File.mtime File.join(dir, 'std.out')
|
105
|
+
errt = File.mtime File.join(dir, 'std.err')
|
106
|
+
time_diff = Time.now - [outt, errt].max
|
107
|
+
end
|
108
|
+
|
109
|
+
fdep = File.join(dir, 'dependencies.list')
|
110
|
+
deps = Open.read(fdep).split("\n") if File.exists?(fdep)
|
111
|
+
|
112
|
+
fcadep = File.join(dir, 'canfail_dependencies.list')
|
113
|
+
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
114
|
+
|
115
|
+
aborted = error = true if aborted.nil? && error.nil?
|
116
|
+
if done || error || aborted || running || queued || jobid || search
|
117
|
+
select = false
|
118
|
+
select = true if done && exit_status && exit_status.to_i == 0
|
119
|
+
select = true if error && exit_status && exit_status.to_i != 0
|
120
|
+
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
121
|
+
select = select && jobid.split(",").include?(id) if jobid
|
122
|
+
select = select && cmd.match(/#{search}/) if search
|
123
|
+
next unless select
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
puts Log.color(:yellow, "**ERASING**")
|
128
|
+
puts Log.color :blue, dir
|
129
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
|
130
|
+
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
131
|
+
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
132
|
+
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
133
|
+
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
134
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
135
|
+
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
136
|
+
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
137
|
+
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
138
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
139
|
+
|
140
|
+
if options[:sbatch_parameters]
|
141
|
+
puts Log.color(:magenta, "SBATCH parameters: ")
|
142
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
143
|
+
end
|
144
|
+
|
145
|
+
if tail && File.exists?(File.join(dir, 'std.err'))
|
146
|
+
if exit_status && exit_status != 0
|
147
|
+
puts Log.color(:magenta, "First error or exception found: ")
|
148
|
+
puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
|
149
|
+
elsif exit_status
|
150
|
+
puts Log.color(:magenta, "Completed jobs: ")
|
151
|
+
puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
|
152
|
+
else
|
153
|
+
puts Log.color(:magenta, "Log tail: ")
|
154
|
+
puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
count += 1
|
159
|
+
|
160
|
+
Open.rm_rf dir unless dry_run
|
161
|
+
end
|
162
|
+
|
163
|
+
puts
|
164
|
+
puts "Found #{count} jobs"
|
165
|
+
|