rbbt-util 5.30.13 → 5.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc.rb +3 -0
- data/lib/rbbt/hpc/batch.rb +623 -0
- data/lib/rbbt/hpc/lsf.rb +119 -0
- data/lib/rbbt/hpc/orchestrate.rb +12 -11
- data/lib/rbbt/hpc/slurm.rb +62 -567
- data/lib/rbbt/resource/path.rb +3 -1
- data/lib/rbbt/tsv/accessor.rb +5 -2
- data/lib/rbbt/tsv/dumper.rb +1 -0
- data/lib/rbbt/tsv/parallel/traverse.rb +1 -1
- data/lib/rbbt/tsv/stream.rb +5 -6
- data/lib/rbbt/util/log.rb +22 -1
- data/lib/rbbt/util/misc/development.rb +2 -2
- data/lib/rbbt/util/misc/options.rb +5 -0
- data/lib/rbbt/workflow/step/accessor.rb +1 -1
- data/lib/rbbt/workflow/usage.rb +13 -13
- data/share/config.ru +3 -3
- data/share/rbbt_commands/{slurm → hpc}/clean +91 -18
- data/share/rbbt_commands/{slurm → hpc}/list +100 -30
- data/share/rbbt_commands/hpc/orchestrate +81 -0
- data/share/rbbt_commands/hpc/tail +81 -0
- data/share/rbbt_commands/hpc/task +80 -0
- data/test/rbbt/hpc/test_batch.rb +65 -0
- data/test/rbbt/hpc/test_slurm.rb +30 -0
- data/test/rbbt/util/misc/test_development.rb +11 -0
- data/test/test_helper.rb +3 -1
- metadata +16 -7
- data/share/rbbt_commands/slurm/orchestrate +0 -48
- data/share/rbbt_commands/slurm/task +0 -46
data/lib/rbbt/resource/path.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/util/misc/indiferent_hash'
|
|
3
3
|
require 'yaml'
|
4
4
|
|
5
5
|
module Path
|
6
|
-
attr_accessor :resource, :pkgdir, :original, :search_paths, :search_order, :libdir
|
6
|
+
attr_accessor :resource, :pkgdir, :original, :search_paths, :search_order, :libdir, :where
|
7
7
|
|
8
8
|
def self.setup(string, pkgdir = nil, resource = nil, search_paths = nil, search_order = nil, libdir = nil)
|
9
9
|
return string if string.nil?
|
@@ -99,6 +99,7 @@ module Path
|
|
99
99
|
|
100
100
|
paths = paths.each do |p|
|
101
101
|
p.original = File.join(found.original, p.sub(/^#{found}/, ''))
|
102
|
+
p.where = where
|
102
103
|
end if found.original and pattern
|
103
104
|
|
104
105
|
location_paths[where] = paths
|
@@ -256,6 +257,7 @@ module Path
|
|
256
257
|
end
|
257
258
|
|
258
259
|
res.original = self.original || self
|
260
|
+
res.where = where
|
259
261
|
|
260
262
|
res
|
261
263
|
end
|
data/lib/rbbt/tsv/accessor.rb
CHANGED
@@ -713,8 +713,11 @@ module TSV
|
|
713
713
|
break
|
714
714
|
end
|
715
715
|
|
716
|
-
filename =
|
717
|
-
filename
|
716
|
+
filename = @filename
|
717
|
+
filename = "No filename" if filename.nil? || filename.empty?
|
718
|
+
filename.find if Path === filename
|
719
|
+
filename = File.basename(filename) + " [" + File.basename(persistence_path) + "]" if respond_to?(:persistence_path) and persistence_path
|
720
|
+
|
718
721
|
with_unnamed do
|
719
722
|
<<-EOF
|
720
723
|
Filename = #{filename}
|
data/lib/rbbt/tsv/dumper.rb
CHANGED
@@ -610,7 +610,7 @@ module TSV
|
|
610
610
|
thread = Thread.new do
|
611
611
|
begin
|
612
612
|
traverse_run(obj, threads, cpus, options, &block)
|
613
|
-
into.close if into.respond_to?(:close) and not (into.respond_to?
|
613
|
+
into.close if into.respond_to?(:close) and not (into.respond_to?(:closed?) and into.closed?)
|
614
614
|
rescue Exception
|
615
615
|
abort_stream obj
|
616
616
|
abort_stream into
|
data/lib/rbbt/tsv/stream.rb
CHANGED
@@ -294,20 +294,19 @@ module TSV
|
|
294
294
|
end
|
295
295
|
|
296
296
|
|
297
|
-
def self.reorder_stream_tsv(stream, key_field, fields=nil, zipped = true)
|
297
|
+
def self.reorder_stream_tsv(stream, key_field, fields=nil, zipped = true, bar = nil)
|
298
298
|
parser = TSV::Parser.new TSV.get_stream(stream), :key_field => key_field, :fields => fields
|
299
299
|
dumper_options = parser.options
|
300
300
|
dumper = TSV::Dumper.new dumper_options
|
301
301
|
dumper.init
|
302
302
|
case parser.type
|
303
303
|
when :single
|
304
|
-
TSV.traverse parser, :into => dumper do |keys,values|
|
304
|
+
TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
|
305
305
|
key = keys.first
|
306
306
|
[key, [values]]
|
307
307
|
end
|
308
308
|
when :double
|
309
|
-
TSV.traverse parser, :into => dumper do |keys,values|
|
310
|
-
raise [keys, values].inspect if keys.include? 'gain'
|
309
|
+
TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
|
311
310
|
res = []
|
312
311
|
keys.each_with_index do |key,i|
|
313
312
|
vs = zipped ? values.collect{|l| l.length == 1 ? l : [l[i]] } : values
|
@@ -317,12 +316,12 @@ module TSV
|
|
317
316
|
res
|
318
317
|
end
|
319
318
|
when :list
|
320
|
-
TSV.traverse parser, :into => dumper do |keys,values|
|
319
|
+
TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
|
321
320
|
key = keys === Array ? keys.first : keys
|
322
321
|
[key, values]
|
323
322
|
end
|
324
323
|
when :flat
|
325
|
-
TSV.traverse parser, :into => dumper do |keys,values|
|
324
|
+
TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
|
326
325
|
key = keys === Array ? keys.first : keys
|
327
326
|
[key, values]
|
328
327
|
end
|
data/lib/rbbt/util/log.rb
CHANGED
@@ -341,10 +341,31 @@ module Log
|
|
341
341
|
end unless stack.nil?
|
342
342
|
end
|
343
343
|
|
344
|
-
def self.tsv(tsv)
|
344
|
+
def self.tsv(tsv, example = false)
|
345
345
|
STDERR.puts Log.color :magenta, "TSV log: " << Log.last_caller(caller).gsub('`',"'")
|
346
346
|
STDERR.puts Log.color(:blue, "=> "<< Misc.fingerprint(tsv), true)
|
347
347
|
STDERR.puts Log.color(:cyan, "=> " << tsv.summary)
|
348
|
+
if example && ! tsv.empty?
|
349
|
+
key = case example
|
350
|
+
when TrueClass, :first, "first"
|
351
|
+
tsv.keys.first
|
352
|
+
when :random, "random"
|
353
|
+
tsv.keys.shuffle.first
|
354
|
+
else
|
355
|
+
example
|
356
|
+
end
|
357
|
+
|
358
|
+
values = tsv[key]
|
359
|
+
values = [values] if tsv.type == :flat || tsv.type == :single
|
360
|
+
if values.nil?
|
361
|
+
STDERR.puts Log.color(:blue, "Key (#{tsv.key_field}) not present: ") + key
|
362
|
+
else
|
363
|
+
STDERR.puts Log.color(:blue, "Key (#{tsv.key_field}): ") + key
|
364
|
+
tsv.fields.zip(values).each do |field,value|
|
365
|
+
STDERR.puts Log.color(:magenta, field + ": ") + (Array === value ? value * ", " : value.to_s)
|
366
|
+
end
|
367
|
+
end
|
368
|
+
end
|
348
369
|
end
|
349
370
|
|
350
371
|
def self.stack(stack)
|
@@ -428,9 +428,9 @@ def self.add_libdir(dir=nil)
|
|
428
428
|
end
|
429
429
|
end
|
430
430
|
|
431
|
-
|
432
|
-
def self.ssh_run(server, script)
|
431
|
+
def self.ssh_run(server, script = nil, &block)
|
433
432
|
Log.debug "Run ssh script in #{server}:\n#{script}"
|
433
|
+
|
434
434
|
CMD.cmd("ssh '#{server}' 'shopt -s expand_aliases; bash -l -c \"ruby\"' ", :in => script, :log => true).read
|
435
435
|
end
|
436
436
|
|
@@ -135,9 +135,14 @@ module Misc
|
|
135
135
|
end
|
136
136
|
|
137
137
|
new_options
|
138
|
+
|
139
|
+
options.replace new_options
|
138
140
|
end
|
139
141
|
|
140
142
|
def self.process_options(hash, *keys)
|
143
|
+
defaults = keys.pop if Hash === keys.last
|
144
|
+
hahs = Misc.add_defaults hash, defaults if defaults
|
145
|
+
|
141
146
|
if keys.length == 1
|
142
147
|
hash.include?(keys.first.to_sym) ? hash.delete(keys.first.to_sym) : hash.delete(keys.first.to_s)
|
143
148
|
else
|
@@ -99,7 +99,7 @@ class Step
|
|
99
99
|
if String === value && File.exists?(value)
|
100
100
|
Open.ln_s(value, path)
|
101
101
|
else
|
102
|
-
Open.write(path + '.yaml', value.to_yaml)
|
102
|
+
Open.write(path + '.yaml', value.to_s.to_yaml)
|
103
103
|
end
|
104
104
|
when Array === value
|
105
105
|
Open.write(path, value.collect{|v| Step === v ? v.path : v.to_s} * "\n")
|
data/lib/rbbt/workflow/usage.rb
CHANGED
@@ -6,19 +6,6 @@ module Task
|
|
6
6
|
puts "\n" << Misc.format_paragraph(description.strip) << "\n" if description and not description.empty?
|
7
7
|
puts
|
8
8
|
|
9
|
-
case
|
10
|
-
when (input_types.values & [:array]).any?
|
11
|
-
puts Log.color(:green, Misc.format_paragraph("Lists are specified as arguments using ',' or '|'. When specified as files the '\\n'
|
12
|
-
also works in addition to the others. You may use the '--array_separator' option
|
13
|
-
the change this default. Whenever a file is specified it may also accept STDIN using
|
14
|
-
the '-' character."))
|
15
|
-
puts
|
16
|
-
|
17
|
-
when (input_types.values & [:text, :tsv]).any?
|
18
|
-
puts Log.color(:green, Misc.format_paragraph("Whenever a file is specified it may also accept STDIN using the '-' character."))
|
19
|
-
puts
|
20
|
-
end
|
21
|
-
|
22
9
|
selects = []
|
23
10
|
if inputs.any?
|
24
11
|
inputs.zip(input_types.values_at(*inputs)).select{|i,t| t.to_sym == :select && input_options[i] && input_options[i][:select_options] }.each{|i,t| selects << [i, input_options[i][:select_options]] }
|
@@ -50,6 +37,19 @@ module Task
|
|
50
37
|
puts
|
51
38
|
end
|
52
39
|
|
40
|
+
case
|
41
|
+
when (input_types.values & [:array]).any?
|
42
|
+
puts Log.color(:green, Misc.format_paragraph("Lists are specified as arguments using ',' or '|'. When specified as files the '\\n'
|
43
|
+
also works in addition to the others. You may use the '--array_separator' option
|
44
|
+
the change this default. Whenever a file is specified it may also accept STDIN using
|
45
|
+
the '-' character."))
|
46
|
+
puts
|
47
|
+
|
48
|
+
when (input_types.values & [:text, :tsv]).any?
|
49
|
+
puts Log.color(:green, Misc.format_paragraph("Whenever a file is specified it may also accept STDIN using the '-' character."))
|
50
|
+
puts
|
51
|
+
end
|
52
|
+
|
53
53
|
puts Log.color(:magenta, "Returns: ") << Log.color(:blue, result_type.to_s) << "\n"
|
54
54
|
puts
|
55
55
|
|
data/share/config.ru
CHANGED
@@ -43,6 +43,9 @@ app_eval app, Rbbt.etc['app.d/remote_workflow_tasks.rb'].find_all
|
|
43
43
|
#{{{ BASE
|
44
44
|
app_eval app, Rbbt.etc['app.d/base.rb'].find
|
45
45
|
|
46
|
+
#{{{ SINATRA
|
47
|
+
load_file Rbbt.lib['sinatra.rb'].find_all
|
48
|
+
|
46
49
|
#{{{ RESOURCES
|
47
50
|
load_file Rbbt.etc['app.d/resources.rb'].find
|
48
51
|
|
@@ -70,9 +73,6 @@ load_file Rbbt.etc['app.d/preload.rb'].find_all
|
|
70
73
|
#{{{ PRELOAD
|
71
74
|
load_file Rbbt.etc['app.d/semaphores.rb'].find_all
|
72
75
|
|
73
|
-
#{{{ SINATRA
|
74
|
-
load_file Rbbt.lib['sinatra.rb'].find_all
|
75
|
-
|
76
76
|
Entity.entity_list_cache = Rbbt.var.sinatra.app[app_name].find.entity_lists
|
77
77
|
Entity.entity_map_cache = Rbbt.var.sinatra.app[app_name].find.entity_maps
|
78
78
|
Entity.entity_property_cache = Rbbt.var.sinatra.app[app_name].find.entity_properties
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'rbbt-util'
|
4
4
|
require 'rbbt/util/simpleopt'
|
5
|
+
require 'rbbt/hpc'
|
5
6
|
|
6
7
|
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
7
8
|
|
@@ -9,16 +10,17 @@ options = SOPT.setup <<EOF
|
|
9
10
|
|
10
11
|
Clean error or aborted jobs
|
11
12
|
|
12
|
-
$ rbbt
|
13
|
+
$ rbbt slurm clean [options]
|
13
14
|
|
14
15
|
-h--help Print this help
|
15
16
|
-d--done Done jobs only
|
16
17
|
-e--error Error jobs only
|
17
18
|
-a--aborted SLURM aboted jobs
|
19
|
+
-q--queued Queued jobs only
|
18
20
|
-j--job* Job ids
|
19
21
|
-s--search* Regular expression
|
20
22
|
-t--tail* Show the last lines of the STDERR
|
21
|
-
-
|
23
|
+
-BP--batch_parameters show batch parameters
|
22
24
|
-dr--dry_run Do not erase anything
|
23
25
|
EOF
|
24
26
|
|
@@ -31,14 +33,47 @@ if options[:help]
|
|
31
33
|
exit 0
|
32
34
|
end
|
33
35
|
|
36
|
+
batch_system = options.delete :batch_system
|
37
|
+
batch_system ||= 'auto'
|
38
|
+
|
39
|
+
HPC::BATCH_MODULE = case batch_system.to_s.downcase
|
40
|
+
when 'slurm'
|
41
|
+
HPC::SLURM
|
42
|
+
when 'lsf'
|
43
|
+
HPC::LSF
|
44
|
+
when 'auto'
|
45
|
+
case $previous_commands.last
|
46
|
+
when 'slurm'
|
47
|
+
HPC::SLURM
|
48
|
+
when 'lsf'
|
49
|
+
HPC::LSF
|
50
|
+
else
|
51
|
+
case Rbbt::Config.get(:batch_system, :batch, :batch_system, :hpc, :HPC, :BATCH).to_s.downcase
|
52
|
+
when 'slurm'
|
53
|
+
HPC::SLURM
|
54
|
+
when 'lsf'
|
55
|
+
HPC::LSF
|
56
|
+
else
|
57
|
+
case ENV["BATCH_SYSTEM"].to_s.downcase
|
58
|
+
when 'slurm'
|
59
|
+
HPC::SLURM
|
60
|
+
when 'lsf'
|
61
|
+
HPC::LSF
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
68
|
+
|
34
69
|
Log.severity = 4
|
35
|
-
done, error, aborted, jobid, search, tail,
|
70
|
+
done, error, aborted, queued, jobid, search, tail, batch_parameters, dry_run = options.values_at :done, :error, :aborted, :queued, :job, :search, :tail, :batch_parameters, :dry_run
|
36
71
|
|
37
|
-
workdir = File.expand_path('~/rbbt-
|
72
|
+
workdir = File.expand_path('~/rbbt-batch')
|
38
73
|
Path.setup(workdir)
|
39
74
|
|
40
75
|
running_jobs = begin
|
41
|
-
squeue_txt =
|
76
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
42
77
|
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
43
78
|
rescue
|
44
79
|
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
@@ -58,27 +93,35 @@ else
|
|
58
93
|
end
|
59
94
|
|
60
95
|
count = 0
|
61
|
-
workdir.glob("**/command.
|
96
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
62
97
|
dir = File.dirname(fcmd)
|
98
|
+
command_txt = Open.read(fcmd)
|
63
99
|
|
64
|
-
if m =
|
100
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
65
101
|
cmd = m[1]
|
66
102
|
else
|
67
103
|
cmd = nil
|
68
104
|
end
|
69
105
|
|
70
|
-
if m =
|
106
|
+
if m = command_txt.match(/# Run command\n(.*?)\n/im)
|
71
107
|
exe = m[1]
|
72
108
|
else
|
73
109
|
exe = nil
|
74
110
|
end
|
75
111
|
|
76
|
-
if m =
|
112
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
77
113
|
container_home = m[1]
|
78
114
|
else
|
79
115
|
container_home = nil
|
80
116
|
end
|
81
117
|
|
118
|
+
if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
|
119
|
+
job_batch_system = m[1].downcase
|
120
|
+
else
|
121
|
+
job_batch_system = nil
|
122
|
+
end
|
123
|
+
|
124
|
+
different_system = job_batch_system != batch_system
|
82
125
|
|
83
126
|
if File.exists?(fid = File.join(dir, 'job.id'))
|
84
127
|
id = Open.read(fid).chomp
|
@@ -93,7 +136,16 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
93
136
|
end
|
94
137
|
|
95
138
|
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
96
|
-
|
139
|
+
fstatus_txt = Open.read(fstatus)
|
140
|
+
begin
|
141
|
+
if job_batch_system == "lsf"
|
142
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
143
|
+
else
|
144
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
145
|
+
end
|
146
|
+
rescue
|
147
|
+
nodes = []
|
148
|
+
end
|
97
149
|
elsif job_nodes[id]
|
98
150
|
nodes = job_nodes[id]
|
99
151
|
else
|
@@ -113,20 +165,36 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
113
165
|
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
114
166
|
|
115
167
|
aborted = error = true if aborted.nil? && error.nil?
|
116
|
-
if done || error || aborted || running || queued || jobid || search
|
168
|
+
#if done || error || aborted || running || queued || jobid || search
|
169
|
+
# select = false
|
170
|
+
# select = true if done && exit_status && exit_status.to_i == 0
|
171
|
+
# select = true if error && exit_status && exit_status.to_i != 0
|
172
|
+
# select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
173
|
+
# select = select && jobid.split(",").include?(id) if jobid
|
174
|
+
# select = select && cmd.match(/#{search}/) if search
|
175
|
+
# next unless select
|
176
|
+
#end
|
177
|
+
|
178
|
+
if done || error || aborted || queued || jobid
|
117
179
|
select = false
|
118
|
-
select = true if done && exit_status
|
119
|
-
select = true if error && exit_status && exit_status
|
180
|
+
select = true if done && exit_status == 0
|
181
|
+
select = true if error && exit_status && exit_status != 0
|
120
182
|
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
121
|
-
|
183
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
|
184
|
+
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
185
|
+
select = true if jobid && jobid.split(",").include?(id)
|
122
186
|
select = select && cmd.match(/#{search}/) if search
|
123
187
|
next unless select
|
188
|
+
elsif search
|
189
|
+
select = false
|
190
|
+
select = true if search && cmd.match(/#{search}/)
|
191
|
+
next unless select
|
124
192
|
end
|
125
193
|
|
126
194
|
|
127
195
|
puts Log.color(:yellow, "**ERASING**")
|
128
196
|
puts Log.color :blue, dir
|
129
|
-
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.
|
197
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
|
130
198
|
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
131
199
|
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
132
200
|
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
@@ -137,9 +205,14 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
137
205
|
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
138
206
|
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
139
207
|
|
140
|
-
if options[:
|
141
|
-
puts Log.color(:magenta, "
|
142
|
-
|
208
|
+
if options[:batch_parameters]
|
209
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
210
|
+
case job_batch_system
|
211
|
+
when 'slurm'
|
212
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
213
|
+
when 'lsf'
|
214
|
+
puts Log.color :blue, CMD.cmd('grep "^#BSUB" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
215
|
+
end
|
143
216
|
end
|
144
217
|
|
145
218
|
if tail && File.exists?(File.join(dir, 'std.err'))
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'rbbt-util'
|
4
4
|
require 'rbbt/util/simpleopt'
|
5
|
+
require 'rbbt/hpc'
|
5
6
|
|
6
7
|
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
7
8
|
|
@@ -9,7 +10,7 @@ options = SOPT.setup <<EOF
|
|
9
10
|
|
10
11
|
Queue a job in Marenostrum
|
11
12
|
|
12
|
-
$ rbbt
|
13
|
+
$ rbbt slurm list [options]
|
13
14
|
|
14
15
|
-h--help Print this help
|
15
16
|
-d--done Done jobs only
|
@@ -21,9 +22,10 @@ $ rbbt mnl [options]
|
|
21
22
|
-s--search* Regular expression
|
22
23
|
-t--tail* Show the last lines of the STDERR
|
23
24
|
-p--progress Report progress of job and the dependencies
|
24
|
-
-
|
25
|
-
-
|
25
|
+
-BP--batch_parameters show batch parameters
|
26
|
+
-BPP--batch_procpath show Procpath performance summary
|
26
27
|
-sacct--sacct_peformance show sacct performance summary
|
28
|
+
-bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
|
27
29
|
EOF
|
28
30
|
|
29
31
|
if options[:help]
|
@@ -35,14 +37,48 @@ if options[:help]
|
|
35
37
|
exit 0
|
36
38
|
end
|
37
39
|
|
38
|
-
|
40
|
+
batch_system = options.delete :batch_system
|
41
|
+
batch_system ||= 'auto'
|
42
|
+
|
43
|
+
HPC::BATCH_MODULE = case batch_system.to_s.downcase
|
44
|
+
when 'slurm'
|
45
|
+
HPC::SLURM
|
46
|
+
when 'lsf'
|
47
|
+
HPC::LSF
|
48
|
+
when 'auto'
|
49
|
+
case $previous_commands.last
|
50
|
+
when 'slurm'
|
51
|
+
HPC::SLURM
|
52
|
+
when 'lsf'
|
53
|
+
HPC::LSF
|
54
|
+
else
|
55
|
+
case Rbbt::Config.get(:batch_system, :batch, :batch_system, :hpc, :HPC, :BATCH).to_s.downcase
|
56
|
+
when 'slurm'
|
57
|
+
HPC::SLURM
|
58
|
+
when 'lsf'
|
59
|
+
HPC::LSF
|
60
|
+
else
|
61
|
+
case ENV["BATCH_SYSTEM"].to_s.downcase
|
62
|
+
when 'slurm'
|
63
|
+
HPC::SLURM
|
64
|
+
when 'lsf'
|
65
|
+
HPC::LSF
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
72
|
+
|
73
|
+
batch_system = HPC::BATCH_MODULE.to_s.split("::").last.downcase
|
74
|
+
|
39
75
|
done, error, running, queued, aborted, jobid, search, tail, progress = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail, :progress
|
40
76
|
|
41
|
-
workdir = File.expand_path('~/rbbt-
|
77
|
+
workdir = File.expand_path('~/rbbt-batch')
|
42
78
|
Path.setup(workdir)
|
43
79
|
|
44
80
|
running_jobs = begin
|
45
|
-
squeue_txt =
|
81
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
46
82
|
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
47
83
|
rescue
|
48
84
|
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
@@ -62,35 +98,48 @@ else
|
|
62
98
|
end
|
63
99
|
|
64
100
|
count = 0
|
65
|
-
workdir.glob("**/command.
|
101
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
66
102
|
dir = File.dirname(fcmd)
|
103
|
+
command_txt = Open.read(fcmd)
|
67
104
|
|
68
|
-
if m =
|
105
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
69
106
|
cmd = m[1]
|
70
107
|
else
|
71
108
|
cmd = nil
|
72
109
|
end
|
73
110
|
|
74
|
-
if m =
|
111
|
+
if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
|
112
|
+
job_batch_system = m[1].downcase
|
113
|
+
else
|
114
|
+
job_batch_system = nil
|
115
|
+
end
|
116
|
+
|
117
|
+
different_system = job_batch_system != batch_system
|
118
|
+
|
119
|
+
if m = command_txt.match(/#MANIFEST: (.*)/)
|
75
120
|
manifest = m[1]
|
76
121
|
else
|
77
122
|
manifest = nil
|
78
123
|
end
|
79
124
|
|
125
|
+
if m = command_txt.match(/#STEP_PATH: (.*)/)
|
126
|
+
step_path = m[1]
|
127
|
+
else
|
128
|
+
step_path = nil
|
129
|
+
end
|
80
130
|
|
81
|
-
if m =
|
82
|
-
exe = m[1]
|
131
|
+
if m = command_txt.match(/#EXEC_CMD: (.*)/)
|
132
|
+
exe = m[1]
|
83
133
|
else
|
84
134
|
exe = nil
|
85
135
|
end
|
86
136
|
|
87
|
-
if m =
|
137
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
88
138
|
container_home = m[1]
|
89
139
|
else
|
90
140
|
container_home = nil
|
91
141
|
end
|
92
142
|
|
93
|
-
|
94
143
|
if File.exists?(fid = File.join(dir, 'job.id'))
|
95
144
|
id = Open.read(fid).chomp
|
96
145
|
else
|
@@ -104,11 +153,20 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
104
153
|
end
|
105
154
|
|
106
155
|
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
107
|
-
|
156
|
+
fstatus_txt = Open.read(fstatus)
|
157
|
+
begin
|
158
|
+
if job_batch_system == "lsf"
|
159
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
160
|
+
else
|
161
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
162
|
+
end
|
163
|
+
rescue
|
164
|
+
nodes = []
|
165
|
+
end
|
108
166
|
elsif job_nodes[id]
|
109
|
-
|
167
|
+
nodes = job_nodes[id].reject{|n| n.include? "("}
|
110
168
|
else
|
111
|
-
|
169
|
+
nodes = []
|
112
170
|
end
|
113
171
|
|
114
172
|
if File.exists?(File.join(dir, 'exit.status'))
|
@@ -137,7 +195,7 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
137
195
|
select = true if done && exit_status == 0
|
138
196
|
select = true if error && exit_status && exit_status != 0
|
139
197
|
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
140
|
-
is_running = exit_status.nil? && running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)
|
198
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
|
141
199
|
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
142
200
|
select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
143
201
|
select = true if jobid && jobid.split(",").include?(id)
|
@@ -151,29 +209,39 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
151
209
|
|
152
210
|
|
153
211
|
puts Log.color :blue, dir
|
154
|
-
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.
|
212
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
|
155
213
|
puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err'))
|
156
214
|
puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest)
|
215
|
+
puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
|
157
216
|
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
158
217
|
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
159
218
|
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
160
219
|
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
161
|
-
|
220
|
+
if different_system
|
221
|
+
puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
|
222
|
+
else
|
223
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
224
|
+
end
|
162
225
|
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
163
226
|
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
164
227
|
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
165
228
|
puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
|
166
229
|
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exists?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
167
230
|
|
168
|
-
if options[:
|
169
|
-
puts Log.color(:magenta, "
|
170
|
-
|
231
|
+
if options[:batch_parameters]
|
232
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
233
|
+
case job_batch_system
|
234
|
+
when 'slurm'
|
235
|
+
text = CMD.cmd('grep "^#SBATCH" |tail -n +5', :in => Open.read(fcmd)).read.strip
|
236
|
+
when 'lsf'
|
237
|
+
text = CMD.cmd('grep "^#BSUB" |tail -n +5', :in => Open.read(fcmd)).read.strip
|
238
|
+
end
|
171
239
|
lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
|
172
240
|
puts Log.color :yellow, lines * "\n"
|
173
241
|
end
|
174
242
|
|
175
243
|
fprocpath = File.join(dir, 'procpath.sqlite3')
|
176
|
-
if options[:
|
244
|
+
if options[:batch_procpath] && Open.exists?(fprocpath)
|
177
245
|
puts Log.color(:magenta, "Procpath summary: ")
|
178
246
|
require 'rbbt/tsv/csv'
|
179
247
|
meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
|
@@ -215,13 +283,15 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
215
283
|
|
216
284
|
if options[:sacct_peformance]
|
217
285
|
begin
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
286
|
+
raise "sacct not supported for LSF" unless batch_system == 'slurm'
|
287
|
+
tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
|
288
|
+
values = tsv[tsv.keys.first]
|
289
|
+
if values.compact.any?
|
290
|
+
puts Log.color(:magenta, "SACCT performance: ")
|
291
|
+
puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
|
292
|
+
end
|
224
293
|
rescue
|
294
|
+
Log.warn $!.message
|
225
295
|
end
|
226
296
|
end
|
227
297
|
|
@@ -247,7 +317,7 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
247
317
|
step = Step.new step_path
|
248
318
|
step.load_dependencies_from_info
|
249
319
|
(step.rec_dependencies + [step]).reverse.each do |j|
|
250
|
-
next if j.done?
|
320
|
+
next if j.done?
|
251
321
|
next unless j.file(:progress).exists?
|
252
322
|
bar = Log::ProgressBar.new
|
253
323
|
bar.load(j.file(:progress).yaml)
|