rbbt-util 5.30.13 → 5.31.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc.rb +3 -0
- data/lib/rbbt/hpc/batch.rb +623 -0
- data/lib/rbbt/hpc/lsf.rb +119 -0
- data/lib/rbbt/hpc/orchestrate.rb +12 -11
- data/lib/rbbt/hpc/slurm.rb +62 -567
- data/lib/rbbt/resource/path.rb +3 -1
- data/lib/rbbt/tsv/accessor.rb +5 -2
- data/lib/rbbt/tsv/dumper.rb +1 -0
- data/lib/rbbt/tsv/parallel/traverse.rb +1 -1
- data/lib/rbbt/tsv/stream.rb +5 -6
- data/lib/rbbt/util/log.rb +22 -1
- data/lib/rbbt/util/misc/development.rb +2 -2
- data/lib/rbbt/util/misc/options.rb +5 -0
- data/lib/rbbt/workflow/step/accessor.rb +1 -1
- data/lib/rbbt/workflow/usage.rb +13 -13
- data/share/config.ru +3 -3
- data/share/rbbt_commands/{slurm → hpc}/clean +91 -18
- data/share/rbbt_commands/{slurm → hpc}/list +100 -30
- data/share/rbbt_commands/hpc/orchestrate +81 -0
- data/share/rbbt_commands/hpc/tail +81 -0
- data/share/rbbt_commands/hpc/task +80 -0
- data/test/rbbt/hpc/test_batch.rb +65 -0
- data/test/rbbt/hpc/test_slurm.rb +30 -0
- data/test/rbbt/util/misc/test_development.rb +11 -0
- data/test/test_helper.rb +3 -1
- metadata +16 -7
- data/share/rbbt_commands/slurm/orchestrate +0 -48
- data/share/rbbt_commands/slurm/task +0 -46
data/lib/rbbt/resource/path.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/util/misc/indiferent_hash'
|
|
3
3
|
require 'yaml'
|
4
4
|
|
5
5
|
module Path
|
6
|
-
attr_accessor :resource, :pkgdir, :original, :search_paths, :search_order, :libdir
|
6
|
+
attr_accessor :resource, :pkgdir, :original, :search_paths, :search_order, :libdir, :where
|
7
7
|
|
8
8
|
def self.setup(string, pkgdir = nil, resource = nil, search_paths = nil, search_order = nil, libdir = nil)
|
9
9
|
return string if string.nil?
|
@@ -99,6 +99,7 @@ module Path
|
|
99
99
|
|
100
100
|
paths = paths.each do |p|
|
101
101
|
p.original = File.join(found.original, p.sub(/^#{found}/, ''))
|
102
|
+
p.where = where
|
102
103
|
end if found.original and pattern
|
103
104
|
|
104
105
|
location_paths[where] = paths
|
@@ -256,6 +257,7 @@ module Path
|
|
256
257
|
end
|
257
258
|
|
258
259
|
res.original = self.original || self
|
260
|
+
res.where = where
|
259
261
|
|
260
262
|
res
|
261
263
|
end
|
data/lib/rbbt/tsv/accessor.rb
CHANGED
@@ -713,8 +713,11 @@ module TSV
|
|
713
713
|
break
|
714
714
|
end
|
715
715
|
|
716
|
-
filename =
|
717
|
-
filename
|
716
|
+
filename = @filename
|
717
|
+
filename = "No filename" if filename.nil? || filename.empty?
|
718
|
+
filename.find if Path === filename
|
719
|
+
filename = File.basename(filename) + " [" + File.basename(persistence_path) + "]" if respond_to?(:persistence_path) and persistence_path
|
720
|
+
|
718
721
|
with_unnamed do
|
719
722
|
<<-EOF
|
720
723
|
Filename = #{filename}
|
data/lib/rbbt/tsv/dumper.rb
CHANGED
@@ -610,7 +610,7 @@ module TSV
|
|
610
610
|
thread = Thread.new do
|
611
611
|
begin
|
612
612
|
traverse_run(obj, threads, cpus, options, &block)
|
613
|
-
into.close if into.respond_to?(:close) and not (into.respond_to?
|
613
|
+
into.close if into.respond_to?(:close) and not (into.respond_to?(:closed?) and into.closed?)
|
614
614
|
rescue Exception
|
615
615
|
abort_stream obj
|
616
616
|
abort_stream into
|
data/lib/rbbt/tsv/stream.rb
CHANGED
@@ -294,20 +294,19 @@ module TSV
|
|
294
294
|
end
|
295
295
|
|
296
296
|
|
297
|
-
def self.reorder_stream_tsv(stream, key_field, fields=nil, zipped = true)
|
297
|
+
def self.reorder_stream_tsv(stream, key_field, fields=nil, zipped = true, bar = nil)
|
298
298
|
parser = TSV::Parser.new TSV.get_stream(stream), :key_field => key_field, :fields => fields
|
299
299
|
dumper_options = parser.options
|
300
300
|
dumper = TSV::Dumper.new dumper_options
|
301
301
|
dumper.init
|
302
302
|
case parser.type
|
303
303
|
when :single
|
304
|
-
TSV.traverse parser, :into => dumper do |keys,values|
|
304
|
+
TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
|
305
305
|
key = keys.first
|
306
306
|
[key, [values]]
|
307
307
|
end
|
308
308
|
when :double
|
309
|
-
TSV.traverse parser, :into => dumper do |keys,values|
|
310
|
-
raise [keys, values].inspect if keys.include? 'gain'
|
309
|
+
TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
|
311
310
|
res = []
|
312
311
|
keys.each_with_index do |key,i|
|
313
312
|
vs = zipped ? values.collect{|l| l.length == 1 ? l : [l[i]] } : values
|
@@ -317,12 +316,12 @@ module TSV
|
|
317
316
|
res
|
318
317
|
end
|
319
318
|
when :list
|
320
|
-
TSV.traverse parser, :into => dumper do |keys,values|
|
319
|
+
TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
|
321
320
|
key = keys === Array ? keys.first : keys
|
322
321
|
[key, values]
|
323
322
|
end
|
324
323
|
when :flat
|
325
|
-
TSV.traverse parser, :into => dumper do |keys,values|
|
324
|
+
TSV.traverse parser, :into => dumper, :bar => bar do |keys,values|
|
326
325
|
key = keys === Array ? keys.first : keys
|
327
326
|
[key, values]
|
328
327
|
end
|
data/lib/rbbt/util/log.rb
CHANGED
@@ -341,10 +341,31 @@ module Log
|
|
341
341
|
end unless stack.nil?
|
342
342
|
end
|
343
343
|
|
344
|
-
def self.tsv(tsv)
|
344
|
+
def self.tsv(tsv, example = false)
|
345
345
|
STDERR.puts Log.color :magenta, "TSV log: " << Log.last_caller(caller).gsub('`',"'")
|
346
346
|
STDERR.puts Log.color(:blue, "=> "<< Misc.fingerprint(tsv), true)
|
347
347
|
STDERR.puts Log.color(:cyan, "=> " << tsv.summary)
|
348
|
+
if example && ! tsv.empty?
|
349
|
+
key = case example
|
350
|
+
when TrueClass, :first, "first"
|
351
|
+
tsv.keys.first
|
352
|
+
when :random, "random"
|
353
|
+
tsv.keys.shuffle.first
|
354
|
+
else
|
355
|
+
example
|
356
|
+
end
|
357
|
+
|
358
|
+
values = tsv[key]
|
359
|
+
values = [values] if tsv.type == :flat || tsv.type == :single
|
360
|
+
if values.nil?
|
361
|
+
STDERR.puts Log.color(:blue, "Key (#{tsv.key_field}) not present: ") + key
|
362
|
+
else
|
363
|
+
STDERR.puts Log.color(:blue, "Key (#{tsv.key_field}): ") + key
|
364
|
+
tsv.fields.zip(values).each do |field,value|
|
365
|
+
STDERR.puts Log.color(:magenta, field + ": ") + (Array === value ? value * ", " : value.to_s)
|
366
|
+
end
|
367
|
+
end
|
368
|
+
end
|
348
369
|
end
|
349
370
|
|
350
371
|
def self.stack(stack)
|
@@ -428,9 +428,9 @@ def self.add_libdir(dir=nil)
|
|
428
428
|
end
|
429
429
|
end
|
430
430
|
|
431
|
-
|
432
|
-
def self.ssh_run(server, script)
|
431
|
+
def self.ssh_run(server, script = nil, &block)
|
433
432
|
Log.debug "Run ssh script in #{server}:\n#{script}"
|
433
|
+
|
434
434
|
CMD.cmd("ssh '#{server}' 'shopt -s expand_aliases; bash -l -c \"ruby\"' ", :in => script, :log => true).read
|
435
435
|
end
|
436
436
|
|
@@ -135,9 +135,14 @@ module Misc
|
|
135
135
|
end
|
136
136
|
|
137
137
|
new_options
|
138
|
+
|
139
|
+
options.replace new_options
|
138
140
|
end
|
139
141
|
|
140
142
|
def self.process_options(hash, *keys)
|
143
|
+
defaults = keys.pop if Hash === keys.last
|
144
|
+
hahs = Misc.add_defaults hash, defaults if defaults
|
145
|
+
|
141
146
|
if keys.length == 1
|
142
147
|
hash.include?(keys.first.to_sym) ? hash.delete(keys.first.to_sym) : hash.delete(keys.first.to_s)
|
143
148
|
else
|
@@ -99,7 +99,7 @@ class Step
|
|
99
99
|
if String === value && File.exists?(value)
|
100
100
|
Open.ln_s(value, path)
|
101
101
|
else
|
102
|
-
Open.write(path + '.yaml', value.to_yaml)
|
102
|
+
Open.write(path + '.yaml', value.to_s.to_yaml)
|
103
103
|
end
|
104
104
|
when Array === value
|
105
105
|
Open.write(path, value.collect{|v| Step === v ? v.path : v.to_s} * "\n")
|
data/lib/rbbt/workflow/usage.rb
CHANGED
@@ -6,19 +6,6 @@ module Task
|
|
6
6
|
puts "\n" << Misc.format_paragraph(description.strip) << "\n" if description and not description.empty?
|
7
7
|
puts
|
8
8
|
|
9
|
-
case
|
10
|
-
when (input_types.values & [:array]).any?
|
11
|
-
puts Log.color(:green, Misc.format_paragraph("Lists are specified as arguments using ',' or '|'. When specified as files the '\\n'
|
12
|
-
also works in addition to the others. You may use the '--array_separator' option
|
13
|
-
the change this default. Whenever a file is specified it may also accept STDIN using
|
14
|
-
the '-' character."))
|
15
|
-
puts
|
16
|
-
|
17
|
-
when (input_types.values & [:text, :tsv]).any?
|
18
|
-
puts Log.color(:green, Misc.format_paragraph("Whenever a file is specified it may also accept STDIN using the '-' character."))
|
19
|
-
puts
|
20
|
-
end
|
21
|
-
|
22
9
|
selects = []
|
23
10
|
if inputs.any?
|
24
11
|
inputs.zip(input_types.values_at(*inputs)).select{|i,t| t.to_sym == :select && input_options[i] && input_options[i][:select_options] }.each{|i,t| selects << [i, input_options[i][:select_options]] }
|
@@ -50,6 +37,19 @@ module Task
|
|
50
37
|
puts
|
51
38
|
end
|
52
39
|
|
40
|
+
case
|
41
|
+
when (input_types.values & [:array]).any?
|
42
|
+
puts Log.color(:green, Misc.format_paragraph("Lists are specified as arguments using ',' or '|'. When specified as files the '\\n'
|
43
|
+
also works in addition to the others. You may use the '--array_separator' option
|
44
|
+
the change this default. Whenever a file is specified it may also accept STDIN using
|
45
|
+
the '-' character."))
|
46
|
+
puts
|
47
|
+
|
48
|
+
when (input_types.values & [:text, :tsv]).any?
|
49
|
+
puts Log.color(:green, Misc.format_paragraph("Whenever a file is specified it may also accept STDIN using the '-' character."))
|
50
|
+
puts
|
51
|
+
end
|
52
|
+
|
53
53
|
puts Log.color(:magenta, "Returns: ") << Log.color(:blue, result_type.to_s) << "\n"
|
54
54
|
puts
|
55
55
|
|
data/share/config.ru
CHANGED
@@ -43,6 +43,9 @@ app_eval app, Rbbt.etc['app.d/remote_workflow_tasks.rb'].find_all
|
|
43
43
|
#{{{ BASE
|
44
44
|
app_eval app, Rbbt.etc['app.d/base.rb'].find
|
45
45
|
|
46
|
+
#{{{ SINATRA
|
47
|
+
load_file Rbbt.lib['sinatra.rb'].find_all
|
48
|
+
|
46
49
|
#{{{ RESOURCES
|
47
50
|
load_file Rbbt.etc['app.d/resources.rb'].find
|
48
51
|
|
@@ -70,9 +73,6 @@ load_file Rbbt.etc['app.d/preload.rb'].find_all
|
|
70
73
|
#{{{ PRELOAD
|
71
74
|
load_file Rbbt.etc['app.d/semaphores.rb'].find_all
|
72
75
|
|
73
|
-
#{{{ SINATRA
|
74
|
-
load_file Rbbt.lib['sinatra.rb'].find_all
|
75
|
-
|
76
76
|
Entity.entity_list_cache = Rbbt.var.sinatra.app[app_name].find.entity_lists
|
77
77
|
Entity.entity_map_cache = Rbbt.var.sinatra.app[app_name].find.entity_maps
|
78
78
|
Entity.entity_property_cache = Rbbt.var.sinatra.app[app_name].find.entity_properties
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'rbbt-util'
|
4
4
|
require 'rbbt/util/simpleopt'
|
5
|
+
require 'rbbt/hpc'
|
5
6
|
|
6
7
|
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
7
8
|
|
@@ -9,16 +10,17 @@ options = SOPT.setup <<EOF
|
|
9
10
|
|
10
11
|
Clean error or aborted jobs
|
11
12
|
|
12
|
-
$ rbbt
|
13
|
+
$ rbbt slurm clean [options]
|
13
14
|
|
14
15
|
-h--help Print this help
|
15
16
|
-d--done Done jobs only
|
16
17
|
-e--error Error jobs only
|
17
18
|
-a--aborted SLURM aboted jobs
|
19
|
+
-q--queued Queued jobs only
|
18
20
|
-j--job* Job ids
|
19
21
|
-s--search* Regular expression
|
20
22
|
-t--tail* Show the last lines of the STDERR
|
21
|
-
-
|
23
|
+
-BP--batch_parameters show batch parameters
|
22
24
|
-dr--dry_run Do not erase anything
|
23
25
|
EOF
|
24
26
|
|
@@ -31,14 +33,47 @@ if options[:help]
|
|
31
33
|
exit 0
|
32
34
|
end
|
33
35
|
|
36
|
+
batch_system = options.delete :batch_system
|
37
|
+
batch_system ||= 'auto'
|
38
|
+
|
39
|
+
HPC::BATCH_MODULE = case batch_system.to_s.downcase
|
40
|
+
when 'slurm'
|
41
|
+
HPC::SLURM
|
42
|
+
when 'lsf'
|
43
|
+
HPC::LSF
|
44
|
+
when 'auto'
|
45
|
+
case $previous_commands.last
|
46
|
+
when 'slurm'
|
47
|
+
HPC::SLURM
|
48
|
+
when 'lsf'
|
49
|
+
HPC::LSF
|
50
|
+
else
|
51
|
+
case Rbbt::Config.get(:batch_system, :batch, :batch_system, :hpc, :HPC, :BATCH).to_s.downcase
|
52
|
+
when 'slurm'
|
53
|
+
HPC::SLURM
|
54
|
+
when 'lsf'
|
55
|
+
HPC::LSF
|
56
|
+
else
|
57
|
+
case ENV["BATCH_SYSTEM"].to_s.downcase
|
58
|
+
when 'slurm'
|
59
|
+
HPC::SLURM
|
60
|
+
when 'lsf'
|
61
|
+
HPC::LSF
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
68
|
+
|
34
69
|
Log.severity = 4
|
35
|
-
done, error, aborted, jobid, search, tail,
|
70
|
+
done, error, aborted, queued, jobid, search, tail, batch_parameters, dry_run = options.values_at :done, :error, :aborted, :queued, :job, :search, :tail, :batch_parameters, :dry_run
|
36
71
|
|
37
|
-
workdir = File.expand_path('~/rbbt-
|
72
|
+
workdir = File.expand_path('~/rbbt-batch')
|
38
73
|
Path.setup(workdir)
|
39
74
|
|
40
75
|
running_jobs = begin
|
41
|
-
squeue_txt =
|
76
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
42
77
|
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
43
78
|
rescue
|
44
79
|
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
@@ -58,27 +93,35 @@ else
|
|
58
93
|
end
|
59
94
|
|
60
95
|
count = 0
|
61
|
-
workdir.glob("**/command.
|
96
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
62
97
|
dir = File.dirname(fcmd)
|
98
|
+
command_txt = Open.read(fcmd)
|
63
99
|
|
64
|
-
if m =
|
100
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
65
101
|
cmd = m[1]
|
66
102
|
else
|
67
103
|
cmd = nil
|
68
104
|
end
|
69
105
|
|
70
|
-
if m =
|
106
|
+
if m = command_txt.match(/# Run command\n(.*?)\n/im)
|
71
107
|
exe = m[1]
|
72
108
|
else
|
73
109
|
exe = nil
|
74
110
|
end
|
75
111
|
|
76
|
-
if m =
|
112
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
77
113
|
container_home = m[1]
|
78
114
|
else
|
79
115
|
container_home = nil
|
80
116
|
end
|
81
117
|
|
118
|
+
if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
|
119
|
+
job_batch_system = m[1].downcase
|
120
|
+
else
|
121
|
+
job_batch_system = nil
|
122
|
+
end
|
123
|
+
|
124
|
+
different_system = job_batch_system != batch_system
|
82
125
|
|
83
126
|
if File.exists?(fid = File.join(dir, 'job.id'))
|
84
127
|
id = Open.read(fid).chomp
|
@@ -93,7 +136,16 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
93
136
|
end
|
94
137
|
|
95
138
|
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
96
|
-
|
139
|
+
fstatus_txt = Open.read(fstatus)
|
140
|
+
begin
|
141
|
+
if job_batch_system == "lsf"
|
142
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
143
|
+
else
|
144
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
145
|
+
end
|
146
|
+
rescue
|
147
|
+
nodes = []
|
148
|
+
end
|
97
149
|
elsif job_nodes[id]
|
98
150
|
nodes = job_nodes[id]
|
99
151
|
else
|
@@ -113,20 +165,36 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
113
165
|
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
114
166
|
|
115
167
|
aborted = error = true if aborted.nil? && error.nil?
|
116
|
-
if done || error || aborted || running || queued || jobid || search
|
168
|
+
#if done || error || aborted || running || queued || jobid || search
|
169
|
+
# select = false
|
170
|
+
# select = true if done && exit_status && exit_status.to_i == 0
|
171
|
+
# select = true if error && exit_status && exit_status.to_i != 0
|
172
|
+
# select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
173
|
+
# select = select && jobid.split(",").include?(id) if jobid
|
174
|
+
# select = select && cmd.match(/#{search}/) if search
|
175
|
+
# next unless select
|
176
|
+
#end
|
177
|
+
|
178
|
+
if done || error || aborted || queued || jobid
|
117
179
|
select = false
|
118
|
-
select = true if done && exit_status
|
119
|
-
select = true if error && exit_status && exit_status
|
180
|
+
select = true if done && exit_status == 0
|
181
|
+
select = true if error && exit_status && exit_status != 0
|
120
182
|
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
121
|
-
|
183
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
|
184
|
+
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
185
|
+
select = true if jobid && jobid.split(",").include?(id)
|
122
186
|
select = select && cmd.match(/#{search}/) if search
|
123
187
|
next unless select
|
188
|
+
elsif search
|
189
|
+
select = false
|
190
|
+
select = true if search && cmd.match(/#{search}/)
|
191
|
+
next unless select
|
124
192
|
end
|
125
193
|
|
126
194
|
|
127
195
|
puts Log.color(:yellow, "**ERASING**")
|
128
196
|
puts Log.color :blue, dir
|
129
|
-
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.
|
197
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
|
130
198
|
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
131
199
|
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
132
200
|
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
@@ -137,9 +205,14 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
137
205
|
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
138
206
|
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
139
207
|
|
140
|
-
if options[:
|
141
|
-
puts Log.color(:magenta, "
|
142
|
-
|
208
|
+
if options[:batch_parameters]
|
209
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
210
|
+
case job_batch_system
|
211
|
+
when 'slurm'
|
212
|
+
puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
213
|
+
when 'lsf'
|
214
|
+
puts Log.color :blue, CMD.cmd('grep "^#BSUB" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
215
|
+
end
|
143
216
|
end
|
144
217
|
|
145
218
|
if tail && File.exists?(File.join(dir, 'std.err'))
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'rbbt-util'
|
4
4
|
require 'rbbt/util/simpleopt'
|
5
|
+
require 'rbbt/hpc'
|
5
6
|
|
6
7
|
#$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
7
8
|
|
@@ -9,7 +10,7 @@ options = SOPT.setup <<EOF
|
|
9
10
|
|
10
11
|
Queue a job in Marenostrum
|
11
12
|
|
12
|
-
$ rbbt
|
13
|
+
$ rbbt slurm list [options]
|
13
14
|
|
14
15
|
-h--help Print this help
|
15
16
|
-d--done Done jobs only
|
@@ -21,9 +22,10 @@ $ rbbt mnl [options]
|
|
21
22
|
-s--search* Regular expression
|
22
23
|
-t--tail* Show the last lines of the STDERR
|
23
24
|
-p--progress Report progress of job and the dependencies
|
24
|
-
-
|
25
|
-
-
|
25
|
+
-BP--batch_parameters show batch parameters
|
26
|
+
-BPP--batch_procpath show Procpath performance summary
|
26
27
|
-sacct--sacct_peformance show sacct performance summary
|
28
|
+
-bs--batch_system* Batch system to use: auto, lsf, slurm (default is auto-detect)
|
27
29
|
EOF
|
28
30
|
|
29
31
|
if options[:help]
|
@@ -35,14 +37,48 @@ if options[:help]
|
|
35
37
|
exit 0
|
36
38
|
end
|
37
39
|
|
38
|
-
|
40
|
+
batch_system = options.delete :batch_system
|
41
|
+
batch_system ||= 'auto'
|
42
|
+
|
43
|
+
HPC::BATCH_MODULE = case batch_system.to_s.downcase
|
44
|
+
when 'slurm'
|
45
|
+
HPC::SLURM
|
46
|
+
when 'lsf'
|
47
|
+
HPC::LSF
|
48
|
+
when 'auto'
|
49
|
+
case $previous_commands.last
|
50
|
+
when 'slurm'
|
51
|
+
HPC::SLURM
|
52
|
+
when 'lsf'
|
53
|
+
HPC::LSF
|
54
|
+
else
|
55
|
+
case Rbbt::Config.get(:batch_system, :batch, :batch_system, :hpc, :HPC, :BATCH).to_s.downcase
|
56
|
+
when 'slurm'
|
57
|
+
HPC::SLURM
|
58
|
+
when 'lsf'
|
59
|
+
HPC::LSF
|
60
|
+
else
|
61
|
+
case ENV["BATCH_SYSTEM"].to_s.downcase
|
62
|
+
when 'slurm'
|
63
|
+
HPC::SLURM
|
64
|
+
when 'lsf'
|
65
|
+
HPC::LSF
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
raise ParameterException.new("Could not detect batch_system: #{Misc.fingerprint batch_system}") if HPC::BATCH_MODULE.nil?
|
72
|
+
|
73
|
+
batch_system = HPC::BATCH_MODULE.to_s.split("::").last.downcase
|
74
|
+
|
39
75
|
done, error, running, queued, aborted, jobid, search, tail, progress = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail, :progress
|
40
76
|
|
41
|
-
workdir = File.expand_path('~/rbbt-
|
77
|
+
workdir = File.expand_path('~/rbbt-batch')
|
42
78
|
Path.setup(workdir)
|
43
79
|
|
44
80
|
running_jobs = begin
|
45
|
-
squeue_txt =
|
81
|
+
squeue_txt = HPC::BATCH_MODULE.job_status
|
46
82
|
squeue_txt.split("\n").collect{|l| l.to_i.to_s}
|
47
83
|
rescue
|
48
84
|
Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
|
@@ -62,35 +98,48 @@ else
|
|
62
98
|
end
|
63
99
|
|
64
100
|
count = 0
|
65
|
-
workdir.glob("**/command.
|
101
|
+
workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
66
102
|
dir = File.dirname(fcmd)
|
103
|
+
command_txt = Open.read(fcmd)
|
67
104
|
|
68
|
-
if m =
|
105
|
+
if m = command_txt.match(/#CMD: (.*)/)
|
69
106
|
cmd = m[1]
|
70
107
|
else
|
71
108
|
cmd = nil
|
72
109
|
end
|
73
110
|
|
74
|
-
if m =
|
111
|
+
if m = command_txt.match(/^BATCH_SYSTEM=(.*)/)
|
112
|
+
job_batch_system = m[1].downcase
|
113
|
+
else
|
114
|
+
job_batch_system = nil
|
115
|
+
end
|
116
|
+
|
117
|
+
different_system = job_batch_system != batch_system
|
118
|
+
|
119
|
+
if m = command_txt.match(/#MANIFEST: (.*)/)
|
75
120
|
manifest = m[1]
|
76
121
|
else
|
77
122
|
manifest = nil
|
78
123
|
end
|
79
124
|
|
125
|
+
if m = command_txt.match(/#STEP_PATH: (.*)/)
|
126
|
+
step_path = m[1]
|
127
|
+
else
|
128
|
+
step_path = nil
|
129
|
+
end
|
80
130
|
|
81
|
-
if m =
|
82
|
-
exe = m[1]
|
131
|
+
if m = command_txt.match(/#EXEC_CMD: (.*)/)
|
132
|
+
exe = m[1]
|
83
133
|
else
|
84
134
|
exe = nil
|
85
135
|
end
|
86
136
|
|
87
|
-
if m =
|
137
|
+
if m = command_txt.match(/^CONTAINER_DIR=(.*)/)
|
88
138
|
container_home = m[1]
|
89
139
|
else
|
90
140
|
container_home = nil
|
91
141
|
end
|
92
142
|
|
93
|
-
|
94
143
|
if File.exists?(fid = File.join(dir, 'job.id'))
|
95
144
|
id = Open.read(fid).chomp
|
96
145
|
else
|
@@ -104,11 +153,20 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
104
153
|
end
|
105
154
|
|
106
155
|
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
107
|
-
|
156
|
+
fstatus_txt = Open.read(fstatus)
|
157
|
+
begin
|
158
|
+
if job_batch_system == "lsf"
|
159
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/)[5].split(",")
|
160
|
+
else
|
161
|
+
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
162
|
+
end
|
163
|
+
rescue
|
164
|
+
nodes = []
|
165
|
+
end
|
108
166
|
elsif job_nodes[id]
|
109
|
-
|
167
|
+
nodes = job_nodes[id].reject{|n| n.include? "("}
|
110
168
|
else
|
111
|
-
|
169
|
+
nodes = []
|
112
170
|
end
|
113
171
|
|
114
172
|
if File.exists?(File.join(dir, 'exit.status'))
|
@@ -137,7 +195,7 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
137
195
|
select = true if done && exit_status == 0
|
138
196
|
select = true if error && exit_status && exit_status != 0
|
139
197
|
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
140
|
-
is_running = exit_status.nil? && running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)
|
198
|
+
is_running = exit_status.nil? && ( (running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)) || different_system )
|
141
199
|
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
142
200
|
select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
143
201
|
select = true if jobid && jobid.split(",").include?(id)
|
@@ -151,29 +209,39 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
151
209
|
|
152
210
|
|
153
211
|
puts Log.color :blue, dir
|
154
|
-
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.
|
212
|
+
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.batch')).to_s
|
155
213
|
puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err'))
|
156
214
|
puts Log.color(:magenta, "Manifest: ") << Log.color(:yellow, manifest)
|
215
|
+
puts Log.color(:magenta, "Step path: ") << Log.color(:yellow, step_path)
|
157
216
|
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
158
217
|
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
159
218
|
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
160
219
|
puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
|
161
|
-
|
220
|
+
if different_system
|
221
|
+
puts Log.color(:magenta, "Job ID (#{Log.color(:red, job_batch_system)}): ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id) )
|
222
|
+
else
|
223
|
+
puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
|
224
|
+
end
|
162
225
|
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
163
226
|
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
164
227
|
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
165
228
|
puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
|
166
229
|
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exists?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
167
230
|
|
168
|
-
if options[:
|
169
|
-
puts Log.color(:magenta, "
|
170
|
-
|
231
|
+
if options[:batch_parameters]
|
232
|
+
puts Log.color(:magenta, "BATCH parameters: ")
|
233
|
+
case job_batch_system
|
234
|
+
when 'slurm'
|
235
|
+
text = CMD.cmd('grep "^#SBATCH" |tail -n +5', :in => Open.read(fcmd)).read.strip
|
236
|
+
when 'lsf'
|
237
|
+
text = CMD.cmd('grep "^#BSUB" |tail -n +5', :in => Open.read(fcmd)).read.strip
|
238
|
+
end
|
171
239
|
lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
|
172
240
|
puts Log.color :yellow, lines * "\n"
|
173
241
|
end
|
174
242
|
|
175
243
|
fprocpath = File.join(dir, 'procpath.sqlite3')
|
176
|
-
if options[:
|
244
|
+
if options[:batch_procpath] && Open.exists?(fprocpath)
|
177
245
|
puts Log.color(:magenta, "Procpath summary: ")
|
178
246
|
require 'rbbt/tsv/csv'
|
179
247
|
meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
|
@@ -215,13 +283,15 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
215
283
|
|
216
284
|
if options[:sacct_peformance]
|
217
285
|
begin
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
286
|
+
raise "sacct not supported for LSF" unless batch_system == 'slurm'
|
287
|
+
tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
|
288
|
+
values = tsv[tsv.keys.first]
|
289
|
+
if values.compact.any?
|
290
|
+
puts Log.color(:magenta, "SACCT performance: ")
|
291
|
+
puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
|
292
|
+
end
|
224
293
|
rescue
|
294
|
+
Log.warn $!.message
|
225
295
|
end
|
226
296
|
end
|
227
297
|
|
@@ -247,7 +317,7 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
247
317
|
step = Step.new step_path
|
248
318
|
step.load_dependencies_from_info
|
249
319
|
(step.rec_dependencies + [step]).reverse.each do |j|
|
250
|
-
next if j.done?
|
320
|
+
next if j.done?
|
251
321
|
next unless j.file(:progress).exists?
|
252
322
|
bar = Log::ProgressBar.new
|
253
323
|
bar.load(j.file(:progress).yaml)
|