rbbt-util 5.29.4 → 5.30.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc/orchestrate.rb +1 -1
- data/lib/rbbt/hpc/slurm.rb +21 -10
- data/lib/rbbt/persist/tsv/adapter.rb +44 -13
- data/lib/rbbt/tsv.rb +3 -2
- data/lib/rbbt/util/cmd.rb +6 -1
- data/lib/rbbt/util/misc/options.rb +0 -42
- data/lib/rbbt/util/procpath.rb +49 -0
- data/lib/rbbt/workflow.rb +1 -1
- data/lib/rbbt/workflow/step/accessor.rb +3 -4
- data/lib/rbbt/workflow/step/run.rb +2 -4
- data/lib/rbbt/workflow/usage.rb +1 -1
- data/lib/rbbt/workflow/util/archive.rb +1 -1
- data/lib/rbbt/workflow/util/provenance.rb +2 -1
- data/share/rbbt_commands/migrate +1 -1
- data/share/rbbt_commands/slurm/list +82 -8
- data/share/rbbt_commands/slurm/orchestrate +1 -0
- data/share/rbbt_commands/slurm/task +1 -0
- data/share/rbbt_commands/tsv/slice +3 -3
- data/share/rbbt_commands/workflow/info +1 -1
- data/share/rbbt_commands/workflow/task +18 -1
- data/share/rbbt_commands/workflow/write_info +52 -0
- data/test/rbbt/util/test_procpath.rb +23 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d45adac7949e3fea0d710418d93837cf0aa715bcedb2212c6e68f8dd749382ae
|
4
|
+
data.tar.gz: 9a9857c6b1565b9ed55f18f50fb1b242a1477dd6868111e10252ecc0c286ca44
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9dac4b1211fd40894f00b1a84f4f10abad83f2169579ec327a8da86f05b4274cc11e2192902638eb78b9de8f39351993f17a3b08dd5390a0f2c04ef385a0e1a2
|
7
|
+
data.tar.gz: 01b9d69b003088e78f2665b57e9ed8acc8104143bc826aabe49059e77fd0e20279d9393245f7bb20efcab807180273f31d2285a62113947cf151571a96f2d949
|
data/lib/rbbt/hpc/orchestrate.rb
CHANGED
@@ -83,7 +83,7 @@ module HPC
|
|
83
83
|
|
84
84
|
deps = seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
|
85
85
|
if job.canfail_paths.include? dep.path
|
86
|
-
[deps].flatten.collect{|id| ['canfail', id] * ":"}
|
86
|
+
[deps].flatten.compact.collect{|id| ['canfail', id] * ":"}
|
87
87
|
else
|
88
88
|
deps
|
89
89
|
end
|
data/lib/rbbt/hpc/slurm.rb
CHANGED
@@ -33,7 +33,8 @@ module HPC
|
|
33
33
|
group = File.basename(File.dirname(ENV['HOME']))
|
34
34
|
|
35
35
|
if contain_and_sync
|
36
|
-
|
36
|
+
random_file = TmpFile.random_name
|
37
|
+
contain = "/scratch/tmp/rbbt-#{user}/#{random_file}" if contain.nil?
|
37
38
|
sync = "~/.rbbt/var/jobs" if sync.nil?
|
38
39
|
wipe_container = "post" if wipe_container.nil?
|
39
40
|
end
|
@@ -243,7 +244,7 @@ EOF
|
|
243
244
|
end
|
244
245
|
|
245
246
|
if contain
|
246
|
-
rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}
|
247
|
+
rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/workdir')
|
247
248
|
end
|
248
249
|
end
|
249
250
|
|
@@ -251,6 +252,10 @@ EOF
|
|
251
252
|
cmd =<<-EOF
|
252
253
|
#{exec_cmd} \\
|
253
254
|
#{rbbt_cmd}
|
255
|
+
EOF
|
256
|
+
annotate_cmd =<<-EOF
|
257
|
+
#{exec_cmd} \\
|
258
|
+
workflow write_info --recursive --force=false --check_pid "$step_path" slurm_job $SLURM_JOB_ID
|
254
259
|
EOF
|
255
260
|
|
256
261
|
header +=<<-EOF
|
@@ -260,11 +265,14 @@ EOF
|
|
260
265
|
run +=<<-EOF
|
261
266
|
|
262
267
|
# Run command
|
263
|
-
#{cmd}
|
268
|
+
step_path=$(#{cmd})
|
264
269
|
|
265
270
|
# Save exit status
|
266
271
|
exit_status=$?
|
267
272
|
|
273
|
+
# Annotate info with SLURM job_info
|
274
|
+
#{annotate_cmd}
|
275
|
+
|
268
276
|
EOF
|
269
277
|
|
270
278
|
# CODA
|
@@ -286,7 +294,7 @@ EOF
|
|
286
294
|
sync = sync.strip
|
287
295
|
source = File.join(File.expand_path(contain), source)
|
288
296
|
else
|
289
|
-
source = File.join(File.expand_path(contain), '
|
297
|
+
source = File.join(File.expand_path(contain), 'workdir/var/jobs')
|
290
298
|
end
|
291
299
|
|
292
300
|
target = File.expand_path(sync)
|
@@ -516,7 +524,11 @@ EOF
|
|
516
524
|
dry_run = options.delete :dry_run
|
517
525
|
tail = options.delete :tail
|
518
526
|
dependencies = options.delete :slurm_dependencies
|
527
|
+
procpath = options.delete :SLURM_procpath
|
528
|
+
|
519
529
|
options[:jobname] = job.clean_name
|
530
|
+
log_level = options.delete :log
|
531
|
+
log_level ||= Log.severity
|
520
532
|
|
521
533
|
workflow = job.workflow
|
522
534
|
|
@@ -541,15 +553,14 @@ EOF
|
|
541
553
|
inputs_dir = File.join(tmp_directory, 'inputs_dir')
|
542
554
|
saved = Step.save_job_inputs(job, inputs_dir)
|
543
555
|
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
else
|
548
|
-
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
|
549
|
-
end
|
556
|
+
cmd = ['workflow', 'task', workflow.to_s, task.to_s, '--printpath', '--log', log_level.to_s]
|
557
|
+
|
558
|
+
cmd << "--procpath_performance='#{tmp_directory}/procpath##{procpath.gsub(',', '#')}'" if procpath
|
550
559
|
|
551
560
|
cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
|
552
561
|
|
562
|
+
cmd << "--load_inputs='#{inputs_dir}'" if saved && saved.any?
|
563
|
+
|
553
564
|
template = self.template(cmd, options)
|
554
565
|
jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
|
555
566
|
|
@@ -104,9 +104,6 @@ module Persist
|
|
104
104
|
write(true) if closed? || ! write?
|
105
105
|
res = begin
|
106
106
|
yield
|
107
|
-
rescue Exception
|
108
|
-
Log.exception $!
|
109
|
-
raise $!
|
110
107
|
ensure
|
111
108
|
close
|
112
109
|
end
|
@@ -115,7 +112,6 @@ module Persist
|
|
115
112
|
end
|
116
113
|
|
117
114
|
def read_and_close
|
118
|
-
#return yield if @locked
|
119
115
|
if read? || write?
|
120
116
|
begin
|
121
117
|
return yield
|
@@ -134,6 +130,41 @@ module Persist
|
|
134
130
|
end
|
135
131
|
end
|
136
132
|
|
133
|
+
def read_lock
|
134
|
+
read if closed?
|
135
|
+
if read?
|
136
|
+
return yield
|
137
|
+
end
|
138
|
+
|
139
|
+
lock do
|
140
|
+
close
|
141
|
+
read true
|
142
|
+
begin
|
143
|
+
yield
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def write_lock
|
149
|
+
write if closed?
|
150
|
+
if write?
|
151
|
+
begin
|
152
|
+
return yield
|
153
|
+
ensure
|
154
|
+
close
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
lock do
|
159
|
+
close
|
160
|
+
write true
|
161
|
+
begin
|
162
|
+
yield
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
|
137
168
|
def merge!(hash)
|
138
169
|
hash.each do |key,values|
|
139
170
|
self[key] = values
|
@@ -141,38 +172,38 @@ module Persist
|
|
141
172
|
end
|
142
173
|
|
143
174
|
def range(*args)
|
144
|
-
self.
|
175
|
+
self.read_lock do
|
145
176
|
super(*args)
|
146
177
|
end
|
147
178
|
end
|
148
179
|
|
149
180
|
def include?(*args)
|
150
|
-
self.
|
181
|
+
self.read_lock do
|
151
182
|
super(*args) #- TSV::ENTRY_KEYS.to_a
|
152
183
|
end
|
153
184
|
end
|
154
185
|
|
155
186
|
def [](*args)
|
156
|
-
self.
|
187
|
+
self.read_lock do
|
157
188
|
super(*args) #- TSV::ENTRY_KEYS.to_a
|
158
189
|
end
|
159
190
|
end
|
160
191
|
|
161
192
|
def []=(*args)
|
162
|
-
self.
|
193
|
+
self.write_lock do
|
163
194
|
super(*args) #- TSV::ENTRY_KEYS.to_a
|
164
195
|
end
|
165
196
|
end
|
166
197
|
|
167
198
|
def keys(*args)
|
168
|
-
self.
|
199
|
+
self.read_lock do
|
169
200
|
super(*args)
|
170
201
|
end
|
171
202
|
end
|
172
203
|
|
173
204
|
|
174
205
|
def prefix(key)
|
175
|
-
self.
|
206
|
+
self.read_lock do
|
176
207
|
range(key, 1, key + MAX_CHAR, 1)
|
177
208
|
end
|
178
209
|
end
|
@@ -184,13 +215,13 @@ module Persist
|
|
184
215
|
|
185
216
|
|
186
217
|
def size(*args)
|
187
|
-
self.
|
218
|
+
self.read_lock do
|
188
219
|
super(*args)
|
189
220
|
end
|
190
221
|
end
|
191
222
|
|
192
223
|
def each(*args, &block)
|
193
|
-
self.
|
224
|
+
self.read_lock do
|
194
225
|
super(*args, &block)
|
195
226
|
end
|
196
227
|
end
|
@@ -208,7 +239,7 @@ module Persist
|
|
208
239
|
end
|
209
240
|
|
210
241
|
def values_at(*keys)
|
211
|
-
self.
|
242
|
+
self.read_lock do
|
212
243
|
keys.collect do |k|
|
213
244
|
self[k]
|
214
245
|
end
|
data/lib/rbbt/tsv.rb
CHANGED
@@ -113,11 +113,12 @@ module TSV
|
|
113
113
|
|
114
114
|
data.entity_options = entity_options
|
115
115
|
|
116
|
-
if Path === source
|
117
|
-
|
116
|
+
if Path === source && data.identifiers
|
117
|
+
Path.setup(data.identifiers, source.pkgdir, source.resource)
|
118
118
|
end
|
119
119
|
|
120
120
|
if data.respond_to? :persistence_path
|
121
|
+
data.read
|
121
122
|
data
|
122
123
|
else
|
123
124
|
h = data.dup
|
data/lib/rbbt/util/cmd.rb
CHANGED
@@ -217,7 +217,7 @@ module CMD
|
|
217
217
|
end
|
218
218
|
end
|
219
219
|
|
220
|
-
def self.
|
220
|
+
def self.cmd_pid(*args)
|
221
221
|
all_args = *args
|
222
222
|
|
223
223
|
all_args << {} unless Hash === all_args.last
|
@@ -248,4 +248,9 @@ module CMD
|
|
248
248
|
nil
|
249
249
|
end
|
250
250
|
|
251
|
+
def self.cmd_log(*args)
|
252
|
+
cmd_pid(*args)
|
253
|
+
nil
|
254
|
+
end
|
255
|
+
|
251
256
|
end
|
@@ -242,48 +242,6 @@ module Misc
|
|
242
242
|
|
243
243
|
return options
|
244
244
|
|
245
|
-
options = {}
|
246
|
-
string.split(/#/).each do |str|
|
247
|
-
if str.match(/(.*)=(.*)/)
|
248
|
-
option, value = $1, $2
|
249
|
-
else
|
250
|
-
option, value = str, true
|
251
|
-
end
|
252
|
-
|
253
|
-
option = option.sub(":",'').to_sym if option.chars.first == ':'
|
254
|
-
value = value.sub(":",'').to_sym if String === value and value.chars.first == ':'
|
255
|
-
|
256
|
-
if value == true
|
257
|
-
options[option] = option.to_s.chars.first != '!'
|
258
|
-
else
|
259
|
-
options[option] = Thread.start do
|
260
|
-
$SAFE = 0;
|
261
|
-
case
|
262
|
-
when value =~ /^(?:true|T)$/i
|
263
|
-
true
|
264
|
-
when value =~ /^(?:false|F)$/i
|
265
|
-
false
|
266
|
-
when Symbol === value
|
267
|
-
value
|
268
|
-
when (String === value and value =~ /^\/(.*)\/$/)
|
269
|
-
Regexp.new /#{$1}/
|
270
|
-
else
|
271
|
-
begin
|
272
|
-
Kernel.const_get value
|
273
|
-
rescue
|
274
|
-
begin
|
275
|
-
raise if value =~ /[a-z]/ and defined? value
|
276
|
-
eval(value)
|
277
|
-
rescue Exception
|
278
|
-
value
|
279
|
-
end
|
280
|
-
end
|
281
|
-
end
|
282
|
-
end.value
|
283
|
-
end
|
284
|
-
end
|
285
|
-
|
286
|
-
options
|
287
245
|
end
|
288
246
|
|
289
247
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'rbbt/util/cmd'
|
2
|
+
module ProcPath
|
3
|
+
CMD.tool :procpath do
|
4
|
+
'pip install procpath'
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.record(pid, path, options = {})
|
8
|
+
IndiferentHash.setup(options)
|
9
|
+
options = Misc.add_defaults options, "interval" => 30
|
10
|
+
|
11
|
+
cmd_options = %w(interval recnum reevalnum).inject({}){|acc,k| acc[k] = options[k]; acc}
|
12
|
+
|
13
|
+
Log.debug "ProcPath recording #{pid} in #{path} (#{Misc.fingerprint options})"
|
14
|
+
procpath_thread = Thread.new do
|
15
|
+
begin
|
16
|
+
procpath_pid = CMD.cmd_pid(:procpath, "record --database-file '#{path}' '$..children[?(@.stat.pid == #{pid})]'", cmd_options.merge(:nofail => true, :add_option_dashes => true))
|
17
|
+
rescue Exception
|
18
|
+
Log.exceptions $!
|
19
|
+
Process.kill "INT", procpath_pid
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
procpath_thread.report_on_exception = false
|
24
|
+
|
25
|
+
Process.wait pid.to_i
|
26
|
+
procpath_thread.raise Interrupt
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.plot(path, output, options = {})
|
30
|
+
IndiferentHash.setup(options)
|
31
|
+
options = Misc.add_defaults options, "query-name" => 'rss', 'epsilon' => 0.5, "moving-average-window" => 10
|
32
|
+
|
33
|
+
cmd_options = %w(query-name epsilon monitor-average-window title logarithmic after before custom-query-file custom-value-expr).inject({}){|acc,k| acc[k] = options[k]; acc}
|
34
|
+
CMD.cmd_log(:procpath, "plot --database-file '#{path}' --plot-file '#{output}' ", cmd_options.merge(:nofail => true, :add_option_dashes => true))
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.monitor(pid, path)
|
38
|
+
database, options_str = path.split("#")
|
39
|
+
options = options_str.nil? ? {} : Misc.string2hash(options_str)
|
40
|
+
|
41
|
+
database = File.expand_path database
|
42
|
+
Log.low "ProcPath monitor #{pid} in #{database} (#{Misc.fingerprint options})"
|
43
|
+
|
44
|
+
ProcPath.record(pid, database + '.sqlite3', options)
|
45
|
+
ProcPath.plot(database + '.sqlite3', database + '.cpu.svg', options.merge("query-name" => 'cpu'))
|
46
|
+
ProcPath.plot(database + '.sqlite3', database + '.rss.svg', options.merge("query-name" => 'rss'))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
data/lib/rbbt/workflow.rb
CHANGED
@@ -190,7 +190,7 @@ module Workflow
|
|
190
190
|
return Misc.string2const Misc.camel_case(wf_name)
|
191
191
|
end
|
192
192
|
|
193
|
-
Log.
|
193
|
+
Log.high{"Loading workflow #{wf_name}"}
|
194
194
|
require_local_workflow(wf_name) or
|
195
195
|
(Workflow.autoinstall and `rbbt workflow install #{Misc.snake_case(wf_name)} || rbbt workflow install #{wf_name}` and require_local_workflow(wf_name)) or raise("Workflow not found or could not be loaded: #{ wf_name }")
|
196
196
|
begin
|
@@ -505,8 +505,8 @@ class Step
|
|
505
505
|
|
506
506
|
def running?
|
507
507
|
return false if ! (started? || status == :ending)
|
508
|
-
|
509
|
-
|
508
|
+
return nil unless Open.exist?(self.pid_file)
|
509
|
+
pid = Open.read(self.pid_file).to_i
|
510
510
|
|
511
511
|
return false if done? or error? or aborted?
|
512
512
|
|
@@ -530,8 +530,7 @@ class Step
|
|
530
530
|
end
|
531
531
|
|
532
532
|
def nopid?
|
533
|
-
|
534
|
-
! pid && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
|
533
|
+
! Open.exists?(pid_file) && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
|
535
534
|
end
|
536
535
|
|
537
536
|
def aborted?
|
@@ -373,7 +373,6 @@ class Step
|
|
373
373
|
Log.exception $!
|
374
374
|
ensure
|
375
375
|
Step.purge_stream_cache
|
376
|
-
set_info :pid, nil
|
377
376
|
Open.rm pid_file if Open.exist?(pid_file)
|
378
377
|
end
|
379
378
|
end
|
@@ -388,7 +387,6 @@ class Step
|
|
388
387
|
_clean_finished
|
389
388
|
rescue
|
390
389
|
stop_dependencies
|
391
|
-
set_info :pid, nil
|
392
390
|
Open.rm pid_file if Open.exist?(pid_file)
|
393
391
|
end
|
394
392
|
end
|
@@ -449,7 +447,7 @@ class Step
|
|
449
447
|
ensure
|
450
448
|
no_load = false unless IO === result
|
451
449
|
Open.rm pid_file if Open.exist?(pid_file) unless no_load
|
452
|
-
set_info :pid, nil unless no_load
|
450
|
+
#set_info :pid, nil unless no_load
|
453
451
|
end
|
454
452
|
end
|
455
453
|
|
@@ -559,7 +557,7 @@ class Step
|
|
559
557
|
RbbtSemaphore.post_semaphore(semaphore) if semaphore
|
560
558
|
Kernel.exit! -1
|
561
559
|
end
|
562
|
-
set_info :pid, nil
|
560
|
+
#set_info :pid, nil
|
563
561
|
ensure
|
564
562
|
RbbtSemaphore.post_semaphore(semaphore) if semaphore
|
565
563
|
end
|
data/lib/rbbt/workflow/usage.rb
CHANGED
@@ -57,7 +57,7 @@ module Task
|
|
57
57
|
puts Log.color(:magenta, "Input select options")
|
58
58
|
puts
|
59
59
|
selects.collect{|p| p}.uniq.each do |input,options|
|
60
|
-
puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| o.to_s} * ", ") << "\n"
|
60
|
+
puts Log.color(:blue, input.to_s + ": ") << Misc.format_paragraph(options.collect{|o| Array === o ? o.first.to_s : o.to_s} * ", ") << "\n"
|
61
61
|
puts unless Log.compact
|
62
62
|
end
|
63
63
|
puts
|
@@ -78,6 +78,7 @@ class Step
|
|
78
78
|
name = info[:name] || File.basename(path)
|
79
79
|
status = :unsync if status == :done and not Open.exist?(path)
|
80
80
|
status = :notfound if status == :noinfo and not Open.exist?(path)
|
81
|
+
|
81
82
|
str = " " * offset
|
82
83
|
str << prov_report_msg(status, name, path, info)
|
83
84
|
step.dependencies.reverse.each do |dep|
|
@@ -90,7 +91,7 @@ class Step
|
|
90
91
|
if expand_repeats
|
91
92
|
str << Log.color(:green, Log.uncolor(prov_report(dep, offset+1, task)))
|
92
93
|
else
|
93
|
-
str << Log.color(:green, " " * (offset + 1) + Log.uncolor(prov_report_msg(status, name, path, info)))
|
94
|
+
str << Log.color(:green, " " * (offset + 1) + Log.uncolor(prov_report_msg(dep.status, dep.info[:name], dep.path, dep.info)))
|
94
95
|
end
|
95
96
|
end
|
96
97
|
end if step.dependencies
|
data/share/rbbt_commands/migrate
CHANGED
@@ -21,6 +21,8 @@ $ rbbt mnl [options]
|
|
21
21
|
-s--search* Regular expression
|
22
22
|
-t--tail* Show the last lines of the STDERR
|
23
23
|
-SBP--sbatch_parameters show sbatch parameters
|
24
|
+
-PERF--procpath_performance show Procpath performance summary
|
25
|
+
-sacct--sacct_peformance show sacct performance summary
|
24
26
|
EOF
|
25
27
|
|
26
28
|
if options[:help]
|
@@ -69,7 +71,7 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
69
71
|
end
|
70
72
|
|
71
73
|
if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
|
72
|
-
exe = m[1]
|
74
|
+
exe = m[1].sub('step_path=$(','')
|
73
75
|
else
|
74
76
|
exe = nil
|
75
77
|
end
|
@@ -96,15 +98,24 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
96
98
|
if File.exists?(fstatus = File.join(dir, 'job.status'))
|
97
99
|
nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
|
98
100
|
elsif job_nodes[id]
|
99
|
-
nodes = job_nodes[id]
|
101
|
+
nodes = job_nodes[id].reject{|n| n.include? "("}
|
100
102
|
else
|
101
103
|
nodes = []
|
102
104
|
end
|
103
105
|
|
106
|
+
if File.exists?(File.join(dir, 'exit.status'))
|
107
|
+
now = File.ctime(File.join(dir, 'exit.status'))
|
108
|
+
else
|
109
|
+
now = Time.now
|
110
|
+
end
|
111
|
+
|
104
112
|
if File.exists?(File.join(dir, 'std.out'))
|
113
|
+
cerrt = File.ctime File.join(dir, 'std.err')
|
114
|
+
coutt = File.ctime File.join(dir, 'std.out')
|
105
115
|
outt = File.mtime File.join(dir, 'std.out')
|
106
116
|
errt = File.mtime File.join(dir, 'std.err')
|
107
|
-
time_diff =
|
117
|
+
time_diff = now - [outt, errt].max
|
118
|
+
time_elapsed = now - [cerrt, coutt].min
|
108
119
|
end
|
109
120
|
|
110
121
|
fdep = File.join(dir, 'dependencies.list')
|
@@ -113,14 +124,19 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
113
124
|
fcadep = File.join(dir, 'canfail_dependencies.list')
|
114
125
|
cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
|
115
126
|
|
116
|
-
if done || error || aborted || running || queued || jobid
|
127
|
+
if done || error || aborted || running || queued || jobid
|
117
128
|
select = false
|
118
129
|
select = true if done && exit_status == 0
|
119
130
|
select = true if error && exit_status && exit_status != 0
|
120
131
|
select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
|
121
|
-
|
122
|
-
select = true if
|
132
|
+
is_running = exit_status.nil? && running_jobs.include?(id) && (!deps || (running_jobs & deps).empty?)
|
133
|
+
select = true if queued && deps && (running_jobs & deps).any? || queued && is_running && nodes.empty?
|
134
|
+
select = true if running && nodes.any? && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
|
123
135
|
select = true if jobid && jobid.split(",").include?(id)
|
136
|
+
select &= search && cmd.match(/#{search}/) if search
|
137
|
+
next unless select
|
138
|
+
elsif search
|
139
|
+
select = false
|
124
140
|
select = true if search && cmd.match(/#{search}/)
|
125
141
|
next unless select
|
126
142
|
end
|
@@ -128,6 +144,7 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
128
144
|
|
129
145
|
puts Log.color :blue, dir
|
130
146
|
puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
|
147
|
+
puts Log.color(:magenta, "Started: ") << File.ctime(File.join(dir, 'std.err')).to_s if File.exist?(File.join(dir, 'std.err'))
|
131
148
|
puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
|
132
149
|
puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
|
133
150
|
puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
|
@@ -136,13 +153,70 @@ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
|
|
136
153
|
puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
|
137
154
|
puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
|
138
155
|
puts Log.color(:magenta, "Nodes: ") << nodes * ", "
|
139
|
-
puts Log.color(:magenta, "
|
156
|
+
puts Log.color(:magenta, "Time elapsed: ") << Misc.format_seconds(time_elapsed) if time_elapsed
|
157
|
+
puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? || File.exists?(File.join(dir, 'exit.status')) ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
|
140
158
|
|
141
159
|
if options[:sbatch_parameters]
|
142
160
|
puts Log.color(:magenta, "SBATCH parameters: ")
|
143
|
-
|
161
|
+
text = CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
|
162
|
+
lines = text.split("\n").collect{|line| header, _sep, value = line.partition(/\s+/); Log.color(:yellow, header + ": ") + value}
|
163
|
+
puts Log.color :yellow, lines * "\n"
|
144
164
|
end
|
145
165
|
|
166
|
+
fprocpath = File.join(dir, 'procpath.sqlite3')
|
167
|
+
if options[:procpath_performance] && Open.exists?(fprocpath)
|
168
|
+
puts Log.color(:magenta, "Procpath summary: ")
|
169
|
+
require 'rbbt/tsv/csv'
|
170
|
+
meta = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from meta;' "))
|
171
|
+
perf = TSV.csv(CMD.cmd("sqlite3 -header -csv #{fprocpath} 'select * from record;' "))
|
172
|
+
|
173
|
+
page_size = meta["page_size"].first.to_f
|
174
|
+
clock_ticks = meta["clock_ticks"].first.to_f
|
175
|
+
|
176
|
+
cpu_average = {}
|
177
|
+
rss_average = {}
|
178
|
+
perf.through :key, ["ts", 'stat_pid', "stat_utime", "stat_stime", "stat_cutime", "stat_cstime", "stat_rss"] do |k, values|
|
179
|
+
time, stat_pid, ucpu, scpu, ccpu, cscpu, rss = values
|
180
|
+
time = time.to_f
|
181
|
+
|
182
|
+
cpu = Misc.sum([ucpu, scpu].collect{|v| v.to_f})
|
183
|
+
cpu_average[stat_pid] ||= {}
|
184
|
+
cpu_average[stat_pid][time] ||= []
|
185
|
+
cpu_average[stat_pid][time] << cpu.to_f
|
186
|
+
rss_average[time] ||= []
|
187
|
+
rss_average[time] << rss.to_f * page_size
|
188
|
+
end
|
189
|
+
|
190
|
+
ticks = 0
|
191
|
+
cpu_average.each do |stat_pid, cpu_average_pid|
|
192
|
+
start = cpu_average_pid.keys.sort.first
|
193
|
+
eend = cpu_average_pid.keys.sort.last
|
194
|
+
ticks += Misc.sum(cpu_average_pid[eend]) - Misc.sum(cpu_average_pid[start])
|
195
|
+
end
|
196
|
+
start = rss_average.keys.sort.first
|
197
|
+
eend = rss_average.keys.sort.last
|
198
|
+
time_elapsed = eend - start
|
199
|
+
ticks = 1 if ticks == 0
|
200
|
+
time_elapsed = 1 if time_elapsed == 0
|
201
|
+
puts Log.color(:yellow, "CPU average: ") + "%.2f" % ( ticks / clock_ticks / time_elapsed * 100).to_s
|
202
|
+
puts Log.color(:yellow, "RSS average: ") + "%.2f GB" % Misc.mean(rss_average.collect{|t,l| Misc.sum(l) / (1024 * 1024 * 1024)}).to_s
|
203
|
+
puts Log.color(:yellow, "Time: ") + Misc.format_seconds((eend - start))
|
204
|
+
|
205
|
+
end
|
206
|
+
|
207
|
+
if options[:sacct_peformance]
|
208
|
+
begin
|
209
|
+
tsv = TSV.open(CMD.cmd("sacct -j #{id} -o 'jobid,AveRSS,MaxRSS,MaxDiskRead,MaxDiskWrite' -P|grep 'JobID\\|\.batch'"), :header_hash => '', :sep => "|", :type => :list)
|
210
|
+
values = tsv[tsv.keys.first]
|
211
|
+
if values.compact.any?
|
212
|
+
puts Log.color(:magenta, "SACCT performance: ")
|
213
|
+
puts values.zip(values.fields).collect{|v,t| Log.color(:yellow, t + ": ") + v.to_s } * "\n"
|
214
|
+
end
|
215
|
+
rescue
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
|
146
220
|
if tail && File.exists?(File.join(dir, 'std.err'))
|
147
221
|
if exit_status && exit_status != 0
|
148
222
|
puts Log.color(:magenta, "First error or exception found: ")
|
@@ -21,6 +21,7 @@ $slurm_options = SOPT.get <<EOF
|
|
21
21
|
-CS--contain_and_sync Contain and sync to default locations
|
22
22
|
-ci--copy_image When using a container directory, copy image there
|
23
23
|
-t--tail Tail the logs
|
24
|
+
-SPERF--SLURM_procpath* Save Procpath performance for SLURM job; specify only options
|
24
25
|
-q--queue* Queue
|
25
26
|
-t--task_cpus* Tasks
|
26
27
|
-W--workflows* Additional workflows
|
@@ -20,6 +20,7 @@ $slurm_options = SOPT.get <<EOF
|
|
20
20
|
-CS--contain_and_sync Contain and sync to default locations
|
21
21
|
-ci--copy_image When using a container directory, copy image there
|
22
22
|
-t--tail Tail the logs
|
23
|
+
-SPERF--SLURM_procpath* Save Procpath performance for SLURM job; specify only options
|
23
24
|
-q--queue* Queue
|
24
25
|
-t--task_cpus* Tasks
|
25
26
|
-W--workflows* Additional workflows
|
@@ -35,7 +35,7 @@ file = case file
|
|
35
35
|
fields = options[:fields]
|
36
36
|
raise ParameterException, "Please specify the fields to slice" if fields.nil?
|
37
37
|
|
38
|
-
options[:header_hash]
|
38
|
+
options[:header_hash] ||= options["header_hash"]
|
39
39
|
|
40
40
|
case
|
41
41
|
when options[:tokyocabinet]
|
@@ -45,8 +45,8 @@ when options[:tokyocabinet_bd]
|
|
45
45
|
tsv = Persist.open_tokyocabinet(file, false, nil, TokyoCabinet::BDB)
|
46
46
|
puts tsv.summary
|
47
47
|
else
|
48
|
-
stream = TSV.traverse file, options.merge(:into => :stream, :type => :list, :
|
49
|
-
|
48
|
+
stream = TSV.traverse file, options.merge(:into => :stream, :type => :list, :fields => fields.split(","), :unnamed => true) do |k,fields,names|
|
49
|
+
[k,fields].flatten * "\t"
|
50
50
|
end
|
51
51
|
puts stream.read
|
52
52
|
exit 0
|
@@ -86,7 +86,7 @@ messages = info[:messages]
|
|
86
86
|
backtrace = info[:backtrace]
|
87
87
|
pid = info[:pid]
|
88
88
|
exception = info[:exception]
|
89
|
-
rest = info.keys - [:inputs, :dependencies, :status, :time_elapsed, :messages, :backtrace, :exception, :
|
89
|
+
rest = info.keys - [:inputs, :dependencies, :status, :time_elapsed, :messages, :backtrace, :exception, :archived_info]
|
90
90
|
|
91
91
|
|
92
92
|
puts Log.color(:magenta, "File") << ": " << step.path
|
@@ -203,6 +203,7 @@ The `recursive_clean` cleans all the job dependency steps recursively.
|
|
203
203
|
-prec--prepare_cpus* Number of dependencies prepared in parallel
|
204
204
|
-rwt--remote_workflow_tasks* Load a yaml file describing remote workflow tasks
|
205
205
|
-od--override_deps* Override deps using 'Workflow#task=<path>' array_separated
|
206
|
+
-PERF--procpath_performance* Measure performance using procpath
|
206
207
|
EOF
|
207
208
|
|
208
209
|
workflow = ARGV.shift
|
@@ -407,6 +408,23 @@ begin
|
|
407
408
|
exit 0
|
408
409
|
end
|
409
410
|
|
411
|
+
if options[:procpath_performance]
|
412
|
+
require 'rbbt/util/procpath'
|
413
|
+
current_pid = job.info[:pid]
|
414
|
+
job.fork
|
415
|
+
job.soft_grace
|
416
|
+
sleep 2 if job.info[:pid] == current_pid
|
417
|
+
if job.info[:pid] != current_pid
|
418
|
+
pid = job.info[:pid]
|
419
|
+
begin
|
420
|
+
ProcPath.monitor(pid, options[:procpath_performance])
|
421
|
+
rescue Errno::ECHILD
|
422
|
+
Log.warn "Procpath didn't find process #{pid} to monitor. Maybe it finished already"
|
423
|
+
rescue
|
424
|
+
Log.warn "Procpath failed: #{$!.message}"
|
425
|
+
end
|
426
|
+
end
|
427
|
+
end
|
410
428
|
|
411
429
|
if do_fork
|
412
430
|
ENV["RBBT_NO_PROGRESS"] = "true"
|
@@ -423,7 +441,6 @@ begin
|
|
423
441
|
res = job
|
424
442
|
end
|
425
443
|
|
426
|
-
|
427
444
|
if options.delete(:printpath)
|
428
445
|
job.join
|
429
446
|
raise job.messages.last if (job.error? || job.aborted?) && job.messages
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rbbt/workflow'
|
4
|
+
|
5
|
+
require 'rbbt-util'
|
6
|
+
require 'rbbt-util'
|
7
|
+
require 'rbbt/util/simpleopt'
|
8
|
+
|
9
|
+
$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
|
10
|
+
|
11
|
+
options = SOPT.setup <<EOF
|
12
|
+
Examine the info of a job result
|
13
|
+
|
14
|
+
$ rbbt workflow info <job-result> <key> <value>
|
15
|
+
|
16
|
+
-h--help Help
|
17
|
+
-f--force Write info even if key is already present
|
18
|
+
-r--recursive Write info for all dependencies as well
|
19
|
+
-p--check_pid Check that recursive jobs where created by the same process
|
20
|
+
EOF
|
21
|
+
|
22
|
+
SOPT.usage if options[:help]
|
23
|
+
|
24
|
+
file, key, value = ARGV
|
25
|
+
|
26
|
+
force, recursive, check_pid = options.values_at :force, :recursive, :check_pid
|
27
|
+
|
28
|
+
def get_step(file)
|
29
|
+
file = file.sub(/\.(info|files)/,'')
|
30
|
+
step = Workflow.load_step file
|
31
|
+
step
|
32
|
+
end
|
33
|
+
|
34
|
+
raise ParameterException if key.nil? || value.nil?
|
35
|
+
|
36
|
+
if %w(DELETE nil).include? value
|
37
|
+
value = nil
|
38
|
+
force = true
|
39
|
+
end
|
40
|
+
|
41
|
+
step = get_step file
|
42
|
+
|
43
|
+
step.set_info key, value if force || ! step.info.include?(key)
|
44
|
+
|
45
|
+
pid = step.info[:pid]
|
46
|
+
host = step.info[:pid_hostname]
|
47
|
+
|
48
|
+
step.rec_dependencies.each do |dep|
|
49
|
+
dep.set_info key, value if (force || ! dep.info.include?(key)) && (!check_pid || dep.info[:pid].to_s == pid and dep.info[:pid_hostname] == host)
|
50
|
+
rescue
|
51
|
+
Log.warn "Could no set info #{key} for #{dep.path}: #{$!.message}"
|
52
|
+
end if recursive
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/util/procpath'
|
3
|
+
|
4
|
+
class TestProcPath < Test::Unit::TestCase
|
5
|
+
def test_record_and_plot
|
6
|
+
Log.with_severity 0 do
|
7
|
+
pid = Process.fork do
|
8
|
+
a = ""
|
9
|
+
(0..1000).each do
|
10
|
+
a << (0..rand(10000).to_i).to_a.collect{|i| "TEST #{i}" } * " "
|
11
|
+
sleep 0.1
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
TmpFile.with_file(nil, false) do |db|
|
16
|
+
|
17
|
+
ProcPath.record(pid, db, :interval => '1', "recnum" => 100)
|
18
|
+
ProcPath.plot(db, db + '.svg', "moving-average-window" => 1 )
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-util
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.
|
4
|
+
version: 5.30.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-02-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -300,6 +300,7 @@ files:
|
|
300
300
|
- lib/rbbt/util/misc/system.rb
|
301
301
|
- lib/rbbt/util/named_array.rb
|
302
302
|
- lib/rbbt/util/open.rb
|
303
|
+
- lib/rbbt/util/procpath.rb
|
303
304
|
- lib/rbbt/util/python.rb
|
304
305
|
- lib/rbbt/util/semaphore.rb
|
305
306
|
- lib/rbbt/util/simpleDSL.rb
|
@@ -438,6 +439,7 @@ files:
|
|
438
439
|
- share/rbbt_commands/workflow/server
|
439
440
|
- share/rbbt_commands/workflow/task
|
440
441
|
- share/rbbt_commands/workflow/trace
|
442
|
+
- share/rbbt_commands/workflow/write_info
|
441
443
|
- share/unicorn.rb
|
442
444
|
- share/workflow_config.ru
|
443
445
|
- test/rbbt/annotations/test_util.rb
|
@@ -516,6 +518,7 @@ files:
|
|
516
518
|
- test/rbbt/util/test_log.rb
|
517
519
|
- test/rbbt/util/test_misc.rb
|
518
520
|
- test/rbbt/util/test_open.rb
|
521
|
+
- test/rbbt/util/test_procpath.rb
|
519
522
|
- test/rbbt/util/test_python.rb
|
520
523
|
- test/rbbt/util/test_semaphore.rb
|
521
524
|
- test/rbbt/util/test_simpleDSL.rb
|
@@ -563,6 +566,7 @@ test_files:
|
|
563
566
|
- test/rbbt/workflow/test_task.rb
|
564
567
|
- test/rbbt/resource/test_path.rb
|
565
568
|
- test/rbbt/util/test_colorize.rb
|
569
|
+
- test/rbbt/util/test_procpath.rb
|
566
570
|
- test/rbbt/util/misc/test_omics.rb
|
567
571
|
- test/rbbt/util/misc/test_pipes.rb
|
568
572
|
- test/rbbt/util/misc/test_format.rb
|