rbbt-util 5.29.2 → 5.29.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fa648dbd0465a8c6a9a62820dc08a7f726dcd45cb541e7ef1c5e06b0e8343680
4
- data.tar.gz: f05f659bfcfe3eef10a1d4ba786d40a7f835a08711ca23af9b5b619ddd61eeda
3
+ metadata.gz: 3ec6302ccfe3f38a074f7f0d10511090c8f4db4186228ad93adb2888e0edbf5e
4
+ data.tar.gz: fcaa50b654461f128b9539fc47ed00b008d3f713e89b8bbe963c2b898c3c168b
5
5
  SHA512:
6
- metadata.gz: b44a34d7ad19eff67c7f8e6f3bfe0f55372e3d8a255247ff01d1aa858a904d2fcf73d45dd966702adf62bdbc3cb1b0dc3df643aaab747a922bfe3b747f975950
7
- data.tar.gz: 5e74bcc37585452e22d876211f0c02c8fc431dbbdcc8cc0d3ef537ea794b722e0af9fc4ae743a014a1384c432f8f280761c74e35a1a8aaf25150067675a9f449
6
+ metadata.gz: 2a2537aef150df77142a593742399d28bb96334decc0e33b69e9cbac2853085487100aa90cd711b342bac7eda536d15c080be22891d5b9f38bfa4601282ae5de
7
+ data.tar.gz: d79cc4afa294d63cebd79f73b759bc46c3c4e0285db7be7406a594aa49e5b572b0c4f91b98c73b5f64c31c908d58d8dee43853ec18b1ead9e814b4370de0d60d
@@ -60,9 +60,14 @@ module HPC
60
60
  return if job.done?
61
61
  return unless job.path.split("/")[-4] == "jobs"
62
62
  seen[:orchestration_target_job] ||= job
63
+
63
64
  options.delete "recursive_clean"
65
+ options.delete "clean_task"
66
+ options.delete "clean"
64
67
  options.delete "tail"
65
68
  options.delete "printfile"
69
+ options.delete "detach"
70
+
66
71
  rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
67
72
  rules ||= {}
68
73
  IndiferentHash.setup(rules)
@@ -75,7 +80,13 @@ module HPC
75
80
  skip_dep = job_rules["chain_tasks"] &&
76
81
  job_rules["chain_tasks"][job.workflow.to_s] && job_rules["chain_tasks"][job.workflow.to_s].include?(job.task_name.to_s) &&
77
82
  job_rules["chain_tasks"][dep.workflow.to_s] && job_rules["chain_tasks"][dep.workflow.to_s].include?(dep.task_name.to_s)
78
- seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
83
+
84
+ deps = seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
85
+ if job.canfail_paths.include? dep.path
86
+ [deps].flatten.collect{|id| ['canfail', id] * ":"}
87
+ else
88
+ deps
89
+ end
79
90
  end.flatten.compact.uniq
80
91
 
81
92
  skip = true if job_rules[:skip]
@@ -58,11 +58,11 @@ module HPC
58
58
  when FalseClass
59
59
  '--' << o << "=false"
60
60
  else
61
- ['--' << o, "'#{v}'"] * " "
61
+ ['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
62
62
  end
63
63
  end * " "
64
64
 
65
- rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
65
+ rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
66
66
 
67
67
  time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
68
68
 
@@ -76,6 +76,7 @@ module HPC
76
76
  fjob = File.join(slurm_basedir, 'job.id')
77
77
  fexit = File.join(slurm_basedir, 'exit.status')
78
78
  fsync = File.join(slurm_basedir, 'sync.log')
79
+ fsyncexit = File.join(slurm_basedir, 'sync.status')
79
80
  fcmd = File.join(slurm_basedir, 'command.slurm')
80
81
 
81
82
  #{{{ GENERATE TEMPLATE
@@ -107,10 +108,6 @@ module HPC
107
108
  EOF
108
109
  end
109
110
 
110
- header +=<<-EOF
111
- #CMD: #{rbbt_cmd}
112
- EOF
113
-
114
111
  # ENV
115
112
  env = ""
116
113
  env +=<<-EOF
@@ -246,7 +243,7 @@ EOF
246
243
  end
247
244
 
248
245
  if contain
249
- rbbt_cmd << " " << %(--workdir_all='#{contain}')
246
+ rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/.rbbt/var/jobs')
250
247
  end
251
248
  end
252
249
 
@@ -256,6 +253,10 @@ EOF
256
253
  #{rbbt_cmd}
257
254
  EOF
258
255
 
256
+ header +=<<-EOF
257
+ #CMD: #{rbbt_cmd}
258
+ EOF
259
+
259
260
  run +=<<-EOF
260
261
 
261
262
  # Run command
@@ -273,10 +274,10 @@ EOF
273
274
  coda +=<<-EOF
274
275
  singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
275
276
  EOF
276
- else
277
- coda +=<<-EOF
278
- rbbt system clean all -q &>> #{fsync}
279
- EOF
277
+ # else
278
+ # coda +=<<-EOF
279
+ #rbbt system clean all -q &>> #{fsync}
280
+ #EOF
280
281
  end
281
282
 
282
283
  if sync.include?("=>")
@@ -295,6 +296,7 @@ EOF
295
296
  mkdir -p "$(dirname '#{target}')"
296
297
  rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
297
298
  sync_es="$?"
299
+ echo $sync_es > #{fsyncexit}
298
300
  find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
299
301
  EOF
300
302
 
@@ -320,23 +322,24 @@ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem
320
322
  EOF
321
323
  else
322
324
  coda +=<<-EOF
323
- #{exec_cmd} system clean
325
+ ##{exec_cmd} system clean
324
326
  if [ $exit_status == '0' -a $sync_es == '0' ]; then
325
327
  rm -Rfv #{contain} &>> #{fsync}
326
328
  else
327
329
  echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
328
330
  fi
329
- unset sync_es
330
331
  EOF
331
332
 
332
333
  end
333
334
  end
334
335
  end
336
+
335
337
  coda +=<<-EOF
336
338
 
337
339
  # Write exit status to file
338
340
  echo $exit_status > #{fexit}
339
341
  EOF
342
+
340
343
  if sync
341
344
  coda +=<<-EOF
342
345
  if [ "$sync_es" == '0' ]; then
@@ -362,6 +365,10 @@ EOF
362
365
  slurm_basedir = options[:slurm_basedir]
363
366
  dependencies = options.delete :slurm_dependencies
364
367
  dependencies = [] if dependencies.nil?
368
+
369
+ canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
370
+ dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
371
+
365
372
  Open.mkdir slurm_basedir
366
373
 
367
374
  dry_run = options.delete :dry_run
@@ -370,6 +377,7 @@ EOF
370
377
  ferr = File.join(slurm_basedir, 'std.err')
371
378
  fjob = File.join(slurm_basedir, 'job.id')
372
379
  fdep = File.join(slurm_basedir, 'dependencies.list')
380
+ fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
373
381
  fexit = File.join(slurm_basedir, 'exit.status')
374
382
  fsync = File.join(slurm_basedir, 'sync.log')
375
383
  fcmd = File.join(slurm_basedir, 'command.slurm')
@@ -401,8 +409,21 @@ EOF
401
409
  Open.rm fexit
402
410
  Open.rm fout
403
411
  Open.rm ferr
412
+
404
413
  Open.write(fdep, dependencies * "\n") if dependencies.any?
405
- dep_str = dependencies.any? ? "--dependency=afterok:" + dependencies * ":" : ''
414
+ Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
415
+
416
+
417
+ dep_str = '--dependency='
418
+ normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
419
+ canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
420
+
421
+ if normal_dep_str.nil? && canfail_dep_str.nil?
422
+ dep_str = ""
423
+ else
424
+ dep_str += [normal_dep_str, canfail_dep_str].compact * ","
425
+ end
426
+
406
427
  job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
407
428
  Log.debug "SBATCH job id: #{job}"
408
429
  Open.write(fjob, job.to_s)
@@ -527,7 +548,7 @@ EOF
527
548
  cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
528
549
  end
529
550
 
530
- cmd << "--override_deps='#{override_deps}'" if override_deps and not override_deps.empty?
551
+ cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
531
552
 
532
553
  template = self.template(cmd, options)
533
554
  jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
@@ -110,6 +110,8 @@ module Persist
110
110
  def self.load_file(path, type)
111
111
  begin
112
112
  case (type || :marshal).to_sym
113
+ when :path
114
+ path
113
115
  when :nil
114
116
  nil
115
117
  when :boolean
@@ -167,6 +169,8 @@ module Persist
167
169
  end
168
170
 
169
171
  case (type || :marshal).to_sym
172
+ when :path
173
+ nil
170
174
  when :nil
171
175
  nil
172
176
  when :boolean
@@ -0,0 +1,165 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+
6
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
7
+
8
+ options = SOPT.setup <<EOF
9
+
10
+ Clean error or aborted jobs
11
+
12
+ $ rbbt mnl [options]
13
+
14
+ -h--help Print this help
15
+ -d--done Done jobs only
16
+ -e--error Error jobs only
17
+ -a--aborted SLURM aboted jobs
18
+ -j--job* Job ids
19
+ -s--search* Regular expression
20
+ -t--tail* Show the last lines of the STDERR
21
+ -SBP--sbatch_parameters show sbatch parameters
22
+ -dr--dry_run Do not erase anything
23
+ EOF
24
+
25
+ if options[:help]
26
+ if defined? rbbt_usage
27
+ rbbt_usage
28
+ else
29
+ puts SOPT.doc
30
+ end
31
+ exit 0
32
+ end
33
+
34
+ Log.severity = 4
35
+ done, error, aborted, jobid, search, tail, sbatch_parameters, dry_run = options.values_at :done, :error, :aborted, :job, :search, :tail, :sbatch_parameters, :dry_run
36
+
37
+ workdir = File.expand_path('~/rbbt-slurm')
38
+ Path.setup(workdir)
39
+
40
+ running_jobs = begin
41
+ squeue_txt = CMD.cmd('squeue').read
42
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
43
+ rescue
44
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
45
+ squeue_txt = nil
46
+ $norunningjobs = true
47
+ []
48
+ end
49
+
50
+ if squeue_txt
51
+ job_nodes = {}
52
+ squeue_txt.split("\n").each do |line|
53
+ parts = line.strip.split(/\s+/)
54
+ job_nodes[parts.first] = parts.last.split(",")
55
+ end
56
+ else
57
+ job_nodes = nil
58
+ end
59
+
60
+ count = 0
61
+ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
62
+ dir = File.dirname(fcmd)
63
+
64
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
65
+ cmd = m[1]
66
+ else
67
+ cmd = nil
68
+ end
69
+
70
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
71
+ exe = m[1]
72
+ else
73
+ exe = nil
74
+ end
75
+
76
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
77
+ container_home = m[1]
78
+ else
79
+ container_home = nil
80
+ end
81
+
82
+
83
+ if File.exists?(fid = File.join(dir, 'job.id'))
84
+ id = Open.read(fid).chomp
85
+ else
86
+ id = nil
87
+ end
88
+
89
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
90
+ exit_status = Open.read(fstatus).to_i
91
+ else
92
+ exit_status = nil
93
+ end
94
+
95
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
96
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
97
+ elsif job_nodes[id]
98
+ nodes = job_nodes[id]
99
+ else
100
+ nodes = []
101
+ end
102
+
103
+ if File.exists?(File.join(dir, 'std.out'))
104
+ outt = File.mtime File.join(dir, 'std.out')
105
+ errt = File.mtime File.join(dir, 'std.err')
106
+ time_diff = Time.now - [outt, errt].max
107
+ end
108
+
109
+ fdep = File.join(dir, 'dependencies.list')
110
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
111
+
112
+ fcadep = File.join(dir, 'canfail_dependencies.list')
113
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
114
+
115
+ aborted = error = true if aborted.nil? && error.nil?
116
+ if done || error || aborted || running || queued || jobid || search
117
+ select = false
118
+ select = true if done && exit_status && exit_status.to_i == 0
119
+ select = true if error && exit_status && exit_status.to_i != 0
120
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
121
+ select = select && jobid.split(",").include?(id) if jobid
122
+ select = select && cmd.match(/#{search}/) if search
123
+ next unless select
124
+ end
125
+
126
+
127
+ puts Log.color(:yellow, "**ERASING**")
128
+ puts Log.color :blue, dir
129
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
130
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
131
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
132
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
133
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
134
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
135
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
136
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
137
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
138
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
139
+
140
+ if options[:sbatch_parameters]
141
+ puts Log.color(:magenta, "SBATCH parameters: ")
142
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
143
+ end
144
+
145
+ if tail && File.exists?(File.join(dir, 'std.err'))
146
+ if exit_status && exit_status != 0
147
+ puts Log.color(:magenta, "First error or exception found: ")
148
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
149
+ elsif exit_status
150
+ puts Log.color(:magenta, "Completed jobs: ")
151
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
152
+ else
153
+ puts Log.color(:magenta, "Log tail: ")
154
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
155
+ end
156
+ end
157
+
158
+ count += 1
159
+
160
+ Open.rm_rf dir unless dry_run
161
+ end
162
+
163
+ puts
164
+ puts "Found #{count} jobs"
165
+
@@ -20,15 +20,16 @@ $ rbbt mnl [options]
20
20
  -j--job* Job ids
21
21
  -s--search* Regular expression
22
22
  -t--tail* Show the last lines of the STDERR
23
+ -SBP--sbatch_parameters show sbatch parameters
23
24
  EOF
24
25
 
25
26
  if options[:help]
26
- if defined? rbbt_usage
27
- rbbt_usage
28
- else
29
- puts SOPT.doc
30
- end
31
- exit 0
27
+ if defined? rbbt_usage
28
+ rbbt_usage
29
+ else
30
+ puts SOPT.doc
31
+ end
32
+ exit 0
32
33
  end
33
34
 
34
35
  Log.severity = 4
@@ -38,101 +39,124 @@ workdir = File.expand_path('~/rbbt-slurm')
38
39
  Path.setup(workdir)
39
40
 
40
41
  running_jobs = begin
41
- CMD.cmd('squeue').read.split("\n").collect{|l| l.to_i.to_s}
42
+ squeue_txt = CMD.cmd('squeue').read
43
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
42
44
  rescue
43
- Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
44
- $norunningjobs = true
45
- []
45
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
46
+ squeue_txt = nil
47
+ $norunningjobs = true
48
+ []
46
49
  end
47
50
 
51
+ if squeue_txt
52
+ job_nodes = {}
53
+ squeue_txt.split("\n").each do |line|
54
+ parts = line.strip.split(/\s+/)
55
+ job_nodes[parts.first] = parts.last.split(",")
56
+ end
57
+ else
58
+ job_nodes = nil
59
+ end
60
+
48
61
  count = 0
49
62
  workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
50
- dir = File.dirname(fcmd)
51
-
52
- if m = Open.read(fcmd).match(/#CMD: (.*)/)
53
- cmd = m[1]
54
- else
55
- cmd = nil
56
- end
57
-
58
- if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
59
- exe = m[1]
60
- else
61
- exe = nil
62
- end
63
-
64
- if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
65
- container_home = m[1]
66
- else
67
- container_home = nil
68
- end
69
-
70
-
71
- if File.exists?(fid = File.join(dir, 'job.id'))
72
- id = Open.read(fid).chomp
73
- else
74
- id = nil
75
- end
76
-
77
- if File.exists?(fstatus = File.join(dir, 'exit.status'))
78
- exit_status = Open.read(fstatus).to_i
79
- else
80
- exit_status = nil
81
- end
82
-
83
- if File.exists?(fstatus = File.join(dir, 'job.status'))
84
- nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
85
- else
86
- nodes = []
87
- end
88
-
89
- if File.exists?(File.join(dir, 'std.out'))
90
- outt = File.mtime File.join(dir, 'std.out')
91
- errt = File.mtime File.join(dir, 'std.err')
92
- time_diff = Time.now - [outt, errt].max
93
- end
94
-
95
- fdep = File.join(dir, 'dependencies.list')
96
- deps = Open.read(fdep).split("\n") if File.exists?(fdep)
97
-
98
- if done || error || aborted || running || queued || jobid || search
99
- select = false
100
- select = true if done && exit_status == 0
101
- select = true if error && exit_status && exit_status != 0
102
- select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
103
- select = true if queued && deps && (running_jobs & deps).any?
104
- select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
105
- select = true if jobid && jobid.split(",").include?(id)
106
- select = true if search && cmd.match(/#{search}/)
107
- next unless select
108
- end
109
-
110
-
111
- puts Log.color :blue, dir
112
- puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
113
- puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
114
- puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
115
- puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
116
- puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
117
- puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
118
- puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
119
- puts Log.color(:magenta, "Nodes: ") << nodes * ", "
120
- puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
121
-
122
- if tail && File.exists?(File.join(dir, 'std.err'))
123
- if exit_status && exit_status != 0
124
- puts Log.color(:magenta, "First error or exception found: ")
125
- puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
126
- elsif exit_status
127
- puts Log.color(:magenta, "Completed jobs: ")
128
- puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
129
- else
130
- puts Log.color(:magenta, "Log tail: ")
131
- puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
132
- end
133
- end
134
-
135
- count += 1
63
+ dir = File.dirname(fcmd)
64
+
65
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
66
+ cmd = m[1]
67
+ else
68
+ cmd = nil
69
+ end
70
+
71
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
72
+ exe = m[1]
73
+ else
74
+ exe = nil
75
+ end
76
+
77
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
78
+ container_home = m[1]
79
+ else
80
+ container_home = nil
81
+ end
82
+
83
+
84
+ if File.exists?(fid = File.join(dir, 'job.id'))
85
+ id = Open.read(fid).chomp
86
+ else
87
+ id = nil
88
+ end
89
+
90
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
91
+ exit_status = Open.read(fstatus).to_i
92
+ else
93
+ exit_status = nil
94
+ end
95
+
96
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
97
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
98
+ elsif job_nodes[id]
99
+ nodes = job_nodes[id]
100
+ else
101
+ nodes = []
102
+ end
103
+
104
+ if File.exists?(File.join(dir, 'std.out'))
105
+ outt = File.mtime File.join(dir, 'std.out')
106
+ errt = File.mtime File.join(dir, 'std.err')
107
+ time_diff = Time.now - [outt, errt].max
108
+ end
109
+
110
+ fdep = File.join(dir, 'dependencies.list')
111
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
112
+
113
+ fcadep = File.join(dir, 'canfail_dependencies.list')
114
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
115
+
116
+ if done || error || aborted || running || queued || jobid || search
117
+ select = false
118
+ select = true if done && exit_status == 0
119
+ select = true if error && exit_status && exit_status != 0
120
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
121
+ select = true if queued && deps && (running_jobs & deps).any?
122
+ select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
123
+ select = true if jobid && jobid.split(",").include?(id)
124
+ select = true if search && cmd.match(/#{search}/)
125
+ next unless select
126
+ end
127
+
128
+
129
+ puts Log.color :blue, dir
130
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
131
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
132
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
133
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
134
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
135
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
136
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
137
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
138
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
139
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
140
+
141
+ if options[:sbatch_parameters]
142
+ puts Log.color(:magenta, "SBATCH parameters: ")
143
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
144
+ end
145
+
146
+ if tail && File.exists?(File.join(dir, 'std.err'))
147
+ if exit_status && exit_status != 0
148
+ puts Log.color(:magenta, "First error or exception found: ")
149
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
150
+ elsif exit_status
151
+ puts Log.color(:magenta, "Completed jobs: ")
152
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
153
+ else
154
+ puts Log.color(:magenta, "Log tail: ")
155
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
156
+ end
157
+ end
158
+
159
+ count += 1
136
160
 
137
161
  end
138
162
 
@@ -25,7 +25,7 @@ $slurm_options = SOPT.get <<EOF
25
25
  -t--task_cpus* Tasks
26
26
  -W--workflows* Additional workflows
27
27
  -tm--time* Time
28
- -R--orchestration_rules* Orchestration rules
28
+ -OR--orchestration_rules* Orchestration rules
29
29
  -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
30
30
  EOF
31
31
 
@@ -43,5 +43,5 @@ class Step
43
43
  end
44
44
  end
45
45
 
46
- ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
46
+ ARGV.concat ["-W", $slurm_options[:workflows], '--detach'] if $slurm_options[:workflows]
47
47
  load Rbbt.share.rbbt_commands.workflow.task.find
@@ -20,7 +20,7 @@ def usage(workflow = nil, task = nil, exception=nil, abridge = false)
20
20
  puts
21
21
  if workflow.nil?
22
22
  puts "No workflow specified. Use `rbbt workflow list` to list available workflows."
23
- exit -1
23
+ exit! -1
24
24
  end
25
25
 
26
26
  if task.nil?
@@ -206,7 +206,7 @@ The `recursive_clean` cleans all the job dependency steps recursively.
206
206
  EOF
207
207
 
208
208
  workflow = ARGV.shift
209
- usage and exit -1 if workflow.nil?
209
+ usage and exit! -1 if workflow.nil?
210
210
 
211
211
  task = ARGV.shift
212
212
 
@@ -232,7 +232,8 @@ else
232
232
  remote_workflows = {}
233
233
  end
234
234
 
235
- Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
235
+ #Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
236
+ Workflow.workdir.search_paths.merge!({:workdir => File.expand_path(options.delete(:workdir_all)), :default => :workdir }) if options[:workdir_all]
236
237
 
237
238
  workflow = Workflow.require_workflow workflow
238
239
 
@@ -486,7 +487,7 @@ rescue ParameterException
486
487
  puts
487
488
  report_options saved_job_options
488
489
  puts
489
- exit -1
490
+ exit! -1
490
491
  end
491
492
 
492
493
  if options.delete(:list_job_files)
@@ -538,7 +539,7 @@ when Step
538
539
  io.abort if io.respond_to? :abort
539
540
  io.join if io.respond_to? :join
540
541
  ensure
541
- exit -1
542
+ exit! -1
542
543
  end
543
544
  rescue Exception
544
545
  Log.exception $!
@@ -547,9 +548,11 @@ when Step
547
548
  io.abort if io.respond_to? :abort
548
549
  io.join if io.respond_to? :join
549
550
  ensure
550
- exit -1
551
+ exit! -1
551
552
  end
552
553
  end
554
+ elsif detach
555
+ exit! 0
553
556
  else
554
557
  res.join
555
558
  out.puts Open.read(res.path) if Open.exist?(res.path) || Open.remote?(res.path) || Open.ssh?(res.path)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-util
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.29.2
4
+ version: 5.29.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-18 00:00:00.000000000 Z
11
+ date: 2021-01-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -378,6 +378,7 @@ files:
378
378
  - share/rbbt_commands/resource/produce
379
379
  - share/rbbt_commands/resource/read
380
380
  - share/rbbt_commands/rsync
381
+ - share/rbbt_commands/slurm/clean
381
382
  - share/rbbt_commands/slurm/list
382
383
  - share/rbbt_commands/slurm/orchestrate
383
384
  - share/rbbt_commands/slurm/task