rbbt-util 5.29.2 → 5.29.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fa648dbd0465a8c6a9a62820dc08a7f726dcd45cb541e7ef1c5e06b0e8343680
4
- data.tar.gz: f05f659bfcfe3eef10a1d4ba786d40a7f835a08711ca23af9b5b619ddd61eeda
3
+ metadata.gz: 3ec6302ccfe3f38a074f7f0d10511090c8f4db4186228ad93adb2888e0edbf5e
4
+ data.tar.gz: fcaa50b654461f128b9539fc47ed00b008d3f713e89b8bbe963c2b898c3c168b
5
5
  SHA512:
6
- metadata.gz: b44a34d7ad19eff67c7f8e6f3bfe0f55372e3d8a255247ff01d1aa858a904d2fcf73d45dd966702adf62bdbc3cb1b0dc3df643aaab747a922bfe3b747f975950
7
- data.tar.gz: 5e74bcc37585452e22d876211f0c02c8fc431dbbdcc8cc0d3ef537ea794b722e0af9fc4ae743a014a1384c432f8f280761c74e35a1a8aaf25150067675a9f449
6
+ metadata.gz: 2a2537aef150df77142a593742399d28bb96334decc0e33b69e9cbac2853085487100aa90cd711b342bac7eda536d15c080be22891d5b9f38bfa4601282ae5de
7
+ data.tar.gz: d79cc4afa294d63cebd79f73b759bc46c3c4e0285db7be7406a594aa49e5b572b0c4f91b98c73b5f64c31c908d58d8dee43853ec18b1ead9e814b4370de0d60d
@@ -60,9 +60,14 @@ module HPC
60
60
  return if job.done?
61
61
  return unless job.path.split("/")[-4] == "jobs"
62
62
  seen[:orchestration_target_job] ||= job
63
+
63
64
  options.delete "recursive_clean"
65
+ options.delete "clean_task"
66
+ options.delete "clean"
64
67
  options.delete "tail"
65
68
  options.delete "printfile"
69
+ options.delete "detach"
70
+
66
71
  rules = YAML.load(Open.read(options[:orchestration_rules])) if options[:orchestration_rules]
67
72
  rules ||= {}
68
73
  IndiferentHash.setup(rules)
@@ -75,7 +80,13 @@ module HPC
75
80
  skip_dep = job_rules["chain_tasks"] &&
76
81
  job_rules["chain_tasks"][job.workflow.to_s] && job_rules["chain_tasks"][job.workflow.to_s].include?(job.task_name.to_s) &&
77
82
  job_rules["chain_tasks"][dep.workflow.to_s] && job_rules["chain_tasks"][dep.workflow.to_s].include?(dep.task_name.to_s)
78
- seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
83
+
84
+ deps = seen[dep.path] ||= self.orchestrate_job(dep, options, skip_dep, seen)
85
+ if job.canfail_paths.include? dep.path
86
+ [deps].flatten.collect{|id| ['canfail', id] * ":"}
87
+ else
88
+ deps
89
+ end
79
90
  end.flatten.compact.uniq
80
91
 
81
92
  skip = true if job_rules[:skip]
@@ -58,11 +58,11 @@ module HPC
58
58
  when FalseClass
59
59
  '--' << o << "=false"
60
60
  else
61
- ['--' << o, "'#{v}'"] * " "
61
+ ['--' << o, "'#{v.to_s.gsub("'", '\'')}'"] * " "
62
62
  end
63
63
  end * " "
64
64
 
65
- rbbt_cmd << " --config_keys='#{config_keys}'" if config_keys and not config_keys.empty?
65
+ rbbt_cmd << " --config_keys='#{config_keys.gsub("'", '\'')}'" if config_keys and not config_keys.empty?
66
66
 
67
67
  time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
68
68
 
@@ -76,6 +76,7 @@ module HPC
76
76
  fjob = File.join(slurm_basedir, 'job.id')
77
77
  fexit = File.join(slurm_basedir, 'exit.status')
78
78
  fsync = File.join(slurm_basedir, 'sync.log')
79
+ fsyncexit = File.join(slurm_basedir, 'sync.status')
79
80
  fcmd = File.join(slurm_basedir, 'command.slurm')
80
81
 
81
82
  #{{{ GENERATE TEMPLATE
@@ -107,10 +108,6 @@ module HPC
107
108
  EOF
108
109
  end
109
110
 
110
- header +=<<-EOF
111
- #CMD: #{rbbt_cmd}
112
- EOF
113
-
114
111
  # ENV
115
112
  env = ""
116
113
  env +=<<-EOF
@@ -246,7 +243,7 @@ EOF
246
243
  end
247
244
 
248
245
  if contain
249
- rbbt_cmd << " " << %(--workdir_all='#{contain}')
246
+ rbbt_cmd << " " << %(--workdir_all='#{contain.gsub("'", '\'')}/.rbbt/var/jobs')
250
247
  end
251
248
  end
252
249
 
@@ -256,6 +253,10 @@ EOF
256
253
  #{rbbt_cmd}
257
254
  EOF
258
255
 
256
+ header +=<<-EOF
257
+ #CMD: #{rbbt_cmd}
258
+ EOF
259
+
259
260
  run +=<<-EOF
260
261
 
261
262
  # Run command
@@ -273,10 +274,10 @@ EOF
273
274
  coda +=<<-EOF
274
275
  singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rbbt system clean all -q &>> #{fsync}
275
276
  EOF
276
- else
277
- coda +=<<-EOF
278
- rbbt system clean all -q &>> #{fsync}
279
- EOF
277
+ # else
278
+ # coda +=<<-EOF
279
+ #rbbt system clean all -q &>> #{fsync}
280
+ #EOF
280
281
  end
281
282
 
282
283
  if sync.include?("=>")
@@ -295,6 +296,7 @@ EOF
295
296
  mkdir -p "$(dirname '#{target}')"
296
297
  rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{target}/" &>> #{fsync}
297
298
  sync_es="$?"
299
+ echo $sync_es > #{fsyncexit}
298
300
  find '#{target}' -type l -ls | awk '$13 ~ /^#{target.gsub('/','\/')}/ { sub("#{source}", "#{target}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
299
301
  EOF
300
302
 
@@ -320,23 +322,24 @@ singularity exec -e -C -H "$CONTAINER_DIR" "$SINGULARITY_IMG" rm -v /dev/shm/sem
320
322
  EOF
321
323
  else
322
324
  coda +=<<-EOF
323
- #{exec_cmd} system clean
325
+ ##{exec_cmd} system clean
324
326
  if [ $exit_status == '0' -a $sync_es == '0' ]; then
325
327
  rm -Rfv #{contain} &>> #{fsync}
326
328
  else
327
329
  echo "ERROR: Process failed or results could not sync correctly. Contain directory not purged" &>> #{fsync}
328
330
  fi
329
- unset sync_es
330
331
  EOF
331
332
 
332
333
  end
333
334
  end
334
335
  end
336
+
335
337
  coda +=<<-EOF
336
338
 
337
339
  # Write exit status to file
338
340
  echo $exit_status > #{fexit}
339
341
  EOF
342
+
340
343
  if sync
341
344
  coda +=<<-EOF
342
345
  if [ "$sync_es" == '0' ]; then
@@ -362,6 +365,10 @@ EOF
362
365
  slurm_basedir = options[:slurm_basedir]
363
366
  dependencies = options.delete :slurm_dependencies
364
367
  dependencies = [] if dependencies.nil?
368
+
369
+ canfail_dependencies = dependencies.select{|dep| dep =~ /^canfail:(\d+)/ }.collect{|dep| dep.partition(":").last}
370
+ dependencies = dependencies.reject{|dep| dep =~ /^canfail:(\d+)/ }
371
+
365
372
  Open.mkdir slurm_basedir
366
373
 
367
374
  dry_run = options.delete :dry_run
@@ -370,6 +377,7 @@ EOF
370
377
  ferr = File.join(slurm_basedir, 'std.err')
371
378
  fjob = File.join(slurm_basedir, 'job.id')
372
379
  fdep = File.join(slurm_basedir, 'dependencies.list')
380
+ fcfdep = File.join(slurm_basedir, 'canfail_dependencies.list')
373
381
  fexit = File.join(slurm_basedir, 'exit.status')
374
382
  fsync = File.join(slurm_basedir, 'sync.log')
375
383
  fcmd = File.join(slurm_basedir, 'command.slurm')
@@ -401,8 +409,21 @@ EOF
401
409
  Open.rm fexit
402
410
  Open.rm fout
403
411
  Open.rm ferr
412
+
404
413
  Open.write(fdep, dependencies * "\n") if dependencies.any?
405
- dep_str = dependencies.any? ? "--dependency=afterok:" + dependencies * ":" : ''
414
+ Open.write(fcfdep, canfail_dependencies * "\n") if canfail_dependencies.any?
415
+
416
+
417
+ dep_str = '--dependency='
418
+ normal_dep_str = dependencies.any? ? "afterok:" + dependencies * ":" : nil
419
+ canfail_dep_str = canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
420
+
421
+ if normal_dep_str.nil? && canfail_dep_str.nil?
422
+ dep_str = ""
423
+ else
424
+ dep_str += [normal_dep_str, canfail_dep_str].compact * ","
425
+ end
426
+
406
427
  job = CMD.cmd("sbatch #{dep_str} '#{fcmd}'").read.scan(/\d+/).first.to_i
407
428
  Log.debug "SBATCH job id: #{job}"
408
429
  Open.write(fjob, job.to_s)
@@ -527,7 +548,7 @@ EOF
527
548
  cmd = ['workflow', 'task', workflow.to_s, task.to_s, '-pf', '--log', (options[:log] || Log.severity).to_s]
528
549
  end
529
550
 
530
- cmd << "--override_deps='#{override_deps}'" if override_deps and not override_deps.empty?
551
+ cmd << "--override_deps='#{override_deps.gsub("'", '\'')}'" if override_deps and not override_deps.empty?
531
552
 
532
553
  template = self.template(cmd, options)
533
554
  jobid = self.issue_template(template, options.merge(:slurm_basedir => slurm_basedir, :dry_run => dry_run, :slurm_dependencies => dependencies))
@@ -110,6 +110,8 @@ module Persist
110
110
  def self.load_file(path, type)
111
111
  begin
112
112
  case (type || :marshal).to_sym
113
+ when :path
114
+ path
113
115
  when :nil
114
116
  nil
115
117
  when :boolean
@@ -167,6 +169,8 @@ module Persist
167
169
  end
168
170
 
169
171
  case (type || :marshal).to_sym
172
+ when :path
173
+ nil
170
174
  when :nil
171
175
  nil
172
176
  when :boolean
@@ -0,0 +1,165 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+
6
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
7
+
8
+ options = SOPT.setup <<EOF
9
+
10
+ Clean error or aborted jobs
11
+
12
+ $ rbbt mnl [options]
13
+
14
+ -h--help Print this help
15
+ -d--done Done jobs only
16
+ -e--error Error jobs only
17
+ -a--aborted SLURM aboted jobs
18
+ -j--job* Job ids
19
+ -s--search* Regular expression
20
+ -t--tail* Show the last lines of the STDERR
21
+ -SBP--sbatch_parameters show sbatch parameters
22
+ -dr--dry_run Do not erase anything
23
+ EOF
24
+
25
+ if options[:help]
26
+ if defined? rbbt_usage
27
+ rbbt_usage
28
+ else
29
+ puts SOPT.doc
30
+ end
31
+ exit 0
32
+ end
33
+
34
+ Log.severity = 4
35
+ done, error, aborted, jobid, search, tail, sbatch_parameters, dry_run = options.values_at :done, :error, :aborted, :job, :search, :tail, :sbatch_parameters, :dry_run
36
+
37
+ workdir = File.expand_path('~/rbbt-slurm')
38
+ Path.setup(workdir)
39
+
40
+ running_jobs = begin
41
+ squeue_txt = CMD.cmd('squeue').read
42
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
43
+ rescue
44
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
45
+ squeue_txt = nil
46
+ $norunningjobs = true
47
+ []
48
+ end
49
+
50
+ if squeue_txt
51
+ job_nodes = {}
52
+ squeue_txt.split("\n").each do |line|
53
+ parts = line.strip.split(/\s+/)
54
+ job_nodes[parts.first] = parts.last.split(",")
55
+ end
56
+ else
57
+ job_nodes = nil
58
+ end
59
+
60
+ count = 0
61
+ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
62
+ dir = File.dirname(fcmd)
63
+
64
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
65
+ cmd = m[1]
66
+ else
67
+ cmd = nil
68
+ end
69
+
70
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
71
+ exe = m[1]
72
+ else
73
+ exe = nil
74
+ end
75
+
76
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
77
+ container_home = m[1]
78
+ else
79
+ container_home = nil
80
+ end
81
+
82
+
83
+ if File.exists?(fid = File.join(dir, 'job.id'))
84
+ id = Open.read(fid).chomp
85
+ else
86
+ id = nil
87
+ end
88
+
89
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
90
+ exit_status = Open.read(fstatus).to_i
91
+ else
92
+ exit_status = nil
93
+ end
94
+
95
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
96
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
97
+ elsif job_nodes[id]
98
+ nodes = job_nodes[id]
99
+ else
100
+ nodes = []
101
+ end
102
+
103
+ if File.exists?(File.join(dir, 'std.out'))
104
+ outt = File.mtime File.join(dir, 'std.out')
105
+ errt = File.mtime File.join(dir, 'std.err')
106
+ time_diff = Time.now - [outt, errt].max
107
+ end
108
+
109
+ fdep = File.join(dir, 'dependencies.list')
110
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
111
+
112
+ fcadep = File.join(dir, 'canfail_dependencies.list')
113
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
114
+
115
+ aborted = error = true if aborted.nil? && error.nil?
116
+ if done || error || aborted || running || queued || jobid || search
117
+ select = false
118
+ select = true if done && exit_status && exit_status.to_i == 0
119
+ select = true if error && exit_status && exit_status.to_i != 0
120
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
121
+ select = select && jobid.split(",").include?(id) if jobid
122
+ select = select && cmd.match(/#{search}/) if search
123
+ next unless select
124
+ end
125
+
126
+
127
+ puts Log.color(:yellow, "**ERASING**")
128
+ puts Log.color :blue, dir
129
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
130
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
131
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
132
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
133
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
134
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
135
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
136
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
137
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
138
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
139
+
140
+ if options[:sbatch_parameters]
141
+ puts Log.color(:magenta, "SBATCH parameters: ")
142
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
143
+ end
144
+
145
+ if tail && File.exists?(File.join(dir, 'std.err'))
146
+ if exit_status && exit_status != 0
147
+ puts Log.color(:magenta, "First error or exception found: ")
148
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
149
+ elsif exit_status
150
+ puts Log.color(:magenta, "Completed jobs: ")
151
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
152
+ else
153
+ puts Log.color(:magenta, "Log tail: ")
154
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
155
+ end
156
+ end
157
+
158
+ count += 1
159
+
160
+ Open.rm_rf dir unless dry_run
161
+ end
162
+
163
+ puts
164
+ puts "Found #{count} jobs"
165
+
@@ -20,15 +20,16 @@ $ rbbt mnl [options]
20
20
  -j--job* Job ids
21
21
  -s--search* Regular expression
22
22
  -t--tail* Show the last lines of the STDERR
23
+ -SBP--sbatch_parameters show sbatch parameters
23
24
  EOF
24
25
 
25
26
  if options[:help]
26
- if defined? rbbt_usage
27
- rbbt_usage
28
- else
29
- puts SOPT.doc
30
- end
31
- exit 0
27
+ if defined? rbbt_usage
28
+ rbbt_usage
29
+ else
30
+ puts SOPT.doc
31
+ end
32
+ exit 0
32
33
  end
33
34
 
34
35
  Log.severity = 4
@@ -38,101 +39,124 @@ workdir = File.expand_path('~/rbbt-slurm')
38
39
  Path.setup(workdir)
39
40
 
40
41
  running_jobs = begin
41
- CMD.cmd('squeue').read.split("\n").collect{|l| l.to_i.to_s}
42
+ squeue_txt = CMD.cmd('squeue').read
43
+ squeue_txt.split("\n").collect{|l| l.to_i.to_s}
42
44
  rescue
43
- Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
44
- $norunningjobs = true
45
- []
45
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
46
+ squeue_txt = nil
47
+ $norunningjobs = true
48
+ []
46
49
  end
47
50
 
51
+ if squeue_txt
52
+ job_nodes = {}
53
+ squeue_txt.split("\n").each do |line|
54
+ parts = line.strip.split(/\s+/)
55
+ job_nodes[parts.first] = parts.last.split(",")
56
+ end
57
+ else
58
+ job_nodes = nil
59
+ end
60
+
48
61
  count = 0
49
62
  workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
50
- dir = File.dirname(fcmd)
51
-
52
- if m = Open.read(fcmd).match(/#CMD: (.*)/)
53
- cmd = m[1]
54
- else
55
- cmd = nil
56
- end
57
-
58
- if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
59
- exe = m[1]
60
- else
61
- exe = nil
62
- end
63
-
64
- if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
65
- container_home = m[1]
66
- else
67
- container_home = nil
68
- end
69
-
70
-
71
- if File.exists?(fid = File.join(dir, 'job.id'))
72
- id = Open.read(fid).chomp
73
- else
74
- id = nil
75
- end
76
-
77
- if File.exists?(fstatus = File.join(dir, 'exit.status'))
78
- exit_status = Open.read(fstatus).to_i
79
- else
80
- exit_status = nil
81
- end
82
-
83
- if File.exists?(fstatus = File.join(dir, 'job.status'))
84
- nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
85
- else
86
- nodes = []
87
- end
88
-
89
- if File.exists?(File.join(dir, 'std.out'))
90
- outt = File.mtime File.join(dir, 'std.out')
91
- errt = File.mtime File.join(dir, 'std.err')
92
- time_diff = Time.now - [outt, errt].max
93
- end
94
-
95
- fdep = File.join(dir, 'dependencies.list')
96
- deps = Open.read(fdep).split("\n") if File.exists?(fdep)
97
-
98
- if done || error || aborted || running || queued || jobid || search
99
- select = false
100
- select = true if done && exit_status == 0
101
- select = true if error && exit_status && exit_status != 0
102
- select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
103
- select = true if queued && deps && (running_jobs & deps).any?
104
- select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
105
- select = true if jobid && jobid.split(",").include?(id)
106
- select = true if search && cmd.match(/#{search}/)
107
- next unless select
108
- end
109
-
110
-
111
- puts Log.color :blue, dir
112
- puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
113
- puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
114
- puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
115
- puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
116
- puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
117
- puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
118
- puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
119
- puts Log.color(:magenta, "Nodes: ") << nodes * ", "
120
- puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
121
-
122
- if tail && File.exists?(File.join(dir, 'std.err'))
123
- if exit_status && exit_status != 0
124
- puts Log.color(:magenta, "First error or exception found: ")
125
- puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
126
- elsif exit_status
127
- puts Log.color(:magenta, "Completed jobs: ")
128
- puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
129
- else
130
- puts Log.color(:magenta, "Log tail: ")
131
- puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
132
- end
133
- end
134
-
135
- count += 1
63
+ dir = File.dirname(fcmd)
64
+
65
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
66
+ cmd = m[1]
67
+ else
68
+ cmd = nil
69
+ end
70
+
71
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
72
+ exe = m[1]
73
+ else
74
+ exe = nil
75
+ end
76
+
77
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
78
+ container_home = m[1]
79
+ else
80
+ container_home = nil
81
+ end
82
+
83
+
84
+ if File.exists?(fid = File.join(dir, 'job.id'))
85
+ id = Open.read(fid).chomp
86
+ else
87
+ id = nil
88
+ end
89
+
90
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
91
+ exit_status = Open.read(fstatus).to_i
92
+ else
93
+ exit_status = nil
94
+ end
95
+
96
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
97
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
98
+ elsif job_nodes[id]
99
+ nodes = job_nodes[id]
100
+ else
101
+ nodes = []
102
+ end
103
+
104
+ if File.exists?(File.join(dir, 'std.out'))
105
+ outt = File.mtime File.join(dir, 'std.out')
106
+ errt = File.mtime File.join(dir, 'std.err')
107
+ time_diff = Time.now - [outt, errt].max
108
+ end
109
+
110
+ fdep = File.join(dir, 'dependencies.list')
111
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
112
+
113
+ fcadep = File.join(dir, 'canfail_dependencies.list')
114
+ cadeps = Open.read(fcadep).split("\n") if File.exists?(fcadep)
115
+
116
+ if done || error || aborted || running || queued || jobid || search
117
+ select = false
118
+ select = true if done && exit_status == 0
119
+ select = true if error && exit_status && exit_status != 0
120
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
121
+ select = true if queued && deps && (running_jobs & deps).any?
122
+ select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
123
+ select = true if jobid && jobid.split(",").include?(id)
124
+ select = true if search && cmd.match(/#{search}/)
125
+ next unless select
126
+ end
127
+
128
+
129
+ puts Log.color :blue, dir
130
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
131
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
132
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
133
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
134
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
135
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
136
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
137
+ puts Log.color(:magenta, "Dependencies (can fail): ") << cadeps * ", " if cadeps
138
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
139
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
140
+
141
+ if options[:sbatch_parameters]
142
+ puts Log.color(:magenta, "SBATCH parameters: ")
143
+ puts Log.color :blue, CMD.cmd('grep "^#SBATCH" |tail -n +6', :in => Open.read(fcmd)).read.strip
144
+ end
145
+
146
+ if tail && File.exists?(File.join(dir, 'std.err'))
147
+ if exit_status && exit_status != 0
148
+ puts Log.color(:magenta, "First error or exception found: ")
149
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
150
+ elsif exit_status
151
+ puts Log.color(:magenta, "Completed jobs: ")
152
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
153
+ else
154
+ puts Log.color(:magenta, "Log tail: ")
155
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
156
+ end
157
+ end
158
+
159
+ count += 1
136
160
 
137
161
  end
138
162
 
@@ -25,7 +25,7 @@ $slurm_options = SOPT.get <<EOF
25
25
  -t--task_cpus* Tasks
26
26
  -W--workflows* Additional workflows
27
27
  -tm--time* Time
28
- -R--orchestration_rules* Orchestration rules
28
+ -OR--orchestration_rules* Orchestration rules
29
29
  -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
30
30
  EOF
31
31
 
@@ -43,5 +43,5 @@ class Step
43
43
  end
44
44
  end
45
45
 
46
- ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
46
+ ARGV.concat ["-W", $slurm_options[:workflows], '--detach'] if $slurm_options[:workflows]
47
47
  load Rbbt.share.rbbt_commands.workflow.task.find
@@ -20,7 +20,7 @@ def usage(workflow = nil, task = nil, exception=nil, abridge = false)
20
20
  puts
21
21
  if workflow.nil?
22
22
  puts "No workflow specified. Use `rbbt workflow list` to list available workflows."
23
- exit -1
23
+ exit! -1
24
24
  end
25
25
 
26
26
  if task.nil?
@@ -206,7 +206,7 @@ The `recursive_clean` cleans all the job dependency steps recursively.
206
206
  EOF
207
207
 
208
208
  workflow = ARGV.shift
209
- usage and exit -1 if workflow.nil?
209
+ usage and exit! -1 if workflow.nil?
210
210
 
211
211
  task = ARGV.shift
212
212
 
@@ -232,7 +232,8 @@ else
232
232
  remote_workflows = {}
233
233
  end
234
234
 
235
- Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
235
+ #Workflow.workdir = Path.setup(File.expand_path(options.delete(:workdir_all))) if options[:workdir_all]
236
+ Workflow.workdir.search_paths.merge!({:workdir => File.expand_path(options.delete(:workdir_all)), :default => :workdir }) if options[:workdir_all]
236
237
 
237
238
  workflow = Workflow.require_workflow workflow
238
239
 
@@ -486,7 +487,7 @@ rescue ParameterException
486
487
  puts
487
488
  report_options saved_job_options
488
489
  puts
489
- exit -1
490
+ exit! -1
490
491
  end
491
492
 
492
493
  if options.delete(:list_job_files)
@@ -538,7 +539,7 @@ when Step
538
539
  io.abort if io.respond_to? :abort
539
540
  io.join if io.respond_to? :join
540
541
  ensure
541
- exit -1
542
+ exit! -1
542
543
  end
543
544
  rescue Exception
544
545
  Log.exception $!
@@ -547,9 +548,11 @@ when Step
547
548
  io.abort if io.respond_to? :abort
548
549
  io.join if io.respond_to? :join
549
550
  ensure
550
- exit -1
551
+ exit! -1
551
552
  end
552
553
  end
554
+ elsif detach
555
+ exit! 0
553
556
  else
554
557
  res.join
555
558
  out.puts Open.read(res.path) if Open.exist?(res.path) || Open.remote?(res.path) || Open.ssh?(res.path)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-util
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.29.2
4
+ version: 5.29.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-18 00:00:00.000000000 Z
11
+ date: 2021-01-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -378,6 +378,7 @@ files:
378
378
  - share/rbbt_commands/resource/produce
379
379
  - share/rbbt_commands/resource/read
380
380
  - share/rbbt_commands/rsync
381
+ - share/rbbt_commands/slurm/clean
381
382
  - share/rbbt_commands/slurm/list
382
383
  - share/rbbt_commands/slurm/orchestrate
383
384
  - share/rbbt_commands/slurm/task