rbbt-util 5.28.10 → 5.29.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -92,12 +92,18 @@ class Step
92
92
  (job.done? && job.dirty?) || (job.error? && job.dirty?) ||
93
93
  (!(job.noinfo? || job.done? || job.error? || job.aborted? || job.running?))
94
94
 
95
- job.clean unless job.resumable? && (job.updated? && ! job.dirty?)
95
+ if ! (job.resumable? && (job.updated? && ! job.dirty?))
96
+ Log.high "About to clean -- status: #{status}, present #{File.exists?(job.path)}, " +
97
+ %w(done? error? recoverable_error? noinfo? updated? dirty? aborted? running? resumable?).
98
+ collect{|v| [v, job.send(v)]*": "} * ", " if RBBT_DEBUG_CLEAN
99
+
100
+ job.clean
101
+ end
96
102
  job.set_info :status, :cleaned
97
103
  end
98
104
 
99
105
  job.dup_inputs unless status == 'done' or job.started?
100
- job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started?
106
+ job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started? || ! Workflow.job_path?(job.path)
101
107
 
102
108
  canfail = ComputeDependency === job && job.canfail?
103
109
  end
@@ -121,10 +127,9 @@ class Step
121
127
  end
122
128
 
123
129
  def input_dependencies
124
- inputs.flatten.select{|i| Step === i}
130
+ (inputs.flatten.select{|i| Step === i} + inputs.flatten.select{|dep| Path === dep && Step === dep.resource}.collect{|dep| dep.resource})
125
131
  end
126
132
 
127
-
128
133
  def execute_dependency(dependency, log = true)
129
134
  task_name = self.task_name
130
135
  canfail_paths = self.canfail_paths
@@ -112,7 +112,7 @@ class Step
112
112
  end
113
113
 
114
114
  def updatable?
115
- (ENV["RBBT_UPDATE_ALL_JOBS"] == 'true' || ( ENV["RBBT_UPDATE"] == "true" && Open.exists?(info_file)) && status != :noinfo && ! (relocated? && done?))
115
+ (ENV["RBBT_UPDATE_ALL_JOBS"] == 'true' || ( ENV["RBBT_UPDATE"] == "true" && Open.exists?(info_file)) && status != :noinfo && ! (relocated? && done?)) || (ENV["RBBT_UPDATE"] && ! (done? && ! Open.exists?(info_file)))
116
116
  end
117
117
 
118
118
  def dependency_checks
@@ -122,13 +122,12 @@ class Step
122
122
  reject{|dependency| (defined?(WorkflowRemoteClient) && WorkflowRemoteClient::RemoteStep === dependency) || Open.remote?(dependency.path) }.
123
123
  reject{|dependency| dependency.error? }.
124
124
  #select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
125
- #select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
126
125
  select{|dependency| dependency.updatable? }.
127
126
  collect{|dependency| Workflow.relocate_dependency(self, dependency)}
128
127
  end
129
128
 
130
129
  def input_checks
131
- inputs.select{|i| Step === i }.
130
+ (inputs.select{|i| Step === i } + inputs.select{|i| Path === i && Step === i.resource}.collect{|i| i.resource}).
132
131
  select{|dependency| dependency.updatable? }
133
132
  end
134
133
 
@@ -154,25 +153,28 @@ class Step
154
153
  canfail_paths = self.canfail_paths
155
154
  this_mtime = Open.mtime(self.path) if Open.exists?(self.path)
156
155
 
157
- checks.each do |dep|
158
- next unless dep.updatable?
159
- dep_done = dep.done?
156
+ outdated_time = checks.select{|dep| dep.updatable? && dep.done? && Persist.newer?(path, dep.path) }
157
+ outdated_dep = checks.reject{|dep| dep.done? || (dep.error? && ! dep.recoverable_error? && canfail_paths.include?(dep.path)) }
160
158
 
161
- begin
162
- if this_mtime && dep_done && Open.exists?(dep.path) && (Open.mtime(dep.path) > this_mtime + 1)
163
- outdated_time << dep
164
- end
165
- rescue
166
- end
159
+ #checks.each do |dep|
160
+ # next unless dep.updatable?
161
+ # dep_done = dep.done?
167
162
 
168
- # Is this pointless? this would mean some dep got updated after a later
169
- # dep but but before this one.
170
- #if (! dep.done? && ! canfail_paths.include?(dep.path)) || ! dep.updated?
163
+ # begin
164
+ # if this_mtime && dep_done && Open.exists?(dep.path) && (Open.mtime(dep.path) > this_mtime + 1)
165
+ # outdated_time << dep
166
+ # end
167
+ # rescue
168
+ # end
171
169
 
172
- if (! dep_done && ! canfail_paths.include?(dep.path))
173
- outdated_dep << dep
174
- end
175
- end
170
+ # # Is this pointless? this would mean some dep got updated after a later
171
+ # # dep but but before this one.
172
+ # #if (! dep.done? && ! canfail_paths.include?(dep.path)) || ! dep.updated?
173
+
174
+ # if (! dep_done && ! canfail_paths.include?(dep.path))
175
+ # outdated_dep << dep
176
+ # end
177
+ #end
176
178
 
177
179
  Log.high "Some newer files found: #{Misc.fingerprint outdated_time}" if outdated_time.any?
178
180
  Log.high "Some outdated files found: #{Misc.fingerprint outdated_dep}" if outdated_dep.any?
@@ -215,7 +217,7 @@ class Step
215
217
  no_load = :stream if no_load
216
218
 
217
219
  Open.write(pid_file, Process.pid.to_s) unless Open.exists?(path) or Open.exists?(pid_file)
218
- result_type = @task.result_type
220
+ result_type = @task.result_type if @task
219
221
  result_type = info[:result_type] if result_type.nil?
220
222
  result = Persist.persist "Job", result_type, :file => path, :check => persist_checks, :no_load => no_load do
221
223
  if Step === Step.log_relay_step and not self == Step.log_relay_step
@@ -26,15 +26,25 @@ module Workflow
26
26
  workload
27
27
  end
28
28
 
29
+ def self.workload(jobs)
30
+ jobs.inject({}) do |acc,job|
31
+ Orchestrator.job_workload(job).each do |j,d|
32
+ acc[j] = d unless acc.keys.collect{|k| k.path }.include? j.path
33
+ end
34
+ acc
35
+ end
36
+ end
37
+
29
38
  def self.job_rules(rules, job)
30
39
  workflow = job.workflow.to_s
31
40
  task_name = job.task_name.to_s
41
+ defaults = rules["defaults"] || {}
32
42
 
33
- return IndiferentHash.setup(rules["defaults"]) unless rules[workflow]
34
- return IndiferentHash.setup(rules["defaults"]) unless rules[workflow][task_name]
43
+ return IndiferentHash.setup(defaults) unless rules[workflow]
44
+ return IndiferentHash.setup(defaults) unless rules[workflow][task_name]
35
45
 
36
46
  job_rules = IndiferentHash.setup(rules[workflow][task_name])
37
- rules["defaults"].each{|k,v| job_rules[k] = v if job_rules[k].nil? } if rules["defaults"]
47
+ defaults.each{|k,v| job_rules[k] = v if job_rules[k].nil? } if defaults
38
48
  job_rules
39
49
  end
40
50
 
@@ -169,12 +179,7 @@ module Workflow
169
179
  def process(rules, jobs)
170
180
  begin
171
181
 
172
- workload = jobs.inject({}) do |acc,job|
173
- Orchestrator.job_workload(job).each do |j,d|
174
- acc[j] = d unless acc.keys.collect{|k| k.path }.include? j.path
175
- end
176
- acc
177
- end
182
+ workload = Orchestrator.workload(jobs)
178
183
  all_jobs = workload.keys
179
184
 
180
185
  top_level_jobs = jobs.collect{|job| job.path }
@@ -22,11 +22,14 @@ class Step
22
22
  end
23
23
 
24
24
  def self.prov_report_msg(status, name, path, info = nil)
25
- parts = path.sub(/\{.*/,'').sub(/#{Regexp.quote(name)}$/,'').split "/"
25
+ parts = path.sub(/\{.*/,'').split "/"
26
26
 
27
+ parts.pop
28
+
27
29
  task = Log.color(:yellow, parts.pop)
28
30
  workflow = Log.color(:magenta, parts.pop)
29
- if status.to_s == 'noinfo' and parts.last != 'jobs'
31
+ #if status.to_s == 'noinfo' && parts.last != 'jobs'
32
+ if ! Workflow.job_path?(path)
30
33
  task, status, workflow = Log.color(:yellow, info[:task_name]), Log.color(:green, "file"), Log.color(:magenta, "-")
31
34
  end
32
35
 
@@ -66,7 +69,7 @@ class Step
66
69
  str << "\n"
67
70
  end
68
71
 
69
- def self.prov_report(step, offset = 0, task = nil, seen = [])
72
+ def self.prov_report(step, offset = 0, task = nil, seen = [], expand_repeats = false)
70
73
  info = step.info || {}
71
74
  info[:task_name] = task
72
75
  path = step.path
@@ -82,9 +85,13 @@ class Step
82
85
  new = ! seen.include?(path)
83
86
  if new
84
87
  seen << path
85
- str << prov_report(dep, offset + 1, task, seen)
88
+ str << prov_report(dep, offset + 1, task, seen, expand_repeats)
86
89
  else
87
- str << Log.color(:green, Log.uncolor(prov_report(dep, offset+1, task)))
90
+ if expand_repeats
91
+ str << Log.color(:green, Log.uncolor(prov_report(dep, offset+1, task)))
92
+ else
93
+ str << Log.color(:green, " " * (offset + 1) + Log.uncolor(prov_report_msg(status, name, path, info)))
94
+ end
88
95
  end
89
96
  end if step.dependencies
90
97
  str
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+
6
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
7
+
8
+ options = SOPT.setup <<EOF
9
+
10
+ Queue a job in Marenostrum
11
+
12
+ $ rbbt mnl [options]
13
+
14
+ -h--help Print this help
15
+ -d--done Done jobs only
16
+ -e--error Error jobs only
17
+ -a--aborted SLURM aboted jobs
18
+ -r--running Running jobs only
19
+ -q--queued Queued jobs only
20
+ -j--job* Job ids
21
+ -s--search* Regular expression
22
+ -t--tail* Show the last lines of the STDERR
23
+ EOF
24
+
25
+ if options[:help]
26
+ if defined? rbbt_usage
27
+ rbbt_usage
28
+ else
29
+ puts SOPT.doc
30
+ end
31
+ exit 0
32
+ end
33
+
34
+ Log.severity = 4
35
+ done, error, running, queued, aborted, jobid, search, tail = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail
36
+
37
+ workdir = File.expand_path('~/rbbt-slurm')
38
+ Path.setup(workdir)
39
+
40
+ running_jobs = begin
41
+ CMD.cmd('squeue').read.split("\n").collect{|l| l.to_i.to_s}
42
+ rescue
43
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
44
+ $norunningjobs = true
45
+ []
46
+ end
47
+
48
+ count = 0
49
+ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
50
+ dir = File.dirname(fcmd)
51
+
52
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
53
+ cmd = m[1]
54
+ else
55
+ cmd = nil
56
+ end
57
+
58
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
59
+ exe = m[1]
60
+ else
61
+ exe = nil
62
+ end
63
+
64
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
65
+ container_home = m[1]
66
+ else
67
+ container_home = nil
68
+ end
69
+
70
+
71
+ if File.exists?(fid = File.join(dir, 'job.id'))
72
+ id = Open.read(fid).chomp
73
+ else
74
+ id = nil
75
+ end
76
+
77
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
78
+ exit_status = Open.read(fstatus).to_i
79
+ else
80
+ exit_status = nil
81
+ end
82
+
83
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
84
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
85
+ else
86
+ nodes = []
87
+ end
88
+
89
+ if File.exists?(File.join(dir, 'std.out'))
90
+ outt = File.mtime File.join(dir, 'std.out')
91
+ errt = File.mtime File.join(dir, 'std.err')
92
+ time_diff = Time.now - [outt, errt].max
93
+ end
94
+
95
+ fdep = File.join(dir, 'dependencies.list')
96
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
97
+
98
+ if done || error || aborted || running || queued || jobid || search
99
+ select = false
100
+ select = true if done && exit_status == 0
101
+ select = true if error && exit_status && exit_status != 0
102
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
103
+ select = true if queued && deps && (running_jobs & deps).any?
104
+ select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
105
+ select = true if jobid && jobid.split(",").include?(id)
106
+ select = true if search && cmd.match(/#{search}/)
107
+ next unless select
108
+ end
109
+
110
+
111
+ puts Log.color :blue, dir
112
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
113
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
114
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
115
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
116
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
117
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
118
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
119
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
120
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
121
+
122
+ if tail && File.exists?(File.join(dir, 'std.err'))
123
+ if exit_status && exit_status != 0
124
+ puts Log.color(:magenta, "First error or exception found: ")
125
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
126
+ elsif exit_status
127
+ puts Log.color(:magenta, "Completed jobs: ")
128
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
129
+ else
130
+ puts Log.color(:magenta, "Log tail: ")
131
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
132
+ end
133
+ end
134
+
135
+ count += 1
136
+
137
+ end
138
+
139
+ puts
140
+ puts "Found #{count} jobs"
141
+
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt/util/simpleopt'
4
+ require 'rbbt/workflow'
5
+ require 'rbbt/workflow/usage'
6
+ require 'rbbt/hpc'
7
+ require 'rbbt/hpc/orchestrate'
8
+ require 'time'
9
+
10
+ $slurm_options = SOPT.get <<EOF
11
+ -dr--dry_run Print only the template
12
+ -cj--clean_job Clean job
13
+ --drbbt* Use development version of rbbt
14
+ -sing--singularity Use Singularity
15
+ -ug--user_group* Use alternative user group for group project directory
16
+ -c--contain* Contain in directory (using Singularity)
17
+ -s--sync* Contain in directory and sync jobs
18
+ -e--exclusive Make exclusive use of the node
19
+ -hm--highmem Make use of highmem cores
20
+ -wc--wipe_container* Wipe the jobs from the contain directory
21
+ -CS--contain_and_sync Contain and sync to default locations
22
+ -ci--copy_image When using a container directory, copy image there
23
+ -t--tail Tail the logs
24
+ -q--queue* Queue
25
+ -t--task_cpus* Tasks
26
+ -W--workflows* Additional workflows
27
+ -tm--time* Time
28
+ -R--rules* Orchestration rules
29
+ -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
30
+ EOF
31
+
32
+ class Step
33
+ def run(*args)
34
+ if done?
35
+ self.load
36
+ else
37
+ begin
38
+ Log.debug "Issuing SLURM job for #{self.path}"
39
+ HPC::SLURM.orchestrate_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
40
+ rescue HPC::SBATCH
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
47
+ load Rbbt.share.rbbt_commands.workflow.task.find
@@ -9,8 +9,9 @@ require 'time'
9
9
  $slurm_options = SOPT.get <<EOF
10
10
  -dr--dry_run Print only the template
11
11
  -cj--clean_job Clean job
12
- --drbbt Use development version of rbbt
12
+ --drbbt* Use development version of rbbt
13
13
  -sing--singularity Use Singularity
14
+ -ug--user_group* Use alternative user group for group project directory
14
15
  -c--contain* Contain in directory (using Singularity)
15
16
  -s--sync* Contain in directory and sync jobs
16
17
  -e--exclusive Make exclusive use of the node
@@ -21,8 +22,9 @@ $slurm_options = SOPT.get <<EOF
21
22
  -t--tail Tail the logs
22
23
  -q--queue* Queue
23
24
  -t--task_cpus* Tasks
25
+ -W--workflows* Additional workflows
24
26
  -tm--time* Time
25
- -S--server* SLURM login node
27
+ -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
26
28
  EOF
27
29
 
28
30
  class Step
@@ -30,9 +32,14 @@ class Step
30
32
  if done?
31
33
  self.load
32
34
  else
33
- Marenostrum::SLURM.run_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
35
+ begin
36
+ Log.debug "Issuing SLURM job for #{self.path}"
37
+ HPC::SLURM.run_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
38
+ rescue HPC::SBATCH
39
+ end
34
40
  end
35
41
  end
36
42
  end
37
43
 
44
+ ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
38
45
  load Rbbt.share.rbbt_commands.workflow.task.find
@@ -49,27 +49,27 @@ def pid_msg(pid)
49
49
  end
50
50
 
51
51
 
52
- def status_msg(status)
53
- color = case status.to_sym
54
- when :error, :aborted, :missing, :dead, :broken
55
- :red
56
- when :streaming, :started
57
- :cyan
58
- when :done
59
- :green
60
- when :noinfo
61
- :blue
62
- when :dependencies, :waiting, :setup
63
- :yellow
64
- else
65
- if status.to_s.index ">"
66
- :cyan
67
- else
68
- :cyan
69
- end
70
- end
71
- Log.color(color, status.to_s)
72
- end
52
+ #def status_msg(status)
53
+ # color = case status.to_sym
54
+ # when :error, :aborted, :missing, :dead, :broken
55
+ # :red
56
+ # when :streaming, :started
57
+ # :cyan
58
+ # when :done
59
+ # :green
60
+ # when :noinfo, :notfound
61
+ # :blue
62
+ # when :dependencies, :waiting, :setup
63
+ # :yellow
64
+ # else
65
+ # if status.to_s.index ">"
66
+ # :cyan
67
+ # else
68
+ # :cyan
69
+ # end
70
+ # end
71
+ # Log.color(color, status.to_s)
72
+ #end
73
73
 
74
74
  def input_msg(file, inputs)
75
75
 
@@ -218,7 +218,7 @@ workflows.sort.each do |workflow,tasks|
218
218
  status << Log.color(:red, " (dead)")
219
219
  end
220
220
  end
221
- str << " #{ status_msg status }"
221
+ str << " #{ Step.prov_status_msg status }"
222
222
  str << " (dirty)" if status == 'done' && Workflow.load_step(file).dirty?
223
223
 
224
224
  if inputs and inputs.any?