rbbt-util 5.28.10 → 5.29.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -92,12 +92,18 @@ class Step
92
92
  (job.done? && job.dirty?) || (job.error? && job.dirty?) ||
93
93
  (!(job.noinfo? || job.done? || job.error? || job.aborted? || job.running?))
94
94
 
95
- job.clean unless job.resumable? && (job.updated? && ! job.dirty?)
95
+ if ! (job.resumable? && (job.updated? && ! job.dirty?))
96
+ Log.high "About to clean -- status: #{status}, present #{File.exists?(job.path)}, " +
97
+ %w(done? error? recoverable_error? noinfo? updated? dirty? aborted? running? resumable?).
98
+ collect{|v| [v, job.send(v)]*": "} * ", " if RBBT_DEBUG_CLEAN
99
+
100
+ job.clean
101
+ end
96
102
  job.set_info :status, :cleaned
97
103
  end
98
104
 
99
105
  job.dup_inputs unless status == 'done' or job.started?
100
- job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started?
106
+ job.init_info(status == 'noinfo') unless status == 'waiting' || status == 'done' || job.started? || ! Workflow.job_path?(job.path)
101
107
 
102
108
  canfail = ComputeDependency === job && job.canfail?
103
109
  end
@@ -121,10 +127,9 @@ class Step
121
127
  end
122
128
 
123
129
  def input_dependencies
124
- inputs.flatten.select{|i| Step === i}
130
+ (inputs.flatten.select{|i| Step === i} + inputs.flatten.select{|dep| Path === dep && Step === dep.resource}.collect{|dep| dep.resource})
125
131
  end
126
132
 
127
-
128
133
  def execute_dependency(dependency, log = true)
129
134
  task_name = self.task_name
130
135
  canfail_paths = self.canfail_paths
@@ -112,7 +112,7 @@ class Step
112
112
  end
113
113
 
114
114
  def updatable?
115
- (ENV["RBBT_UPDATE_ALL_JOBS"] == 'true' || ( ENV["RBBT_UPDATE"] == "true" && Open.exists?(info_file)) && status != :noinfo && ! (relocated? && done?))
115
+ (ENV["RBBT_UPDATE_ALL_JOBS"] == 'true' || ( ENV["RBBT_UPDATE"] == "true" && Open.exists?(info_file)) && status != :noinfo && ! (relocated? && done?)) || (ENV["RBBT_UPDATE"] && ! (done? && ! Open.exists?(info_file)))
116
116
  end
117
117
 
118
118
  def dependency_checks
@@ -122,13 +122,12 @@ class Step
122
122
  reject{|dependency| (defined?(WorkflowRemoteClient) && WorkflowRemoteClient::RemoteStep === dependency) || Open.remote?(dependency.path) }.
123
123
  reject{|dependency| dependency.error? }.
124
124
  #select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
125
- #select{|dependency| Open.exists?(dependency.path) || ((Open.exists?(dependency.info_file) && (dependency.status == :cleaned) || dependency.status == :waiting)) }.
126
125
  select{|dependency| dependency.updatable? }.
127
126
  collect{|dependency| Workflow.relocate_dependency(self, dependency)}
128
127
  end
129
128
 
130
129
  def input_checks
131
- inputs.select{|i| Step === i }.
130
+ (inputs.select{|i| Step === i } + inputs.select{|i| Path === i && Step === i.resource}.collect{|i| i.resource}).
132
131
  select{|dependency| dependency.updatable? }
133
132
  end
134
133
 
@@ -154,25 +153,28 @@ class Step
154
153
  canfail_paths = self.canfail_paths
155
154
  this_mtime = Open.mtime(self.path) if Open.exists?(self.path)
156
155
 
157
- checks.each do |dep|
158
- next unless dep.updatable?
159
- dep_done = dep.done?
156
+ outdated_time = checks.select{|dep| dep.updatable? && dep.done? && Persist.newer?(path, dep.path) }
157
+ outdated_dep = checks.reject{|dep| dep.done? || (dep.error? && ! dep.recoverable_error? && canfail_paths.include?(dep.path)) }
160
158
 
161
- begin
162
- if this_mtime && dep_done && Open.exists?(dep.path) && (Open.mtime(dep.path) > this_mtime + 1)
163
- outdated_time << dep
164
- end
165
- rescue
166
- end
159
+ #checks.each do |dep|
160
+ # next unless dep.updatable?
161
+ # dep_done = dep.done?
167
162
 
168
- # Is this pointless? this would mean some dep got updated after a later
169
- # dep but but before this one.
170
- #if (! dep.done? && ! canfail_paths.include?(dep.path)) || ! dep.updated?
163
+ # begin
164
+ # if this_mtime && dep_done && Open.exists?(dep.path) && (Open.mtime(dep.path) > this_mtime + 1)
165
+ # outdated_time << dep
166
+ # end
167
+ # rescue
168
+ # end
171
169
 
172
- if (! dep_done && ! canfail_paths.include?(dep.path))
173
- outdated_dep << dep
174
- end
175
- end
170
+ # # Is this pointless? this would mean some dep got updated after a later
171
+ # # dep but but before this one.
172
+ # #if (! dep.done? && ! canfail_paths.include?(dep.path)) || ! dep.updated?
173
+
174
+ # if (! dep_done && ! canfail_paths.include?(dep.path))
175
+ # outdated_dep << dep
176
+ # end
177
+ #end
176
178
 
177
179
  Log.high "Some newer files found: #{Misc.fingerprint outdated_time}" if outdated_time.any?
178
180
  Log.high "Some outdated files found: #{Misc.fingerprint outdated_dep}" if outdated_dep.any?
@@ -215,7 +217,7 @@ class Step
215
217
  no_load = :stream if no_load
216
218
 
217
219
  Open.write(pid_file, Process.pid.to_s) unless Open.exists?(path) or Open.exists?(pid_file)
218
- result_type = @task.result_type
220
+ result_type = @task.result_type if @task
219
221
  result_type = info[:result_type] if result_type.nil?
220
222
  result = Persist.persist "Job", result_type, :file => path, :check => persist_checks, :no_load => no_load do
221
223
  if Step === Step.log_relay_step and not self == Step.log_relay_step
@@ -26,15 +26,25 @@ module Workflow
26
26
  workload
27
27
  end
28
28
 
29
+ def self.workload(jobs)
30
+ jobs.inject({}) do |acc,job|
31
+ Orchestrator.job_workload(job).each do |j,d|
32
+ acc[j] = d unless acc.keys.collect{|k| k.path }.include? j.path
33
+ end
34
+ acc
35
+ end
36
+ end
37
+
29
38
  def self.job_rules(rules, job)
30
39
  workflow = job.workflow.to_s
31
40
  task_name = job.task_name.to_s
41
+ defaults = rules["defaults"] || {}
32
42
 
33
- return IndiferentHash.setup(rules["defaults"]) unless rules[workflow]
34
- return IndiferentHash.setup(rules["defaults"]) unless rules[workflow][task_name]
43
+ return IndiferentHash.setup(defaults) unless rules[workflow]
44
+ return IndiferentHash.setup(defaults) unless rules[workflow][task_name]
35
45
 
36
46
  job_rules = IndiferentHash.setup(rules[workflow][task_name])
37
- rules["defaults"].each{|k,v| job_rules[k] = v if job_rules[k].nil? } if rules["defaults"]
47
+ defaults.each{|k,v| job_rules[k] = v if job_rules[k].nil? } if defaults
38
48
  job_rules
39
49
  end
40
50
 
@@ -169,12 +179,7 @@ module Workflow
169
179
  def process(rules, jobs)
170
180
  begin
171
181
 
172
- workload = jobs.inject({}) do |acc,job|
173
- Orchestrator.job_workload(job).each do |j,d|
174
- acc[j] = d unless acc.keys.collect{|k| k.path }.include? j.path
175
- end
176
- acc
177
- end
182
+ workload = Orchestrator.workload(jobs)
178
183
  all_jobs = workload.keys
179
184
 
180
185
  top_level_jobs = jobs.collect{|job| job.path }
@@ -22,11 +22,14 @@ class Step
22
22
  end
23
23
 
24
24
  def self.prov_report_msg(status, name, path, info = nil)
25
- parts = path.sub(/\{.*/,'').sub(/#{Regexp.quote(name)}$/,'').split "/"
25
+ parts = path.sub(/\{.*/,'').split "/"
26
26
 
27
+ parts.pop
28
+
27
29
  task = Log.color(:yellow, parts.pop)
28
30
  workflow = Log.color(:magenta, parts.pop)
29
- if status.to_s == 'noinfo' and parts.last != 'jobs'
31
+ #if status.to_s == 'noinfo' && parts.last != 'jobs'
32
+ if ! Workflow.job_path?(path)
30
33
  task, status, workflow = Log.color(:yellow, info[:task_name]), Log.color(:green, "file"), Log.color(:magenta, "-")
31
34
  end
32
35
 
@@ -66,7 +69,7 @@ class Step
66
69
  str << "\n"
67
70
  end
68
71
 
69
- def self.prov_report(step, offset = 0, task = nil, seen = [])
72
+ def self.prov_report(step, offset = 0, task = nil, seen = [], expand_repeats = false)
70
73
  info = step.info || {}
71
74
  info[:task_name] = task
72
75
  path = step.path
@@ -82,9 +85,13 @@ class Step
82
85
  new = ! seen.include?(path)
83
86
  if new
84
87
  seen << path
85
- str << prov_report(dep, offset + 1, task, seen)
88
+ str << prov_report(dep, offset + 1, task, seen, expand_repeats)
86
89
  else
87
- str << Log.color(:green, Log.uncolor(prov_report(dep, offset+1, task)))
90
+ if expand_repeats
91
+ str << Log.color(:green, Log.uncolor(prov_report(dep, offset+1, task)))
92
+ else
93
+ str << Log.color(:green, " " * (offset + 1) + Log.uncolor(prov_report_msg(status, name, path, info)))
94
+ end
88
95
  end
89
96
  end if step.dependencies
90
97
  str
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt-util'
4
+ require 'rbbt/util/simpleopt'
5
+
6
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
7
+
8
+ options = SOPT.setup <<EOF
9
+
10
+ Queue a job in Marenostrum
11
+
12
+ $ rbbt mnl [options]
13
+
14
+ -h--help Print this help
15
+ -d--done Done jobs only
16
+ -e--error Error jobs only
17
+ -a--aborted SLURM aboted jobs
18
+ -r--running Running jobs only
19
+ -q--queued Queued jobs only
20
+ -j--job* Job ids
21
+ -s--search* Regular expression
22
+ -t--tail* Show the last lines of the STDERR
23
+ EOF
24
+
25
+ if options[:help]
26
+ if defined? rbbt_usage
27
+ rbbt_usage
28
+ else
29
+ puts SOPT.doc
30
+ end
31
+ exit 0
32
+ end
33
+
34
+ Log.severity = 4
35
+ done, error, running, queued, aborted, jobid, search, tail = options.values_at :done, :error, :running, :queued, :aborted, :job, :search, :tail
36
+
37
+ workdir = File.expand_path('~/rbbt-slurm')
38
+ Path.setup(workdir)
39
+
40
+ running_jobs = begin
41
+ CMD.cmd('squeue').read.split("\n").collect{|l| l.to_i.to_s}
42
+ rescue
43
+ Log.warn "Cannot determine if jobs are running, they will seem to be all alive (Job ID in green)"
44
+ $norunningjobs = true
45
+ []
46
+ end
47
+
48
+ count = 0
49
+ workdir.glob("**/command.slurm").sort_by{|f| File.mtime(f)}.each do |fcmd|
50
+ dir = File.dirname(fcmd)
51
+
52
+ if m = Open.read(fcmd).match(/#CMD: (.*)/)
53
+ cmd = m[1]
54
+ else
55
+ cmd = nil
56
+ end
57
+
58
+ if m = Open.read(fcmd).match(/# Run command\n(.*?)\n/im)
59
+ exe = m[1]
60
+ else
61
+ exe = nil
62
+ end
63
+
64
+ if m = Open.read(fcmd).match(/^CONTAINER_DIR=(.*)/)
65
+ container_home = m[1]
66
+ else
67
+ container_home = nil
68
+ end
69
+
70
+
71
+ if File.exists?(fid = File.join(dir, 'job.id'))
72
+ id = Open.read(fid).chomp
73
+ else
74
+ id = nil
75
+ end
76
+
77
+ if File.exists?(fstatus = File.join(dir, 'exit.status'))
78
+ exit_status = Open.read(fstatus).to_i
79
+ else
80
+ exit_status = nil
81
+ end
82
+
83
+ if File.exists?(fstatus = File.join(dir, 'job.status'))
84
+ nodes = Open.read(fstatus).split("\n").last.split(/\s+/).last.split(",")
85
+ else
86
+ nodes = []
87
+ end
88
+
89
+ if File.exists?(File.join(dir, 'std.out'))
90
+ outt = File.mtime File.join(dir, 'std.out')
91
+ errt = File.mtime File.join(dir, 'std.err')
92
+ time_diff = Time.now - [outt, errt].max
93
+ end
94
+
95
+ fdep = File.join(dir, 'dependencies.list')
96
+ deps = Open.read(fdep).split("\n") if File.exists?(fdep)
97
+
98
+ if done || error || aborted || running || queued || jobid || search
99
+ select = false
100
+ select = true if done && exit_status == 0
101
+ select = true if error && exit_status && exit_status != 0
102
+ select = true if aborted && (exit_status.nil? && ! running_jobs.include?(id))
103
+ select = true if queued && deps && (running_jobs & deps).any?
104
+ select = true if running && (exit_status.nil? && running_jobs.include?(id)) && (!deps || (running_jobs & deps).empty?)
105
+ select = true if jobid && jobid.split(",").include?(id)
106
+ select = true if search && cmd.match(/#{search}/)
107
+ next unless select
108
+ end
109
+
110
+
111
+ puts Log.color :blue, dir
112
+ puts Log.color(:magenta, "Creation: ") << File.mtime(File.join(dir, 'command.slurm')).to_s
113
+ puts Log.color(:magenta, "Done: ") << File.mtime(File.join(dir, 'exit.status')).to_s if File.exist?(File.join(dir, 'exit.status'))
114
+ puts Log.color(:magenta, "Exec: ") << (exe || "Missing")
115
+ puts Log.color(:magenta, "CMD: ") << (Log.color(:yellow, cmd) || "Missing")
116
+ puts Log.color(:magenta, "HOME: ") << Log.color(:yellow, container_home) if container_home
117
+ puts Log.color(:magenta, "Job ID: ") << (exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : (running_jobs.include?(id) || $norunningjobs ? Log.color(:green, id) : Log.color(:red, id) ))
118
+ puts Log.color(:magenta, "Dependencies: ") << deps * ", " if deps
119
+ puts Log.color(:magenta, "Nodes: ") << nodes * ", "
120
+ puts Log.color(:magenta, "Output: ") << File.exists?(File.join(dir, 'std.out')).to_s << (id.nil? ? "" : " (last update " + Misc.format_seconds(time_diff) + " ago)")
121
+
122
+ if tail && File.exists?(File.join(dir, 'std.err'))
123
+ if exit_status && exit_status != 0
124
+ puts Log.color(:magenta, "First error or exception found: ")
125
+ puts CMD.cmd("grep -i -w 'error\\|[a-z]*exception' #{File.join(dir, 'std.err')} -A #{tail.to_i} |head -n #{tail.to_i}", :no_fail => true).read
126
+ elsif exit_status
127
+ puts Log.color(:magenta, "Completed jobs: ")
128
+ puts CMD.cmd("grep -i -w 'Completed step' #{File.join(dir, 'std.err')} | grep -v 'Retrying dep.' | tail -n #{tail.to_i}", :no_fail => true).read
129
+ else
130
+ puts Log.color(:magenta, "Log tail: ")
131
+ puts CMD.cmd("tail -n #{tail.to_i} #{File.join(dir, 'std.err')}").read
132
+ end
133
+ end
134
+
135
+ count += 1
136
+
137
+ end
138
+
139
+ puts
140
+ puts "Found #{count} jobs"
141
+
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rbbt/util/simpleopt'
4
+ require 'rbbt/workflow'
5
+ require 'rbbt/workflow/usage'
6
+ require 'rbbt/hpc'
7
+ require 'rbbt/hpc/orchestrate'
8
+ require 'time'
9
+
10
+ $slurm_options = SOPT.get <<EOF
11
+ -dr--dry_run Print only the template
12
+ -cj--clean_job Clean job
13
+ --drbbt* Use development version of rbbt
14
+ -sing--singularity Use Singularity
15
+ -ug--user_group* Use alternative user group for group project directory
16
+ -c--contain* Contain in directory (using Singularity)
17
+ -s--sync* Contain in directory and sync jobs
18
+ -e--exclusive Make exclusive use of the node
19
+ -hm--highmem Make use of highmem cores
20
+ -wc--wipe_container* Wipe the jobs from the contain directory
21
+ -CS--contain_and_sync Contain and sync to default locations
22
+ -ci--copy_image When using a container directory, copy image there
23
+ -t--tail Tail the logs
24
+ -q--queue* Queue
25
+ -t--task_cpus* Tasks
26
+ -W--workflows* Additional workflows
27
+ -tm--time* Time
28
+ -R--rules* Orchestration rules
29
+ -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
30
+ EOF
31
+
32
+ class Step
33
+ def run(*args)
34
+ if done?
35
+ self.load
36
+ else
37
+ begin
38
+ Log.debug "Issuing SLURM job for #{self.path}"
39
+ HPC::SLURM.orchestrate_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
40
+ rescue HPC::SBATCH
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
47
+ load Rbbt.share.rbbt_commands.workflow.task.find
@@ -9,8 +9,9 @@ require 'time'
9
9
  $slurm_options = SOPT.get <<EOF
10
10
  -dr--dry_run Print only the template
11
11
  -cj--clean_job Clean job
12
- --drbbt Use development version of rbbt
12
+ --drbbt* Use development version of rbbt
13
13
  -sing--singularity Use Singularity
14
+ -ug--user_group* Use alternative user group for group project directory
14
15
  -c--contain* Contain in directory (using Singularity)
15
16
  -s--sync* Contain in directory and sync jobs
16
17
  -e--exclusive Make exclusive use of the node
@@ -21,8 +22,9 @@ $slurm_options = SOPT.get <<EOF
21
22
  -t--tail Tail the logs
22
23
  -q--queue* Queue
23
24
  -t--task_cpus* Tasks
25
+ -W--workflows* Additional workflows
24
26
  -tm--time* Time
25
- -S--server* SLURM login node
27
+ -rmb--remove_slurm_basedir Remove the SLURM working directory (command, STDIN, exit status, ...)
26
28
  EOF
27
29
 
28
30
  class Step
@@ -30,9 +32,14 @@ class Step
30
32
  if done?
31
33
  self.load
32
34
  else
33
- Marenostrum::SLURM.run_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
35
+ begin
36
+ Log.debug "Issuing SLURM job for #{self.path}"
37
+ HPC::SLURM.run_job(self, SOPT::GOT_OPTIONS.merge($slurm_options))
38
+ rescue HPC::SBATCH
39
+ end
34
40
  end
35
41
  end
36
42
  end
37
43
 
44
+ ARGV.concat ["-W", $slurm_options[:workflows]] if $slurm_options[:workflows]
38
45
  load Rbbt.share.rbbt_commands.workflow.task.find
@@ -49,27 +49,27 @@ def pid_msg(pid)
49
49
  end
50
50
 
51
51
 
52
- def status_msg(status)
53
- color = case status.to_sym
54
- when :error, :aborted, :missing, :dead, :broken
55
- :red
56
- when :streaming, :started
57
- :cyan
58
- when :done
59
- :green
60
- when :noinfo
61
- :blue
62
- when :dependencies, :waiting, :setup
63
- :yellow
64
- else
65
- if status.to_s.index ">"
66
- :cyan
67
- else
68
- :cyan
69
- end
70
- end
71
- Log.color(color, status.to_s)
72
- end
52
+ #def status_msg(status)
53
+ # color = case status.to_sym
54
+ # when :error, :aborted, :missing, :dead, :broken
55
+ # :red
56
+ # when :streaming, :started
57
+ # :cyan
58
+ # when :done
59
+ # :green
60
+ # when :noinfo, :notfound
61
+ # :blue
62
+ # when :dependencies, :waiting, :setup
63
+ # :yellow
64
+ # else
65
+ # if status.to_s.index ">"
66
+ # :cyan
67
+ # else
68
+ # :cyan
69
+ # end
70
+ # end
71
+ # Log.color(color, status.to_s)
72
+ #end
73
73
 
74
74
  def input_msg(file, inputs)
75
75
 
@@ -218,7 +218,7 @@ workflows.sort.each do |workflow,tasks|
218
218
  status << Log.color(:red, " (dead)")
219
219
  end
220
220
  end
221
- str << " #{ status_msg status }"
221
+ str << " #{ Step.prov_status_msg status }"
222
222
  str << " (dirty)" if status == 'done' && Workflow.load_step(file).dirty?
223
223
 
224
224
  if inputs and inputs.any?