scout-gear 10.9.0 → 10.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vimproject +25 -0
- data/VERSION +1 -1
- data/bin/scout +4 -1
- data/lib/scout/knowledge_base/registry.rb +2 -3
- data/lib/scout/workflow/definition.rb +11 -0
- data/lib/scout/workflow/deployment/local.rb +288 -0
- data/lib/scout/workflow/deployment/orchestrator/batches.rb +130 -0
- data/lib/scout/workflow/deployment/orchestrator/chains.rb +104 -0
- data/lib/scout/workflow/deployment/orchestrator/rules.rb +256 -0
- data/lib/scout/workflow/deployment/orchestrator/workload.rb +67 -0
- data/lib/scout/workflow/deployment/scheduler/job.rb +740 -0
- data/lib/scout/workflow/deployment/scheduler/lfs.rb +125 -0
- data/lib/scout/workflow/deployment/scheduler/pbs.rb +176 -0
- data/lib/scout/workflow/deployment/scheduler/slurm.rb +158 -0
- data/lib/scout/workflow/deployment/scheduler.rb +73 -0
- data/lib/scout/workflow/deployment.rb +10 -1
- data/lib/scout/workflow/exceptions.rb +2 -0
- data/lib/scout/workflow/step/config.rb +3 -0
- data/lib/scout/workflow/step/info.rb +2 -2
- data/lib/scout/workflow/step/progress.rb +52 -0
- data/lib/scout/workflow/step.rb +30 -1
- data/lib/scout/workflow/task.rb +2 -0
- data/scout-gear.gemspec +23 -4
- data/scout_commands/batch/list +1 -1
- data/scout_commands/workflow/cmd +5 -13
- data/scout_commands/workflow/info +1 -1
- data/scout_commands/workflow/task +61 -25
- data/test/scout/workflow/deployment/orchestrator/test_batches.rb +138 -0
- data/test/scout/workflow/deployment/orchestrator/test_chains.rb +171 -0
- data/test/scout/workflow/deployment/orchestrator/test_rules.rb +219 -0
- data/test/scout/workflow/deployment/orchestrator/test_workload.rb +117 -0
- data/test/scout/workflow/deployment/scheduler/test_job.rb +31 -0
- data/test/scout/workflow/deployment/scheduler/test_lfs.rb +32 -0
- data/test/scout/workflow/deployment/scheduler/test_pbs.rb +32 -0
- data/test/scout/workflow/deployment/scheduler/test_slurm.rb +32 -0
- data/test/scout/workflow/deployment/{test_orchestrator.rb → test_local.rb} +161 -33
- data/test/scout/workflow/deployment/test_scheduler.rb +75 -0
- data/test/scout/workflow/deployment/test_trace.rb +1 -1
- data/test/scout/workflow/step/test_progress.rb +27 -0
- data/test/scout/workflow/task/test_inputs.rb +17 -0
- data/test/test_helper.rb +2 -1
- metadata +22 -3
- data/lib/scout/workflow/deployment/orchestrator.rb +0 -292
|
@@ -1,292 +0,0 @@
|
|
|
1
|
-
module Workflow
|
|
2
|
-
class Orchestrator
|
|
3
|
-
|
|
4
|
-
class NoWork < Exception; end
|
|
5
|
-
|
|
6
|
-
def self.job_workload(job)
|
|
7
|
-
workload = {job => []}
|
|
8
|
-
return workload if job.done? && job.updated?
|
|
9
|
-
|
|
10
|
-
job.dependencies.each do |dep|
|
|
11
|
-
next if dep.done? && dep.updated?
|
|
12
|
-
workload.merge!(job_workload(dep))
|
|
13
|
-
workload[job] += workload[dep]
|
|
14
|
-
workload[job] << dep
|
|
15
|
-
workload[job].uniq!
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
job.input_dependencies.each do |dep|
|
|
19
|
-
next if dep.done? && dep.updated?
|
|
20
|
-
workload.merge!(job_workload(dep))
|
|
21
|
-
workload[job] += workload[dep]
|
|
22
|
-
workload[job] << dep
|
|
23
|
-
workload[job].uniq!
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
workload
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
def self.workload(jobs)
|
|
30
|
-
jobs.inject({}) do |acc,job|
|
|
31
|
-
Orchestrator.job_workload(job).each do |j,d|
|
|
32
|
-
acc[j] = d unless acc.keys.collect{|k| k.path }.include? j.path
|
|
33
|
-
end
|
|
34
|
-
acc
|
|
35
|
-
end
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
def self.job_rules(rules, job)
|
|
39
|
-
IndiferentHash.setup(rules)
|
|
40
|
-
workflow = job.workflow.name
|
|
41
|
-
task_name = job.task_name.to_s
|
|
42
|
-
defaults = rules["defaults"] || {}
|
|
43
|
-
|
|
44
|
-
return IndiferentHash.setup(defaults) unless rules[workflow]
|
|
45
|
-
workflow_rules = IndiferentHash.setup(rules[workflow])
|
|
46
|
-
return IndiferentHash.setup(defaults) unless workflow_rules[task_name]
|
|
47
|
-
job_rules = IndiferentHash.setup(workflow_rules[task_name])
|
|
48
|
-
|
|
49
|
-
defaults.each{|k,v| job_rules[k] = v if job_rules[k].nil? } if defaults
|
|
50
|
-
job_rules
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
def self.purge_duplicates(candidates)
|
|
54
|
-
seen = Set.new
|
|
55
|
-
candidates.select do |job|
|
|
56
|
-
if seen.include? job.path
|
|
57
|
-
false
|
|
58
|
-
else
|
|
59
|
-
seen << job.path
|
|
60
|
-
true
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
def self.job_resources(rules, job)
|
|
66
|
-
resources = (job_rules(rules, job) || {})["resources"] || {}
|
|
67
|
-
|
|
68
|
-
IndiferentHash.setup(resources)
|
|
69
|
-
|
|
70
|
-
default_resources = rules["default_resources"]
|
|
71
|
-
default_resources ||= rules["defaults"]["resources"] if rules["defaults"]
|
|
72
|
-
default_resources ||= {}
|
|
73
|
-
|
|
74
|
-
default_resources.each{|k,v| resources[k] ||= v } if default_resources
|
|
75
|
-
|
|
76
|
-
resources = {:cpus => 1} if resources.empty?
|
|
77
|
-
resources
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
def self.sort_candidates(candidates, rules)
|
|
81
|
-
seen = Set.new
|
|
82
|
-
candidates.sort_by do |job|
|
|
83
|
-
- job_resources(rules, job).values.inject(0){|acc,e| acc += e}
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
def self.candidates(workload, rules)
|
|
88
|
-
if rules.empty?
|
|
89
|
-
candidates = workload.
|
|
90
|
-
select{|k,v| v.empty? }.
|
|
91
|
-
collect{|k,v| k }.
|
|
92
|
-
reject{|k| k.done? || k.running? || (k.error? && ! k.recoverable_error?) }
|
|
93
|
-
else
|
|
94
|
-
candidates = workload. #select{|k,v| Orchestrator.job_rules(rules, k) }.
|
|
95
|
-
select{|k,v| v.empty? }.
|
|
96
|
-
collect{|k,v| k }.
|
|
97
|
-
reject{|k| k.done? || k.running? }
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
#top_level = workload.keys - workload.values.flatten
|
|
101
|
-
|
|
102
|
-
candidates = purge_duplicates candidates
|
|
103
|
-
candidates = sort_candidates candidates, rules
|
|
104
|
-
|
|
105
|
-
candidates
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
def self.process(*args)
|
|
109
|
-
self.new.process(*args)
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
attr_accessor :available_resources, :resources_requested, :resources_used, :timer
|
|
113
|
-
|
|
114
|
-
def initialize(timer = 5, available_resources = nil)
|
|
115
|
-
available_resources = {:cpus => Etc.nprocessors } if available_resources.nil?
|
|
116
|
-
@timer = timer
|
|
117
|
-
@available_resources = IndiferentHash.setup(available_resources)
|
|
118
|
-
@resources_requested = IndiferentHash.setup({})
|
|
119
|
-
@resources_used = IndiferentHash.setup({})
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
def release_resources(job)
|
|
123
|
-
if resources_used[job]
|
|
124
|
-
Log.debug "Orchestrator releasing resouces from #{job.path}"
|
|
125
|
-
resources_used[job].each do |resource,value|
|
|
126
|
-
next if resource == 'size'
|
|
127
|
-
resources_requested[resource] -= value.to_i
|
|
128
|
-
end
|
|
129
|
-
resources_used.delete job
|
|
130
|
-
end
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
def check_resources(rules, job)
|
|
134
|
-
resources = Orchestrator.job_resources(rules, job)
|
|
135
|
-
|
|
136
|
-
limit_resources = resources.select{|resource,value| available_resources[resource] && ((resources_requested[resource] || 0) + value) > available_resources[resource] }.collect{|resource,v| resource }
|
|
137
|
-
if limit_resources.any?
|
|
138
|
-
Log.debug "Orchestrator waiting on #{job.path} due to #{limit_resources * ", "}"
|
|
139
|
-
else
|
|
140
|
-
|
|
141
|
-
resources_used[job] = resources
|
|
142
|
-
resources.each do |resource,value|
|
|
143
|
-
resources_requested[resource] ||= 0
|
|
144
|
-
resources_requested[resource] += value.to_i
|
|
145
|
-
end
|
|
146
|
-
Log.low "Orchestrator producing #{job.path} with resources #{resources}"
|
|
147
|
-
|
|
148
|
-
return yield
|
|
149
|
-
end
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
def run_with_rules(rules, job)
|
|
153
|
-
job_rules = Orchestrator.job_rules(rules, job)
|
|
154
|
-
|
|
155
|
-
Scout::Config.with_config do
|
|
156
|
-
job_rules[:config_keys].each do |config|
|
|
157
|
-
Scout::Config.process_config config
|
|
158
|
-
end if job_rules && job_rules[:config_keys]
|
|
159
|
-
|
|
160
|
-
log = job_rules[:log] if job_rules
|
|
161
|
-
log = Log.severity if log.nil?
|
|
162
|
-
Log.with_severity log do
|
|
163
|
-
job.fork
|
|
164
|
-
end
|
|
165
|
-
end
|
|
166
|
-
end
|
|
167
|
-
|
|
168
|
-
def erase_job_dependencies(job, rules, all_jobs, top_level_jobs)
|
|
169
|
-
job.dependencies.each do |dep|
|
|
170
|
-
next if top_level_jobs.include? dep.path
|
|
171
|
-
next unless Orchestrator.job_rules(rules, dep)["erase"].to_s == 'true'
|
|
172
|
-
|
|
173
|
-
dep_path = dep.path
|
|
174
|
-
parents = all_jobs.select do |parent|
|
|
175
|
-
paths = parent.info[:dependencies].nil? ? parent.dependencies.collect{|d| d.path } : parent.info[:dependencies].collect{|d| Array === d ? d.last : d }
|
|
176
|
-
paths.include? dep_path
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
next unless parents.reject{|parent| parent.done? }.empty?
|
|
180
|
-
|
|
181
|
-
parents.each do |parent|
|
|
182
|
-
Log.high "Erasing #{dep.path} from #{parent.path}"
|
|
183
|
-
parent.archive_deps
|
|
184
|
-
parent.copy_linked_files_dir
|
|
185
|
-
parent.dependencies = parent.dependencies - [dep]
|
|
186
|
-
end
|
|
187
|
-
dep.clean
|
|
188
|
-
end
|
|
189
|
-
end
|
|
190
|
-
|
|
191
|
-
def process(rules, jobs = nil)
|
|
192
|
-
jobs, rules = rules, {} if jobs.nil?
|
|
193
|
-
jobs = [jobs] if Step === jobs
|
|
194
|
-
failed_jobs = []
|
|
195
|
-
begin
|
|
196
|
-
|
|
197
|
-
workload = Orchestrator.workload(jobs)
|
|
198
|
-
all_jobs = workload.keys
|
|
199
|
-
|
|
200
|
-
all_jobs.each{|job| job.clean unless (job.done? && job.updated?) || (job.error? && ! job.recoverable_error?) }
|
|
201
|
-
|
|
202
|
-
top_level_jobs = jobs.collect{|job| job.path }
|
|
203
|
-
while workload.any?
|
|
204
|
-
|
|
205
|
-
candidates = resources_used.keys + Orchestrator.candidates(workload, rules)
|
|
206
|
-
candidates.uniq!
|
|
207
|
-
raise NoWork, "No candidates and no running jobs" if candidates.empty?
|
|
208
|
-
|
|
209
|
-
candidates.each do |job|
|
|
210
|
-
case
|
|
211
|
-
when (job.error? || job.aborted?)
|
|
212
|
-
begin
|
|
213
|
-
if job.recoverable_error?
|
|
214
|
-
if failed_jobs.include?(job)
|
|
215
|
-
Log.warn "Failed twice #{job.path} with recoverable error"
|
|
216
|
-
next
|
|
217
|
-
else
|
|
218
|
-
failed_jobs << job
|
|
219
|
-
job.clean
|
|
220
|
-
raise TryAgain
|
|
221
|
-
end
|
|
222
|
-
else
|
|
223
|
-
Log.warn "Non-recoverable error in #{job.path}"
|
|
224
|
-
next
|
|
225
|
-
end
|
|
226
|
-
ensure
|
|
227
|
-
Log.warn "Releases resources from failed job: #{job.path}"
|
|
228
|
-
release_resources(job)
|
|
229
|
-
end
|
|
230
|
-
when job.done?
|
|
231
|
-
Log.debug "Orchestrator done #{job.path}"
|
|
232
|
-
release_resources(job)
|
|
233
|
-
erase_job_dependencies(job, rules, all_jobs, top_level_jobs)
|
|
234
|
-
|
|
235
|
-
when job.running?
|
|
236
|
-
next
|
|
237
|
-
|
|
238
|
-
else
|
|
239
|
-
check_resources(rules, job) do
|
|
240
|
-
run_with_rules(rules, job)
|
|
241
|
-
end
|
|
242
|
-
end
|
|
243
|
-
end
|
|
244
|
-
|
|
245
|
-
new_workload = {}
|
|
246
|
-
workload.each do |k,v|
|
|
247
|
-
next if k.done? || k.error? || k.aborted?
|
|
248
|
-
#new_workload[k] = v.reject{|d| d.done? || ((d.error? || d.aborted?) && ! d.recoverable_error?)}
|
|
249
|
-
new_workload[k] = v.reject{|d| d.done? || d.error? || d.aborted?}
|
|
250
|
-
end
|
|
251
|
-
workload = new_workload
|
|
252
|
-
sleep timer
|
|
253
|
-
end
|
|
254
|
-
all_jobs.each{|s|
|
|
255
|
-
begin
|
|
256
|
-
s.join
|
|
257
|
-
rescue
|
|
258
|
-
Log.warn "Job #{s.short_path} ended with exception #{$!.class.to_s}: #{$!.message}"
|
|
259
|
-
end
|
|
260
|
-
}
|
|
261
|
-
rescue TryAgain
|
|
262
|
-
retry
|
|
263
|
-
end
|
|
264
|
-
end
|
|
265
|
-
end
|
|
266
|
-
|
|
267
|
-
def self.produce_dependencies(jobs, tasks, produce_cpus = Etc.nprocessors, produce_timer = 5)
|
|
268
|
-
jobs = [jobs] unless Array === jobs
|
|
269
|
-
produce_list = []
|
|
270
|
-
jobs.each do |job|
|
|
271
|
-
next if job.done? || job.running?
|
|
272
|
-
job.rec_dependencies.each do |job|
|
|
273
|
-
produce_list << job if tasks.include?(job.task_name) ||
|
|
274
|
-
tasks.include?(job.task_name.to_s) ||
|
|
275
|
-
tasks.include?(job.full_task_name)
|
|
276
|
-
end
|
|
277
|
-
end
|
|
278
|
-
|
|
279
|
-
orchestrator = Orchestrator.new produce_timer, cpus: produce_cpus.to_i
|
|
280
|
-
orchestrator.process({}, produce_list)
|
|
281
|
-
produce_list
|
|
282
|
-
end
|
|
283
|
-
|
|
284
|
-
def self.produce(jobs, produce_cpus: Etc.nprocessors, produce_timer: 1)
|
|
285
|
-
jobs = [jobs] unless Array === jobs
|
|
286
|
-
orchestrator = Orchestrator.new produce_timer.to_i, cpus: produce_cpus.to_i
|
|
287
|
-
begin
|
|
288
|
-
orchestrator.process({}, jobs)
|
|
289
|
-
rescue Orchestrator::NoWork
|
|
290
|
-
end
|
|
291
|
-
end
|
|
292
|
-
end
|