scout-gear 10.9.0 → 10.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/.vimproject +25 -0
  3. data/VERSION +1 -1
  4. data/bin/scout +4 -1
  5. data/lib/scout/knowledge_base/registry.rb +2 -3
  6. data/lib/scout/workflow/definition.rb +11 -0
  7. data/lib/scout/workflow/deployment/local.rb +288 -0
  8. data/lib/scout/workflow/deployment/orchestrator/batches.rb +130 -0
  9. data/lib/scout/workflow/deployment/orchestrator/chains.rb +104 -0
  10. data/lib/scout/workflow/deployment/orchestrator/rules.rb +256 -0
  11. data/lib/scout/workflow/deployment/orchestrator/workload.rb +67 -0
  12. data/lib/scout/workflow/deployment/scheduler/job.rb +740 -0
  13. data/lib/scout/workflow/deployment/scheduler/lfs.rb +125 -0
  14. data/lib/scout/workflow/deployment/scheduler/pbs.rb +176 -0
  15. data/lib/scout/workflow/deployment/scheduler/slurm.rb +158 -0
  16. data/lib/scout/workflow/deployment/scheduler.rb +73 -0
  17. data/lib/scout/workflow/deployment.rb +10 -1
  18. data/lib/scout/workflow/exceptions.rb +2 -0
  19. data/lib/scout/workflow/step/config.rb +3 -0
  20. data/lib/scout/workflow/step/info.rb +2 -2
  21. data/lib/scout/workflow/step/progress.rb +52 -0
  22. data/lib/scout/workflow/step.rb +30 -1
  23. data/lib/scout/workflow/task.rb +2 -0
  24. data/scout-gear.gemspec +23 -4
  25. data/scout_commands/batch/list +1 -1
  26. data/scout_commands/workflow/cmd +5 -13
  27. data/scout_commands/workflow/info +1 -1
  28. data/scout_commands/workflow/task +61 -25
  29. data/test/scout/workflow/deployment/orchestrator/test_batches.rb +138 -0
  30. data/test/scout/workflow/deployment/orchestrator/test_chains.rb +171 -0
  31. data/test/scout/workflow/deployment/orchestrator/test_rules.rb +219 -0
  32. data/test/scout/workflow/deployment/orchestrator/test_workload.rb +117 -0
  33. data/test/scout/workflow/deployment/scheduler/test_job.rb +31 -0
  34. data/test/scout/workflow/deployment/scheduler/test_lfs.rb +32 -0
  35. data/test/scout/workflow/deployment/scheduler/test_pbs.rb +32 -0
  36. data/test/scout/workflow/deployment/scheduler/test_slurm.rb +32 -0
  37. data/test/scout/workflow/deployment/{test_orchestrator.rb → test_local.rb} +161 -33
  38. data/test/scout/workflow/deployment/test_scheduler.rb +75 -0
  39. data/test/scout/workflow/deployment/test_trace.rb +1 -1
  40. data/test/scout/workflow/step/test_progress.rb +27 -0
  41. data/test/scout/workflow/task/test_inputs.rb +17 -0
  42. data/test/test_helper.rb +2 -1
  43. metadata +22 -3
  44. data/lib/scout/workflow/deployment/orchestrator.rb +0 -292
@@ -1,292 +0,0 @@
1
- module Workflow
2
- class Orchestrator
3
-
4
- class NoWork < Exception; end
5
-
6
- def self.job_workload(job)
7
- workload = {job => []}
8
- return workload if job.done? && job.updated?
9
-
10
- job.dependencies.each do |dep|
11
- next if dep.done? && dep.updated?
12
- workload.merge!(job_workload(dep))
13
- workload[job] += workload[dep]
14
- workload[job] << dep
15
- workload[job].uniq!
16
- end
17
-
18
- job.input_dependencies.each do |dep|
19
- next if dep.done? && dep.updated?
20
- workload.merge!(job_workload(dep))
21
- workload[job] += workload[dep]
22
- workload[job] << dep
23
- workload[job].uniq!
24
- end
25
-
26
- workload
27
- end
28
-
29
- def self.workload(jobs)
30
- jobs.inject({}) do |acc,job|
31
- Orchestrator.job_workload(job).each do |j,d|
32
- acc[j] = d unless acc.keys.collect{|k| k.path }.include? j.path
33
- end
34
- acc
35
- end
36
- end
37
-
38
- def self.job_rules(rules, job)
39
- IndiferentHash.setup(rules)
40
- workflow = job.workflow.name
41
- task_name = job.task_name.to_s
42
- defaults = rules["defaults"] || {}
43
-
44
- return IndiferentHash.setup(defaults) unless rules[workflow]
45
- workflow_rules = IndiferentHash.setup(rules[workflow])
46
- return IndiferentHash.setup(defaults) unless workflow_rules[task_name]
47
- job_rules = IndiferentHash.setup(workflow_rules[task_name])
48
-
49
- defaults.each{|k,v| job_rules[k] = v if job_rules[k].nil? } if defaults
50
- job_rules
51
- end
52
-
53
- def self.purge_duplicates(candidates)
54
- seen = Set.new
55
- candidates.select do |job|
56
- if seen.include? job.path
57
- false
58
- else
59
- seen << job.path
60
- true
61
- end
62
- end
63
- end
64
-
65
- def self.job_resources(rules, job)
66
- resources = (job_rules(rules, job) || {})["resources"] || {}
67
-
68
- IndiferentHash.setup(resources)
69
-
70
- default_resources = rules["default_resources"]
71
- default_resources ||= rules["defaults"]["resources"] if rules["defaults"]
72
- default_resources ||= {}
73
-
74
- default_resources.each{|k,v| resources[k] ||= v } if default_resources
75
-
76
- resources = {:cpus => 1} if resources.empty?
77
- resources
78
- end
79
-
80
- def self.sort_candidates(candidates, rules)
81
- seen = Set.new
82
- candidates.sort_by do |job|
83
- - job_resources(rules, job).values.inject(0){|acc,e| acc += e}
84
- end
85
- end
86
-
87
- def self.candidates(workload, rules)
88
- if rules.empty?
89
- candidates = workload.
90
- select{|k,v| v.empty? }.
91
- collect{|k,v| k }.
92
- reject{|k| k.done? || k.running? || (k.error? && ! k.recoverable_error?) }
93
- else
94
- candidates = workload. #select{|k,v| Orchestrator.job_rules(rules, k) }.
95
- select{|k,v| v.empty? }.
96
- collect{|k,v| k }.
97
- reject{|k| k.done? || k.running? }
98
- end
99
-
100
- #top_level = workload.keys - workload.values.flatten
101
-
102
- candidates = purge_duplicates candidates
103
- candidates = sort_candidates candidates, rules
104
-
105
- candidates
106
- end
107
-
108
- def self.process(*args)
109
- self.new.process(*args)
110
- end
111
-
112
- attr_accessor :available_resources, :resources_requested, :resources_used, :timer
113
-
114
- def initialize(timer = 5, available_resources = nil)
115
- available_resources = {:cpus => Etc.nprocessors } if available_resources.nil?
116
- @timer = timer
117
- @available_resources = IndiferentHash.setup(available_resources)
118
- @resources_requested = IndiferentHash.setup({})
119
- @resources_used = IndiferentHash.setup({})
120
- end
121
-
122
- def release_resources(job)
123
- if resources_used[job]
124
- Log.debug "Orchestrator releasing resouces from #{job.path}"
125
- resources_used[job].each do |resource,value|
126
- next if resource == 'size'
127
- resources_requested[resource] -= value.to_i
128
- end
129
- resources_used.delete job
130
- end
131
- end
132
-
133
- def check_resources(rules, job)
134
- resources = Orchestrator.job_resources(rules, job)
135
-
136
- limit_resources = resources.select{|resource,value| available_resources[resource] && ((resources_requested[resource] || 0) + value) > available_resources[resource] }.collect{|resource,v| resource }
137
- if limit_resources.any?
138
- Log.debug "Orchestrator waiting on #{job.path} due to #{limit_resources * ", "}"
139
- else
140
-
141
- resources_used[job] = resources
142
- resources.each do |resource,value|
143
- resources_requested[resource] ||= 0
144
- resources_requested[resource] += value.to_i
145
- end
146
- Log.low "Orchestrator producing #{job.path} with resources #{resources}"
147
-
148
- return yield
149
- end
150
- end
151
-
152
- def run_with_rules(rules, job)
153
- job_rules = Orchestrator.job_rules(rules, job)
154
-
155
- Scout::Config.with_config do
156
- job_rules[:config_keys].each do |config|
157
- Scout::Config.process_config config
158
- end if job_rules && job_rules[:config_keys]
159
-
160
- log = job_rules[:log] if job_rules
161
- log = Log.severity if log.nil?
162
- Log.with_severity log do
163
- job.fork
164
- end
165
- end
166
- end
167
-
168
- def erase_job_dependencies(job, rules, all_jobs, top_level_jobs)
169
- job.dependencies.each do |dep|
170
- next if top_level_jobs.include? dep.path
171
- next unless Orchestrator.job_rules(rules, dep)["erase"].to_s == 'true'
172
-
173
- dep_path = dep.path
174
- parents = all_jobs.select do |parent|
175
- paths = parent.info[:dependencies].nil? ? parent.dependencies.collect{|d| d.path } : parent.info[:dependencies].collect{|d| Array === d ? d.last : d }
176
- paths.include? dep_path
177
- end
178
-
179
- next unless parents.reject{|parent| parent.done? }.empty?
180
-
181
- parents.each do |parent|
182
- Log.high "Erasing #{dep.path} from #{parent.path}"
183
- parent.archive_deps
184
- parent.copy_linked_files_dir
185
- parent.dependencies = parent.dependencies - [dep]
186
- end
187
- dep.clean
188
- end
189
- end
190
-
191
- def process(rules, jobs = nil)
192
- jobs, rules = rules, {} if jobs.nil?
193
- jobs = [jobs] if Step === jobs
194
- failed_jobs = []
195
- begin
196
-
197
- workload = Orchestrator.workload(jobs)
198
- all_jobs = workload.keys
199
-
200
- all_jobs.each{|job| job.clean unless (job.done? && job.updated?) || (job.error? && ! job.recoverable_error?) }
201
-
202
- top_level_jobs = jobs.collect{|job| job.path }
203
- while workload.any?
204
-
205
- candidates = resources_used.keys + Orchestrator.candidates(workload, rules)
206
- candidates.uniq!
207
- raise NoWork, "No candidates and no running jobs" if candidates.empty?
208
-
209
- candidates.each do |job|
210
- case
211
- when (job.error? || job.aborted?)
212
- begin
213
- if job.recoverable_error?
214
- if failed_jobs.include?(job)
215
- Log.warn "Failed twice #{job.path} with recoverable error"
216
- next
217
- else
218
- failed_jobs << job
219
- job.clean
220
- raise TryAgain
221
- end
222
- else
223
- Log.warn "Non-recoverable error in #{job.path}"
224
- next
225
- end
226
- ensure
227
- Log.warn "Releases resources from failed job: #{job.path}"
228
- release_resources(job)
229
- end
230
- when job.done?
231
- Log.debug "Orchestrator done #{job.path}"
232
- release_resources(job)
233
- erase_job_dependencies(job, rules, all_jobs, top_level_jobs)
234
-
235
- when job.running?
236
- next
237
-
238
- else
239
- check_resources(rules, job) do
240
- run_with_rules(rules, job)
241
- end
242
- end
243
- end
244
-
245
- new_workload = {}
246
- workload.each do |k,v|
247
- next if k.done? || k.error? || k.aborted?
248
- #new_workload[k] = v.reject{|d| d.done? || ((d.error? || d.aborted?) && ! d.recoverable_error?)}
249
- new_workload[k] = v.reject{|d| d.done? || d.error? || d.aborted?}
250
- end
251
- workload = new_workload
252
- sleep timer
253
- end
254
- all_jobs.each{|s|
255
- begin
256
- s.join
257
- rescue
258
- Log.warn "Job #{s.short_path} ended with exception #{$!.class.to_s}: #{$!.message}"
259
- end
260
- }
261
- rescue TryAgain
262
- retry
263
- end
264
- end
265
- end
266
-
267
- def self.produce_dependencies(jobs, tasks, produce_cpus = Etc.nprocessors, produce_timer = 5)
268
- jobs = [jobs] unless Array === jobs
269
- produce_list = []
270
- jobs.each do |job|
271
- next if job.done? || job.running?
272
- job.rec_dependencies.each do |job|
273
- produce_list << job if tasks.include?(job.task_name) ||
274
- tasks.include?(job.task_name.to_s) ||
275
- tasks.include?(job.full_task_name)
276
- end
277
- end
278
-
279
- orchestrator = Orchestrator.new produce_timer, cpus: produce_cpus.to_i
280
- orchestrator.process({}, produce_list)
281
- produce_list
282
- end
283
-
284
- def self.produce(jobs, produce_cpus: Etc.nprocessors, produce_timer: 1)
285
- jobs = [jobs] unless Array === jobs
286
- orchestrator = Orchestrator.new produce_timer.to_i, cpus: produce_cpus.to_i
287
- begin
288
- orchestrator.process({}, jobs)
289
- rescue Orchestrator::NoWork
290
- end
291
- end
292
- end