scout-gear 10.11.4 → 10.11.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vimproject +17 -2
- data/VERSION +1 -1
- data/bin/scout +10 -10
- data/lib/scout/association/fields.rb +15 -15
- data/lib/scout/association/index.rb +6 -6
- data/lib/scout/association/item.rb +18 -8
- data/lib/scout/association.rb +4 -4
- data/lib/scout/entity/identifiers.rb +5 -5
- data/lib/scout/entity/property.rb +2 -2
- data/lib/scout/entity.rb +1 -1
- data/lib/scout/knowledge_base/description.rb +10 -10
- data/lib/scout/knowledge_base/entity.rb +6 -6
- data/lib/scout/knowledge_base/list.rb +1 -1
- data/lib/scout/knowledge_base/query.rb +4 -4
- data/lib/scout/knowledge_base/registry.rb +6 -6
- data/lib/scout/knowledge_base/traverse.rb +7 -40
- data/lib/scout/persist/engine/fix_width_table.rb +6 -6
- data/lib/scout/persist/engine/packed_index.rb +2 -2
- data/lib/scout/persist/engine/sharder.rb +4 -4
- data/lib/scout/persist/engine/tkrzw.rb +1 -1
- data/lib/scout/persist/engine/tokyocabinet.rb +2 -2
- data/lib/scout/persist/tsv/adapter/fix_width_table.rb +1 -1
- data/lib/scout/persist/tsv/adapter/packed_index.rb +1 -1
- data/lib/scout/persist/tsv/adapter/tkrzw.rb +1 -1
- data/lib/scout/persist/tsv/adapter/tokyocabinet.rb +3 -3
- data/lib/scout/persist/tsv/serialize.rb +3 -3
- data/lib/scout/persist/tsv.rb +1 -1
- data/lib/scout/semaphore.rb +100 -17
- data/lib/scout/tsv/annotation/repo.rb +4 -4
- data/lib/scout/tsv/annotation.rb +2 -2
- data/lib/scout/tsv/attach.rb +7 -7
- data/lib/scout/tsv/change_id/translate.rb +1 -1
- data/lib/scout/tsv/csv.rb +3 -3
- data/lib/scout/tsv/dumper.rb +8 -8
- data/lib/scout/tsv/index.rb +1 -1
- data/lib/scout/tsv/open.rb +3 -3
- data/lib/scout/tsv/stream.rb +2 -2
- data/lib/scout/tsv/traverse.rb +4 -4
- data/lib/scout/tsv/util/filter.rb +9 -9
- data/lib/scout/tsv/util/process.rb +2 -2
- data/lib/scout/tsv/util/reorder.rb +2 -2
- data/lib/scout/tsv/util/select.rb +3 -3
- data/lib/scout/tsv/util/unzip.rb +2 -2
- data/lib/scout/tsv/util.rb +1 -1
- data/lib/scout/tsv.rb +2 -2
- data/lib/scout/work_queue/socket.rb +3 -2
- data/lib/scout/work_queue/worker.rb +4 -4
- data/lib/scout/work_queue.rb +7 -7
- data/lib/scout/workflow/definition.rb +18 -16
- data/lib/scout/workflow/deployment/local.rb +81 -62
- data/lib/scout/workflow/deployment/orchestrator/batches.rb +66 -5
- data/lib/scout/workflow/deployment/orchestrator/chains.rb +47 -30
- data/lib/scout/workflow/deployment/orchestrator/rules.rb +3 -3
- data/lib/scout/workflow/deployment/orchestrator/workload.rb +11 -22
- data/lib/scout/workflow/deployment/scheduler/job.rb +34 -36
- data/lib/scout/workflow/deployment/scheduler/lfs.rb +1 -1
- data/lib/scout/workflow/deployment/scheduler/pbs.rb +4 -4
- data/lib/scout/workflow/deployment/scheduler/slurm.rb +2 -2
- data/lib/scout/workflow/deployment/scheduler.rb +23 -12
- data/lib/scout/workflow/deployment/trace.rb +2 -2
- data/lib/scout/workflow/documentation.rb +4 -4
- data/lib/scout/workflow/export.rb +1 -1
- data/lib/scout/workflow/path.rb +2 -2
- data/lib/scout/workflow/step/children.rb +1 -1
- data/lib/scout/workflow/step/dependencies.rb +36 -3
- data/lib/scout/workflow/step/info.rb +5 -19
- data/lib/scout/workflow/step/inputs.rb +1 -1
- data/lib/scout/workflow/step/progress.rb +2 -2
- data/lib/scout/workflow/step/provenance.rb +4 -4
- data/lib/scout/workflow/step/status.rb +23 -9
- data/lib/scout/workflow/step.rb +21 -19
- data/lib/scout/workflow/task/dependencies.rb +10 -3
- data/lib/scout/workflow/task/info.rb +3 -3
- data/lib/scout/workflow/task/inputs.rb +8 -8
- data/lib/scout/workflow/task.rb +37 -22
- data/lib/scout/workflow/usage.rb +13 -13
- data/lib/scout/workflow/util.rb +1 -1
- data/lib/scout/workflow.rb +6 -6
- data/scout-gear.gemspec +4 -3
- data/scout_commands/alias +1 -1
- data/scout_commands/batch/clean +12 -12
- data/scout_commands/batch/list +26 -25
- data/scout_commands/batch/tail +9 -5
- data/scout_commands/cat +1 -1
- data/scout_commands/doc +2 -2
- data/scout_commands/entity +4 -4
- data/scout_commands/find +1 -1
- data/scout_commands/kb/config +1 -1
- data/scout_commands/kb/entities +1 -1
- data/scout_commands/kb/list +1 -1
- data/scout_commands/kb/query +2 -2
- data/scout_commands/kb/register +1 -1
- data/scout_commands/kb/show +1 -1
- data/scout_commands/kb/traverse +1 -1
- data/scout_commands/log +6 -6
- data/scout_commands/resource/produce +2 -2
- data/scout_commands/resource/sync +1 -1
- data/scout_commands/system/clean +7 -7
- data/scout_commands/system/status +4 -4
- data/scout_commands/template +1 -1
- data/scout_commands/update +1 -1
- data/scout_commands/workflow/cmd +2 -1
- data/scout_commands/workflow/example +123 -0
- data/scout_commands/workflow/info +10 -1
- data/scout_commands/workflow/install +1 -1
- data/scout_commands/workflow/list +2 -2
- data/scout_commands/workflow/process +2 -2
- data/scout_commands/workflow/prov +3 -3
- data/scout_commands/workflow/task +36 -11
- data/scout_commands/workflow/trace +1 -1
- data/scout_commands/workflow/write_info +2 -2
- data/share/templates/command +1 -1
- data/test/scout/association/test_item.rb +5 -0
- data/test/scout/entity/test_property.rb +3 -3
- data/test/scout/knowledge_base/test_description.rb +1 -1
- data/test/scout/knowledge_base/test_traverse.rb +2 -2
- data/test/scout/persist/engine/test_packed_index.rb +6 -6
- data/test/scout/persist/test_tsv.rb +4 -4
- data/test/scout/persist/tsv/adapter/test_packed_index.rb +4 -4
- data/test/scout/persist/tsv/adapter/test_sharder.rb +23 -23
- data/test/scout/persist/tsv/adapter/test_tokyocabinet.rb +1 -1
- data/test/scout/persist/tsv/test_serialize.rb +1 -1
- data/test/scout/test_association.rb +1 -1
- data/test/scout/test_tsv.rb +2 -2
- data/test/scout/test_workflow.rb +2 -2
- data/test/scout/tsv/test_annotation.rb +4 -4
- data/test/scout/tsv/test_index.rb +1 -1
- data/test/scout/tsv/test_open.rb +2 -2
- data/test/scout/tsv/test_parser.rb +2 -2
- data/test/scout/tsv/test_stream.rb +1 -1
- data/test/scout/tsv/test_transformer.rb +1 -1
- data/test/scout/tsv/util/test_filter.rb +1 -1
- data/test/scout/tsv/util/test_melt.rb +1 -1
- data/test/scout/tsv/util/test_reorder.rb +1 -1
- data/test/scout/work_queue/test_socket.rb +3 -3
- data/test/scout/work_queue/test_worker.rb +2 -2
- data/test/scout/workflow/deployment/orchestrator/test_batches.rb +13 -3
- data/test/scout/workflow/deployment/orchestrator/test_chains.rb +15 -13
- data/test/scout/workflow/deployment/orchestrator/test_workload.rb +1 -1
- data/test/scout/workflow/deployment/test_local.rb +2 -2
- data/test/scout/workflow/deployment/test_scheduler.rb +1 -2
- data/test/scout/workflow/step/test_children.rb +1 -1
- data/test/scout/workflow/step/test_dependencies.rb +36 -1
- data/test/scout/workflow/step/test_info.rb +3 -35
- data/test/scout/workflow/step/test_load.rb +1 -1
- data/test/scout/workflow/step/test_provenance.rb +1 -1
- data/test/scout/workflow/step/test_status.rb +33 -1
- data/test/scout/workflow/task/test_dependencies.rb +9 -7
- data/test/scout/workflow/task/test_inputs.rb +1 -1
- data/test/scout/workflow/test_definition.rb +1 -1
- data/test/scout/workflow/test_documentation.rb +1 -1
- data/test/scout/workflow/test_entity.rb +2 -2
- data/test/scout/workflow/test_step.rb +13 -13
- data/test/scout/workflow/test_usage.rb +1 -1
- data/test/test_helper.rb +1 -1
- metadata +3 -2
|
@@ -42,86 +42,95 @@ class Workflow::LocalExecutor
|
|
|
42
42
|
@available_resources = IndiferentHash.setup(available_resources)
|
|
43
43
|
@resources_requested = IndiferentHash.setup({})
|
|
44
44
|
@resources_used = IndiferentHash.setup({})
|
|
45
|
+
Log.info "LocalExecutor initiated #{Log.fingerprint available_resources}"
|
|
45
46
|
end
|
|
46
47
|
|
|
47
|
-
def process_batches(batches)
|
|
48
|
+
def process_batches(batches, bar: true)
|
|
48
49
|
retry_jobs = []
|
|
49
50
|
failed_jobs = []
|
|
50
51
|
|
|
51
|
-
|
|
52
|
+
bar = {desc: "Processing batches"} if TrueClass === bar
|
|
53
|
+
bar = {bar: bar} if Log::ProgressBar === bar
|
|
54
|
+
Log::ProgressBar.with_bar batches.length, bar do |bar|
|
|
55
|
+
bar.init if bar
|
|
52
56
|
|
|
53
|
-
|
|
54
|
-
top_level_jobs = candidates.collect{|batch| batch[:top_level] }
|
|
57
|
+
while (missing_batches = batches.reject{|b| Workflow::Orchestrator.done_batch?(b) }).any?
|
|
55
58
|
|
|
56
|
-
|
|
59
|
+
bar.pos batches.select{|b| Workflow::Orchestrator.done_batch?(b) }.length if bar
|
|
57
60
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
if exception
|
|
61
|
-
Log.warn 'Some work failed'
|
|
62
|
-
raise exception
|
|
63
|
-
else
|
|
64
|
-
raise 'Some work failed'
|
|
65
|
-
end
|
|
66
|
-
end
|
|
61
|
+
candidates = Workflow::LocalExecutor.candidates(batches)
|
|
62
|
+
top_level_jobs = candidates.collect{|batch| batch[:top_level] }
|
|
67
63
|
|
|
68
|
-
|
|
69
|
-
begin
|
|
64
|
+
raise NoWork, "No candidates and no running jobs #{Log.fingerprint batches}" if resources_used.empty? && top_level_jobs.empty?
|
|
70
65
|
|
|
71
|
-
|
|
66
|
+
if candidates.reject{|batch| failed_jobs.include? batch[:top_level] }.empty? && resources_used.empty? && top_level_jobs.empty?
|
|
67
|
+
exception = failed_jobs.collect(&:get_exception).compact.first
|
|
68
|
+
if exception
|
|
69
|
+
Log.warn 'Some work failed'
|
|
70
|
+
raise exception
|
|
71
|
+
else
|
|
72
|
+
raise 'Some work failed'
|
|
73
|
+
end
|
|
74
|
+
end
|
|
72
75
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
76
|
+
candidates.each do |batch|
|
|
77
|
+
begin
|
|
78
|
+
|
|
79
|
+
job = batch[:top_level]
|
|
80
|
+
|
|
81
|
+
case
|
|
82
|
+
when (job.error? || job.aborted?)
|
|
83
|
+
begin
|
|
84
|
+
if job.recoverable_error?
|
|
85
|
+
if retry_jobs.include?(job)
|
|
86
|
+
Log.warn "Failed twice #{job.path} with recoverable error"
|
|
87
|
+
retry_jobs.delete job
|
|
88
|
+
failed_jobs << job
|
|
89
|
+
next
|
|
90
|
+
else
|
|
91
|
+
retry_jobs << job
|
|
92
|
+
job.clean
|
|
93
|
+
raise TryAgain
|
|
94
|
+
end
|
|
95
|
+
else
|
|
80
96
|
failed_jobs << job
|
|
97
|
+
Log.warn "Non-recoverable error in #{job.path}"
|
|
81
98
|
next
|
|
82
|
-
else
|
|
83
|
-
retry_jobs << job
|
|
84
|
-
job.clean
|
|
85
|
-
raise TryAgain
|
|
86
99
|
end
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
next
|
|
100
|
+
ensure
|
|
101
|
+
Log.warn "Releases resources from failed job: #{job.path}"
|
|
102
|
+
release_resources(job)
|
|
91
103
|
end
|
|
92
|
-
|
|
93
|
-
Log.
|
|
104
|
+
when job.done?
|
|
105
|
+
Log.debug "Orchestrator done #{job.path}"
|
|
94
106
|
release_resources(job)
|
|
107
|
+
clear_batch(batches, batch)
|
|
108
|
+
erase_job_dependencies(job, batches)
|
|
109
|
+
when job.running?
|
|
110
|
+
next
|
|
111
|
+
|
|
112
|
+
else
|
|
113
|
+
check_resources(batch) do
|
|
114
|
+
run_batch(batch)
|
|
115
|
+
end
|
|
95
116
|
end
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
117
|
+
rescue TryAgain
|
|
118
|
+
retry
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
batches.each do |batch|
|
|
123
|
+
job = batch[:top_level]
|
|
124
|
+
if job.done? || job.aborted? || job.error?
|
|
125
|
+
job.join if job.done?
|
|
99
126
|
clear_batch(batches, batch)
|
|
127
|
+
release_resources(job)
|
|
100
128
|
erase_job_dependencies(job, batches)
|
|
101
|
-
when job.running?
|
|
102
|
-
next
|
|
103
|
-
|
|
104
|
-
else
|
|
105
|
-
check_resources(batch) do
|
|
106
|
-
run_batch(batch)
|
|
107
|
-
end
|
|
108
129
|
end
|
|
109
|
-
rescue TryAgain
|
|
110
|
-
retry
|
|
111
130
|
end
|
|
112
|
-
end
|
|
113
131
|
|
|
114
|
-
|
|
115
|
-
job = batch[:top_level]
|
|
116
|
-
if job.done? || job.aborted? || job.error?
|
|
117
|
-
job.join if job.done?
|
|
118
|
-
clear_batch(batches, batch)
|
|
119
|
-
release_resources(job)
|
|
120
|
-
erase_job_dependencies(job, batches)
|
|
121
|
-
end
|
|
132
|
+
sleep timer
|
|
122
133
|
end
|
|
123
|
-
|
|
124
|
-
sleep timer
|
|
125
134
|
end
|
|
126
135
|
|
|
127
136
|
batches.each{|batch|
|
|
@@ -141,7 +150,16 @@ class Workflow::LocalExecutor
|
|
|
141
150
|
|
|
142
151
|
def process(rules, jobs = nil)
|
|
143
152
|
jobs, rules = rules, {} if jobs.nil?
|
|
144
|
-
|
|
153
|
+
|
|
154
|
+
if Step === jobs
|
|
155
|
+
jobs = [jobs]
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
if jobs.length == 1
|
|
159
|
+
bar = jobs.first.progress_bar("Process batches for #{jobs.first.short_path}")
|
|
160
|
+
else
|
|
161
|
+
bar = true
|
|
162
|
+
end
|
|
145
163
|
|
|
146
164
|
batches = Workflow::Orchestrator.job_batches(rules, jobs)
|
|
147
165
|
batches.each do |batch|
|
|
@@ -153,13 +171,13 @@ class Workflow::LocalExecutor
|
|
|
153
171
|
batch[:rules] = rules
|
|
154
172
|
end
|
|
155
173
|
|
|
156
|
-
process_batches(batches)
|
|
174
|
+
process_batches(batches, bar: bar)
|
|
157
175
|
end
|
|
158
176
|
|
|
159
177
|
def release_resources(job)
|
|
160
178
|
if resources_used[job]
|
|
161
179
|
Log.debug "Orchestrator releasing resouces from #{job.path}"
|
|
162
|
-
resources_used[job].each do |resource,value|
|
|
180
|
+
resources_used[job].each do |resource,value|
|
|
163
181
|
next if resource == 'size'
|
|
164
182
|
resources_requested[resource] -= value.to_i
|
|
165
183
|
end
|
|
@@ -195,8 +213,9 @@ class Workflow::LocalExecutor
|
|
|
195
213
|
def run_batch(batch)
|
|
196
214
|
job, job_rules = batch.values_at :top_level, :rules
|
|
197
215
|
|
|
198
|
-
rules = batch[:rules]
|
|
216
|
+
rules = batch[:rules]
|
|
199
217
|
deploy = rules[:deploy] if rules
|
|
218
|
+
Log.debug "Processing #{deploy} #{job.short_path} #{Log.fingerprint job_rules}"
|
|
200
219
|
case deploy
|
|
201
220
|
when nil, 'local', :local, :serial, 'serial'
|
|
202
221
|
Scout::Config.with_config do
|
|
@@ -207,7 +226,7 @@ class Workflow::LocalExecutor
|
|
|
207
226
|
log = job_rules[:log] if job_rules
|
|
208
227
|
log = Log.severity if log.nil?
|
|
209
228
|
Log.with_severity log do
|
|
210
|
-
job.fork
|
|
229
|
+
job.fork(true)
|
|
211
230
|
end
|
|
212
231
|
end
|
|
213
232
|
when 'batch', 'sched', 'slurm', 'pbs', 'lsf'
|
|
@@ -69,6 +69,7 @@ class Workflow::Orchestrator
|
|
|
69
69
|
task_name = job.task_name
|
|
70
70
|
task_rules = task_specific_rules(rules, workflow, task_name)
|
|
71
71
|
acc = accumulate_rules(acc, task_rules.dup)
|
|
72
|
+
acc
|
|
72
73
|
end
|
|
73
74
|
|
|
74
75
|
if chain = batch[:chain]
|
|
@@ -92,12 +93,20 @@ class Workflow::Orchestrator
|
|
|
92
93
|
next if batch[:deps].nil?
|
|
93
94
|
|
|
94
95
|
if batch[:deps].any?
|
|
95
|
-
batch_dep_jobs = batch[:top_level].rec_dependencies
|
|
96
|
+
batch_dep_jobs = batch[:top_level].rec_dependencies.to_a
|
|
96
97
|
target = batch[:deps].select do |target|
|
|
97
|
-
|
|
98
|
-
|
|
98
|
+
target_deps = []
|
|
99
|
+
stack = [target]
|
|
100
|
+
while stack.any?
|
|
101
|
+
c = stack.pop
|
|
102
|
+
target_deps << c
|
|
103
|
+
stack.concat c[:deps]
|
|
104
|
+
end
|
|
105
|
+
(batch[:deps] - target_deps).empty?
|
|
99
106
|
end.first
|
|
100
107
|
next if target.nil?
|
|
108
|
+
all_target_jobs = ([target] + target[:deps]).collect{|d| d[:jobs] }.flatten
|
|
109
|
+
next if all_target_jobs.reject{|j| batch_dep_jobs.include? j }.any?
|
|
101
110
|
target[:jobs] = batch[:jobs] + target[:jobs]
|
|
102
111
|
target[:deps] = (target[:deps] + batch[:deps]).uniq - [target]
|
|
103
112
|
target[:top_level] = batch[:top_level]
|
|
@@ -119,12 +128,64 @@ class Workflow::Orchestrator
|
|
|
119
128
|
jobs = [jobs] unless Array === jobs
|
|
120
129
|
|
|
121
130
|
workload = job_workload(jobs)
|
|
122
|
-
|
|
131
|
+
job_chain_list = []
|
|
123
132
|
|
|
124
|
-
|
|
133
|
+
jobs.each do |job|
|
|
134
|
+
job_chains = self.job_chains(rules, job)
|
|
135
|
+
job_chains.each do |chain,list|
|
|
136
|
+
list.each do |info|
|
|
137
|
+
job_chain_list << [chain,info]
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
batches = chain_batches(rules, job_chain_list, workload)
|
|
125
143
|
batches = add_batch_deps(batches)
|
|
126
144
|
batches = add_rules_and_consolidate(rules, batches)
|
|
127
145
|
|
|
128
146
|
batches
|
|
129
147
|
end
|
|
148
|
+
|
|
149
|
+
def self.sort_batches(batches)
|
|
150
|
+
pending = batches.dup
|
|
151
|
+
sorted = []
|
|
152
|
+
while pending.any?
|
|
153
|
+
leaf_nodes = batches.select{|batch| batch[:deps].nil? || (batch[:deps] - sorted).empty? }
|
|
154
|
+
sorted.concat(leaf_nodes - sorted)
|
|
155
|
+
pending -= leaf_nodes
|
|
156
|
+
end
|
|
157
|
+
sorted
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def self.errors_in_batch(batch)
|
|
161
|
+
errors = batch[:jobs].select do |job|
|
|
162
|
+
job.error? && ! job.recoverable_error?
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
errors.empty? ? false : errors
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def self.clean_batches(batches)
|
|
169
|
+
error = []
|
|
170
|
+
batches.collect do |batch|
|
|
171
|
+
if failed = Workflow::Orchestrator.errors_in_batch(batch)
|
|
172
|
+
Log.warn "Batch contains errors #{batch[:top_level].short_path} #{Log.fingerprint failed}"
|
|
173
|
+
error << batch
|
|
174
|
+
next
|
|
175
|
+
elsif (error_deps = error & batch[:deps]).any?
|
|
176
|
+
if error_deps.reject{|b| b[:top_level].canfail? }.any?
|
|
177
|
+
Log.warn "Batch depends on batches with errors #{batch[:top_level].short_path} #{Log.fingerprint(error_deps.collect{|d| d[:top_level] })}"
|
|
178
|
+
error << batch
|
|
179
|
+
next
|
|
180
|
+
else
|
|
181
|
+
batch[:deps] -= error_deps
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
batch
|
|
185
|
+
end.compact
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def self.inspect_batch(batch)
|
|
189
|
+
batch.merge(deps: batch[:deps].collect{|b| b[:top_level] })
|
|
190
|
+
end
|
|
130
191
|
end
|
|
@@ -58,47 +58,64 @@ class Workflow::Orchestrator
|
|
|
58
58
|
chains
|
|
59
59
|
end
|
|
60
60
|
|
|
61
|
+
def self.add_chain(job_chains, match, info)
|
|
62
|
+
if job_chains[match]
|
|
63
|
+
current = job_chains[match]
|
|
64
|
+
new_info = {}
|
|
65
|
+
new_info[:jobs] = (current[:jobs] + info[:jobs]).uniq
|
|
66
|
+
if current[:top_level].rec_dependencies.include?(info[:top_level]) ||
|
|
67
|
+
current[:top_level].input_dependencies.include?(info[:top_level])
|
|
68
|
+
new_info[:top_level] = current[:top_level]
|
|
69
|
+
else
|
|
70
|
+
new_info[:top_level] = info[:top_level]
|
|
71
|
+
end
|
|
72
|
+
job_chains[match] = new_info
|
|
73
|
+
else
|
|
74
|
+
job_chains[match] = info
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
61
78
|
def self.job_chains(rules, job, computed = {})
|
|
62
|
-
|
|
79
|
+
chains = parse_chains(rules)
|
|
80
|
+
key = Log.fingerprint([job.path, job.object_id, chains])
|
|
63
81
|
return computed[key] if computed.has_key?(key)
|
|
64
82
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
83
|
+
job_chains = check_chains(chains, job)
|
|
84
|
+
job_batches = {}
|
|
85
|
+
new_batches = {}
|
|
86
|
+
job_dependencies(job).each do |dep|
|
|
87
|
+
dep_chains = check_chains(chains, dep)
|
|
88
|
+
common_chains = job_chains & dep_chains
|
|
68
89
|
|
|
69
|
-
|
|
70
|
-
new_job_chains = {}
|
|
71
|
-
dependencies.each do |dep|
|
|
72
|
-
dep_matches = check_chains(chains, dep)
|
|
73
|
-
common = matches & dep_matches
|
|
90
|
+
dep_batches = job_chains(rules, dep, computed)
|
|
74
91
|
|
|
75
|
-
dep_chains = job_chains(rules, dep, computed)
|
|
76
92
|
found = []
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
93
|
+
common_chains.each do |chain|
|
|
94
|
+
info = new_batches[chain]
|
|
95
|
+
info = {top_level: job, jobs: [job]} if info.nil?
|
|
96
|
+
if dep_batches[chain]
|
|
97
|
+
found << chain
|
|
98
|
+
dep_batches[chain].each do |dep_info|
|
|
99
|
+
info[:jobs] += dep_info[:jobs] - info[:jobs]
|
|
100
|
+
end
|
|
84
101
|
else
|
|
85
|
-
|
|
102
|
+
info[:jobs] << dep
|
|
86
103
|
end
|
|
104
|
+
new_batches[chain] = info
|
|
87
105
|
end
|
|
88
106
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
job_chains << [match, info]
|
|
94
|
-
end
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
new_job_chains.each do |match, info|
|
|
98
|
-
info[:jobs].prepend job
|
|
99
|
-
job_chains << [match, info]
|
|
107
|
+
dep_batches.each do |chain,list|
|
|
108
|
+
next if found.include? chain
|
|
109
|
+
job_batches[chain] ||= []
|
|
110
|
+
job_batches[chain].concat list
|
|
100
111
|
end
|
|
112
|
+
end
|
|
101
113
|
|
|
102
|
-
|
|
114
|
+
new_batches.each do |match, info|
|
|
115
|
+
job_batches[match] ||= []
|
|
116
|
+
job_batches[match] << info
|
|
103
117
|
end
|
|
118
|
+
|
|
119
|
+
computed[key] = job_batches
|
|
120
|
+
end
|
|
104
121
|
end
|
|
@@ -47,7 +47,7 @@ class Workflow::Orchestrator
|
|
|
47
47
|
case k.to_s
|
|
48
48
|
when "config_keys"
|
|
49
49
|
current[k] = add_config_keys current["config_keys"], value
|
|
50
|
-
when "cpus
|
|
50
|
+
when "task_cpus", 'cpus'
|
|
51
51
|
# choose max
|
|
52
52
|
vals = [current[k], value].compact.map{|v| v.to_i }
|
|
53
53
|
current[k] = vals.max unless vals.empty?
|
|
@@ -183,7 +183,7 @@ class Workflow::Orchestrator
|
|
|
183
183
|
r = rules_block[:resources] || {}
|
|
184
184
|
r = IndiferentHash.setup r
|
|
185
185
|
|
|
186
|
-
r = IndiferentHash.add_defaults r,
|
|
186
|
+
r = IndiferentHash.add_defaults r,
|
|
187
187
|
cpus: rules_block[:cpus] || rules_block[:task_cpus] || 1,
|
|
188
188
|
time: rules_block[:time]
|
|
189
189
|
|
|
@@ -237,7 +237,7 @@ class Workflow::Orchestrator
|
|
|
237
237
|
merge_rule_file(acc, file_rules)
|
|
238
238
|
end
|
|
239
239
|
end
|
|
240
|
-
|
|
240
|
+
|
|
241
241
|
def self.load_rules_for_job(jobs)
|
|
242
242
|
jobs = [jobs] unless Array === jobs
|
|
243
243
|
|
|
@@ -1,32 +1,21 @@
|
|
|
1
1
|
class Workflow::Orchestrator
|
|
2
2
|
|
|
3
|
-
def self.
|
|
4
|
-
|
|
5
|
-
path_jobs = {}
|
|
3
|
+
def self.prepare_for_execution(job)
|
|
4
|
+
rec_dependencies = job.rec_dependencies(true)
|
|
6
5
|
|
|
7
|
-
|
|
6
|
+
return if rec_dependencies.empty?
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
path_jobs[job.path] = job
|
|
11
|
-
end
|
|
8
|
+
all_deps = rec_dependencies + [job]
|
|
12
9
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
deps = job_dependencies(j)
|
|
21
|
-
deps.each do |d|
|
|
22
|
-
path_jobs[d.path] ||= d
|
|
10
|
+
all_deps.each do |dep|
|
|
11
|
+
begin
|
|
12
|
+
dep.clean if (dep.error? && dep.recoverable_error?) ||
|
|
13
|
+
dep.aborted? || (dep.done? && ! dep.updated?)
|
|
14
|
+
rescue RbbtException
|
|
15
|
+
Log.exception $!
|
|
16
|
+
next
|
|
23
17
|
end
|
|
24
|
-
|
|
25
|
-
heap.concat deps.collect(&:path)
|
|
26
|
-
heap.uniq!
|
|
27
18
|
end
|
|
28
|
-
|
|
29
|
-
path_jobs
|
|
30
19
|
end
|
|
31
20
|
|
|
32
21
|
def self.job_workload(jobs)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module SchedulerJob
|
|
2
|
-
@batch_base_dir = File.expand_path(File.join('~/scout-batch'))
|
|
2
|
+
@batch_base_dir = File.expand_path(File.join('~/scout-batch'))
|
|
3
3
|
self.singleton_class.attr_accessor :batch_base_dir
|
|
4
4
|
|
|
5
5
|
module_function
|
|
@@ -27,7 +27,7 @@ module SchedulerJob
|
|
|
27
27
|
|
|
28
28
|
singularity_img, singularity_opt_dir, singularity_ruby_inline, singularity_mounts = options.values_at :singularity_img, :singularity_opt_dir, :singularity_ruby_inline, :singularity_mounts
|
|
29
29
|
|
|
30
|
-
singularity_cmd = %(singularity exec -e -B "#{File.expand_path singularity_opt_dir}":/singularity_opt/ -B "#{File.expand_path singularity_ruby_inline}":"/.singularity_ruby_inline":rw )
|
|
30
|
+
singularity_cmd = %(singularity exec -e -B "#{File.expand_path singularity_opt_dir}":/singularity_opt/ -B "#{File.expand_path singularity_ruby_inline}":"/.singularity_ruby_inline":rw )
|
|
31
31
|
|
|
32
32
|
if singularity_mounts
|
|
33
33
|
singularity_mounts.split(",").each do |mount|
|
|
@@ -37,7 +37,7 @@ module SchedulerJob
|
|
|
37
37
|
|
|
38
38
|
if contain && options[:hardened]
|
|
39
39
|
singularity_cmd << %( -C -H "#{contain}" \
|
|
40
|
-
-B "/.singularity_ruby_inline":"#{contain}/.singularity_ruby_inline":rw
|
|
40
|
+
-B "/.singularity_ruby_inline":"#{contain}/.singularity_ruby_inline":rw
|
|
41
41
|
-B "#{options[:batch_dir]}" \
|
|
42
42
|
-B /scratch/tmp \
|
|
43
43
|
#{ group != user_group ? "-B /gpfs/projects/#{user_group}" : "" } \
|
|
@@ -81,17 +81,15 @@ module SchedulerJob
|
|
|
81
81
|
|
|
82
82
|
task = job.task_name
|
|
83
83
|
|
|
84
|
-
if job.
|
|
85
|
-
override_deps = job.
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
end.uniq * ","
|
|
94
|
-
options[:override_deps] = override_deps unless override_deps.empty?
|
|
84
|
+
if job.overriden?
|
|
85
|
+
override_deps = job.recursive_overrider_deps.collect do |dep|
|
|
86
|
+
o_workflow = dep.overriden_workflow || dep.workflow
|
|
87
|
+
o_workflow = o_workflow.name if o_workflow.respond_to?(:name)
|
|
88
|
+
o_task_name = dep.overriden_task || dep.task.name
|
|
89
|
+
name = [o_workflow, o_task_name] * "#"
|
|
90
|
+
[name, dep.path] * "="
|
|
91
|
+
end.uniq * ","
|
|
92
|
+
options[:override_deps] = override_deps unless override_deps.empty?
|
|
95
93
|
end
|
|
96
94
|
|
|
97
95
|
# Save inputs into inputs_dir (only if provided)
|
|
@@ -192,7 +190,7 @@ workflow task #{workflow} #{task} #{cmds}
|
|
|
192
190
|
keys_from_config.each do |key|
|
|
193
191
|
next unless batch_options.include? key
|
|
194
192
|
default_value = Scout::Config.get(key, "batch_#{key}", "batch")
|
|
195
|
-
next if default_value.nil?
|
|
193
|
+
next if default_value.nil?
|
|
196
194
|
IndiferentHash.add_defaults batch_options, default_value
|
|
197
195
|
end
|
|
198
196
|
|
|
@@ -211,7 +209,7 @@ workflow task #{workflow} #{task} #{cmds}
|
|
|
211
209
|
batch_options[:contain] = File.join(contain_base, random_file)
|
|
212
210
|
end
|
|
213
211
|
|
|
214
|
-
batch_options[:sync] ||= "~/.scout/var/jobs"
|
|
212
|
+
batch_options[:sync] ||= "~/.scout/var/jobs"
|
|
215
213
|
batch_options[:wipe_container] ||= 'post'
|
|
216
214
|
end
|
|
217
215
|
|
|
@@ -219,23 +217,23 @@ workflow task #{workflow} #{task} #{cmds}
|
|
|
219
217
|
options[:workdir_all] = batch_options[:contain]
|
|
220
218
|
end
|
|
221
219
|
|
|
222
|
-
IndiferentHash.add_defaults batch_options,
|
|
220
|
+
IndiferentHash.add_defaults batch_options,
|
|
223
221
|
:batch_name => batch_name,
|
|
224
|
-
:inputs_dir => inputs_dir,
|
|
225
|
-
:nodes => 1,
|
|
222
|
+
:inputs_dir => inputs_dir,
|
|
223
|
+
:nodes => 1,
|
|
226
224
|
:step_path => job.path,
|
|
227
225
|
:task_cpus => 1,
|
|
228
|
-
:time => '2min',
|
|
226
|
+
:time => '2min',
|
|
229
227
|
:env => {'JDK_JAVA_OPTIONS' => "-Xms1g -Xmx${MAX_MEMORY}m"},
|
|
230
228
|
:singularity_img => ENV["SINGULARITY_IMG"] || "~/scout.singularity.img",
|
|
231
229
|
:singularity_ruby_inline => ENV["SINGULARITY_RUBY_INLINE"] || "~/.singularity_ruby_inline",
|
|
232
230
|
:singularity_opt_dir => ENV["SINGULARITY_OPT_DIR"] || "~/singularity_opt",
|
|
233
|
-
:workdir => Dir.pwd
|
|
231
|
+
:workdir => Dir.pwd
|
|
234
232
|
|
|
235
233
|
exec_cmd = exec_cmd(job, batch_options)
|
|
236
234
|
scout_cmd = scout_job_exec_cmd(job, options)
|
|
237
235
|
|
|
238
|
-
IndiferentHash.add_defaults batch_options,
|
|
236
|
+
IndiferentHash.add_defaults batch_options,
|
|
239
237
|
:exec_cmd => exec_cmd,
|
|
240
238
|
:scout_cmd => scout_cmd
|
|
241
239
|
|
|
@@ -344,7 +342,7 @@ batch_erase_contain_dir()
|
|
|
344
342
|
function batch_sync_contain_dir(){
|
|
345
343
|
mkdir -p "$(dirname '#{sync}')"
|
|
346
344
|
rsync -avztAXHP --copy-unsafe-links "#{source}/" "#{sync}/" 2>1 >> '#{options[:fsync]}'
|
|
347
|
-
sync_es="$?"
|
|
345
|
+
sync_es="$?"
|
|
348
346
|
echo $sync_es > '#{options[:fsexit]}'
|
|
349
347
|
find '#{sync}' -type l -ls | awk '$13 ~ /^#{sync.gsub('/','\/')}/ { sub("#{source}", "#{sync}", $13); print $11, $13 }' | while read A B; do rm $A; ln -s $B $A; done
|
|
350
348
|
}
|
|
@@ -353,7 +351,7 @@ function batch_sync_contain_dir(){
|
|
|
353
351
|
|
|
354
352
|
if options[:env]
|
|
355
353
|
prepare_environment +=<<-EOF
|
|
356
|
-
# Set ENV variables
|
|
354
|
+
# Set ENV variables
|
|
357
355
|
#{options[:env].collect{|n,v| "export #{n}=\"#{v}\"" } * "\n"}
|
|
358
356
|
EOF
|
|
359
357
|
end
|
|
@@ -384,7 +382,7 @@ for tmpd in persist_locks produce_locks R_sockets sensiblewrite sensiblewrit
|
|
|
384
382
|
mkdir -p "#{contain}/.scout/tmp/$tmpd"
|
|
385
383
|
done
|
|
386
384
|
|
|
387
|
-
# Copy environment
|
|
385
|
+
# Copy environment
|
|
388
386
|
cp ~/.scout/etc/environment #{contain}/.scout/etc/
|
|
389
387
|
|
|
390
388
|
# Set search_paths
|
|
@@ -406,7 +404,7 @@ echo "user_scratch: #{scratch_group_dir}/#{user}/{PKGDIR}/{TOPLEVEL}/{SUBPATH}"
|
|
|
406
404
|
exec_cmd, job_cmd, task_cpus = options.values_at :exec_cmd, :scout_cmd, :task_cpus
|
|
407
405
|
|
|
408
406
|
script=<<-EOF
|
|
409
|
-
step_path=$(
|
|
407
|
+
step_path=$(
|
|
410
408
|
#{exec_cmd} #{job_cmd} --printpath
|
|
411
409
|
)
|
|
412
410
|
exit_status=$?
|
|
@@ -426,7 +424,7 @@ fi
|
|
|
426
424
|
|
|
427
425
|
if options[:sync]
|
|
428
426
|
sync_environment +=<<-EOF
|
|
429
|
-
if [ $exit_status == '0' ]; then
|
|
427
|
+
if [ $exit_status == '0' ]; then
|
|
430
428
|
batch_sync_contain_dir
|
|
431
429
|
else
|
|
432
430
|
sync_es=$exit_status
|
|
@@ -441,8 +439,8 @@ fi
|
|
|
441
439
|
cleanup_environment = ""
|
|
442
440
|
|
|
443
441
|
cleanup_environment +=<<-EOF if options[:purge_deps]
|
|
444
|
-
if [ $exit_status == '0' ]; then
|
|
445
|
-
#{options[:exec_cmd]} workflow forget_deps --purge --recursive_purge "$step_path" 2>1 >> '#{options[:fsync]}'
|
|
442
|
+
if [ $exit_status == '0' ]; then
|
|
443
|
+
#{options[:exec_cmd]} workflow forget_deps --purge --recursive_purge "$step_path" 2>1 >> '#{options[:fsync]}'
|
|
446
444
|
fi
|
|
447
445
|
EOF
|
|
448
446
|
|
|
@@ -453,7 +451,7 @@ batch_erase_contain_dir
|
|
|
453
451
|
EOF
|
|
454
452
|
elsif options[:wipe_container] == 'post' || options[:wipe_container] == 'both'
|
|
455
453
|
cleanup_environment +=<<-EOF
|
|
456
|
-
if [ $sync_es == '0' -a $empty_contain_dir == 'true' ]; then
|
|
454
|
+
if [ $sync_es == '0' -a $empty_contain_dir == 'true' ]; then
|
|
457
455
|
batch_erase_contain_dir
|
|
458
456
|
fi
|
|
459
457
|
EOF
|
|
@@ -512,7 +510,7 @@ exit $exit_status
|
|
|
512
510
|
env > #{batch_options[:fenv]}
|
|
513
511
|
|
|
514
512
|
# #{Log.color :green, "2. Execute"}
|
|
515
|
-
#{execute}
|
|
513
|
+
#{execute}
|
|
516
514
|
|
|
517
515
|
# #{Log.color :green, "3. Sync and cleanup environment"}
|
|
518
516
|
#{sync_environment}
|
|
@@ -555,13 +553,13 @@ env > #{batch_options[:fenv]}
|
|
|
555
553
|
def run_job(job, options = {})
|
|
556
554
|
system = self.to_s.split("::").last
|
|
557
555
|
|
|
558
|
-
batch_base_dir, clean_batch_job, remove_batch_dir, procpath, tail, batch_dependencies, dry_run, orchestration_rules_file = IndiferentHash.process_options options,
|
|
556
|
+
batch_base_dir, clean_batch_job, remove_batch_dir, procpath, tail, batch_dependencies, dry_run, orchestration_rules_file = IndiferentHash.process_options options,
|
|
559
557
|
:batch_base_dir, :clean_batch_job, :remove_batch_dir, :batch_procpath, :tail, :batch_dependencies, :dry_run, :orchestration_rules,
|
|
560
558
|
:batch_base_dir => SchedulerJob.batch_base_dir
|
|
561
559
|
|
|
562
560
|
if (batch_job = job.info[:batch_job]) && job_queued(batch_job)
|
|
563
561
|
Log.info "Job #{job.short_path} already queued in #{batch_job}"
|
|
564
|
-
return batch_job, batch_dir_for_id(batch_base_dir, batch_job)
|
|
562
|
+
return batch_job, batch_dir_for_id(batch_base_dir, batch_job)
|
|
565
563
|
end
|
|
566
564
|
|
|
567
565
|
if job.running?
|
|
@@ -570,7 +568,7 @@ env > #{batch_options[:fenv]}
|
|
|
570
568
|
if job.info[:batch_job]
|
|
571
569
|
return job.info[:batch_job], batch_dir_for_id(batch_base_dir, batch_job)
|
|
572
570
|
else
|
|
573
|
-
return
|
|
571
|
+
return
|
|
574
572
|
end
|
|
575
573
|
end
|
|
576
574
|
|
|
@@ -582,8 +580,8 @@ env > #{batch_options[:fenv]}
|
|
|
582
580
|
workflows_to_load = job.rec_dependencies.select{|d| Step === d}.collect{|d| d.workflow }.compact.collect(&:to_s) - [workflow.to_s]
|
|
583
581
|
|
|
584
582
|
TmpFile.with_file(nil, remove_batch_dir, :tmpdir => batch_base_dir, :prefix => "#{system}_scout_job-#{workflow.to_s}-#{task_name}-") do |batch_dir|
|
|
585
|
-
IndiferentHash.add_defaults options,
|
|
586
|
-
:batch_dir => batch_dir,
|
|
583
|
+
IndiferentHash.add_defaults options,
|
|
584
|
+
:batch_dir => batch_dir,
|
|
587
585
|
:inputs_dir => File.join(batch_dir, "inputs_dir"),
|
|
588
586
|
:workflows => workflows_to_load.any? ? workflows_to_load.uniq * "," : nil
|
|
589
587
|
|