scout-gear 10.8.4 → 10.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vimproject +38 -0
- data/README.md +352 -0
- data/VERSION +1 -1
- data/bin/scout +4 -1
- data/doc/Association.md +288 -0
- data/doc/Entity.md +296 -0
- data/doc/KnowledgeBase.md +433 -0
- data/doc/Persist.md +356 -0
- data/doc/Semaphore.md +171 -0
- data/doc/TSV.md +449 -0
- data/doc/WorkQueue.md +359 -0
- data/doc/Workflow.md +586 -0
- data/lib/scout/association.rb +4 -2
- data/lib/scout/entity/identifiers.rb +1 -1
- data/lib/scout/entity/object.rb +1 -1
- data/lib/scout/entity/property.rb +5 -5
- data/lib/scout/entity.rb +1 -1
- data/lib/scout/knowledge_base/description.rb +1 -1
- data/lib/scout/knowledge_base/list.rb +7 -2
- data/lib/scout/knowledge_base/registry.rb +4 -5
- data/lib/scout/knowledge_base.rb +20 -2
- data/lib/scout/monitor.rb +10 -6
- data/lib/scout/persist/engine/packed_index.rb +2 -2
- data/lib/scout/persist/engine/sharder.rb +1 -1
- data/lib/scout/persist/tsv.rb +1 -0
- data/lib/scout/semaphore.rb +1 -1
- data/lib/scout/tsv/dumper.rb +3 -3
- data/lib/scout/tsv/open.rb +1 -0
- data/lib/scout/tsv/parser.rb +1 -1
- data/lib/scout/tsv/transformer.rb +1 -0
- data/lib/scout/tsv/util.rb +2 -2
- data/lib/scout/work_queue/socket.rb +1 -1
- data/lib/scout/work_queue/worker.rb +7 -5
- data/lib/scout/workflow/definition.rb +11 -0
- data/lib/scout/workflow/deployment/local.rb +288 -0
- data/lib/scout/workflow/deployment/orchestrator/batches.rb +130 -0
- data/lib/scout/workflow/deployment/orchestrator/chains.rb +104 -0
- data/lib/scout/workflow/deployment/orchestrator/rules.rb +256 -0
- data/lib/scout/workflow/deployment/orchestrator/workload.rb +67 -0
- data/lib/scout/workflow/deployment/scheduler/job.rb +740 -0
- data/lib/scout/workflow/deployment/scheduler/lfs.rb +125 -0
- data/lib/scout/workflow/deployment/scheduler/pbs.rb +176 -0
- data/lib/scout/workflow/deployment/scheduler/slurm.rb +158 -0
- data/lib/scout/workflow/deployment/scheduler.rb +73 -0
- data/lib/scout/workflow/deployment.rb +10 -1
- data/lib/scout/workflow/entity.rb +22 -1
- data/lib/scout/workflow/exceptions.rb +2 -0
- data/lib/scout/workflow/step/config.rb +6 -3
- data/lib/scout/workflow/step/file.rb +4 -0
- data/lib/scout/workflow/step/info.rb +10 -4
- data/lib/scout/workflow/step/progress.rb +52 -0
- data/lib/scout/workflow/step.rb +39 -5
- data/lib/scout/workflow/task/inputs.rb +1 -1
- data/lib/scout/workflow/task.rb +2 -0
- data/lib/scout/workflow/usage.rb +3 -2
- data/lib/scout/workflow/util.rb +22 -0
- data/scout-gear.gemspec +37 -7
- data/scout_commands/batch/list +1 -1
- data/scout_commands/cat +86 -0
- data/scout_commands/doc +3 -1
- data/scout_commands/entity +151 -0
- data/scout_commands/system/status +238 -0
- data/scout_commands/workflow/cmd +5 -13
- data/scout_commands/workflow/info +23 -10
- data/scout_commands/workflow/install +1 -1
- data/scout_commands/workflow/task +61 -25
- data/test/scout/entity/test_property.rb +1 -1
- data/test/scout/knowledge_base/test_registry.rb +19 -0
- data/test/scout/test_work_queue.rb +1 -1
- data/test/scout/work_queue/test_worker.rb +12 -10
- data/test/scout/workflow/deployment/orchestrator/test_batches.rb +138 -0
- data/test/scout/workflow/deployment/orchestrator/test_chains.rb +171 -0
- data/test/scout/workflow/deployment/orchestrator/test_rules.rb +219 -0
- data/test/scout/workflow/deployment/orchestrator/test_workload.rb +117 -0
- data/test/scout/workflow/deployment/scheduler/test_job.rb +31 -0
- data/test/scout/workflow/deployment/scheduler/test_lfs.rb +32 -0
- data/test/scout/workflow/deployment/scheduler/test_pbs.rb +32 -0
- data/test/scout/workflow/deployment/scheduler/test_slurm.rb +32 -0
- data/test/scout/workflow/deployment/{test_orchestrator.rb → test_local.rb} +161 -33
- data/test/scout/workflow/deployment/test_scheduler.rb +75 -0
- data/test/scout/workflow/deployment/test_trace.rb +1 -1
- data/test/scout/workflow/step/test_progress.rb +27 -0
- data/test/scout/workflow/task/test_inputs.rb +17 -0
- data/test/test_helper.rb +2 -1
- metadata +36 -6
- data/doc/lib/scout/path.md +0 -35
- data/doc/lib/scout/workflow/task.md +0 -13
- data/lib/scout/workflow/deployment/orchestrator.rb +0 -292
|
@@ -12,7 +12,7 @@ class KnowledgeBase
|
|
|
12
12
|
if entity_type.to_s == "simple"
|
|
13
13
|
path = dir.lists[entity_type.to_s][id]
|
|
14
14
|
else
|
|
15
|
-
path = dir.lists[entity_type.to_s][id
|
|
15
|
+
path = dir.lists[entity_type.to_s][id].find_with_extension("tsv")
|
|
16
16
|
end
|
|
17
17
|
else
|
|
18
18
|
path = dir.lists.glob("*/#{id}").first
|
|
@@ -33,6 +33,7 @@ class KnowledgeBase
|
|
|
33
33
|
Open.lock path do
|
|
34
34
|
begin
|
|
35
35
|
if AnnotatedArray === list
|
|
36
|
+
path = path.set_extension('tsv')
|
|
36
37
|
Open.write(path, Annotation.tsv(list, :all).to_s)
|
|
37
38
|
else
|
|
38
39
|
Open.write(path, list * "\n")
|
|
@@ -60,7 +61,11 @@ class KnowledgeBase
|
|
|
60
61
|
list.extend AnnotatedArray
|
|
61
62
|
list
|
|
62
63
|
else
|
|
63
|
-
path.list
|
|
64
|
+
list = path.list
|
|
65
|
+
if entity_type
|
|
66
|
+
Entity.prepare_entity(list, entity_type)
|
|
67
|
+
end
|
|
68
|
+
list
|
|
64
69
|
end
|
|
65
70
|
rescue
|
|
66
71
|
Log.exception $!
|
|
@@ -84,7 +84,7 @@ class KnowledgeBase
|
|
|
84
84
|
key = name.to_s + "_" + fp
|
|
85
85
|
end
|
|
86
86
|
|
|
87
|
-
Persist.memory("Index:"
|
|
87
|
+
Persist.memory("Index:" + [key, dir] * "@") do
|
|
88
88
|
options = options.dup
|
|
89
89
|
|
|
90
90
|
persist_dir = dir
|
|
@@ -148,7 +148,7 @@ class KnowledgeBase
|
|
|
148
148
|
options[:namespace] ||= self.namespace unless self.namespace.nil?
|
|
149
149
|
|
|
150
150
|
key += '.database'
|
|
151
|
-
Persist.memory("Database:"
|
|
151
|
+
Persist.memory("Database:" + [key, dir] * "@") do
|
|
152
152
|
options = options.dup
|
|
153
153
|
|
|
154
154
|
persist_dir = dir
|
|
@@ -175,14 +175,13 @@ class KnowledgeBase
|
|
|
175
175
|
|
|
176
176
|
database = if persist_path.exists? and persist_options[:persist] and not persist_options[:update]
|
|
177
177
|
Log.low "Re-opening database #{ name } from #{ Log.fingerprint persist_path }. #{options}"
|
|
178
|
-
Association.database(file, **options.merge(persist_options: persist_options))
|
|
178
|
+
Association.database(file, **options.merge(persist_options: persist_options).except(:undirected))
|
|
179
179
|
else
|
|
180
180
|
options = IndiferentHash.add_defaults options, registered_options if registered_options
|
|
181
|
-
undirected = IndiferentHash.process_options options, :undirected
|
|
182
181
|
raise "Repo #{ name } not found and not registered" if file.nil?
|
|
183
182
|
Log.medium "Opening database #{ name } from #{ Log.fingerprint file }. #{options}"
|
|
184
183
|
file = file.call if Proc === file
|
|
185
|
-
Association.database(file, **options.merge(persist_options: persist_options))
|
|
184
|
+
Association.database(file, **options.merge(persist_options: persist_options).except(:undirected))
|
|
186
185
|
end
|
|
187
186
|
|
|
188
187
|
database.namespace = self.namespace if self.namespace
|
data/lib/scout/knowledge_base.rb
CHANGED
|
@@ -59,8 +59,26 @@ class KnowledgeBase
|
|
|
59
59
|
end
|
|
60
60
|
|
|
61
61
|
def self.load(dir)
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
kb = case dir
|
|
63
|
+
when Path
|
|
64
|
+
KnowledgeBase.new dir
|
|
65
|
+
when Symbol
|
|
66
|
+
dir = Path.setup("var").knowledge_base[dir.to_s] if Symbol === dir
|
|
67
|
+
kb = KnowledgeBase.new dir
|
|
68
|
+
when Workflow
|
|
69
|
+
raise if dir.knowledge_base.nil?
|
|
70
|
+
kb = dir.knowledge_base
|
|
71
|
+
when String
|
|
72
|
+
if Workflow.list.include? dir
|
|
73
|
+
workflow = Workflow.require_workflow dir
|
|
74
|
+
kb = workflow.knowledge_base
|
|
75
|
+
elsif dir =~ /^\w+$/
|
|
76
|
+
load(dir.to_sym)
|
|
77
|
+
else
|
|
78
|
+
kb = KnowledgeBase.new dir
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
64
82
|
kb.load
|
|
65
83
|
kb
|
|
66
84
|
end
|
data/lib/scout/monitor.rb
CHANGED
|
@@ -2,14 +2,18 @@ require 'scout'
|
|
|
2
2
|
|
|
3
3
|
module Scout
|
|
4
4
|
|
|
5
|
-
LOCK_DIRS = Scout.tmp.tsv_open_locks.find_all + Scout.tmp.persist_locks.find_all + Scout.tmp.sensiblewrite_locks.find_all +
|
|
6
|
-
Scout.tmp.produce_locks.find_all + Scout.tmp.step_info_locks.find_all
|
|
7
|
-
|
|
8
5
|
SENSIBLE_WRITE_DIRS = Open.sensible_write_dir.find_all
|
|
9
6
|
|
|
10
|
-
|
|
7
|
+
LOCK_DIRS = Path.setup('tmp/tsv_open_locks').find_all +
|
|
8
|
+
Path.setup('tmp/tsv_locks').find_all +
|
|
9
|
+
Path.setup('tmp/persist_locks').find_all +
|
|
10
|
+
Path.setup('tmp/sensible_write_locks').find_all +
|
|
11
|
+
Path.setup('tmp/produce_locks').find_all +
|
|
12
|
+
Path.setup('tmp/step_info_locks').find_all
|
|
13
|
+
|
|
14
|
+
PERSIST_DIRS = Path.setup('share').find_all + Path.setup('var/cache/persistence').find_all
|
|
11
15
|
|
|
12
|
-
JOB_DIRS =
|
|
16
|
+
JOB_DIRS = Path.setup('var/jobs').find_all
|
|
13
17
|
|
|
14
18
|
MUTEX_FOR_THREAD_EXCLUSIVE = Mutex.new
|
|
15
19
|
|
|
@@ -68,7 +72,7 @@ module Scout
|
|
|
68
72
|
lock_info[f][:ppid] = info[:ppid]
|
|
69
73
|
end
|
|
70
74
|
rescue Exception
|
|
71
|
-
Log.
|
|
75
|
+
Log.warn $!.message
|
|
72
76
|
end
|
|
73
77
|
end
|
|
74
78
|
lock_info
|
|
@@ -9,7 +9,7 @@ class PackedIndex
|
|
|
9
9
|
}
|
|
10
10
|
|
|
11
11
|
def self.process_mask(mask)
|
|
12
|
-
str = ""
|
|
12
|
+
str = "".dup
|
|
13
13
|
size = 0
|
|
14
14
|
mask.each do |e|
|
|
15
15
|
if ELEMS.include? e
|
|
@@ -50,7 +50,7 @@ class PackedIndex
|
|
|
50
50
|
@mask = @stream.read(mask_length)
|
|
51
51
|
@offset = @mask.length + 8
|
|
52
52
|
end
|
|
53
|
-
@nil_string = "NIL"
|
|
53
|
+
@nil_string = "NIL" + ("-" * (@item_size - 3))
|
|
54
54
|
end
|
|
55
55
|
|
|
56
56
|
def file
|
|
@@ -37,7 +37,7 @@ class Sharder
|
|
|
37
37
|
databases[shard]
|
|
38
38
|
else
|
|
39
39
|
database = databases[shard] ||= begin
|
|
40
|
-
path = File.join(persistence_path, 'shard-'
|
|
40
|
+
path = File.join(persistence_path, 'shard-' + shard.to_s)
|
|
41
41
|
(writable or File.exist?(path)) ? Persist.open_database(path, (File.exist?(path) ? false : writable), :clean, db_type, @persist_options) : nil
|
|
42
42
|
end
|
|
43
43
|
Log.warn "Database #{ path } missing" if database.nil?
|
data/lib/scout/persist/tsv.rb
CHANGED
data/lib/scout/semaphore.rb
CHANGED
|
@@ -75,7 +75,7 @@ if continue
|
|
|
75
75
|
|
|
76
76
|
def self.with_semaphore(size, file = nil)
|
|
77
77
|
if file.nil?
|
|
78
|
-
file = "/scout-"
|
|
78
|
+
file = "/scout-" + Misc.digest(rand(100000000000).to_s)[0..10] if file.nil?
|
|
79
79
|
else
|
|
80
80
|
file = file.gsub('/', '_') if file
|
|
81
81
|
end
|
data/lib/scout/tsv/dumper.rb
CHANGED
|
@@ -16,7 +16,7 @@ module TSV
|
|
|
16
16
|
if String === preamble
|
|
17
17
|
preamble_str = preamble
|
|
18
18
|
elsif preamble && options.values.compact.any?
|
|
19
|
-
preamble_str = "#: "
|
|
19
|
+
preamble_str = "#: " + IndiferentHash.hash2string(options.merge(serializer: nil))
|
|
20
20
|
else
|
|
21
21
|
preamble_str = nil
|
|
22
22
|
end
|
|
@@ -81,7 +81,7 @@ module TSV
|
|
|
81
81
|
header = Dumper.header(@options.merge(type: @type, sep: @sep, preamble: preamble))
|
|
82
82
|
@mutex.synchronize do
|
|
83
83
|
@initialized = true
|
|
84
|
-
@sin << header
|
|
84
|
+
@sin << header + "\n" if header and ! header.empty?
|
|
85
85
|
end
|
|
86
86
|
end
|
|
87
87
|
|
|
@@ -134,7 +134,7 @@ module TSV
|
|
|
134
134
|
end
|
|
135
135
|
|
|
136
136
|
def fingerprint
|
|
137
|
-
"Dumper:{"
|
|
137
|
+
"Dumper:{" + Log.fingerprint(self.all_fields|| []) << "}"
|
|
138
138
|
end
|
|
139
139
|
|
|
140
140
|
def digest_str
|
data/lib/scout/tsv/open.rb
CHANGED
data/lib/scout/tsv/parser.rb
CHANGED
data/lib/scout/tsv/util.rb
CHANGED
|
@@ -165,11 +165,11 @@ Example:
|
|
|
165
165
|
end
|
|
166
166
|
|
|
167
167
|
def fingerprint
|
|
168
|
-
"TSV:{"
|
|
168
|
+
"TSV:{" + Log.fingerprint(self.all_fields|| []) << ";" << Log.fingerprint(self.keys) << "}"
|
|
169
169
|
end
|
|
170
170
|
|
|
171
171
|
def digest_str
|
|
172
|
-
"TSV:{"
|
|
172
|
+
"TSV:{" + Log.fingerprint(self.all_fields|| []) << ";" << Log.fingerprint(self.keys) << ";" << Log.fingerprint(self.values) << "}"
|
|
173
173
|
end
|
|
174
174
|
|
|
175
175
|
def inspect
|
|
@@ -9,7 +9,7 @@ class WorkQueue
|
|
|
9
9
|
|
|
10
10
|
@serializer = serializer || Marshal
|
|
11
11
|
|
|
12
|
-
@key = "/"
|
|
12
|
+
@key = "/" + rand(1000000000).to_s << '.' << Process.pid.to_s;
|
|
13
13
|
@write_sem = @key + '.in'
|
|
14
14
|
@read_sem = @key + '.out'
|
|
15
15
|
Log.debug "Creating socket semaphores: #{@key}"
|
|
@@ -27,9 +27,9 @@ class WorkQueue
|
|
|
27
27
|
run do
|
|
28
28
|
begin
|
|
29
29
|
if output
|
|
30
|
-
Open.purge_pipes(output.swrite)
|
|
30
|
+
Open.purge_pipes(input.sread, output.swrite)
|
|
31
31
|
else
|
|
32
|
-
Open.purge_pipes
|
|
32
|
+
Open.purge_pipes(input.sread)
|
|
33
33
|
end
|
|
34
34
|
|
|
35
35
|
while obj = input.read
|
|
@@ -43,9 +43,11 @@ class WorkQueue
|
|
|
43
43
|
rescue DoneProcessing
|
|
44
44
|
rescue Interrupt
|
|
45
45
|
rescue Exception
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
begin
|
|
47
|
+
output.write WorkerException.new($!, Process.pid)
|
|
48
|
+
ensure
|
|
49
|
+
exit -1
|
|
50
|
+
end
|
|
49
51
|
end
|
|
50
52
|
exit 0
|
|
51
53
|
end
|
|
@@ -227,4 +227,15 @@ module Workflow
|
|
|
227
227
|
alias export_asynchronous export
|
|
228
228
|
alias export_exec export
|
|
229
229
|
alias export_stream export
|
|
230
|
+
|
|
231
|
+
def include_workflow(workflow)
|
|
232
|
+
workflow.documentation
|
|
233
|
+
self.asynchronous_exports += workflow.asynchronous_exports
|
|
234
|
+
self.synchronous_exports += workflow.synchronous_exports
|
|
235
|
+
self.exec_exports += workflow.exec_exports
|
|
236
|
+
self.stream_exports += workflow.stream_exports
|
|
237
|
+
self.tasks.merge! workflow.tasks
|
|
238
|
+
self.tasks.each{|_,t| t.workflow = workflow }
|
|
239
|
+
self.helpers.merge! workflow.helpers
|
|
240
|
+
end
|
|
230
241
|
end
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
require_relative 'orchestrator/batches'
|
|
2
|
+
class Workflow::LocalExecutor
|
|
3
|
+
class NoWork < Exception; end
|
|
4
|
+
|
|
5
|
+
def self.process(*args)
|
|
6
|
+
self.new.process(*args)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def self.produce(jobs, rules = {}, produce_cpus: Etc.nprocessors, produce_timer: 1)
|
|
10
|
+
jobs = [jobs] unless Array === jobs
|
|
11
|
+
orchestrator = self.new produce_timer.to_f, cpus: produce_cpus.to_i
|
|
12
|
+
begin
|
|
13
|
+
orchestrator.process(rules, jobs)
|
|
14
|
+
rescue self::NoWork
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def self.produce_dependencies(jobs, tasks, rules = {}, produce_cpus: Etc.nprocessors, produce_timer: 1)
|
|
19
|
+
jobs = [jobs] unless Array === jobs
|
|
20
|
+
tasks = tasks.collect{|task| (String === task) ? task.to_sym : task }
|
|
21
|
+
|
|
22
|
+
produce_list = []
|
|
23
|
+
jobs.each do |job|
|
|
24
|
+
next if job.done? || job.running?
|
|
25
|
+
job.rec_dependencies.each do |dep|
|
|
26
|
+
task_name = dep.task_name.to_sym
|
|
27
|
+
task_name = task_name.to_sym if String === task_name
|
|
28
|
+
produce_list << dep if tasks.include?(task_name) ||
|
|
29
|
+
tasks.include?(job.task_name.to_s) ||
|
|
30
|
+
tasks.include?(job.full_task_name)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
produce(produce_list, rules, produce_cpus: produce_cpus, produce_timer: produce_timer)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
attr_accessor :available_resources, :resources_requested, :resources_used, :timer
|
|
38
|
+
|
|
39
|
+
def initialize(timer = 5, available_resources = nil)
|
|
40
|
+
available_resources = {:cpus => Etc.nprocessors } if available_resources.nil?
|
|
41
|
+
@timer = timer
|
|
42
|
+
@available_resources = IndiferentHash.setup(available_resources)
|
|
43
|
+
@resources_requested = IndiferentHash.setup({})
|
|
44
|
+
@resources_used = IndiferentHash.setup({})
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def process_batches(batches)
|
|
48
|
+
failed_jobs = []
|
|
49
|
+
|
|
50
|
+
while batches.reject{|b| Workflow::Orchestrator.done_batch?(b) }.any?
|
|
51
|
+
|
|
52
|
+
candidates = Workflow::LocalExecutor.candidates(batches)
|
|
53
|
+
top_level_jobs = candidates.collect{|batch| batch[:top_level] }
|
|
54
|
+
|
|
55
|
+
raise NoWork, "No candidates and no running jobs #{Log.fingerprint batches}" if resources_used.empty? && top_level_jobs.empty?
|
|
56
|
+
|
|
57
|
+
candidates.each do |batch|
|
|
58
|
+
begin
|
|
59
|
+
|
|
60
|
+
job = batch[:top_level]
|
|
61
|
+
|
|
62
|
+
case
|
|
63
|
+
when (job.error? || job.aborted?)
|
|
64
|
+
begin
|
|
65
|
+
if job.recoverable_error?
|
|
66
|
+
if failed_jobs.include?(job)
|
|
67
|
+
Log.warn "Failed twice #{job.path} with recoverable error"
|
|
68
|
+
next
|
|
69
|
+
else
|
|
70
|
+
failed_jobs << job
|
|
71
|
+
job.clean
|
|
72
|
+
raise TryAgain
|
|
73
|
+
end
|
|
74
|
+
else
|
|
75
|
+
Log.warn "Non-recoverable error in #{job.path}"
|
|
76
|
+
next
|
|
77
|
+
end
|
|
78
|
+
ensure
|
|
79
|
+
Log.warn "Releases resources from failed job: #{job.path}"
|
|
80
|
+
release_resources(job)
|
|
81
|
+
end
|
|
82
|
+
when job.done?
|
|
83
|
+
Log.debug "Orchestrator done #{job.path}"
|
|
84
|
+
release_resources(job)
|
|
85
|
+
clear_batch(batches, batch)
|
|
86
|
+
erase_job_dependencies(job, batches)
|
|
87
|
+
when job.running?
|
|
88
|
+
next
|
|
89
|
+
|
|
90
|
+
else
|
|
91
|
+
check_resources(batch) do
|
|
92
|
+
run_batch(batch)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
rescue TryAgain
|
|
96
|
+
retry
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
batches.each do |batch|
|
|
101
|
+
job = batch[:top_level]
|
|
102
|
+
if job.done? || job.aborted? || job.error?
|
|
103
|
+
job.join if job.done?
|
|
104
|
+
clear_batch(batches, batch)
|
|
105
|
+
release_resources(job)
|
|
106
|
+
erase_job_dependencies(job, batches)
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
sleep timer
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
batches.each{|batch|
|
|
114
|
+
job = batch[:top_level]
|
|
115
|
+
begin
|
|
116
|
+
job.join
|
|
117
|
+
rescue
|
|
118
|
+
Log.warn "Job #{job.short_path} ended with exception #{$!.class.to_s}: #{$!.message}"
|
|
119
|
+
end
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
batches.each{|batch|
|
|
123
|
+
job = batch[:top_level]
|
|
124
|
+
erase_job_dependencies(job, batches) if job.done?
|
|
125
|
+
}
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def process(rules, jobs = nil)
|
|
129
|
+
jobs, rules = rules, {} if jobs.nil?
|
|
130
|
+
jobs = [jobs] if Step === jobs
|
|
131
|
+
|
|
132
|
+
batches = Workflow::Orchestrator.job_batches(rules, jobs)
|
|
133
|
+
batches.each do |batch|
|
|
134
|
+
rules = IndiferentHash.setup batch[:rules]
|
|
135
|
+
rules.delete :erase if jobs.include?(batch[:top_level])
|
|
136
|
+
resources = Workflow::Orchestrator.normalize_resources_from_rules(rules)
|
|
137
|
+
resources = IndiferentHash.add_defaults resources, rules[:default_resources] if rules[:default_resources]
|
|
138
|
+
batch[:resources] = resources
|
|
139
|
+
batch[:rules] = rules
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
process_batches(batches)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def release_resources(job)
|
|
146
|
+
if resources_used[job]
|
|
147
|
+
Log.debug "Orchestrator releasing resouces from #{job.path}"
|
|
148
|
+
resources_used[job].each do |resource,value|
|
|
149
|
+
next if resource == 'size'
|
|
150
|
+
resources_requested[resource] -= value.to_i
|
|
151
|
+
end
|
|
152
|
+
resources_used.delete job
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def check_resources(batch)
|
|
157
|
+
resources = batch[:resources]
|
|
158
|
+
job = batch[:top_level]
|
|
159
|
+
|
|
160
|
+
limit_resources = resources.select do |resource,value|
|
|
161
|
+
value && available_resources[resource] && ((resources_requested[resource] || 0) + value) > available_resources[resource]
|
|
162
|
+
end.collect do |resource,v|
|
|
163
|
+
resource
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
if limit_resources.any?
|
|
167
|
+
Log.debug "Orchestrator waiting on #{job.path} due to #{limit_resources * ", "}"
|
|
168
|
+
else
|
|
169
|
+
|
|
170
|
+
resources_used[job] = resources
|
|
171
|
+
resources.each do |resource,value|
|
|
172
|
+
resources_requested[resource] ||= 0
|
|
173
|
+
resources_requested[resource] += value.to_i
|
|
174
|
+
end
|
|
175
|
+
Log.low "Orchestrator producing #{job.path} with resources #{resources}"
|
|
176
|
+
|
|
177
|
+
return yield
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def run_batch(batch)
|
|
182
|
+
job, job_rules = batch.values_at :top_level, :rules
|
|
183
|
+
|
|
184
|
+
rules = batch[:rules]
|
|
185
|
+
deploy = rules[:deploy] if rules
|
|
186
|
+
case deploy
|
|
187
|
+
when nil, 'local', :local, :serial, 'serial'
|
|
188
|
+
Scout::Config.with_config do
|
|
189
|
+
job_rules[:config_keys].split(/,\s*/).each do |config|
|
|
190
|
+
Scout::Config.process_config config
|
|
191
|
+
end if job_rules && job_rules[:config_keys]
|
|
192
|
+
|
|
193
|
+
log = job_rules[:log] if job_rules
|
|
194
|
+
log = Log.severity if log.nil?
|
|
195
|
+
Log.with_severity log do
|
|
196
|
+
job.fork
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
when 'batch', 'sched', 'slurm', 'pbs', 'lsf'
|
|
200
|
+
job.init_info
|
|
201
|
+
Workflow::Scheduler.process_batches([batch])
|
|
202
|
+
job.join
|
|
203
|
+
else
|
|
204
|
+
require 'scout/offsite'
|
|
205
|
+
if deploy.end_with?('-batch')
|
|
206
|
+
server = deploy.sub('-batch','')
|
|
207
|
+
OffsiteStep.setup(job, server: server, batch: true)
|
|
208
|
+
else
|
|
209
|
+
OffsiteStep.setup(job, server: deploy)
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
job.produce
|
|
213
|
+
job.join
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def erase_job_dependencies(job, batches)
|
|
218
|
+
all_jobs = batches.collect{|b| b[:jobs] }.flatten
|
|
219
|
+
top_level_jobs = batches.collect{|b| b[:top_level] }
|
|
220
|
+
|
|
221
|
+
job.dependencies.each do |dep|
|
|
222
|
+
batch = batches.select{|b| b[:jobs].include? dep}.first
|
|
223
|
+
next unless batch
|
|
224
|
+
rules = batch[:rules]
|
|
225
|
+
next unless rules[:erase].to_s == 'true'
|
|
226
|
+
|
|
227
|
+
dep_path = dep.path
|
|
228
|
+
parents = all_jobs.select do |parent|
|
|
229
|
+
parent.rec_dependencies.include?(dep)
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
next if parents.select{|parent| ! parent.done? }.any?
|
|
233
|
+
|
|
234
|
+
parents.each do |parent|
|
|
235
|
+
Log.high "Erasing #{dep.path} from #{parent.path}"
|
|
236
|
+
parent.archive_deps
|
|
237
|
+
parent.copy_linked_files_dir
|
|
238
|
+
parent.dependencies = parent.dependencies - [dep]
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
dep.clean
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def clear_batch(batches, batch)
|
|
246
|
+
job = batch[:top_level]
|
|
247
|
+
|
|
248
|
+
parents = batches.select do |b|
|
|
249
|
+
b[:deps].include? batch
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
parents.each{|b| b[:deps].delete batch }
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
#{{{ HELPER
|
|
256
|
+
|
|
257
|
+
def self.purge_duplicates(batches)
|
|
258
|
+
seen = Set.new
|
|
259
|
+
batches.select do |batch|
|
|
260
|
+
path = batch[:top_level].path
|
|
261
|
+
if seen.include? path
|
|
262
|
+
false
|
|
263
|
+
else
|
|
264
|
+
seen << path
|
|
265
|
+
true
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
def self.sort_candidates(batches)
|
|
271
|
+
seen = Set.new
|
|
272
|
+
batches.sort_by do |batch|
|
|
273
|
+
- batch[:resources].values.compact.select{|e| Numeric === e }.inject(0.0){|acc,e| acc += e}
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
def self.candidates(batches)
|
|
278
|
+
|
|
279
|
+
leaf_nodes = batches.select{|b| b[:deps].empty? }
|
|
280
|
+
|
|
281
|
+
leaf_nodes.reject!{|b| Workflow::Orchestrator.done_batch?(b) }
|
|
282
|
+
|
|
283
|
+
leaf_nodes = purge_duplicates leaf_nodes
|
|
284
|
+
leaf_nodes = sort_candidates leaf_nodes
|
|
285
|
+
|
|
286
|
+
leaf_nodes
|
|
287
|
+
end
|
|
288
|
+
end
|