scout-gear 10.8.4 → 10.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vimproject +38 -0
- data/README.md +352 -0
- data/VERSION +1 -1
- data/bin/scout +4 -1
- data/doc/Association.md +288 -0
- data/doc/Entity.md +296 -0
- data/doc/KnowledgeBase.md +433 -0
- data/doc/Persist.md +356 -0
- data/doc/Semaphore.md +171 -0
- data/doc/TSV.md +449 -0
- data/doc/WorkQueue.md +359 -0
- data/doc/Workflow.md +586 -0
- data/lib/scout/association.rb +4 -2
- data/lib/scout/entity/identifiers.rb +1 -1
- data/lib/scout/entity/object.rb +1 -1
- data/lib/scout/entity/property.rb +5 -5
- data/lib/scout/entity.rb +1 -1
- data/lib/scout/knowledge_base/description.rb +1 -1
- data/lib/scout/knowledge_base/list.rb +7 -2
- data/lib/scout/knowledge_base/registry.rb +4 -5
- data/lib/scout/knowledge_base.rb +20 -2
- data/lib/scout/monitor.rb +10 -6
- data/lib/scout/persist/engine/packed_index.rb +2 -2
- data/lib/scout/persist/engine/sharder.rb +1 -1
- data/lib/scout/persist/tsv.rb +1 -0
- data/lib/scout/semaphore.rb +1 -1
- data/lib/scout/tsv/dumper.rb +3 -3
- data/lib/scout/tsv/open.rb +1 -0
- data/lib/scout/tsv/parser.rb +1 -1
- data/lib/scout/tsv/transformer.rb +1 -0
- data/lib/scout/tsv/util.rb +2 -2
- data/lib/scout/work_queue/socket.rb +1 -1
- data/lib/scout/work_queue/worker.rb +7 -5
- data/lib/scout/workflow/definition.rb +11 -0
- data/lib/scout/workflow/deployment/local.rb +288 -0
- data/lib/scout/workflow/deployment/orchestrator/batches.rb +130 -0
- data/lib/scout/workflow/deployment/orchestrator/chains.rb +104 -0
- data/lib/scout/workflow/deployment/orchestrator/rules.rb +256 -0
- data/lib/scout/workflow/deployment/orchestrator/workload.rb +67 -0
- data/lib/scout/workflow/deployment/scheduler/job.rb +740 -0
- data/lib/scout/workflow/deployment/scheduler/lfs.rb +125 -0
- data/lib/scout/workflow/deployment/scheduler/pbs.rb +176 -0
- data/lib/scout/workflow/deployment/scheduler/slurm.rb +158 -0
- data/lib/scout/workflow/deployment/scheduler.rb +73 -0
- data/lib/scout/workflow/deployment.rb +10 -1
- data/lib/scout/workflow/entity.rb +22 -1
- data/lib/scout/workflow/exceptions.rb +2 -0
- data/lib/scout/workflow/step/config.rb +6 -3
- data/lib/scout/workflow/step/file.rb +4 -0
- data/lib/scout/workflow/step/info.rb +10 -4
- data/lib/scout/workflow/step/progress.rb +52 -0
- data/lib/scout/workflow/step.rb +39 -5
- data/lib/scout/workflow/task/inputs.rb +1 -1
- data/lib/scout/workflow/task.rb +2 -0
- data/lib/scout/workflow/usage.rb +3 -2
- data/lib/scout/workflow/util.rb +22 -0
- data/scout-gear.gemspec +37 -7
- data/scout_commands/batch/list +1 -1
- data/scout_commands/cat +86 -0
- data/scout_commands/doc +3 -1
- data/scout_commands/entity +151 -0
- data/scout_commands/system/status +238 -0
- data/scout_commands/workflow/cmd +5 -13
- data/scout_commands/workflow/info +23 -10
- data/scout_commands/workflow/install +1 -1
- data/scout_commands/workflow/task +61 -25
- data/test/scout/entity/test_property.rb +1 -1
- data/test/scout/knowledge_base/test_registry.rb +19 -0
- data/test/scout/test_work_queue.rb +1 -1
- data/test/scout/work_queue/test_worker.rb +12 -10
- data/test/scout/workflow/deployment/orchestrator/test_batches.rb +138 -0
- data/test/scout/workflow/deployment/orchestrator/test_chains.rb +171 -0
- data/test/scout/workflow/deployment/orchestrator/test_rules.rb +219 -0
- data/test/scout/workflow/deployment/orchestrator/test_workload.rb +117 -0
- data/test/scout/workflow/deployment/scheduler/test_job.rb +31 -0
- data/test/scout/workflow/deployment/scheduler/test_lfs.rb +32 -0
- data/test/scout/workflow/deployment/scheduler/test_pbs.rb +32 -0
- data/test/scout/workflow/deployment/scheduler/test_slurm.rb +32 -0
- data/test/scout/workflow/deployment/{test_orchestrator.rb → test_local.rb} +161 -33
- data/test/scout/workflow/deployment/test_scheduler.rb +75 -0
- data/test/scout/workflow/deployment/test_trace.rb +1 -1
- data/test/scout/workflow/step/test_progress.rb +27 -0
- data/test/scout/workflow/task/test_inputs.rb +17 -0
- data/test/test_helper.rb +2 -1
- metadata +36 -6
- data/doc/lib/scout/path.md +0 -35
- data/doc/lib/scout/workflow/task.md +0 -13
- data/lib/scout/workflow/deployment/orchestrator.rb +0 -292
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
require_relative 'job'
|
|
2
|
+
require 'scout'
|
|
3
|
+
|
|
4
|
+
module LSF
|
|
5
|
+
extend SchedulerJob
|
|
6
|
+
|
|
7
|
+
def self.system
|
|
8
|
+
"LSF"
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.batch_system_variables
|
|
12
|
+
<<-EOF
|
|
13
|
+
let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
|
|
14
|
+
let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $LSB_MAX_NUM_PROCESSORS )"
|
|
15
|
+
[ ! -z $LSB_MAX_MEM_RUSAGE ] && let MAX_MEMORY="$LSB_MAX_MEM_RUSAGE" || MAX_MEMORY="$MAX_MEMORY_DEFAULT"
|
|
16
|
+
export MAX_MEMORY_DEFAULT
|
|
17
|
+
export MAX_MEMORY
|
|
18
|
+
export BATCH_JOB_ID=$LSF_JOBID
|
|
19
|
+
export BATCH_SYSTEM=#{system}
|
|
20
|
+
EOF
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.header(options = {})
|
|
24
|
+
options = options.dup
|
|
25
|
+
|
|
26
|
+
queue = IndiferentHash.process_options options, :queue
|
|
27
|
+
task_cpus = IndiferentHash.process_options options, :task_cpus
|
|
28
|
+
time = IndiferentHash.process_options options, :time
|
|
29
|
+
nodes = IndiferentHash.process_options options, :nodes
|
|
30
|
+
workdir = IndiferentHash.process_options options, :workdir
|
|
31
|
+
exclusive = IndiferentHash.process_options options, :exclusive
|
|
32
|
+
|
|
33
|
+
batch_dir = IndiferentHash.process_options options, :batch_dir
|
|
34
|
+
batch_name = IndiferentHash.process_options options, :batch_name
|
|
35
|
+
batch_name ||= File.basename(batch_dir)
|
|
36
|
+
|
|
37
|
+
fout = File.join(batch_dir, 'std.out')
|
|
38
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
39
|
+
|
|
40
|
+
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
|
41
|
+
|
|
42
|
+
time = time.split(":").values_at(0, 1) * ":"
|
|
43
|
+
|
|
44
|
+
header =<<-EOF
|
|
45
|
+
#!/bin/bash
|
|
46
|
+
#BSUB -J "#{batch_name}"
|
|
47
|
+
#BSUB -cwd "#{workdir}"
|
|
48
|
+
#BSUB -oo "#{fout}"
|
|
49
|
+
#BSUB -eo "#{ferr}"
|
|
50
|
+
#BSUB -q "#{queue}"
|
|
51
|
+
#BSUB -n "#{task_cpus}"
|
|
52
|
+
#BSUB -W "#{time}"
|
|
53
|
+
EOF
|
|
54
|
+
|
|
55
|
+
header << "#BSUB -x" << "\n" if exclusive
|
|
56
|
+
|
|
57
|
+
header
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def self.run_template(batch_dir, dry_run)
|
|
61
|
+
|
|
62
|
+
fout = File.join(batch_dir, 'std.out')
|
|
63
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
64
|
+
fjob = File.join(batch_dir, 'job.id')
|
|
65
|
+
fdep = File.join(batch_dir, 'dependencies.list')
|
|
66
|
+
fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
|
|
67
|
+
fexit = File.join(batch_dir, 'exit.status')
|
|
68
|
+
fsync = File.join(batch_dir, 'sync.log')
|
|
69
|
+
fcmd = File.join(batch_dir, 'command.batch')
|
|
70
|
+
|
|
71
|
+
return if Open.exists?(fexit)
|
|
72
|
+
|
|
73
|
+
STDERR.puts Log.color(:magenta, "Issuing LSF file: #{fcmd}")
|
|
74
|
+
STDERR.puts Open.read(fcmd)
|
|
75
|
+
|
|
76
|
+
if File.exist?(fjob)
|
|
77
|
+
job = Open.read(fjob).to_i
|
|
78
|
+
else
|
|
79
|
+
|
|
80
|
+
dependencies = Open.read(fdep).split("\n") if File.exist? fdep
|
|
81
|
+
canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
|
|
82
|
+
|
|
83
|
+
normal_dep_list = dependencies && dependencies.any? ? dependencies.collect{|d| "post_done(#{d})"} : []
|
|
84
|
+
canfail_dep_list = canfail_dependencies && canfail_dependencies.any? ? canfail_dependencies.collect{|d| "done(#{d})"} : []
|
|
85
|
+
|
|
86
|
+
dep_list = normal_dep_list + canfail_dep_list
|
|
87
|
+
|
|
88
|
+
if dep_list.any?
|
|
89
|
+
dep_str = '-w "' + dep_list * " && " + '"'
|
|
90
|
+
else
|
|
91
|
+
dep_str = ""
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
cmd = "bsub #{dep_str} < '#{fcmd}'"
|
|
95
|
+
|
|
96
|
+
if File.exist?(fout)
|
|
97
|
+
return
|
|
98
|
+
elsif dry_run
|
|
99
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, cmd)
|
|
100
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local scout): ") + Log.color(:blue, "scout lsf tail '#{batch_dir}'")
|
|
101
|
+
raise DryRun, batch_dir
|
|
102
|
+
else
|
|
103
|
+
Open.rm fsync
|
|
104
|
+
Open.rm fexit
|
|
105
|
+
Open.rm fout
|
|
106
|
+
Open.rm ferr
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
|
|
110
|
+
Log.debug "BSUB job id: #{job}"
|
|
111
|
+
Open.write(fjob, job.to_s)
|
|
112
|
+
job
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def self.job_status(job = nil)
|
|
118
|
+
if job.nil?
|
|
119
|
+
CMD.cmd("bjobs -w").read
|
|
120
|
+
else
|
|
121
|
+
CMD.cmd("bjobs -w #{job}").read
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
require_relative 'job'
|
|
2
|
+
require 'scout'
|
|
3
|
+
|
|
4
|
+
module PBS
|
|
5
|
+
extend SchedulerJob
|
|
6
|
+
|
|
7
|
+
def self.system
|
|
8
|
+
"PBS"
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.batch_system_variables
|
|
12
|
+
<<-EOF
|
|
13
|
+
let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
|
|
14
|
+
let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $PBS_CPUS_PER_TASK )"
|
|
15
|
+
MAX_MEMORY="$MAX_MEMORY_DEFAULT"
|
|
16
|
+
[ ! -z $PBS_MEM_PER_CPU ] && let MAX_MEMORY="$PBS_MEM_PER_CPU * $PBS_CPUS_PER_TASK"
|
|
17
|
+
[ ! -z $PBS_MEM_PER_NODE ] && MAX_MEMORY="$PBS_MEM_PER_NODE"
|
|
18
|
+
export MAX_MEMORY_DEFAULT
|
|
19
|
+
export MAX_MEMORY
|
|
20
|
+
export BATCH_JOB_ID=$PBS_JOBID
|
|
21
|
+
export BATCH_SYSTEM=#{system}
|
|
22
|
+
|
|
23
|
+
cd ${PBS_O_WORKDIR}
|
|
24
|
+
EOF
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def self.header(options = {})
|
|
28
|
+
options = options.dup
|
|
29
|
+
|
|
30
|
+
workdir = IndiferentHash.process_options options, :workdir
|
|
31
|
+
batch_dir = IndiferentHash.process_options options, :batch_dir
|
|
32
|
+
batch_name = IndiferentHash.process_options options, :batch_name
|
|
33
|
+
|
|
34
|
+
queue = IndiferentHash.process_options options, :queue
|
|
35
|
+
account = IndiferentHash.process_options options, :account
|
|
36
|
+
time = IndiferentHash.process_options options, :time
|
|
37
|
+
nodes = IndiferentHash.process_options options, :nodes
|
|
38
|
+
|
|
39
|
+
# PBS
|
|
40
|
+
place = IndiferentHash.process_options options, :place, :place => 'scatter'
|
|
41
|
+
system = IndiferentHash.process_options options, :partition
|
|
42
|
+
filesystems = IndiferentHash.process_options options, :filesystems
|
|
43
|
+
|
|
44
|
+
filesystems = "home" if filesystems.nil?
|
|
45
|
+
|
|
46
|
+
filesystems = filesystems * "," if Array === filesystems
|
|
47
|
+
|
|
48
|
+
# NOT USED
|
|
49
|
+
partition = IndiferentHash.process_options options, :partition
|
|
50
|
+
task_cpus = IndiferentHash.process_options options, :task_cpus
|
|
51
|
+
exclusive = IndiferentHash.process_options options, :exclusive
|
|
52
|
+
highmem = IndiferentHash.process_options options, :highmem
|
|
53
|
+
licenses = IndiferentHash.process_options options, :licenses
|
|
54
|
+
constraint = IndiferentHash.process_options options, :constraint
|
|
55
|
+
gres = IndiferentHash.process_options options, :gres
|
|
56
|
+
|
|
57
|
+
constraint = [constraint, "highmem"].compact * "&" if highmem
|
|
58
|
+
|
|
59
|
+
mem = IndiferentHash.process_options options, :mem
|
|
60
|
+
mem_per_cpu = IndiferentHash.process_options options, :mem_per_cpu
|
|
61
|
+
|
|
62
|
+
fout = File.join(batch_dir, 'std.out')
|
|
63
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
64
|
+
|
|
65
|
+
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
|
66
|
+
|
|
67
|
+
qsub_params = { "-l filesystems=" => filesystems,
|
|
68
|
+
"-l system=" => system,
|
|
69
|
+
"-l select=" => nodes,
|
|
70
|
+
"-l place=" => place,
|
|
71
|
+
"-l walltime=" => time,
|
|
72
|
+
"-q " => queue,
|
|
73
|
+
"-A " => account,
|
|
74
|
+
"-o " => fout,
|
|
75
|
+
"-e " => ferr,
|
|
76
|
+
"-k doe" => true,
|
|
77
|
+
# "cpus-per-task" => task_cpus,
|
|
78
|
+
# "nodes" => nodes,
|
|
79
|
+
# "time" => time,
|
|
80
|
+
# "constraint" => constraint,
|
|
81
|
+
# "exclusive" => exclusive,
|
|
82
|
+
# "licenses" => licenses,
|
|
83
|
+
# "gres" => gres,
|
|
84
|
+
# "mem" => mem,
|
|
85
|
+
# "mem-per-cpu" => mem_per_cpu,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
header =<<-EOF
|
|
90
|
+
#!/bin/bash
|
|
91
|
+
EOF
|
|
92
|
+
|
|
93
|
+
qsub_params.each do |name,value|
|
|
94
|
+
next if value.nil? || value == ""
|
|
95
|
+
if TrueClass === value
|
|
96
|
+
header << "#PBS #{name}" << "\n"
|
|
97
|
+
elsif Array === value
|
|
98
|
+
value.each do |v|
|
|
99
|
+
header << "#PBS #{name}\"#{v}\"" << "\n"
|
|
100
|
+
end
|
|
101
|
+
else
|
|
102
|
+
header << "#PBS #{name}\"#{value}\"" << "\n"
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
header
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def self.run_template(batch_dir, dry_run)
|
|
110
|
+
|
|
111
|
+
fout = File.join(batch_dir, 'std.out')
|
|
112
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
113
|
+
fjob = File.join(batch_dir, 'job.id')
|
|
114
|
+
fdep = File.join(batch_dir, 'dependencies.list')
|
|
115
|
+
fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
|
|
116
|
+
fexit = File.join(batch_dir, 'exit.status')
|
|
117
|
+
fsync = File.join(batch_dir, 'sync.log')
|
|
118
|
+
fcmd = File.join(batch_dir, 'command.batch')
|
|
119
|
+
|
|
120
|
+
return if Open.exists?(fexit)
|
|
121
|
+
|
|
122
|
+
Log.info "Issuing PBS file: #{fcmd}"
|
|
123
|
+
Log.debug Open.read(fcmd)
|
|
124
|
+
|
|
125
|
+
if File.exist?(fjob)
|
|
126
|
+
job = Open.read(fjob).to_i
|
|
127
|
+
else
|
|
128
|
+
|
|
129
|
+
dependencies = Open.read(fdep).split("\n") if File.exist? fdep
|
|
130
|
+
canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
|
|
131
|
+
|
|
132
|
+
normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
|
|
133
|
+
canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
|
|
134
|
+
|
|
135
|
+
if normal_dep_str.nil? && canfail_dep_str.nil?
|
|
136
|
+
dep_str = ""
|
|
137
|
+
else
|
|
138
|
+
dep_str = '-W depend=' + [normal_dep_str, canfail_dep_str].compact * ","
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
cmd = "qsub #{dep_str} '#{fcmd}'"
|
|
142
|
+
|
|
143
|
+
if File.exist?(fout)
|
|
144
|
+
return
|
|
145
|
+
elsif dry_run
|
|
146
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "squb '#{fcmd}'")
|
|
147
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt pbs tail '#{batch_dir}'")
|
|
148
|
+
raise DryRun, batch_dir
|
|
149
|
+
else
|
|
150
|
+
Open.rm fsync
|
|
151
|
+
Open.rm fexit
|
|
152
|
+
Open.rm fout
|
|
153
|
+
Open.rm ferr
|
|
154
|
+
|
|
155
|
+
job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
|
|
156
|
+
Log.debug "SBATCH job id: #{job}"
|
|
157
|
+
Open.write(fjob, job.to_s)
|
|
158
|
+
job
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def self.job_status(job = nil)
|
|
164
|
+
if job.nil?
|
|
165
|
+
CMD.cmd("qstat").read
|
|
166
|
+
else
|
|
167
|
+
begin
|
|
168
|
+
CMD.cmd("qstat #{job}").read
|
|
169
|
+
rescue
|
|
170
|
+
""
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
end
|
|
176
|
+
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
require_relative 'job'
|
|
2
|
+
require 'scout'
|
|
3
|
+
|
|
4
|
+
module SLURM
|
|
5
|
+
|
|
6
|
+
extend SchedulerJob
|
|
7
|
+
|
|
8
|
+
def self.system
|
|
9
|
+
"SLURM"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def self.batch_system_variables
|
|
14
|
+
<<-EOF
|
|
15
|
+
let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
|
|
16
|
+
let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $SLURM_CPUS_PER_TASK )"
|
|
17
|
+
MAX_MEMORY="$MAX_MEMORY_DEFAULT"
|
|
18
|
+
[ ! -z $SLURM_MEM_PER_CPU ] && let MAX_MEMORY="$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK"
|
|
19
|
+
[ ! -z $SLURM_MEM_PER_NODE ] && MAX_MEMORY="$SLURM_MEM_PER_NODE"
|
|
20
|
+
export MAX_MEMORY_DEFAULT
|
|
21
|
+
export MAX_MEMORY
|
|
22
|
+
export BATCH_JOB_ID=$SLURM_JOB_ID
|
|
23
|
+
export BATCH_SYSTEM=#{system}
|
|
24
|
+
EOF
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def self.header(options = {})
|
|
28
|
+
options = options.dup
|
|
29
|
+
|
|
30
|
+
queue = IndiferentHash.process_options options, :queue
|
|
31
|
+
account = IndiferentHash.process_options options, :account
|
|
32
|
+
partition = IndiferentHash.process_options options, :partition
|
|
33
|
+
task_cpus = IndiferentHash.process_options options, :task_cpus
|
|
34
|
+
time = IndiferentHash.process_options options, :time
|
|
35
|
+
nodes = IndiferentHash.process_options options, :nodes
|
|
36
|
+
workdir = IndiferentHash.process_options options, :workdir
|
|
37
|
+
exclusive = IndiferentHash.process_options options, :exclusive
|
|
38
|
+
highmem = IndiferentHash.process_options options, :highmem
|
|
39
|
+
licenses = IndiferentHash.process_options options, :licenses
|
|
40
|
+
constraints = IndiferentHash.process_options options, :constraints
|
|
41
|
+
gres = IndiferentHash.process_options options, :gres
|
|
42
|
+
|
|
43
|
+
constraints = [constraints, "highmem"].compact * "&" if highmem
|
|
44
|
+
|
|
45
|
+
mem = IndiferentHash.process_options options, :mem
|
|
46
|
+
mem_per_cpu = IndiferentHash.process_options options, :mem_per_cpu
|
|
47
|
+
|
|
48
|
+
batch_dir = IndiferentHash.process_options options, :batch_dir
|
|
49
|
+
batch_name = IndiferentHash.process_options options, :batch_name
|
|
50
|
+
|
|
51
|
+
fout = File.join(batch_dir, 'std.out')
|
|
52
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
53
|
+
|
|
54
|
+
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
|
55
|
+
|
|
56
|
+
sbatch_params = {"job-name" => batch_name,
|
|
57
|
+
"qos" => queue,
|
|
58
|
+
"account" => account,
|
|
59
|
+
"partition" => partition,
|
|
60
|
+
"output" => fout,
|
|
61
|
+
"error" => ferr,
|
|
62
|
+
"cpus-per-task" => task_cpus,
|
|
63
|
+
"nodes" => nodes,
|
|
64
|
+
"time" => time,
|
|
65
|
+
"constraints" => constraints,
|
|
66
|
+
"exclusive" => exclusive,
|
|
67
|
+
"licenses" => licenses,
|
|
68
|
+
"gres" => gres,
|
|
69
|
+
"mem" => mem,
|
|
70
|
+
"mem-per-cpu" => mem_per_cpu,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
header =<<-EOF
|
|
75
|
+
#!/bin/bash
|
|
76
|
+
EOF
|
|
77
|
+
|
|
78
|
+
sbatch_params.each do |name,value|
|
|
79
|
+
next if value.nil? || value == ""
|
|
80
|
+
if TrueClass === value
|
|
81
|
+
header << "#SBATCH --#{name}" << "\n"
|
|
82
|
+
elsif Array === value
|
|
83
|
+
value.each do |v|
|
|
84
|
+
header << "#SBATCH --#{name}=\"#{v}\"" << "\n"
|
|
85
|
+
end
|
|
86
|
+
else
|
|
87
|
+
header << "#SBATCH --#{name}=\"#{value}\"" << "\n"
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
header
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def self.run_template(batch_dir, dry_run)
|
|
95
|
+
|
|
96
|
+
fout = File.join(batch_dir, 'std.out')
|
|
97
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
98
|
+
fjob = File.join(batch_dir, 'job.id')
|
|
99
|
+
fdep = File.join(batch_dir, 'dependencies.list')
|
|
100
|
+
fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
|
|
101
|
+
fexit = File.join(batch_dir, 'exit.status')
|
|
102
|
+
fsync = File.join(batch_dir, 'sync.log')
|
|
103
|
+
fcmd = File.join(batch_dir, 'command.batch')
|
|
104
|
+
|
|
105
|
+
return if Open.exists?(fexit)
|
|
106
|
+
|
|
107
|
+
Log.info "Issuing SLURM file: #{fcmd}"
|
|
108
|
+
|
|
109
|
+
if File.exist?(fjob)
|
|
110
|
+
job = Open.read(fjob).to_i
|
|
111
|
+
else
|
|
112
|
+
|
|
113
|
+
dependencies = Open.read(fdep).split("\n") if File.exist? fdep
|
|
114
|
+
canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
|
|
115
|
+
|
|
116
|
+
normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
|
|
117
|
+
canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
|
|
118
|
+
|
|
119
|
+
if normal_dep_str.nil? && canfail_dep_str.nil?
|
|
120
|
+
dep_str = ""
|
|
121
|
+
else
|
|
122
|
+
dep_str = '--dependency=' + [normal_dep_str, canfail_dep_str].compact * ","
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
cmd = "sbatch #{dep_str} '#{fcmd}'"
|
|
126
|
+
|
|
127
|
+
if File.exist?(fout)
|
|
128
|
+
return
|
|
129
|
+
elsif dry_run
|
|
130
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{fcmd}'")
|
|
131
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local scout): ") + Log.color(:blue, "scout slurm tail '#{batch_dir}'")
|
|
132
|
+
raise DryRun, batch_dir
|
|
133
|
+
else
|
|
134
|
+
Open.rm fsync
|
|
135
|
+
Open.rm fexit
|
|
136
|
+
Open.rm fout
|
|
137
|
+
Open.rm ferr
|
|
138
|
+
|
|
139
|
+
job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
|
|
140
|
+
Log.debug "SBATCH job id: #{job}"
|
|
141
|
+
Open.write(fjob, job.to_s)
|
|
142
|
+
job
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def self.job_status(job = nil)
|
|
148
|
+
if job.nil?
|
|
149
|
+
CMD.cmd("squeue").read
|
|
150
|
+
else
|
|
151
|
+
begin
|
|
152
|
+
CMD.cmd("squeue --job #{job}").read
|
|
153
|
+
rescue
|
|
154
|
+
""
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
require_relative 'orchestrator/chains'
|
|
2
|
+
require_relative 'orchestrator/rules'
|
|
3
|
+
require_relative 'orchestrator/batches'
|
|
4
|
+
|
|
5
|
+
require_relative 'scheduler/slurm'
|
|
6
|
+
require_relative 'scheduler/pbs'
|
|
7
|
+
require_relative 'scheduler/lfs'
|
|
8
|
+
|
|
9
|
+
module Workflow::Scheduler
|
|
10
|
+
def self.produce(jobs, rules = {}, options = {})
|
|
11
|
+
batches = Workflow::Orchestrator.job_batches(rules, jobs)
|
|
12
|
+
Workflow::Scheduler.process_batches(batches, options)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def self.process_batches(batches, process_options = {})
|
|
16
|
+
failed_jobs = []
|
|
17
|
+
|
|
18
|
+
pending = batches.dup
|
|
19
|
+
|
|
20
|
+
sorted = []
|
|
21
|
+
while pending.any?
|
|
22
|
+
leaf_nodes = batches.select{|batch| (batch[:deps] - sorted).empty? }
|
|
23
|
+
sorted.concat(leaf_nodes - sorted)
|
|
24
|
+
pending -= leaf_nodes
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
batch_system = Scout::Config.get :system, :batch, :scheduler, 'env:BATCH_SYSTEM', default: 'SLURM'
|
|
28
|
+
|
|
29
|
+
batch_ids = {}
|
|
30
|
+
sorted.collect do |batch|
|
|
31
|
+
job_options = batch[:rules]
|
|
32
|
+
job_options = IndiferentHash.add_defaults job_options, process_options.dup
|
|
33
|
+
|
|
34
|
+
if batch[:deps].nil?
|
|
35
|
+
batch_dependencies = []
|
|
36
|
+
else
|
|
37
|
+
top_jobs = batch[:jobs]
|
|
38
|
+
|
|
39
|
+
batch_dependencies = batch[:deps].collect{|dep|
|
|
40
|
+
dep_target = dep[:top_level]
|
|
41
|
+
id = batch_ids[dep_target].to_s
|
|
42
|
+
|
|
43
|
+
if dep_target.canfail?
|
|
44
|
+
'canfail:' + id
|
|
45
|
+
else
|
|
46
|
+
id
|
|
47
|
+
end
|
|
48
|
+
}
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
job_options.merge!(:batch_dependencies => batch_dependencies )
|
|
52
|
+
job_options.merge!(:manifest => batch[:jobs].collect{|d| d.task_signature })
|
|
53
|
+
|
|
54
|
+
begin
|
|
55
|
+
id, dir = case batch_system
|
|
56
|
+
when 'SLURM'
|
|
57
|
+
SLURM.run_job(batch[:top_level], job_options)
|
|
58
|
+
when 'LSF'
|
|
59
|
+
LSF.run_job(batch[:top_level], job_options)
|
|
60
|
+
when 'PBS'
|
|
61
|
+
PBS.run_job(batch[:top_level], job_options)
|
|
62
|
+
when nil
|
|
63
|
+
raise "No batch system specified"
|
|
64
|
+
else
|
|
65
|
+
raise "Unknown batch system #{batch_system}"
|
|
66
|
+
end
|
|
67
|
+
batch_ids[batch[:top_level]] = id
|
|
68
|
+
rescue DryRun
|
|
69
|
+
$!.message
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -1,2 +1,11 @@
|
|
|
1
|
-
require_relative 'deployment/
|
|
1
|
+
require_relative 'deployment/local'
|
|
2
|
+
require_relative 'deployment/scheduler'
|
|
2
3
|
require_relative 'deployment/trace'
|
|
4
|
+
require_relative 'deployment/queue'
|
|
5
|
+
|
|
6
|
+
module Workflow
|
|
7
|
+
def self.produce(jobs, ...)
|
|
8
|
+
rules = Workflow::Orchestrator.load_rules_for_job(jobs)
|
|
9
|
+
Workflow::LocalExecutor.produce(jobs, rules, ...)
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -71,10 +71,31 @@ module EntityWorkflow
|
|
|
71
71
|
end
|
|
72
72
|
|
|
73
73
|
property_name = task_name.to_s.sub(/^(#{entity_name}_list|#{entity_name}|list)_/, '')
|
|
74
|
+
property_job_name = property_name + '_job'
|
|
75
|
+
|
|
76
|
+
property property_job_name => property_type do |*args|
|
|
77
|
+
job(task_name, *args)
|
|
78
|
+
end
|
|
79
|
+
|
|
74
80
|
property property_name => property_type do |*args|
|
|
75
|
-
job =
|
|
81
|
+
job = self.send(property_job_name)
|
|
82
|
+
|
|
83
|
+
job.join if job.running?
|
|
84
|
+
|
|
85
|
+
if job.error?
|
|
86
|
+
if job.recoverable_error?
|
|
87
|
+
job.clean
|
|
88
|
+
else
|
|
89
|
+
raise job.exception
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
job.run unless job.done?
|
|
94
|
+
|
|
95
|
+
job.load
|
|
76
96
|
Array === job ? job.collect(&:run) : job.run
|
|
77
97
|
end
|
|
98
|
+
|
|
78
99
|
end
|
|
79
100
|
|
|
80
101
|
def entity_task(task_name, *args, &block)
|
|
@@ -8,10 +8,13 @@ class Step
|
|
|
8
8
|
new_tokens = []
|
|
9
9
|
if workflow
|
|
10
10
|
workflow_name = workflow.name
|
|
11
|
-
new_tokens << ("workflow:"
|
|
12
|
-
new_tokens << ("task:"
|
|
11
|
+
new_tokens << ("workflow:" + workflow_name)
|
|
12
|
+
new_tokens << ("task:" + workflow_name << "#" << task_name.to_s)
|
|
13
13
|
end
|
|
14
|
-
new_tokens << ("task:"
|
|
14
|
+
new_tokens << ("task:" + task_name.to_s)
|
|
15
|
+
new_tokens << (task_name.to_s)
|
|
16
|
+
new_tokens << (workflow_name)
|
|
17
|
+
new_tokens << ("task")
|
|
15
18
|
|
|
16
19
|
Scout::Config.get(key, tokens + new_tokens, options)
|
|
17
20
|
end
|
|
@@ -38,8 +38,8 @@ class Step
|
|
|
38
38
|
save_info(@info = info)
|
|
39
39
|
end
|
|
40
40
|
|
|
41
|
-
def init_info
|
|
42
|
-
log
|
|
41
|
+
def init_info(status=:waiting)
|
|
42
|
+
log status unless info_file.nil? || Open.exists?(info_file)
|
|
43
43
|
end
|
|
44
44
|
|
|
45
45
|
def info
|
|
@@ -120,7 +120,7 @@ class Step
|
|
|
120
120
|
if info.include?(key)
|
|
121
121
|
case info[key]
|
|
122
122
|
when Array
|
|
123
|
-
info[key].concat
|
|
123
|
+
info[key].concat(Array === value ? value : [value])
|
|
124
124
|
when Hash
|
|
125
125
|
info[key].merge! value
|
|
126
126
|
else
|
|
@@ -201,7 +201,13 @@ class Step
|
|
|
201
201
|
end
|
|
202
202
|
|
|
203
203
|
def exception
|
|
204
|
-
info[:exception]
|
|
204
|
+
return nil unless info[:exception]
|
|
205
|
+
begin
|
|
206
|
+
Marshal.load(Base64.decode64(info[:exception]))
|
|
207
|
+
rescue
|
|
208
|
+
Log.exception $!
|
|
209
|
+
nil
|
|
210
|
+
end
|
|
205
211
|
end
|
|
206
212
|
|
|
207
213
|
# Marshal Step
|