scout-gear 10.9.0 → 10.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vimproject +25 -0
- data/VERSION +1 -1
- data/bin/scout +4 -1
- data/lib/scout/knowledge_base/registry.rb +2 -3
- data/lib/scout/workflow/definition.rb +11 -0
- data/lib/scout/workflow/deployment/local.rb +288 -0
- data/lib/scout/workflow/deployment/orchestrator/batches.rb +130 -0
- data/lib/scout/workflow/deployment/orchestrator/chains.rb +104 -0
- data/lib/scout/workflow/deployment/orchestrator/rules.rb +256 -0
- data/lib/scout/workflow/deployment/orchestrator/workload.rb +67 -0
- data/lib/scout/workflow/deployment/scheduler/job.rb +740 -0
- data/lib/scout/workflow/deployment/scheduler/lfs.rb +125 -0
- data/lib/scout/workflow/deployment/scheduler/pbs.rb +176 -0
- data/lib/scout/workflow/deployment/scheduler/slurm.rb +158 -0
- data/lib/scout/workflow/deployment/scheduler.rb +73 -0
- data/lib/scout/workflow/deployment.rb +10 -1
- data/lib/scout/workflow/exceptions.rb +2 -0
- data/lib/scout/workflow/step/config.rb +3 -0
- data/lib/scout/workflow/step/info.rb +2 -2
- data/lib/scout/workflow/step/progress.rb +52 -0
- data/lib/scout/workflow/step.rb +30 -1
- data/lib/scout/workflow/task.rb +2 -0
- data/scout-gear.gemspec +23 -4
- data/scout_commands/batch/list +1 -1
- data/scout_commands/workflow/cmd +5 -13
- data/scout_commands/workflow/info +1 -1
- data/scout_commands/workflow/task +61 -25
- data/test/scout/workflow/deployment/orchestrator/test_batches.rb +138 -0
- data/test/scout/workflow/deployment/orchestrator/test_chains.rb +171 -0
- data/test/scout/workflow/deployment/orchestrator/test_rules.rb +219 -0
- data/test/scout/workflow/deployment/orchestrator/test_workload.rb +117 -0
- data/test/scout/workflow/deployment/scheduler/test_job.rb +31 -0
- data/test/scout/workflow/deployment/scheduler/test_lfs.rb +32 -0
- data/test/scout/workflow/deployment/scheduler/test_pbs.rb +32 -0
- data/test/scout/workflow/deployment/scheduler/test_slurm.rb +32 -0
- data/test/scout/workflow/deployment/{test_orchestrator.rb → test_local.rb} +161 -33
- data/test/scout/workflow/deployment/test_scheduler.rb +75 -0
- data/test/scout/workflow/deployment/test_trace.rb +1 -1
- data/test/scout/workflow/step/test_progress.rb +27 -0
- data/test/scout/workflow/task/test_inputs.rb +17 -0
- data/test/test_helper.rb +2 -1
- metadata +22 -3
- data/lib/scout/workflow/deployment/orchestrator.rb +0 -292
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
require_relative 'job'
|
|
2
|
+
require 'scout'
|
|
3
|
+
|
|
4
|
+
module LSF
|
|
5
|
+
extend SchedulerJob
|
|
6
|
+
|
|
7
|
+
def self.system
|
|
8
|
+
"LSF"
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.batch_system_variables
|
|
12
|
+
<<-EOF
|
|
13
|
+
let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
|
|
14
|
+
let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $LSB_MAX_NUM_PROCESSORS )"
|
|
15
|
+
[ ! -z $LSB_MAX_MEM_RUSAGE ] && let MAX_MEMORY="$LSB_MAX_MEM_RUSAGE" || MAX_MEMORY="$MAX_MEMORY_DEFAULT"
|
|
16
|
+
export MAX_MEMORY_DEFAULT
|
|
17
|
+
export MAX_MEMORY
|
|
18
|
+
export BATCH_JOB_ID=$LSF_JOBID
|
|
19
|
+
export BATCH_SYSTEM=#{system}
|
|
20
|
+
EOF
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.header(options = {})
|
|
24
|
+
options = options.dup
|
|
25
|
+
|
|
26
|
+
queue = IndiferentHash.process_options options, :queue
|
|
27
|
+
task_cpus = IndiferentHash.process_options options, :task_cpus
|
|
28
|
+
time = IndiferentHash.process_options options, :time
|
|
29
|
+
nodes = IndiferentHash.process_options options, :nodes
|
|
30
|
+
workdir = IndiferentHash.process_options options, :workdir
|
|
31
|
+
exclusive = IndiferentHash.process_options options, :exclusive
|
|
32
|
+
|
|
33
|
+
batch_dir = IndiferentHash.process_options options, :batch_dir
|
|
34
|
+
batch_name = IndiferentHash.process_options options, :batch_name
|
|
35
|
+
batch_name ||= File.basename(batch_dir)
|
|
36
|
+
|
|
37
|
+
fout = File.join(batch_dir, 'std.out')
|
|
38
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
39
|
+
|
|
40
|
+
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
|
41
|
+
|
|
42
|
+
time = time.split(":").values_at(0, 1) * ":"
|
|
43
|
+
|
|
44
|
+
header =<<-EOF
|
|
45
|
+
#!/bin/bash
|
|
46
|
+
#BSUB -J "#{batch_name}"
|
|
47
|
+
#BSUB -cwd "#{workdir}"
|
|
48
|
+
#BSUB -oo "#{fout}"
|
|
49
|
+
#BSUB -eo "#{ferr}"
|
|
50
|
+
#BSUB -q "#{queue}"
|
|
51
|
+
#BSUB -n "#{task_cpus}"
|
|
52
|
+
#BSUB -W "#{time}"
|
|
53
|
+
EOF
|
|
54
|
+
|
|
55
|
+
header << "#BSUB -x" << "\n" if exclusive
|
|
56
|
+
|
|
57
|
+
header
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def self.run_template(batch_dir, dry_run)
|
|
61
|
+
|
|
62
|
+
fout = File.join(batch_dir, 'std.out')
|
|
63
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
64
|
+
fjob = File.join(batch_dir, 'job.id')
|
|
65
|
+
fdep = File.join(batch_dir, 'dependencies.list')
|
|
66
|
+
fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
|
|
67
|
+
fexit = File.join(batch_dir, 'exit.status')
|
|
68
|
+
fsync = File.join(batch_dir, 'sync.log')
|
|
69
|
+
fcmd = File.join(batch_dir, 'command.batch')
|
|
70
|
+
|
|
71
|
+
return if Open.exists?(fexit)
|
|
72
|
+
|
|
73
|
+
STDERR.puts Log.color(:magenta, "Issuing LSF file: #{fcmd}")
|
|
74
|
+
STDERR.puts Open.read(fcmd)
|
|
75
|
+
|
|
76
|
+
if File.exist?(fjob)
|
|
77
|
+
job = Open.read(fjob).to_i
|
|
78
|
+
else
|
|
79
|
+
|
|
80
|
+
dependencies = Open.read(fdep).split("\n") if File.exist? fdep
|
|
81
|
+
canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
|
|
82
|
+
|
|
83
|
+
normal_dep_list = dependencies && dependencies.any? ? dependencies.collect{|d| "post_done(#{d})"} : []
|
|
84
|
+
canfail_dep_list = canfail_dependencies && canfail_dependencies.any? ? canfail_dependencies.collect{|d| "done(#{d})"} : []
|
|
85
|
+
|
|
86
|
+
dep_list = normal_dep_list + canfail_dep_list
|
|
87
|
+
|
|
88
|
+
if dep_list.any?
|
|
89
|
+
dep_str = '-w "' + dep_list * " && " + '"'
|
|
90
|
+
else
|
|
91
|
+
dep_str = ""
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
cmd = "bsub #{dep_str} < '#{fcmd}'"
|
|
95
|
+
|
|
96
|
+
if File.exist?(fout)
|
|
97
|
+
return
|
|
98
|
+
elsif dry_run
|
|
99
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, cmd)
|
|
100
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local scout): ") + Log.color(:blue, "scout lsf tail '#{batch_dir}'")
|
|
101
|
+
raise DryRun, batch_dir
|
|
102
|
+
else
|
|
103
|
+
Open.rm fsync
|
|
104
|
+
Open.rm fexit
|
|
105
|
+
Open.rm fout
|
|
106
|
+
Open.rm ferr
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
|
|
110
|
+
Log.debug "BSUB job id: #{job}"
|
|
111
|
+
Open.write(fjob, job.to_s)
|
|
112
|
+
job
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def self.job_status(job = nil)
|
|
118
|
+
if job.nil?
|
|
119
|
+
CMD.cmd("bjobs -w").read
|
|
120
|
+
else
|
|
121
|
+
CMD.cmd("bjobs -w #{job}").read
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
require_relative 'job'
|
|
2
|
+
require 'scout'
|
|
3
|
+
|
|
4
|
+
module PBS
|
|
5
|
+
extend SchedulerJob
|
|
6
|
+
|
|
7
|
+
def self.system
|
|
8
|
+
"PBS"
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.batch_system_variables
|
|
12
|
+
<<-EOF
|
|
13
|
+
let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
|
|
14
|
+
let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $PBS_CPUS_PER_TASK )"
|
|
15
|
+
MAX_MEMORY="$MAX_MEMORY_DEFAULT"
|
|
16
|
+
[ ! -z $PBS_MEM_PER_CPU ] && let MAX_MEMORY="$PBS_MEM_PER_CPU * $PBS_CPUS_PER_TASK"
|
|
17
|
+
[ ! -z $PBS_MEM_PER_NODE ] && MAX_MEMORY="$PBS_MEM_PER_NODE"
|
|
18
|
+
export MAX_MEMORY_DEFAULT
|
|
19
|
+
export MAX_MEMORY
|
|
20
|
+
export BATCH_JOB_ID=$PBS_JOBID
|
|
21
|
+
export BATCH_SYSTEM=#{system}
|
|
22
|
+
|
|
23
|
+
cd ${PBS_O_WORKDIR}
|
|
24
|
+
EOF
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def self.header(options = {})
|
|
28
|
+
options = options.dup
|
|
29
|
+
|
|
30
|
+
workdir = IndiferentHash.process_options options, :workdir
|
|
31
|
+
batch_dir = IndiferentHash.process_options options, :batch_dir
|
|
32
|
+
batch_name = IndiferentHash.process_options options, :batch_name
|
|
33
|
+
|
|
34
|
+
queue = IndiferentHash.process_options options, :queue
|
|
35
|
+
account = IndiferentHash.process_options options, :account
|
|
36
|
+
time = IndiferentHash.process_options options, :time
|
|
37
|
+
nodes = IndiferentHash.process_options options, :nodes
|
|
38
|
+
|
|
39
|
+
# PBS
|
|
40
|
+
place = IndiferentHash.process_options options, :place, :place => 'scatter'
|
|
41
|
+
system = IndiferentHash.process_options options, :partition
|
|
42
|
+
filesystems = IndiferentHash.process_options options, :filesystems
|
|
43
|
+
|
|
44
|
+
filesystems = "home" if filesystems.nil?
|
|
45
|
+
|
|
46
|
+
filesystems = filesystems * "," if Array === filesystems
|
|
47
|
+
|
|
48
|
+
# NOT USED
|
|
49
|
+
partition = IndiferentHash.process_options options, :partition
|
|
50
|
+
task_cpus = IndiferentHash.process_options options, :task_cpus
|
|
51
|
+
exclusive = IndiferentHash.process_options options, :exclusive
|
|
52
|
+
highmem = IndiferentHash.process_options options, :highmem
|
|
53
|
+
licenses = IndiferentHash.process_options options, :licenses
|
|
54
|
+
constraint = IndiferentHash.process_options options, :constraint
|
|
55
|
+
gres = IndiferentHash.process_options options, :gres
|
|
56
|
+
|
|
57
|
+
constraint = [constraint, "highmem"].compact * "&" if highmem
|
|
58
|
+
|
|
59
|
+
mem = IndiferentHash.process_options options, :mem
|
|
60
|
+
mem_per_cpu = IndiferentHash.process_options options, :mem_per_cpu
|
|
61
|
+
|
|
62
|
+
fout = File.join(batch_dir, 'std.out')
|
|
63
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
64
|
+
|
|
65
|
+
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
|
66
|
+
|
|
67
|
+
qsub_params = { "-l filesystems=" => filesystems,
|
|
68
|
+
"-l system=" => system,
|
|
69
|
+
"-l select=" => nodes,
|
|
70
|
+
"-l place=" => place,
|
|
71
|
+
"-l walltime=" => time,
|
|
72
|
+
"-q " => queue,
|
|
73
|
+
"-A " => account,
|
|
74
|
+
"-o " => fout,
|
|
75
|
+
"-e " => ferr,
|
|
76
|
+
"-k doe" => true,
|
|
77
|
+
# "cpus-per-task" => task_cpus,
|
|
78
|
+
# "nodes" => nodes,
|
|
79
|
+
# "time" => time,
|
|
80
|
+
# "constraint" => constraint,
|
|
81
|
+
# "exclusive" => exclusive,
|
|
82
|
+
# "licenses" => licenses,
|
|
83
|
+
# "gres" => gres,
|
|
84
|
+
# "mem" => mem,
|
|
85
|
+
# "mem-per-cpu" => mem_per_cpu,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
header =<<-EOF
|
|
90
|
+
#!/bin/bash
|
|
91
|
+
EOF
|
|
92
|
+
|
|
93
|
+
qsub_params.each do |name,value|
|
|
94
|
+
next if value.nil? || value == ""
|
|
95
|
+
if TrueClass === value
|
|
96
|
+
header << "#PBS #{name}" << "\n"
|
|
97
|
+
elsif Array === value
|
|
98
|
+
value.each do |v|
|
|
99
|
+
header << "#PBS #{name}\"#{v}\"" << "\n"
|
|
100
|
+
end
|
|
101
|
+
else
|
|
102
|
+
header << "#PBS #{name}\"#{value}\"" << "\n"
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
header
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def self.run_template(batch_dir, dry_run)
|
|
110
|
+
|
|
111
|
+
fout = File.join(batch_dir, 'std.out')
|
|
112
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
113
|
+
fjob = File.join(batch_dir, 'job.id')
|
|
114
|
+
fdep = File.join(batch_dir, 'dependencies.list')
|
|
115
|
+
fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
|
|
116
|
+
fexit = File.join(batch_dir, 'exit.status')
|
|
117
|
+
fsync = File.join(batch_dir, 'sync.log')
|
|
118
|
+
fcmd = File.join(batch_dir, 'command.batch')
|
|
119
|
+
|
|
120
|
+
return if Open.exists?(fexit)
|
|
121
|
+
|
|
122
|
+
Log.info "Issuing PBS file: #{fcmd}"
|
|
123
|
+
Log.debug Open.read(fcmd)
|
|
124
|
+
|
|
125
|
+
if File.exist?(fjob)
|
|
126
|
+
job = Open.read(fjob).to_i
|
|
127
|
+
else
|
|
128
|
+
|
|
129
|
+
dependencies = Open.read(fdep).split("\n") if File.exist? fdep
|
|
130
|
+
canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
|
|
131
|
+
|
|
132
|
+
normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
|
|
133
|
+
canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
|
|
134
|
+
|
|
135
|
+
if normal_dep_str.nil? && canfail_dep_str.nil?
|
|
136
|
+
dep_str = ""
|
|
137
|
+
else
|
|
138
|
+
dep_str = '-W depend=' + [normal_dep_str, canfail_dep_str].compact * ","
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
cmd = "qsub #{dep_str} '#{fcmd}'"
|
|
142
|
+
|
|
143
|
+
if File.exist?(fout)
|
|
144
|
+
return
|
|
145
|
+
elsif dry_run
|
|
146
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "squb '#{fcmd}'")
|
|
147
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt pbs tail '#{batch_dir}'")
|
|
148
|
+
raise DryRun, batch_dir
|
|
149
|
+
else
|
|
150
|
+
Open.rm fsync
|
|
151
|
+
Open.rm fexit
|
|
152
|
+
Open.rm fout
|
|
153
|
+
Open.rm ferr
|
|
154
|
+
|
|
155
|
+
job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
|
|
156
|
+
Log.debug "SBATCH job id: #{job}"
|
|
157
|
+
Open.write(fjob, job.to_s)
|
|
158
|
+
job
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def self.job_status(job = nil)
|
|
164
|
+
if job.nil?
|
|
165
|
+
CMD.cmd("qstat").read
|
|
166
|
+
else
|
|
167
|
+
begin
|
|
168
|
+
CMD.cmd("qstat #{job}").read
|
|
169
|
+
rescue
|
|
170
|
+
""
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
end
|
|
176
|
+
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
require_relative 'job'
|
|
2
|
+
require 'scout'
|
|
3
|
+
|
|
4
|
+
module SLURM
|
|
5
|
+
|
|
6
|
+
extend SchedulerJob
|
|
7
|
+
|
|
8
|
+
def self.system
|
|
9
|
+
"SLURM"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def self.batch_system_variables
|
|
14
|
+
<<-EOF
|
|
15
|
+
let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
|
|
16
|
+
let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $SLURM_CPUS_PER_TASK )"
|
|
17
|
+
MAX_MEMORY="$MAX_MEMORY_DEFAULT"
|
|
18
|
+
[ ! -z $SLURM_MEM_PER_CPU ] && let MAX_MEMORY="$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK"
|
|
19
|
+
[ ! -z $SLURM_MEM_PER_NODE ] && MAX_MEMORY="$SLURM_MEM_PER_NODE"
|
|
20
|
+
export MAX_MEMORY_DEFAULT
|
|
21
|
+
export MAX_MEMORY
|
|
22
|
+
export BATCH_JOB_ID=$SLURM_JOB_ID
|
|
23
|
+
export BATCH_SYSTEM=#{system}
|
|
24
|
+
EOF
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def self.header(options = {})
|
|
28
|
+
options = options.dup
|
|
29
|
+
|
|
30
|
+
queue = IndiferentHash.process_options options, :queue
|
|
31
|
+
account = IndiferentHash.process_options options, :account
|
|
32
|
+
partition = IndiferentHash.process_options options, :partition
|
|
33
|
+
task_cpus = IndiferentHash.process_options options, :task_cpus
|
|
34
|
+
time = IndiferentHash.process_options options, :time
|
|
35
|
+
nodes = IndiferentHash.process_options options, :nodes
|
|
36
|
+
workdir = IndiferentHash.process_options options, :workdir
|
|
37
|
+
exclusive = IndiferentHash.process_options options, :exclusive
|
|
38
|
+
highmem = IndiferentHash.process_options options, :highmem
|
|
39
|
+
licenses = IndiferentHash.process_options options, :licenses
|
|
40
|
+
constraints = IndiferentHash.process_options options, :constraints
|
|
41
|
+
gres = IndiferentHash.process_options options, :gres
|
|
42
|
+
|
|
43
|
+
constraints = [constraints, "highmem"].compact * "&" if highmem
|
|
44
|
+
|
|
45
|
+
mem = IndiferentHash.process_options options, :mem
|
|
46
|
+
mem_per_cpu = IndiferentHash.process_options options, :mem_per_cpu
|
|
47
|
+
|
|
48
|
+
batch_dir = IndiferentHash.process_options options, :batch_dir
|
|
49
|
+
batch_name = IndiferentHash.process_options options, :batch_name
|
|
50
|
+
|
|
51
|
+
fout = File.join(batch_dir, 'std.out')
|
|
52
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
53
|
+
|
|
54
|
+
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
|
|
55
|
+
|
|
56
|
+
sbatch_params = {"job-name" => batch_name,
|
|
57
|
+
"qos" => queue,
|
|
58
|
+
"account" => account,
|
|
59
|
+
"partition" => partition,
|
|
60
|
+
"output" => fout,
|
|
61
|
+
"error" => ferr,
|
|
62
|
+
"cpus-per-task" => task_cpus,
|
|
63
|
+
"nodes" => nodes,
|
|
64
|
+
"time" => time,
|
|
65
|
+
"constraints" => constraints,
|
|
66
|
+
"exclusive" => exclusive,
|
|
67
|
+
"licenses" => licenses,
|
|
68
|
+
"gres" => gres,
|
|
69
|
+
"mem" => mem,
|
|
70
|
+
"mem-per-cpu" => mem_per_cpu,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
header =<<-EOF
|
|
75
|
+
#!/bin/bash
|
|
76
|
+
EOF
|
|
77
|
+
|
|
78
|
+
sbatch_params.each do |name,value|
|
|
79
|
+
next if value.nil? || value == ""
|
|
80
|
+
if TrueClass === value
|
|
81
|
+
header << "#SBATCH --#{name}" << "\n"
|
|
82
|
+
elsif Array === value
|
|
83
|
+
value.each do |v|
|
|
84
|
+
header << "#SBATCH --#{name}=\"#{v}\"" << "\n"
|
|
85
|
+
end
|
|
86
|
+
else
|
|
87
|
+
header << "#SBATCH --#{name}=\"#{value}\"" << "\n"
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
header
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def self.run_template(batch_dir, dry_run)
|
|
95
|
+
|
|
96
|
+
fout = File.join(batch_dir, 'std.out')
|
|
97
|
+
ferr = File.join(batch_dir, 'std.err')
|
|
98
|
+
fjob = File.join(batch_dir, 'job.id')
|
|
99
|
+
fdep = File.join(batch_dir, 'dependencies.list')
|
|
100
|
+
fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
|
|
101
|
+
fexit = File.join(batch_dir, 'exit.status')
|
|
102
|
+
fsync = File.join(batch_dir, 'sync.log')
|
|
103
|
+
fcmd = File.join(batch_dir, 'command.batch')
|
|
104
|
+
|
|
105
|
+
return if Open.exists?(fexit)
|
|
106
|
+
|
|
107
|
+
Log.info "Issuing SLURM file: #{fcmd}"
|
|
108
|
+
|
|
109
|
+
if File.exist?(fjob)
|
|
110
|
+
job = Open.read(fjob).to_i
|
|
111
|
+
else
|
|
112
|
+
|
|
113
|
+
dependencies = Open.read(fdep).split("\n") if File.exist? fdep
|
|
114
|
+
canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
|
|
115
|
+
|
|
116
|
+
normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
|
|
117
|
+
canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
|
|
118
|
+
|
|
119
|
+
if normal_dep_str.nil? && canfail_dep_str.nil?
|
|
120
|
+
dep_str = ""
|
|
121
|
+
else
|
|
122
|
+
dep_str = '--dependency=' + [normal_dep_str, canfail_dep_str].compact * ","
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
cmd = "sbatch #{dep_str} '#{fcmd}'"
|
|
126
|
+
|
|
127
|
+
if File.exist?(fout)
|
|
128
|
+
return
|
|
129
|
+
elsif dry_run
|
|
130
|
+
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{fcmd}'")
|
|
131
|
+
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local scout): ") + Log.color(:blue, "scout slurm tail '#{batch_dir}'")
|
|
132
|
+
raise DryRun, batch_dir
|
|
133
|
+
else
|
|
134
|
+
Open.rm fsync
|
|
135
|
+
Open.rm fexit
|
|
136
|
+
Open.rm fout
|
|
137
|
+
Open.rm ferr
|
|
138
|
+
|
|
139
|
+
job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
|
|
140
|
+
Log.debug "SBATCH job id: #{job}"
|
|
141
|
+
Open.write(fjob, job.to_s)
|
|
142
|
+
job
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def self.job_status(job = nil)
|
|
148
|
+
if job.nil?
|
|
149
|
+
CMD.cmd("squeue").read
|
|
150
|
+
else
|
|
151
|
+
begin
|
|
152
|
+
CMD.cmd("squeue --job #{job}").read
|
|
153
|
+
rescue
|
|
154
|
+
""
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
require_relative 'orchestrator/chains'
|
|
2
|
+
require_relative 'orchestrator/rules'
|
|
3
|
+
require_relative 'orchestrator/batches'
|
|
4
|
+
|
|
5
|
+
require_relative 'scheduler/slurm'
|
|
6
|
+
require_relative 'scheduler/pbs'
|
|
7
|
+
require_relative 'scheduler/lfs'
|
|
8
|
+
|
|
9
|
+
module Workflow::Scheduler
|
|
10
|
+
def self.produce(jobs, rules = {}, options = {})
|
|
11
|
+
batches = Workflow::Orchestrator.job_batches(rules, jobs)
|
|
12
|
+
Workflow::Scheduler.process_batches(batches, options)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def self.process_batches(batches, process_options = {})
|
|
16
|
+
failed_jobs = []
|
|
17
|
+
|
|
18
|
+
pending = batches.dup
|
|
19
|
+
|
|
20
|
+
sorted = []
|
|
21
|
+
while pending.any?
|
|
22
|
+
leaf_nodes = batches.select{|batch| (batch[:deps] - sorted).empty? }
|
|
23
|
+
sorted.concat(leaf_nodes - sorted)
|
|
24
|
+
pending -= leaf_nodes
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
batch_system = Scout::Config.get :system, :batch, :scheduler, 'env:BATCH_SYSTEM', default: 'SLURM'
|
|
28
|
+
|
|
29
|
+
batch_ids = {}
|
|
30
|
+
sorted.collect do |batch|
|
|
31
|
+
job_options = batch[:rules]
|
|
32
|
+
job_options = IndiferentHash.add_defaults job_options, process_options.dup
|
|
33
|
+
|
|
34
|
+
if batch[:deps].nil?
|
|
35
|
+
batch_dependencies = []
|
|
36
|
+
else
|
|
37
|
+
top_jobs = batch[:jobs]
|
|
38
|
+
|
|
39
|
+
batch_dependencies = batch[:deps].collect{|dep|
|
|
40
|
+
dep_target = dep[:top_level]
|
|
41
|
+
id = batch_ids[dep_target].to_s
|
|
42
|
+
|
|
43
|
+
if dep_target.canfail?
|
|
44
|
+
'canfail:' + id
|
|
45
|
+
else
|
|
46
|
+
id
|
|
47
|
+
end
|
|
48
|
+
}
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
job_options.merge!(:batch_dependencies => batch_dependencies )
|
|
52
|
+
job_options.merge!(:manifest => batch[:jobs].collect{|d| d.task_signature })
|
|
53
|
+
|
|
54
|
+
begin
|
|
55
|
+
id, dir = case batch_system
|
|
56
|
+
when 'SLURM'
|
|
57
|
+
SLURM.run_job(batch[:top_level], job_options)
|
|
58
|
+
when 'LSF'
|
|
59
|
+
LSF.run_job(batch[:top_level], job_options)
|
|
60
|
+
when 'PBS'
|
|
61
|
+
PBS.run_job(batch[:top_level], job_options)
|
|
62
|
+
when nil
|
|
63
|
+
raise "No batch system specified"
|
|
64
|
+
else
|
|
65
|
+
raise "Unknown batch system #{batch_system}"
|
|
66
|
+
end
|
|
67
|
+
batch_ids[batch[:top_level]] = id
|
|
68
|
+
rescue DryRun
|
|
69
|
+
$!.message
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -1,2 +1,11 @@
|
|
|
1
|
-
require_relative 'deployment/
|
|
1
|
+
require_relative 'deployment/local'
|
|
2
|
+
require_relative 'deployment/scheduler'
|
|
2
3
|
require_relative 'deployment/trace'
|
|
4
|
+
require_relative 'deployment/queue'
|
|
5
|
+
|
|
6
|
+
module Workflow
|
|
7
|
+
def self.produce(jobs, ...)
|
|
8
|
+
rules = Workflow::Orchestrator.load_rules_for_job(jobs)
|
|
9
|
+
Workflow::LocalExecutor.produce(jobs, rules, ...)
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -12,6 +12,9 @@ class Step
|
|
|
12
12
|
new_tokens << ("task:" + workflow_name << "#" << task_name.to_s)
|
|
13
13
|
end
|
|
14
14
|
new_tokens << ("task:" + task_name.to_s)
|
|
15
|
+
new_tokens << (task_name.to_s)
|
|
16
|
+
new_tokens << (workflow_name)
|
|
17
|
+
new_tokens << ("task")
|
|
15
18
|
|
|
16
19
|
Scout::Config.get(key, tokens + new_tokens, options)
|
|
17
20
|
end
|
|
@@ -25,5 +25,57 @@ class Step
|
|
|
25
25
|
kwargs[:bar] = self.progress_bar(desc) unless kwargs.include?(:bar)
|
|
26
26
|
TSV.traverse obj, **kwargs, &block
|
|
27
27
|
end
|
|
28
|
+
|
|
29
|
+
def monitor_stream(stream, options = {}, &block)
|
|
30
|
+
case options[:bar]
|
|
31
|
+
when TrueClass
|
|
32
|
+
bar = progress_bar
|
|
33
|
+
when Hash
|
|
34
|
+
bar = progress_bar options[:bar]
|
|
35
|
+
when Numeric
|
|
36
|
+
bar = progress_bar :max => options[:bar]
|
|
37
|
+
else
|
|
38
|
+
bar = options[:bar]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
out = if bar.nil?
|
|
42
|
+
Open.line_monitor_stream stream, &block
|
|
43
|
+
elsif (block.nil? || block.arity == 0)
|
|
44
|
+
Open.line_monitor_stream stream do
|
|
45
|
+
bar.tick
|
|
46
|
+
end
|
|
47
|
+
elsif block.arity == 1
|
|
48
|
+
Open.line_monitor_stream stream do |line|
|
|
49
|
+
bar.tick
|
|
50
|
+
block.call line
|
|
51
|
+
end
|
|
52
|
+
elsif block.arity == 2
|
|
53
|
+
Open.line_monitor_stream stream do |line|
|
|
54
|
+
block.call line, bar
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
if bar
|
|
59
|
+
ConcurrentStream.setup(out, :abort_callback => Proc.new{
|
|
60
|
+
bar.done
|
|
61
|
+
Log::ProgressBar.remove_bar(bar, true)
|
|
62
|
+
}, :callback => Proc.new{
|
|
63
|
+
bar.done
|
|
64
|
+
Log::ProgressBar.remove_bar(bar)
|
|
65
|
+
})
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
bgzip = (options[:compress] || options[:gzip]).to_s == 'bgzip'
|
|
69
|
+
bgzip = true if options[:bgzip]
|
|
70
|
+
|
|
71
|
+
gzip = true if options[:compress] || options[:gzip]
|
|
72
|
+
if bgzip
|
|
73
|
+
Open.bgzip(out)
|
|
74
|
+
elsif gzip
|
|
75
|
+
Open.gzip(out)
|
|
76
|
+
else
|
|
77
|
+
out
|
|
78
|
+
end
|
|
79
|
+
end
|
|
28
80
|
end
|
|
29
81
|
|
data/lib/scout/workflow/step.rb
CHANGED
|
@@ -43,6 +43,18 @@ class Step
|
|
|
43
43
|
end
|
|
44
44
|
end
|
|
45
45
|
end
|
|
46
|
+
|
|
47
|
+
def non_default_inputs
|
|
48
|
+
@non_default_inputs ||= begin
|
|
49
|
+
if info_file && Open.exists?(info_file)
|
|
50
|
+
info[:non_default_inputs]
|
|
51
|
+
else
|
|
52
|
+
[]
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
|
|
46
58
|
def inputs
|
|
47
59
|
@inputs ||= begin
|
|
48
60
|
if info_file && Open.exists?(info_file)
|
|
@@ -90,6 +102,8 @@ class Step
|
|
|
90
102
|
return name.split(".").first
|
|
91
103
|
end
|
|
92
104
|
|
|
105
|
+
attr_accessor :task_name
|
|
106
|
+
|
|
93
107
|
def task_name
|
|
94
108
|
return @task_name if @task_name
|
|
95
109
|
@task_name ||= @task.name if @task.respond_to?(:name)
|
|
@@ -189,6 +203,7 @@ class Step
|
|
|
189
203
|
:pid => Process.pid, :pid_hostname => Misc.hostname,
|
|
190
204
|
:task_name => task_name, :workflow => workflow.to_s,
|
|
191
205
|
:provided_inputs => Annotation.purge(provided_inputs),
|
|
206
|
+
:non_default_inputs => non_default_inputs,
|
|
192
207
|
:inputs => Annotation.purge(inputs), :input_names => input_names, :type => type,
|
|
193
208
|
:dependencies => (dependencies || []) .collect{|d| d.path }
|
|
194
209
|
|
|
@@ -223,8 +238,22 @@ class Step
|
|
|
223
238
|
|
|
224
239
|
@result
|
|
225
240
|
rescue Exception => e
|
|
226
|
-
merge_info :status => :error, :exception => Base64.encode64(Marshal.dump(e)), :end => Time.now, :backtrace => e.backtrace, :message => "#{e.class}: #{e.message}"
|
|
227
241
|
begin
|
|
242
|
+
begin
|
|
243
|
+
if ConcurrentStreamProcessFailed === e
|
|
244
|
+
s = e.concurrent_stream
|
|
245
|
+
e.concurrent_stream = nil
|
|
246
|
+
exception_encoded = Base64.encode64(Marshal.dump(e))
|
|
247
|
+
e.concurrent_stream = s
|
|
248
|
+
else
|
|
249
|
+
exception_encoded = Base64.encode64(Marshal.dump(e))
|
|
250
|
+
end
|
|
251
|
+
merge_info :status => :error, :exception => exception_encoded, :end => Time.now, :backtrace => e.backtrace, :message => "#{e.class}: #{e.message}"
|
|
252
|
+
rescue Exception
|
|
253
|
+
exception_encoded = Base64.encode64(Marshal.dump(Exception.new(e.message)))
|
|
254
|
+
merge_info :status => :error, :exception => exception_encoded, :end => Time.now, :backtrace => e.backtrace, :message => "#{e.class}: #{e.message}"
|
|
255
|
+
end
|
|
256
|
+
|
|
228
257
|
abort_dependencies
|
|
229
258
|
ensure
|
|
230
259
|
raise e
|