toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/batchSystems/slurm.py
CHANGED
|
@@ -11,19 +11,31 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
14
16
|
import logging
|
|
15
17
|
import math
|
|
16
18
|
import os
|
|
17
|
-
|
|
19
|
+
import sys
|
|
20
|
+
from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
|
|
18
21
|
from shlex import quote
|
|
19
|
-
from typing import
|
|
20
|
-
|
|
21
|
-
from toil.batchSystems.abstractBatchSystem import
|
|
22
|
-
|
|
23
|
-
|
|
22
|
+
from typing import NamedTuple, TypeVar
|
|
23
|
+
|
|
24
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
25
|
+
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
26
|
+
BatchJobExitReason,
|
|
27
|
+
InsufficientSystemResources,
|
|
28
|
+
)
|
|
29
|
+
from toil.batchSystems.abstractGridEngineBatchSystem import (
|
|
30
|
+
AbstractGridEngineBatchSystem,
|
|
31
|
+
)
|
|
24
32
|
from toil.batchSystems.options import OptionSetter
|
|
25
|
-
from toil.
|
|
33
|
+
from toil.bus import get_job_kind
|
|
34
|
+
from toil.common import Config
|
|
35
|
+
from toil.job import JobDescription, Requirer
|
|
36
|
+
from toil.lib.conversions import strtobool
|
|
26
37
|
from toil.lib.misc import CalledProcessErrorStderr, call_command
|
|
38
|
+
from toil.statsAndLogging import TRACE
|
|
27
39
|
|
|
28
40
|
logger = logging.getLogger(__name__)
|
|
29
41
|
|
|
@@ -36,7 +48,7 @@ logger = logging.getLogger(__name__)
|
|
|
36
48
|
# it, so Toil should wait for it.
|
|
37
49
|
#
|
|
38
50
|
# We map from each terminal state to the Toil-ontology exit reason.
|
|
39
|
-
TERMINAL_STATES:
|
|
51
|
+
TERMINAL_STATES: dict[str, BatchJobExitReason] = {
|
|
40
52
|
"BOOT_FAIL": BatchJobExitReason.LOST,
|
|
41
53
|
"CANCELLED": BatchJobExitReason.KILLED,
|
|
42
54
|
"COMPLETED": BatchJobExitReason.FINISHED,
|
|
@@ -47,12 +59,12 @@ TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
|
|
|
47
59
|
"PREEMPTED": BatchJobExitReason.KILLED,
|
|
48
60
|
"REVOKED": BatchJobExitReason.KILLED,
|
|
49
61
|
"SPECIAL_EXIT": BatchJobExitReason.FAILED,
|
|
50
|
-
"TIMEOUT": BatchJobExitReason.KILLED
|
|
62
|
+
"TIMEOUT": BatchJobExitReason.KILLED,
|
|
51
63
|
}
|
|
52
64
|
|
|
53
65
|
# If a job is in one of these states, it might eventually move to a different
|
|
54
66
|
# state.
|
|
55
|
-
NONTERMINAL_STATES:
|
|
67
|
+
NONTERMINAL_STATES: set[str] = {
|
|
56
68
|
"CONFIGURING",
|
|
57
69
|
"COMPLETING",
|
|
58
70
|
"PENDING",
|
|
@@ -65,51 +77,194 @@ NONTERMINAL_STATES: Set[str] = {
|
|
|
65
77
|
"SIGNALING",
|
|
66
78
|
"STAGE_OUT",
|
|
67
79
|
"STOPPED",
|
|
68
|
-
"SUSPENDED"
|
|
69
|
-
}
|
|
80
|
+
"SUSPENDED",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def parse_slurm_time(slurm_time: str) -> int:
|
|
85
|
+
"""
|
|
86
|
+
Parse a Slurm-style time duration like 7-00:00:00 to a number of seconds.
|
|
87
|
+
|
|
88
|
+
Raises ValueError if not parseable.
|
|
89
|
+
"""
|
|
90
|
+
# slurm returns time in days-hours:minutes:seconds format
|
|
91
|
+
# Sometimes it will only return minutes:seconds, so days may be omitted
|
|
92
|
+
# For ease of calculating, we'll make sure all the delimeters are ':'
|
|
93
|
+
# Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days
|
|
94
|
+
total_seconds = 0
|
|
95
|
+
elapsed_split: list[str] = slurm_time.replace("-", ":").split(":")
|
|
96
|
+
elapsed_split.reverse()
|
|
97
|
+
seconds_per_unit = [1, 60, 3600, 86400]
|
|
98
|
+
for index, multiplier in enumerate(seconds_per_unit):
|
|
99
|
+
if index < len(elapsed_split):
|
|
100
|
+
total_seconds += multiplier * int(elapsed_split[index])
|
|
101
|
+
return total_seconds
|
|
102
|
+
|
|
70
103
|
|
|
71
104
|
class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
105
|
+
class PartitionInfo(NamedTuple):
|
|
106
|
+
partition_name: str
|
|
107
|
+
gres: bool
|
|
108
|
+
time_limit: float
|
|
109
|
+
priority: int
|
|
110
|
+
cpus: str
|
|
111
|
+
memory: str
|
|
112
|
+
|
|
113
|
+
class PartitionSet:
|
|
114
|
+
"""
|
|
115
|
+
Set of available partitions detected on the slurm batch system
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
default_gpu_partition: SlurmBatchSystem.PartitionInfo | None
|
|
119
|
+
all_partitions: list[SlurmBatchSystem.PartitionInfo]
|
|
120
|
+
gpu_partitions: set[str]
|
|
121
|
+
|
|
122
|
+
def __init__(self) -> None:
|
|
123
|
+
self._get_partition_info()
|
|
124
|
+
self._get_gpu_partitions()
|
|
125
|
+
|
|
126
|
+
def _get_gpu_partitions(self) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Get all available GPU partitions. Also get the default GPU partition.
|
|
129
|
+
:return: None
|
|
130
|
+
"""
|
|
131
|
+
gpu_partitions = [
|
|
132
|
+
partition for partition in self.all_partitions if partition.gres
|
|
133
|
+
]
|
|
134
|
+
self.gpu_partitions = {p.partition_name for p in gpu_partitions}
|
|
135
|
+
# Grab the lowest priority GPU partition
|
|
136
|
+
# If no GPU partitions are available, then set the default to None
|
|
137
|
+
self.default_gpu_partition = None
|
|
138
|
+
if len(gpu_partitions) > 0:
|
|
139
|
+
self.default_gpu_partition = sorted(
|
|
140
|
+
gpu_partitions, key=lambda x: x.priority
|
|
141
|
+
)[0]
|
|
142
|
+
|
|
143
|
+
def _get_partition_info(self) -> None:
|
|
144
|
+
"""
|
|
145
|
+
Call the Slurm batch system with sinfo to grab all available partitions.
|
|
146
|
+
Then parse the output and store all available Slurm partitions
|
|
147
|
+
:return: None
|
|
148
|
+
"""
|
|
149
|
+
sinfo_command = ["sinfo", "-a", "-o", "%P %G %l %p %c %m"]
|
|
150
|
+
|
|
151
|
+
sinfo = call_command(sinfo_command)
|
|
152
|
+
|
|
153
|
+
parsed_partitions = []
|
|
154
|
+
for line in sinfo.split("\n")[1:]:
|
|
155
|
+
if line.strip():
|
|
156
|
+
partition_name, gres, time, priority, cpus, memory = line.split(" ")
|
|
157
|
+
try:
|
|
158
|
+
# Parse time to a number so we can compute on it
|
|
159
|
+
partition_time: float = parse_slurm_time(time)
|
|
160
|
+
except ValueError:
|
|
161
|
+
# Maybe time is unlimited?
|
|
162
|
+
partition_time = float("inf")
|
|
163
|
+
try:
|
|
164
|
+
# Parse priority to an int so we can sort on it
|
|
165
|
+
partition_priority = int(priority)
|
|
166
|
+
except ValueError:
|
|
167
|
+
logger.warning(
|
|
168
|
+
"Could not parse priority %s for partition %s, assuming high priority",
|
|
169
|
+
partition_name,
|
|
170
|
+
priority,
|
|
171
|
+
)
|
|
172
|
+
partition_priority = sys.maxsize
|
|
173
|
+
parsed_partitions.append(
|
|
174
|
+
SlurmBatchSystem.PartitionInfo(
|
|
175
|
+
partition_name.rstrip("*"),
|
|
176
|
+
gres != "(null)",
|
|
177
|
+
partition_time,
|
|
178
|
+
partition_priority,
|
|
179
|
+
cpus,
|
|
180
|
+
memory,
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
self.all_partitions = parsed_partitions
|
|
184
|
+
|
|
185
|
+
def get_partition(self, time_limit: float | None) -> str | None:
|
|
186
|
+
"""
|
|
187
|
+
Get the partition name to use for a job with the given time limit.
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
if time_limit is None:
|
|
191
|
+
# Just use Slurm's default
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
winning_partition = None
|
|
195
|
+
for partition in self.all_partitions:
|
|
196
|
+
if partition.time_limit >= time_limit and (
|
|
197
|
+
winning_partition is None
|
|
198
|
+
or partition.time_limit < winning_partition.time_limit
|
|
199
|
+
):
|
|
200
|
+
# If this partition can fit the job and is faster than the current winner, take it
|
|
201
|
+
winning_partition = partition
|
|
202
|
+
# TODO: Store partitions in a better indexed way
|
|
203
|
+
if winning_partition is None and len(self.all_partitions) > 0:
|
|
204
|
+
# We have partitions and none of them can fit this
|
|
205
|
+
raise RuntimeError(
|
|
206
|
+
"Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if winning_partition is None:
|
|
210
|
+
return None
|
|
211
|
+
else:
|
|
212
|
+
return winning_partition.partition_name
|
|
72
213
|
|
|
73
214
|
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
215
|
+
# Our boss is always the enclosing class
|
|
216
|
+
boss: SlurmBatchSystem
|
|
74
217
|
|
|
75
|
-
def getRunningJobIDs(self):
|
|
218
|
+
def getRunningJobIDs(self) -> dict[int, int]:
|
|
76
219
|
# Should return a dictionary of Job IDs and number of seconds
|
|
77
220
|
times = {}
|
|
78
221
|
with self.runningJobsLock:
|
|
79
|
-
currentjobs
|
|
222
|
+
currentjobs: dict[str, int] = {
|
|
223
|
+
str(self.batchJobIDs[x][0]): x for x in self.runningJobs
|
|
224
|
+
}
|
|
80
225
|
# currentjobs is a dictionary that maps a slurm job id (string) to our own internal job id
|
|
81
226
|
# squeue arguments:
|
|
82
227
|
# -h for no header
|
|
83
228
|
# --format to get jobid i, state %t and time days-hours:minutes:seconds
|
|
84
229
|
|
|
85
|
-
lines = call_command(
|
|
230
|
+
lines = call_command(
|
|
231
|
+
["squeue", "-h", "--format", "%i %t %M"], quiet=True
|
|
232
|
+
).split("\n")
|
|
86
233
|
for line in lines:
|
|
87
234
|
values = line.split()
|
|
88
235
|
if len(values) < 3:
|
|
89
236
|
continue
|
|
90
237
|
slurm_jobid, state, elapsed_time = values
|
|
91
|
-
if slurm_jobid in currentjobs and state ==
|
|
92
|
-
|
|
238
|
+
if slurm_jobid in currentjobs and state == "R":
|
|
239
|
+
try:
|
|
240
|
+
seconds_running = parse_slurm_time(elapsed_time)
|
|
241
|
+
except ValueError:
|
|
242
|
+
# slurm may return INVALID instead of a time
|
|
243
|
+
seconds_running = 0
|
|
93
244
|
times[currentjobs[slurm_jobid]] = seconds_running
|
|
94
245
|
|
|
95
246
|
return times
|
|
96
247
|
|
|
97
|
-
def killJob(self, jobID):
|
|
98
|
-
call_command([
|
|
99
|
-
|
|
100
|
-
def prepareSubmission(
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
248
|
+
def killJob(self, jobID: int) -> None:
|
|
249
|
+
call_command(["scancel", self.getBatchSystemID(jobID)])
|
|
250
|
+
|
|
251
|
+
def prepareSubmission(
|
|
252
|
+
self,
|
|
253
|
+
cpu: int,
|
|
254
|
+
memory: int,
|
|
255
|
+
jobID: int,
|
|
256
|
+
command: str,
|
|
257
|
+
jobName: str,
|
|
258
|
+
job_environment: dict[str, str] | None = None,
|
|
259
|
+
gpus: int | None = None,
|
|
260
|
+
) -> list[str]:
|
|
108
261
|
# Make sure to use exec so we can get Slurm's signals in the Toil
|
|
109
262
|
# worker instead of having an intervening Bash
|
|
110
|
-
return self.prepareSbatch(
|
|
263
|
+
return self.prepareSbatch(
|
|
264
|
+
cpu, memory, jobID, jobName, job_environment, gpus
|
|
265
|
+
) + [f"--wrap=exec {command}"]
|
|
111
266
|
|
|
112
|
-
def submitJob(self, subLine):
|
|
267
|
+
def submitJob(self, subLine: list[str]) -> int:
|
|
113
268
|
try:
|
|
114
269
|
# Slurm is not quite clever enough to follow the XDG spec on
|
|
115
270
|
# its own. If the submission command sees e.g. XDG_RUNTIME_DIR
|
|
@@ -125,7 +280,11 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
125
280
|
# This doesn't get us a trustworthy XDG session in Slurm, but
|
|
126
281
|
# it does let us see the one Slurm tries to give us.
|
|
127
282
|
no_session_environment = os.environ.copy()
|
|
128
|
-
session_names = [
|
|
283
|
+
session_names = [
|
|
284
|
+
n
|
|
285
|
+
for n in no_session_environment.keys()
|
|
286
|
+
if n.startswith("XDG_") or n.startswith("DBUS_")
|
|
287
|
+
]
|
|
129
288
|
for name in session_names:
|
|
130
289
|
del no_session_environment[name]
|
|
131
290
|
|
|
@@ -138,36 +297,44 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
138
297
|
logger.error(f"sbatch command failed with error: {e}")
|
|
139
298
|
raise e
|
|
140
299
|
|
|
141
|
-
def coalesce_job_exit_codes(
|
|
300
|
+
def coalesce_job_exit_codes(
|
|
301
|
+
self, batch_job_id_list: list[str]
|
|
302
|
+
) -> list[int | tuple[int, BatchJobExitReason | None] | None]:
|
|
142
303
|
"""
|
|
143
304
|
Collect all job exit codes in a single call.
|
|
144
305
|
:param batch_job_id_list: list of Job ID strings, where each string has the form
|
|
145
306
|
"<job>[.<task>]".
|
|
146
307
|
:return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
|
|
147
308
|
"""
|
|
148
|
-
logger.
|
|
309
|
+
logger.log(
|
|
310
|
+
TRACE, "Getting exit codes for slurm jobs: %s", batch_job_id_list
|
|
311
|
+
)
|
|
149
312
|
# Convert batch_job_id_list to list of integer job IDs.
|
|
150
|
-
job_id_list = [int(id.split(
|
|
313
|
+
job_id_list = [int(id.split(".")[0]) for id in batch_job_id_list]
|
|
151
314
|
status_dict = self._get_job_details(job_id_list)
|
|
152
|
-
exit_codes = []
|
|
315
|
+
exit_codes: list[int | tuple[int, BatchJobExitReason | None] | None] = []
|
|
153
316
|
for _, status in status_dict.items():
|
|
154
317
|
exit_codes.append(self._get_job_return_code(status))
|
|
155
318
|
return exit_codes
|
|
156
319
|
|
|
157
|
-
def getJobExitCode(
|
|
320
|
+
def getJobExitCode(
|
|
321
|
+
self, batchJobID: str
|
|
322
|
+
) -> int | tuple[int, BatchJobExitReason | None] | None:
|
|
158
323
|
"""
|
|
159
324
|
Get job exit code for given batch job ID.
|
|
160
325
|
:param batchJobID: string of the form "<job>[.<task>]".
|
|
161
326
|
:return: integer job exit code.
|
|
162
327
|
"""
|
|
163
|
-
logger.
|
|
328
|
+
logger.log(TRACE, "Getting exit code for slurm job: %s", batchJobID)
|
|
164
329
|
# Convert batchJobID to an integer job ID.
|
|
165
|
-
job_id = int(batchJobID.split(
|
|
330
|
+
job_id = int(batchJobID.split(".")[0])
|
|
166
331
|
status_dict = self._get_job_details([job_id])
|
|
167
332
|
status = status_dict[job_id]
|
|
168
333
|
return self._get_job_return_code(status)
|
|
169
334
|
|
|
170
|
-
def _get_job_details(
|
|
335
|
+
def _get_job_details(
|
|
336
|
+
self, job_id_list: list[int]
|
|
337
|
+
) -> dict[int, tuple[str | None, int | None]]:
|
|
171
338
|
"""
|
|
172
339
|
Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
|
|
173
340
|
Fetch job details from Slurm's accounting system or job control system.
|
|
@@ -181,7 +348,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
181
348
|
status_dict = self._getJobDetailsFromScontrol(job_id_list)
|
|
182
349
|
return status_dict
|
|
183
350
|
|
|
184
|
-
def _get_job_return_code(
|
|
351
|
+
def _get_job_return_code(
|
|
352
|
+
self, status: tuple[str | None, int | None]
|
|
353
|
+
) -> int | tuple[int, BatchJobExitReason | None] | None:
|
|
185
354
|
"""
|
|
186
355
|
Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair.
|
|
187
356
|
|
|
@@ -215,7 +384,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
215
384
|
# The only state that should produce a 0 ever is COMPLETED. So
|
|
216
385
|
# if the job is COMPLETED and the exit reason is thus FINISHED,
|
|
217
386
|
# pass along the code it has.
|
|
218
|
-
return (rc, exit_reason)
|
|
387
|
+
return (rc, exit_reason) # type: ignore[return-value] # mypy doesn't understand enums well
|
|
219
388
|
|
|
220
389
|
if rc == 0:
|
|
221
390
|
# The job claims to be in a state other than COMPLETED, but
|
|
@@ -224,7 +393,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
224
393
|
return (EXIT_STATUS_UNAVAILABLE_VALUE, exit_reason)
|
|
225
394
|
|
|
226
395
|
# If the code is nonzero, pass it along.
|
|
227
|
-
return (rc, exit_reason)
|
|
396
|
+
return (rc, exit_reason) # type: ignore[return-value] # mypy doesn't understand enums well
|
|
228
397
|
|
|
229
398
|
def _canonicalize_state(self, state: str) -> str:
|
|
230
399
|
"""
|
|
@@ -233,18 +402,23 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
233
402
|
|
|
234
403
|
# Slurm will sometimes send something like "CANCELED by 30065" in
|
|
235
404
|
# the state column for some reason.
|
|
236
|
-
|
|
405
|
+
|
|
237
406
|
state_token = state
|
|
238
407
|
|
|
239
408
|
if " " in state_token:
|
|
240
409
|
state_token = state.split(" ", 1)[0]
|
|
241
410
|
|
|
242
|
-
if
|
|
411
|
+
if (
|
|
412
|
+
state_token not in TERMINAL_STATES
|
|
413
|
+
and state_token not in NONTERMINAL_STATES
|
|
414
|
+
):
|
|
243
415
|
raise RuntimeError("Toil job in unimplemented Slurm state " + state)
|
|
244
|
-
|
|
416
|
+
|
|
245
417
|
return state_token
|
|
246
418
|
|
|
247
|
-
def _getJobDetailsFromSacct(
|
|
419
|
+
def _getJobDetailsFromSacct(
|
|
420
|
+
self, job_id_list: list[int]
|
|
421
|
+
) -> dict[int, tuple[str | None, int | None]]:
|
|
248
422
|
"""
|
|
249
423
|
Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
|
|
250
424
|
:param job_id_list: list of integer batch job IDs.
|
|
@@ -252,52 +426,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
252
426
|
containing the job's state and exit code.
|
|
253
427
|
"""
|
|
254
428
|
job_ids = ",".join(str(id) for id in job_id_list)
|
|
255
|
-
args = [
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
429
|
+
args = [
|
|
430
|
+
"sacct",
|
|
431
|
+
"-n", # no header
|
|
432
|
+
"-j",
|
|
433
|
+
job_ids, # job
|
|
434
|
+
"--format",
|
|
435
|
+
"JobIDRaw,State,ExitCode", # specify output columns
|
|
436
|
+
"-P", # separate columns with pipes
|
|
437
|
+
"-S",
|
|
438
|
+
"1970-01-01",
|
|
439
|
+
] # override start time limit
|
|
261
440
|
stdout = call_command(args, quiet=True)
|
|
262
441
|
|
|
263
442
|
# Collect the job statuses in a dict; key is the job-id, value is a tuple containing
|
|
264
443
|
# job state and exit status. Initialize dict before processing output of `sacct`.
|
|
265
|
-
job_statuses = {}
|
|
444
|
+
job_statuses: dict[int, tuple[str | None, int | None]] = {}
|
|
266
445
|
for job_id in job_id_list:
|
|
267
446
|
job_statuses[job_id] = (None, None)
|
|
268
447
|
|
|
269
448
|
for line in stdout.splitlines():
|
|
270
|
-
values = line.strip().split(
|
|
449
|
+
values = line.strip().split("|")
|
|
271
450
|
if len(values) < 3:
|
|
272
451
|
continue
|
|
452
|
+
state: str
|
|
273
453
|
job_id_raw, state, exitcode = values
|
|
274
454
|
state = self._canonicalize_state(state)
|
|
275
|
-
logger.
|
|
455
|
+
logger.log(
|
|
456
|
+
TRACE, "%s state of job %s is %s", args[0], job_id_raw, state
|
|
457
|
+
)
|
|
276
458
|
# JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
|
|
277
459
|
job_id_parts = job_id_raw.split(".")
|
|
278
460
|
if len(job_id_parts) > 1:
|
|
279
461
|
continue
|
|
280
462
|
job_id = int(job_id_parts[0])
|
|
281
|
-
status
|
|
463
|
+
status: int
|
|
464
|
+
signal: int
|
|
465
|
+
status, signal = (int(n) for n in exitcode.split(":"))
|
|
282
466
|
if signal > 0:
|
|
283
467
|
# A non-zero signal may indicate e.g. an out-of-memory killed job
|
|
284
468
|
status = 128 + signal
|
|
285
|
-
logger.
|
|
286
|
-
|
|
469
|
+
logger.log(
|
|
470
|
+
TRACE,
|
|
471
|
+
"%s exit code of job %d is %s, return status %d",
|
|
472
|
+
args[0],
|
|
473
|
+
job_id,
|
|
474
|
+
exitcode,
|
|
475
|
+
status,
|
|
476
|
+
)
|
|
287
477
|
job_statuses[job_id] = state, status
|
|
288
|
-
logger.
|
|
478
|
+
logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
|
|
289
479
|
return job_statuses
|
|
290
480
|
|
|
291
|
-
def _getJobDetailsFromScontrol(
|
|
481
|
+
def _getJobDetailsFromScontrol(
|
|
482
|
+
self, job_id_list: list[int]
|
|
483
|
+
) -> dict[int, tuple[str | None, int | None]]:
|
|
292
484
|
"""
|
|
293
485
|
Get SLURM job exit codes for the jobs in `job_id_list` by running `scontrol`.
|
|
294
486
|
:param job_id_list: list of integer batch job IDs.
|
|
295
487
|
:return: dict of job statuses, where key is the job-id, and value is a tuple
|
|
296
488
|
containing the job's state and exit code.
|
|
297
489
|
"""
|
|
298
|
-
args = [
|
|
299
|
-
'show',
|
|
300
|
-
'job']
|
|
490
|
+
args = ["scontrol", "show", "job"]
|
|
301
491
|
# `scontrol` can only return information about a single job,
|
|
302
492
|
# or all the jobs it knows about.
|
|
303
493
|
if len(job_id_list) == 1:
|
|
@@ -306,14 +496,16 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
306
496
|
stdout = call_command(args, quiet=True)
|
|
307
497
|
|
|
308
498
|
# Job records are separated by a blank line.
|
|
499
|
+
job_records = None
|
|
309
500
|
if isinstance(stdout, str):
|
|
310
|
-
job_records = stdout.strip().split(
|
|
501
|
+
job_records = stdout.strip().split("\n\n")
|
|
311
502
|
elif isinstance(stdout, bytes):
|
|
312
|
-
job_records = stdout.decode(
|
|
503
|
+
job_records = stdout.decode("utf-8").strip().split("\n\n")
|
|
313
504
|
|
|
314
505
|
# Collect the job statuses in a dict; key is the job-id, value is a tuple containing
|
|
315
506
|
# job state and exit status. Initialize dict before processing output of `scontrol`.
|
|
316
|
-
job_statuses = {}
|
|
507
|
+
job_statuses: dict[int, tuple[str | None, int | None]] = {}
|
|
508
|
+
job_id: int | None
|
|
317
509
|
for job_id in job_id_list:
|
|
318
510
|
job_statuses[job_id] = (None, None)
|
|
319
511
|
|
|
@@ -323,7 +515,8 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
323
515
|
return job_statuses
|
|
324
516
|
|
|
325
517
|
for record in job_records:
|
|
326
|
-
job = {}
|
|
518
|
+
job: dict[str, str] = {}
|
|
519
|
+
job_id = None
|
|
327
520
|
for line in record.splitlines():
|
|
328
521
|
for item in line.split():
|
|
329
522
|
# Output is in the form of many key=value pairs, multiple pairs on each line
|
|
@@ -331,59 +524,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
331
524
|
# added to a dictionary.
|
|
332
525
|
# Note: In some cases, the value itself may contain white-space. So, if we find
|
|
333
526
|
# a key without a value, we consider that key part of the previous value.
|
|
334
|
-
bits = item.split(
|
|
527
|
+
bits = item.split("=", 1)
|
|
335
528
|
if len(bits) == 1:
|
|
336
|
-
job[key] +=
|
|
529
|
+
job[key] += " " + bits[0] # type: ignore[has-type] # we depend on the previous iteration to populate key
|
|
337
530
|
else:
|
|
338
531
|
key = bits[0]
|
|
339
532
|
job[key] = bits[1]
|
|
340
533
|
# The first line of the record contains the JobId. Stop processing the remainder
|
|
341
534
|
# of this record, if we're not interested in this job.
|
|
342
|
-
job_id = int(job[
|
|
535
|
+
job_id = int(job["JobId"])
|
|
343
536
|
if job_id not in job_id_list:
|
|
344
|
-
logger.
|
|
537
|
+
logger.log(
|
|
538
|
+
TRACE, "%s job %d is not in the list", args[0], job_id
|
|
539
|
+
)
|
|
345
540
|
break
|
|
346
|
-
if job_id not in job_id_list:
|
|
541
|
+
if job_id is None or job_id not in job_id_list:
|
|
347
542
|
continue
|
|
348
|
-
state = job[
|
|
543
|
+
state = job["JobState"]
|
|
349
544
|
state = self._canonicalize_state(state)
|
|
350
|
-
logger.
|
|
545
|
+
logger.log(TRACE, "%s state of job %s is %s", args[0], job_id, state)
|
|
351
546
|
try:
|
|
352
|
-
exitcode = job[
|
|
547
|
+
exitcode = job["ExitCode"]
|
|
353
548
|
if exitcode is not None:
|
|
354
|
-
status, signal = (int(n) for n in exitcode.split(
|
|
549
|
+
status, signal = (int(n) for n in exitcode.split(":"))
|
|
355
550
|
if signal > 0:
|
|
356
551
|
# A non-zero signal may indicate e.g. an out-of-memory killed job
|
|
357
552
|
status = 128 + signal
|
|
358
|
-
logger.
|
|
359
|
-
|
|
553
|
+
logger.log(
|
|
554
|
+
TRACE,
|
|
555
|
+
"%s exit code of job %d is %s, return status %d",
|
|
556
|
+
args[0],
|
|
557
|
+
job_id,
|
|
558
|
+
exitcode,
|
|
559
|
+
status,
|
|
560
|
+
)
|
|
360
561
|
rc = status
|
|
361
562
|
else:
|
|
362
563
|
rc = None
|
|
363
564
|
except KeyError:
|
|
364
565
|
rc = None
|
|
365
566
|
job_statuses[job_id] = (state, rc)
|
|
366
|
-
logger.
|
|
567
|
+
logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
|
|
367
568
|
return job_statuses
|
|
368
569
|
|
|
369
570
|
###
|
|
370
571
|
### Implementation-specific helper methods
|
|
371
572
|
###
|
|
372
573
|
|
|
373
|
-
def prepareSbatch(
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
574
|
+
def prepareSbatch(
|
|
575
|
+
self,
|
|
576
|
+
cpu: int,
|
|
577
|
+
mem: int,
|
|
578
|
+
jobID: int,
|
|
579
|
+
jobName: str,
|
|
580
|
+
job_environment: dict[str, str] | None,
|
|
581
|
+
gpus: int | None,
|
|
582
|
+
) -> list[str]:
|
|
381
583
|
"""
|
|
382
584
|
Returns the sbatch command line to run to queue the job.
|
|
383
585
|
"""
|
|
384
586
|
|
|
385
587
|
# Start by naming the job
|
|
386
|
-
sbatch_line = [
|
|
588
|
+
sbatch_line = ["sbatch", "-J", f"toil_job_{jobID}_{jobName}"]
|
|
387
589
|
|
|
388
590
|
# Make sure the job gets a signal before it disappears so that e.g.
|
|
389
591
|
# container cleanup finally blocks can run. Ask for SIGINT so we
|
|
@@ -398,25 +600,28 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
398
600
|
# responded to this signal and use the right exit reason for it.
|
|
399
601
|
sbatch_line.append("--signal=B:INT@30")
|
|
400
602
|
|
|
401
|
-
if gpus:
|
|
402
|
-
sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:]
|
|
403
603
|
environment = {}
|
|
404
604
|
environment.update(self.boss.environment)
|
|
405
605
|
if job_environment:
|
|
406
606
|
environment.update(job_environment)
|
|
407
607
|
|
|
408
608
|
# "Native extensions" for SLURM (see DRMAA or SAGA)
|
|
409
|
-
|
|
609
|
+
# Also any extra arguments from --slurmArgs or TOIL_SLURM_ARGS
|
|
610
|
+
nativeConfig: str = self.boss.config.slurm_args # type: ignore[attr-defined]
|
|
410
611
|
|
|
411
612
|
# --export=[ALL,]<environment_toil_variables>
|
|
412
613
|
set_exports = "--export=ALL"
|
|
413
614
|
|
|
414
615
|
if nativeConfig is not None:
|
|
415
|
-
logger.debug(
|
|
616
|
+
logger.debug(
|
|
617
|
+
"Native SLURM options appended to sbatch: %s", nativeConfig
|
|
618
|
+
)
|
|
416
619
|
|
|
417
620
|
for arg in nativeConfig.split():
|
|
418
621
|
if arg.startswith("--mem") or arg.startswith("--cpus-per-task"):
|
|
419
|
-
raise ValueError(
|
|
622
|
+
raise ValueError(
|
|
623
|
+
f"Some resource arguments are incompatible: {nativeConfig}"
|
|
624
|
+
)
|
|
420
625
|
# repleace default behaviour by the one stated at TOIL_SLURM_ARGS
|
|
421
626
|
if arg.startswith("--export"):
|
|
422
627
|
set_exports = arg
|
|
@@ -427,54 +632,149 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
427
632
|
|
|
428
633
|
for k, v in environment.items():
|
|
429
634
|
quoted_value = quote(os.environ[k] if v is None else v)
|
|
430
|
-
argList.append(f
|
|
635
|
+
argList.append(f"{k}={quoted_value}")
|
|
431
636
|
|
|
432
|
-
set_exports +=
|
|
637
|
+
set_exports += "," + ",".join(argList)
|
|
433
638
|
|
|
434
639
|
# add --export to the sbatch
|
|
435
640
|
sbatch_line.append(set_exports)
|
|
436
641
|
|
|
437
|
-
parallel_env =
|
|
642
|
+
parallel_env: str = self.boss.config.slurm_pe # type: ignore[attr-defined]
|
|
438
643
|
if cpu and cpu > 1 and parallel_env:
|
|
439
|
-
sbatch_line.append(f
|
|
644
|
+
sbatch_line.append(f"--partition={parallel_env}")
|
|
440
645
|
|
|
441
|
-
if mem is not None and self.boss.config.
|
|
646
|
+
if mem is not None and self.boss.config.slurm_allocate_mem: # type: ignore[attr-defined]
|
|
442
647
|
# memory passed in is in bytes, but slurm expects megabytes
|
|
443
|
-
sbatch_line.append(f
|
|
648
|
+
sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}")
|
|
444
649
|
if cpu is not None:
|
|
445
|
-
sbatch_line.append(f
|
|
650
|
+
sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}")
|
|
651
|
+
|
|
652
|
+
time_limit: int = self.boss.config.slurm_time # type: ignore[attr-defined]
|
|
653
|
+
if time_limit is not None:
|
|
654
|
+
# Put all the seconds in the seconds slot
|
|
655
|
+
sbatch_line.append(f"--time=0:{time_limit}")
|
|
446
656
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
657
|
+
if gpus:
|
|
658
|
+
# This block will add a gpu supported partition only if no partition is supplied by the user
|
|
659
|
+
sbatch_line = sbatch_line[:1] + [f"--gres=gpu:{gpus}"] + sbatch_line[1:]
|
|
660
|
+
if not any(option.startswith("--partition") for option in sbatch_line):
|
|
661
|
+
# no partition specified, so specify one
|
|
662
|
+
# try to get the name of the lowest priority gpu supported partition
|
|
663
|
+
lowest_gpu_partition = self.boss.partitions.default_gpu_partition
|
|
664
|
+
if lowest_gpu_partition is None:
|
|
665
|
+
# no gpu partitions are available, raise an error
|
|
666
|
+
raise RuntimeError(
|
|
667
|
+
f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
|
|
668
|
+
)
|
|
669
|
+
if (
|
|
670
|
+
time_limit is not None
|
|
671
|
+
and lowest_gpu_partition.time_limit < time_limit
|
|
672
|
+
):
|
|
673
|
+
# TODO: find the lowest-priority GPU partition that has at least each job's time limit!
|
|
674
|
+
logger.warning(
|
|
675
|
+
"Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
|
|
676
|
+
time_limit,
|
|
677
|
+
lowest_gpu_partition.partition_name,
|
|
678
|
+
lowest_gpu_partition.time_limit,
|
|
679
|
+
)
|
|
680
|
+
sbatch_line.append(
|
|
681
|
+
f"--partition={lowest_gpu_partition.partition_name}"
|
|
682
|
+
)
|
|
683
|
+
else:
|
|
684
|
+
# there is a partition specified already, check if the partition has GPUs
|
|
685
|
+
for i, option in enumerate(sbatch_line):
|
|
686
|
+
if option.startswith("--partition"):
|
|
687
|
+
# grab the partition name depending on if it's specified via an "=" or a space
|
|
688
|
+
if "=" in option:
|
|
689
|
+
partition_name = option[len("--partition=") :]
|
|
690
|
+
else:
|
|
691
|
+
partition_name = option[i + 1]
|
|
692
|
+
available_gpu_partitions = (
|
|
693
|
+
self.boss.partitions.gpu_partitions
|
|
694
|
+
)
|
|
695
|
+
if partition_name not in available_gpu_partitions:
|
|
696
|
+
# the specified partition is not compatible, so warn the user that the job may not work
|
|
697
|
+
logger.warning(
|
|
698
|
+
f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work."
|
|
699
|
+
f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}."
|
|
700
|
+
)
|
|
701
|
+
break
|
|
702
|
+
|
|
703
|
+
if not any(option.startswith("--partition") for option in sbatch_line):
|
|
704
|
+
# Pick a partition ourselves
|
|
705
|
+
chosen_partition = self.boss.partitions.get_partition(time_limit)
|
|
706
|
+
if chosen_partition is not None:
|
|
707
|
+
# Route to that partition
|
|
708
|
+
sbatch_line.append(f"--partition={chosen_partition}")
|
|
709
|
+
|
|
710
|
+
stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out")
|
|
711
|
+
stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err")
|
|
712
|
+
sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile])
|
|
450
713
|
|
|
451
714
|
return sbatch_line
|
|
452
715
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
716
|
+
def __init__(
|
|
717
|
+
self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
|
|
718
|
+
) -> None:
|
|
719
|
+
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
720
|
+
self.partitions = SlurmBatchSystem.PartitionSet()
|
|
721
|
+
|
|
722
|
+
# Override issuing jobs so we can check if we need to use Slurm's magic
|
|
723
|
+
# whole-node-memory feature.
|
|
724
|
+
def issueBatchJob(
|
|
725
|
+
self,
|
|
726
|
+
command: str,
|
|
727
|
+
job_desc: JobDescription,
|
|
728
|
+
job_environment: dict[str, str] | None = None,
|
|
729
|
+
) -> int:
|
|
730
|
+
# Avoid submitting internal jobs to the batch queue, handle locally
|
|
731
|
+
local_id = self.handleLocalJob(command, job_desc)
|
|
732
|
+
if local_id is not None:
|
|
733
|
+
return local_id
|
|
734
|
+
else:
|
|
735
|
+
self.check_resource_request(job_desc)
|
|
736
|
+
gpus = self.count_needed_gpus(job_desc)
|
|
737
|
+
job_id = self.getNextJobID()
|
|
738
|
+
self.currentJobs.add(job_id)
|
|
739
|
+
|
|
740
|
+
if "memory" not in job_desc.requirements and self.config.slurm_default_all_mem: # type: ignore[attr-defined]
|
|
741
|
+
# The job doesn't have its own memory requirement, and we are
|
|
742
|
+
# defaulting to whole node memory. Use Slurm's 0-memory sentinel.
|
|
743
|
+
memory = 0
|
|
744
|
+
else:
|
|
745
|
+
# Use the memory actually on the job, or the Toil default memory
|
|
746
|
+
memory = job_desc.memory
|
|
747
|
+
|
|
748
|
+
self.newJobsQueue.put(
|
|
749
|
+
(
|
|
750
|
+
job_id,
|
|
751
|
+
job_desc.cores,
|
|
752
|
+
memory,
|
|
753
|
+
command,
|
|
754
|
+
get_job_kind(job_desc.get_names()),
|
|
755
|
+
job_environment,
|
|
756
|
+
gpus,
|
|
757
|
+
)
|
|
758
|
+
)
|
|
759
|
+
logger.debug(
|
|
760
|
+
"Issued the job command: %s with job id: %s and job name %s",
|
|
761
|
+
command,
|
|
762
|
+
str(job_id),
|
|
763
|
+
get_job_kind(job_desc.get_names()),
|
|
764
|
+
)
|
|
765
|
+
return job_id
|
|
469
766
|
|
|
470
767
|
def _check_accelerator_request(self, requirer: Requirer) -> None:
|
|
471
768
|
for accelerator in requirer.accelerators:
|
|
472
|
-
if accelerator[
|
|
473
|
-
raise InsufficientSystemResources(
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
769
|
+
if accelerator["kind"] != "gpu":
|
|
770
|
+
raise InsufficientSystemResources(
|
|
771
|
+
requirer,
|
|
772
|
+
"accelerators",
|
|
773
|
+
details=[
|
|
774
|
+
f"The accelerator {accelerator} could not be provided"
|
|
775
|
+
"The Toil Slurm batch system only supports gpu accelerators at the moment."
|
|
776
|
+
],
|
|
777
|
+
)
|
|
478
778
|
|
|
479
779
|
###
|
|
480
780
|
### The interface for SLURM
|
|
@@ -488,17 +788,69 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
488
788
|
# implement getWaitDuration().
|
|
489
789
|
|
|
490
790
|
@classmethod
|
|
491
|
-
def add_options(cls, parser:
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
791
|
+
def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None:
|
|
792
|
+
|
|
793
|
+
parser.add_argument(
|
|
794
|
+
"--slurmAllocateMem",
|
|
795
|
+
dest="slurm_allocate_mem",
|
|
796
|
+
type=strtobool,
|
|
797
|
+
default=True,
|
|
798
|
+
env_var="TOIL_SLURM_ALLOCATE_MEM",
|
|
799
|
+
help="If False, do not use --mem. Used as a workaround for Slurm clusters that reject jobs "
|
|
800
|
+
"with memory allocations.",
|
|
801
|
+
)
|
|
802
|
+
# Keep these deprcated options for backward compatibility
|
|
803
|
+
parser.add_argument(
|
|
804
|
+
"--dont_allocate_mem",
|
|
805
|
+
action="store_false",
|
|
806
|
+
dest="slurm_allocate_mem",
|
|
807
|
+
help=SUPPRESS,
|
|
808
|
+
)
|
|
809
|
+
parser.add_argument(
|
|
810
|
+
"--allocate_mem",
|
|
811
|
+
action="store_true",
|
|
812
|
+
dest="slurm_allocate_mem",
|
|
813
|
+
help=SUPPRESS,
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
parser.add_argument(
|
|
817
|
+
"--slurmDefaultAllMem",
|
|
818
|
+
dest="slurm_default_all_mem",
|
|
819
|
+
type=strtobool,
|
|
820
|
+
default=False,
|
|
821
|
+
env_var="TOIL_SLURM_DEFAULT_ALL_MEM",
|
|
822
|
+
help="If True, assign Toil jobs without their own memory requirements all available "
|
|
823
|
+
"memory on a Slurm node (via Slurm --mem=0).",
|
|
824
|
+
)
|
|
825
|
+
parser.add_argument(
|
|
826
|
+
"--slurmTime",
|
|
827
|
+
dest="slurm_time",
|
|
828
|
+
type=parse_slurm_time,
|
|
829
|
+
default=None,
|
|
830
|
+
env_var="TOIL_SLURM_TIME",
|
|
831
|
+
help="Slurm job time limit, in [DD-]HH:MM:SS format.",
|
|
832
|
+
)
|
|
833
|
+
parser.add_argument(
|
|
834
|
+
"--slurmPE",
|
|
835
|
+
dest="slurm_pe",
|
|
836
|
+
default=None,
|
|
837
|
+
env_var="TOIL_SLURM_PE",
|
|
838
|
+
help="Special partition to send Slurm jobs to if they ask for more than 1 CPU.",
|
|
839
|
+
)
|
|
840
|
+
parser.add_argument(
|
|
841
|
+
"--slurmArgs",
|
|
842
|
+
dest="slurm_args",
|
|
843
|
+
default="",
|
|
844
|
+
env_var="TOIL_SLURM_ARGS",
|
|
845
|
+
help="Extra arguments to pass to Slurm.",
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
OptionType = TypeVar("OptionType")
|
|
849
|
+
|
|
501
850
|
@classmethod
|
|
502
851
|
def setOptions(cls, setOption: OptionSetter) -> None:
|
|
503
|
-
setOption("
|
|
504
|
-
|
|
852
|
+
setOption("slurm_allocate_mem")
|
|
853
|
+
setOption("slurm_default_all_mem")
|
|
854
|
+
setOption("slurm_time")
|
|
855
|
+
setOption("slurm_pe")
|
|
856
|
+
setOption("slurm_args")
|