toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
@@ -11,62 +11,260 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from __future__ import annotations
15
+
14
16
  import logging
15
17
  import math
16
18
  import os
17
- from argparse import ArgumentParser, _ArgumentGroup
19
+ import sys
20
+ from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
18
21
  from shlex import quote
19
- from typing import Dict, List, Optional, TypeVar, Union
20
-
21
- from toil.batchSystems.abstractGridEngineBatchSystem import \
22
- AbstractGridEngineBatchSystem
22
+ from typing import NamedTuple, TypeVar
23
+
24
+ from toil.batchSystems.abstractBatchSystem import (
25
+ EXIT_STATUS_UNAVAILABLE_VALUE,
26
+ BatchJobExitReason,
27
+ InsufficientSystemResources,
28
+ )
29
+ from toil.batchSystems.abstractGridEngineBatchSystem import (
30
+ AbstractGridEngineBatchSystem,
31
+ )
23
32
  from toil.batchSystems.options import OptionSetter
24
- from toil.job import Requirer
33
+ from toil.bus import get_job_kind
34
+ from toil.common import Config
35
+ from toil.job import JobDescription, Requirer
36
+ from toil.lib.conversions import strtobool
25
37
  from toil.lib.misc import CalledProcessErrorStderr, call_command
38
+ from toil.statsAndLogging import TRACE
26
39
 
27
40
  logger = logging.getLogger(__name__)
28
41
 
42
+ # We have a complete list of Slurm states. States not in one of these aren't
43
+ # allowed. See <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>
29
44
 
30
- class SlurmBatchSystem(AbstractGridEngineBatchSystem):
45
+ # If a job is in one of these states, Slurm can't run it anymore.
46
+ # We don't include states where the job is held or paused here;
47
+ # those mean it could run and needs to wait for someone to un-hold
48
+ # it, so Toil should wait for it.
49
+ #
50
+ # We map from each terminal state to the Toil-ontology exit reason.
51
+ TERMINAL_STATES: dict[str, BatchJobExitReason] = {
52
+ "BOOT_FAIL": BatchJobExitReason.LOST,
53
+ "CANCELLED": BatchJobExitReason.KILLED,
54
+ "COMPLETED": BatchJobExitReason.FINISHED,
55
+ "DEADLINE": BatchJobExitReason.KILLED,
56
+ "FAILED": BatchJobExitReason.FAILED,
57
+ "NODE_FAIL": BatchJobExitReason.LOST,
58
+ "OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
59
+ "PREEMPTED": BatchJobExitReason.KILLED,
60
+ "REVOKED": BatchJobExitReason.KILLED,
61
+ "SPECIAL_EXIT": BatchJobExitReason.FAILED,
62
+ "TIMEOUT": BatchJobExitReason.KILLED,
63
+ }
64
+
65
+ # If a job is in one of these states, it might eventually move to a different
66
+ # state.
67
+ NONTERMINAL_STATES: set[str] = {
68
+ "CONFIGURING",
69
+ "COMPLETING",
70
+ "PENDING",
71
+ "RUNNING",
72
+ "RESV_DEL_HOLD",
73
+ "REQUEUE_FED",
74
+ "REQUEUE_HOLD",
75
+ "REQUEUED",
76
+ "RESIZING",
77
+ "SIGNALING",
78
+ "STAGE_OUT",
79
+ "STOPPED",
80
+ "SUSPENDED",
81
+ }
82
+
83
+
84
+ def parse_slurm_time(slurm_time: str) -> int:
85
+ """
86
+ Parse a Slurm-style time duration like 7-00:00:00 to a number of seconds.
87
+
88
+ Raises ValueError if not parseable.
89
+ """
90
+ # slurm returns time in days-hours:minutes:seconds format
91
+ # Sometimes it will only return minutes:seconds, so days may be omitted
92
+ # For ease of calculating, we'll make sure all the delimeters are ':'
93
+ # Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days
94
+ total_seconds = 0
95
+ elapsed_split: list[str] = slurm_time.replace("-", ":").split(":")
96
+ elapsed_split.reverse()
97
+ seconds_per_unit = [1, 60, 3600, 86400]
98
+ for index, multiplier in enumerate(seconds_per_unit):
99
+ if index < len(elapsed_split):
100
+ total_seconds += multiplier * int(elapsed_split[index])
101
+ return total_seconds
31
102
 
32
- class Worker(AbstractGridEngineBatchSystem.Worker):
33
103
 
34
- def getRunningJobIDs(self):
104
+ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
105
+ class PartitionInfo(NamedTuple):
106
+ partition_name: str
107
+ gres: bool
108
+ time_limit: float
109
+ priority: int
110
+ cpus: str
111
+ memory: str
112
+
113
+ class PartitionSet:
114
+ """
115
+ Set of available partitions detected on the slurm batch system
116
+ """
117
+
118
+ default_gpu_partition: SlurmBatchSystem.PartitionInfo | None
119
+ all_partitions: list[SlurmBatchSystem.PartitionInfo]
120
+ gpu_partitions: set[str]
121
+
122
+ def __init__(self) -> None:
123
+ self._get_partition_info()
124
+ self._get_gpu_partitions()
125
+
126
+ def _get_gpu_partitions(self) -> None:
127
+ """
128
+ Get all available GPU partitions. Also get the default GPU partition.
129
+ :return: None
130
+ """
131
+ gpu_partitions = [
132
+ partition for partition in self.all_partitions if partition.gres
133
+ ]
134
+ self.gpu_partitions = {p.partition_name for p in gpu_partitions}
135
+ # Grab the lowest priority GPU partition
136
+ # If no GPU partitions are available, then set the default to None
137
+ self.default_gpu_partition = None
138
+ if len(gpu_partitions) > 0:
139
+ self.default_gpu_partition = sorted(
140
+ gpu_partitions, key=lambda x: x.priority
141
+ )[0]
142
+
143
+ def _get_partition_info(self) -> None:
144
+ """
145
+ Call the Slurm batch system with sinfo to grab all available partitions.
146
+ Then parse the output and store all available Slurm partitions
147
+ :return: None
148
+ """
149
+ sinfo_command = ["sinfo", "-a", "-o", "%P %G %l %p %c %m"]
150
+
151
+ sinfo = call_command(sinfo_command)
152
+
153
+ parsed_partitions = []
154
+ for line in sinfo.split("\n")[1:]:
155
+ if line.strip():
156
+ partition_name, gres, time, priority, cpus, memory = line.split(" ")
157
+ try:
158
+ # Parse time to a number so we can compute on it
159
+ partition_time: float = parse_slurm_time(time)
160
+ except ValueError:
161
+ # Maybe time is unlimited?
162
+ partition_time = float("inf")
163
+ try:
164
+ # Parse priority to an int so we can sort on it
165
+ partition_priority = int(priority)
166
+ except ValueError:
167
+ logger.warning(
168
+ "Could not parse priority %s for partition %s, assuming high priority",
169
+ partition_name,
170
+ priority,
171
+ )
172
+ partition_priority = sys.maxsize
173
+ parsed_partitions.append(
174
+ SlurmBatchSystem.PartitionInfo(
175
+ partition_name.rstrip("*"),
176
+ gres != "(null)",
177
+ partition_time,
178
+ partition_priority,
179
+ cpus,
180
+ memory,
181
+ )
182
+ )
183
+ self.all_partitions = parsed_partitions
184
+
185
+ def get_partition(self, time_limit: float | None) -> str | None:
186
+ """
187
+ Get the partition name to use for a job with the given time limit.
188
+ """
189
+
190
+ if time_limit is None:
191
+ # Just use Slurm's default
192
+ return None
193
+
194
+ winning_partition = None
195
+ for partition in self.all_partitions:
196
+ if partition.time_limit >= time_limit and (
197
+ winning_partition is None
198
+ or partition.time_limit < winning_partition.time_limit
199
+ ):
200
+ # If this partition can fit the job and is faster than the current winner, take it
201
+ winning_partition = partition
202
+ # TODO: Store partitions in a better indexed way
203
+ if winning_partition is None and len(self.all_partitions) > 0:
204
+ # We have partitions and none of them can fit this
205
+ raise RuntimeError(
206
+ "Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
207
+ )
208
+
209
+ if winning_partition is None:
210
+ return None
211
+ else:
212
+ return winning_partition.partition_name
213
+
214
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
215
+ # Our boss is always the enclosing class
216
+ boss: SlurmBatchSystem
217
+
218
+ def getRunningJobIDs(self) -> dict[int, int]:
35
219
  # Should return a dictionary of Job IDs and number of seconds
36
220
  times = {}
37
221
  with self.runningJobsLock:
38
- currentjobs = {str(self.batchJobIDs[x][0]): x for x in self.runningJobs}
222
+ currentjobs: dict[str, int] = {
223
+ str(self.batchJobIDs[x][0]): x for x in self.runningJobs
224
+ }
39
225
  # currentjobs is a dictionary that maps a slurm job id (string) to our own internal job id
40
226
  # squeue arguments:
41
227
  # -h for no header
42
228
  # --format to get jobid i, state %t and time days-hours:minutes:seconds
43
229
 
44
- lines = call_command(['squeue', '-h', '--format', '%i %t %M'], quiet=True).split('\n')
230
+ lines = call_command(
231
+ ["squeue", "-h", "--format", "%i %t %M"], quiet=True
232
+ ).split("\n")
45
233
  for line in lines:
46
234
  values = line.split()
47
235
  if len(values) < 3:
48
236
  continue
49
237
  slurm_jobid, state, elapsed_time = values
50
- if slurm_jobid in currentjobs and state == 'R':
51
- seconds_running = self.parse_elapsed(elapsed_time)
238
+ if slurm_jobid in currentjobs and state == "R":
239
+ try:
240
+ seconds_running = parse_slurm_time(elapsed_time)
241
+ except ValueError:
242
+ # slurm may return INVALID instead of a time
243
+ seconds_running = 0
52
244
  times[currentjobs[slurm_jobid]] = seconds_running
53
245
 
54
246
  return times
55
247
 
56
- def killJob(self, jobID):
57
- call_command(['scancel', self.getBatchSystemID(jobID)])
58
-
59
- def prepareSubmission(self,
60
- cpu: int,
61
- memory: int,
62
- jobID: int,
63
- command: str,
64
- jobName: str,
65
- job_environment: Optional[Dict[str, str]] = None,
66
- gpus: Optional[int] = None) -> List[str]:
67
- return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap={command}']
68
-
69
- def submitJob(self, subLine):
248
+ def killJob(self, jobID: int) -> None:
249
+ call_command(["scancel", self.getBatchSystemID(jobID)])
250
+
251
+ def prepareSubmission(
252
+ self,
253
+ cpu: int,
254
+ memory: int,
255
+ jobID: int,
256
+ command: str,
257
+ jobName: str,
258
+ job_environment: dict[str, str] | None = None,
259
+ gpus: int | None = None,
260
+ ) -> list[str]:
261
+ # Make sure to use exec so we can get Slurm's signals in the Toil
262
+ # worker instead of having an intervening Bash
263
+ return self.prepareSbatch(
264
+ cpu, memory, jobID, jobName, job_environment, gpus
265
+ ) + [f"--wrap=exec {command}"]
266
+
267
+ def submitJob(self, subLine: list[str]) -> int:
70
268
  try:
71
269
  # Slurm is not quite clever enough to follow the XDG spec on
72
270
  # its own. If the submission command sees e.g. XDG_RUNTIME_DIR
@@ -82,7 +280,11 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
82
280
  # This doesn't get us a trustworthy XDG session in Slurm, but
83
281
  # it does let us see the one Slurm tries to give us.
84
282
  no_session_environment = os.environ.copy()
85
- session_names = [n for n in no_session_environment.keys() if n.startswith('XDG_') or n.startswith('DBUS_')]
283
+ session_names = [
284
+ n
285
+ for n in no_session_environment.keys()
286
+ if n.startswith("XDG_") or n.startswith("DBUS_")
287
+ ]
86
288
  for name in session_names:
87
289
  del no_session_environment[name]
88
290
 
@@ -92,39 +294,47 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
92
294
  logger.debug("sbatch submitted job %d", result)
93
295
  return result
94
296
  except OSError as e:
95
- logger.error("sbatch command failed")
297
+ logger.error(f"sbatch command failed with error: {e}")
96
298
  raise e
97
299
 
98
- def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list:
300
+ def coalesce_job_exit_codes(
301
+ self, batch_job_id_list: list[str]
302
+ ) -> list[int | tuple[int, BatchJobExitReason | None] | None]:
99
303
  """
100
304
  Collect all job exit codes in a single call.
101
305
  :param batch_job_id_list: list of Job ID strings, where each string has the form
102
306
  "<job>[.<task>]".
103
- :return: list of job exit codes, associated with the list of job IDs.
307
+ :return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
104
308
  """
105
- logger.debug("Getting exit codes for slurm jobs: %s", batch_job_id_list)
309
+ logger.log(
310
+ TRACE, "Getting exit codes for slurm jobs: %s", batch_job_id_list
311
+ )
106
312
  # Convert batch_job_id_list to list of integer job IDs.
107
- job_id_list = [int(id.split('.')[0]) for id in batch_job_id_list]
313
+ job_id_list = [int(id.split(".")[0]) for id in batch_job_id_list]
108
314
  status_dict = self._get_job_details(job_id_list)
109
- exit_codes = []
315
+ exit_codes: list[int | tuple[int, BatchJobExitReason | None] | None] = []
110
316
  for _, status in status_dict.items():
111
317
  exit_codes.append(self._get_job_return_code(status))
112
318
  return exit_codes
113
319
 
114
- def getJobExitCode(self, batchJobID: str) -> int:
320
+ def getJobExitCode(
321
+ self, batchJobID: str
322
+ ) -> int | tuple[int, BatchJobExitReason | None] | None:
115
323
  """
116
324
  Get job exit code for given batch job ID.
117
325
  :param batchJobID: string of the form "<job>[.<task>]".
118
326
  :return: integer job exit code.
119
327
  """
120
- logger.debug("Getting exit code for slurm job: %s", batchJobID)
328
+ logger.log(TRACE, "Getting exit code for slurm job: %s", batchJobID)
121
329
  # Convert batchJobID to an integer job ID.
122
- job_id = int(batchJobID.split('.')[0])
330
+ job_id = int(batchJobID.split(".")[0])
123
331
  status_dict = self._get_job_details([job_id])
124
332
  status = status_dict[job_id]
125
333
  return self._get_job_return_code(status)
126
334
 
127
- def _get_job_details(self, job_id_list: list) -> dict:
335
+ def _get_job_details(
336
+ self, job_id_list: list[int]
337
+ ) -> dict[int, tuple[str | None, int | None]]:
128
338
  """
129
339
  Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
130
340
  Fetch job details from Slurm's accounting system or job control system.
@@ -138,20 +348,77 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
138
348
  status_dict = self._getJobDetailsFromScontrol(job_id_list)
139
349
  return status_dict
140
350
 
141
- def _get_job_return_code(self, status: tuple) -> list:
351
+ def _get_job_return_code(
352
+ self, status: tuple[str | None, int | None]
353
+ ) -> int | tuple[int, BatchJobExitReason | None] | None:
142
354
  """
355
+ Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair.
356
+
357
+ The return code may have already been OR'd with the 128-offset
358
+ Slurm-reported signal.
359
+
360
+ Slurm will report return codes of 0 even if jobs time out instead
361
+ of succeeding:
362
+
363
+ 2093597|TIMEOUT|0:0
364
+ 2093597.batch|CANCELLED|0:15
365
+
366
+ So we guarantee here that, if the Slurm status string is not a
367
+ successful one as defined in
368
+ <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>, we
369
+ will not return a successful return code.
370
+
143
371
  Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
144
- :param status: tuple containing the job's state and it's return code.
145
- :return: the job's return code if it's completed, otherwise None.
372
+ :param status: tuple containing the job's state and it's return code from Slurm.
373
+ :return: the job's return code for Toil if it's completed, otherwise None.
146
374
  """
147
375
  state, rc = status
148
- # If job is in a running state, set return code to None to indicate we don't have
149
- # an update.
150
- if state in ('PENDING', 'RUNNING', 'CONFIGURING', 'COMPLETING', 'RESIZING', 'SUSPENDED'):
151
- rc = None
152
- return rc
153
376
 
154
- def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
377
+ if state not in TERMINAL_STATES:
378
+ # Don't treat the job as exited yet
379
+ return None
380
+
381
+ exit_reason = TERMINAL_STATES[state]
382
+
383
+ if exit_reason == BatchJobExitReason.FINISHED:
384
+ # The only state that should produce a 0 ever is COMPLETED. So
385
+ # if the job is COMPLETED and the exit reason is thus FINISHED,
386
+ # pass along the code it has.
387
+ return (rc, exit_reason) # type: ignore[return-value] # mypy doesn't understand enums well
388
+
389
+ if rc == 0:
390
+ # The job claims to be in a state other than COMPLETED, but
391
+ # also to have not encountered a problem. Say the exit status
392
+ # is unavailable.
393
+ return (EXIT_STATUS_UNAVAILABLE_VALUE, exit_reason)
394
+
395
+ # If the code is nonzero, pass it along.
396
+ return (rc, exit_reason) # type: ignore[return-value] # mypy doesn't understand enums well
397
+
398
+ def _canonicalize_state(self, state: str) -> str:
399
+ """
400
+ Turn a state string form SLURM into just the state token like "CANCELED".
401
+ """
402
+
403
+ # Slurm will sometimes send something like "CANCELED by 30065" in
404
+ # the state column for some reason.
405
+
406
+ state_token = state
407
+
408
+ if " " in state_token:
409
+ state_token = state.split(" ", 1)[0]
410
+
411
+ if (
412
+ state_token not in TERMINAL_STATES
413
+ and state_token not in NONTERMINAL_STATES
414
+ ):
415
+ raise RuntimeError("Toil job in unimplemented Slurm state " + state)
416
+
417
+ return state_token
418
+
419
+ def _getJobDetailsFromSacct(
420
+ self, job_id_list: list[int]
421
+ ) -> dict[int, tuple[str | None, int | None]]:
155
422
  """
156
423
  Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
157
424
  :param job_id_list: list of integer batch job IDs.
@@ -159,51 +426,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
159
426
  containing the job's state and exit code.
160
427
  """
161
428
  job_ids = ",".join(str(id) for id in job_id_list)
162
- args = ['sacct',
163
- '-n', # no header
164
- '-j', job_ids, # job
165
- '--format', 'JobIDRaw,State,ExitCode', # specify output columns
166
- '-P', # separate columns with pipes
167
- '-S', '1970-01-01'] # override start time limit
429
+ args = [
430
+ "sacct",
431
+ "-n", # no header
432
+ "-j",
433
+ job_ids, # job
434
+ "--format",
435
+ "JobIDRaw,State,ExitCode", # specify output columns
436
+ "-P", # separate columns with pipes
437
+ "-S",
438
+ "1970-01-01",
439
+ ] # override start time limit
168
440
  stdout = call_command(args, quiet=True)
169
441
 
170
442
  # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
171
443
  # job state and exit status. Initialize dict before processing output of `sacct`.
172
- job_statuses = {}
444
+ job_statuses: dict[int, tuple[str | None, int | None]] = {}
173
445
  for job_id in job_id_list:
174
446
  job_statuses[job_id] = (None, None)
175
447
 
176
448
  for line in stdout.splitlines():
177
- values = line.strip().split('|')
449
+ values = line.strip().split("|")
178
450
  if len(values) < 3:
179
451
  continue
452
+ state: str
180
453
  job_id_raw, state, exitcode = values
181
- logger.debug("%s state of job %s is %s", args[0], job_id_raw, state)
454
+ state = self._canonicalize_state(state)
455
+ logger.log(
456
+ TRACE, "%s state of job %s is %s", args[0], job_id_raw, state
457
+ )
182
458
  # JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
183
459
  job_id_parts = job_id_raw.split(".")
184
460
  if len(job_id_parts) > 1:
185
461
  continue
186
462
  job_id = int(job_id_parts[0])
187
- status, signal = (int(n) for n in exitcode.split(':'))
463
+ status: int
464
+ signal: int
465
+ status, signal = (int(n) for n in exitcode.split(":"))
188
466
  if signal > 0:
189
467
  # A non-zero signal may indicate e.g. an out-of-memory killed job
190
468
  status = 128 + signal
191
- logger.debug("%s exit code of job %d is %s, return status %d",
192
- args[0], job_id, exitcode, status)
469
+ logger.log(
470
+ TRACE,
471
+ "%s exit code of job %d is %s, return status %d",
472
+ args[0],
473
+ job_id,
474
+ exitcode,
475
+ status,
476
+ )
193
477
  job_statuses[job_id] = state, status
194
- logger.debug("%s returning job statuses: %s", args[0], job_statuses)
478
+ logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
195
479
  return job_statuses
196
480
 
197
- def _getJobDetailsFromScontrol(self, job_id_list: list) -> dict:
481
+ def _getJobDetailsFromScontrol(
482
+ self, job_id_list: list[int]
483
+ ) -> dict[int, tuple[str | None, int | None]]:
198
484
  """
199
485
  Get SLURM job exit codes for the jobs in `job_id_list` by running `scontrol`.
200
486
  :param job_id_list: list of integer batch job IDs.
201
487
  :return: dict of job statuses, where key is the job-id, and value is a tuple
202
488
  containing the job's state and exit code.
203
489
  """
204
- args = ['scontrol',
205
- 'show',
206
- 'job']
490
+ args = ["scontrol", "show", "job"]
207
491
  # `scontrol` can only return information about a single job,
208
492
  # or all the jobs it knows about.
209
493
  if len(job_id_list) == 1:
@@ -212,14 +496,16 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
212
496
  stdout = call_command(args, quiet=True)
213
497
 
214
498
  # Job records are separated by a blank line.
499
+ job_records = None
215
500
  if isinstance(stdout, str):
216
- job_records = stdout.strip().split('\n\n')
501
+ job_records = stdout.strip().split("\n\n")
217
502
  elif isinstance(stdout, bytes):
218
- job_records = stdout.decode('utf-8').strip().split('\n\n')
503
+ job_records = stdout.decode("utf-8").strip().split("\n\n")
219
504
 
220
505
  # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
221
506
  # job state and exit status. Initialize dict before processing output of `scontrol`.
222
- job_statuses = {}
507
+ job_statuses: dict[int, tuple[str | None, int | None]] = {}
508
+ job_id: int | None
223
509
  for job_id in job_id_list:
224
510
  job_statuses[job_id] = (None, None)
225
511
 
@@ -229,7 +515,8 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
229
515
  return job_statuses
230
516
 
231
517
  for record in job_records:
232
- job = {}
518
+ job: dict[str, str] = {}
519
+ job_id = None
233
520
  for line in record.splitlines():
234
521
  for item in line.split():
235
522
  # Output is in the form of many key=value pairs, multiple pairs on each line
@@ -237,73 +524,104 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
237
524
  # added to a dictionary.
238
525
  # Note: In some cases, the value itself may contain white-space. So, if we find
239
526
  # a key without a value, we consider that key part of the previous value.
240
- bits = item.split('=', 1)
527
+ bits = item.split("=", 1)
241
528
  if len(bits) == 1:
242
- job[key] += ' ' + bits[0]
529
+ job[key] += " " + bits[0] # type: ignore[has-type] # we depend on the previous iteration to populate key
243
530
  else:
244
531
  key = bits[0]
245
532
  job[key] = bits[1]
246
533
  # The first line of the record contains the JobId. Stop processing the remainder
247
534
  # of this record, if we're not interested in this job.
248
- job_id = int(job['JobId'])
535
+ job_id = int(job["JobId"])
249
536
  if job_id not in job_id_list:
250
- logger.debug("%s job %d is not in the list", args[0], job_id)
537
+ logger.log(
538
+ TRACE, "%s job %d is not in the list", args[0], job_id
539
+ )
251
540
  break
252
- if job_id not in job_id_list:
541
+ if job_id is None or job_id not in job_id_list:
253
542
  continue
254
- state = job['JobState']
255
- logger.debug("%s state of job %s is %s", args[0], job_id, state)
543
+ state = job["JobState"]
544
+ state = self._canonicalize_state(state)
545
+ logger.log(TRACE, "%s state of job %s is %s", args[0], job_id, state)
256
546
  try:
257
- exitcode = job['ExitCode']
547
+ exitcode = job["ExitCode"]
258
548
  if exitcode is not None:
259
- status, signal = (int(n) for n in exitcode.split(':'))
549
+ status, signal = (int(n) for n in exitcode.split(":"))
260
550
  if signal > 0:
261
551
  # A non-zero signal may indicate e.g. an out-of-memory killed job
262
552
  status = 128 + signal
263
- logger.debug("%s exit code of job %d is %s, return status %d",
264
- args[0], job_id, exitcode, status)
553
+ logger.log(
554
+ TRACE,
555
+ "%s exit code of job %d is %s, return status %d",
556
+ args[0],
557
+ job_id,
558
+ exitcode,
559
+ status,
560
+ )
265
561
  rc = status
266
562
  else:
267
563
  rc = None
268
564
  except KeyError:
269
565
  rc = None
270
566
  job_statuses[job_id] = (state, rc)
271
- logger.debug("%s returning job statuses: %s", args[0], job_statuses)
567
+ logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
272
568
  return job_statuses
273
569
 
274
570
  ###
275
571
  ### Implementation-specific helper methods
276
572
  ###
277
573
 
278
- def prepareSbatch(self,
279
- cpu: int,
280
- mem: int,
281
- jobID: int,
282
- jobName: str,
283
- job_environment: Optional[Dict[str, str]],
284
- gpus: Optional[int]) -> List[str]:
574
+ def prepareSbatch(
575
+ self,
576
+ cpu: int,
577
+ mem: int,
578
+ jobID: int,
579
+ jobName: str,
580
+ job_environment: dict[str, str] | None,
581
+ gpus: int | None,
582
+ ) -> list[str]:
583
+ """
584
+ Returns the sbatch command line to run to queue the job.
585
+ """
586
+
587
+ # Start by naming the job
588
+ sbatch_line = ["sbatch", "-J", f"toil_job_{jobID}_{jobName}"]
589
+
590
+ # Make sure the job gets a signal before it disappears so that e.g.
591
+ # container cleanup finally blocks can run. Ask for SIGINT so we
592
+ # can get the default Python KeyboardInterrupt which third-party
593
+ # code is likely to plan for. Make sure to send it to the batch
594
+ # shell process with "B:", not to all the srun steps it launches
595
+ # (because there shouldn't be any). We cunningly replaced the batch
596
+ # shell process with the Toil worker process, so Toil should be
597
+ # able to get the signal.
598
+ #
599
+ # TODO: Add a way to detect when the job failed because it
600
+ # responded to this signal and use the right exit reason for it.
601
+ sbatch_line.append("--signal=B:INT@30")
285
602
 
286
- # Returns the sbatch command line before the script to run
287
- sbatch_line = ['sbatch', '-J', f'toil_job_{jobID}_{jobName}']
288
- if gpus:
289
- sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:]
290
603
  environment = {}
291
604
  environment.update(self.boss.environment)
292
605
  if job_environment:
293
606
  environment.update(job_environment)
294
607
 
295
608
  # "Native extensions" for SLURM (see DRMAA or SAGA)
296
- nativeConfig = os.getenv('TOIL_SLURM_ARGS')
609
+ # Also any extra arguments from --slurmArgs or TOIL_SLURM_ARGS
610
+ nativeConfig: str = self.boss.config.slurm_args # type: ignore[attr-defined]
297
611
 
298
612
  # --export=[ALL,]<environment_toil_variables>
299
613
  set_exports = "--export=ALL"
300
614
 
301
615
  if nativeConfig is not None:
302
- logger.debug("Native SLURM options appended to sbatch from TOIL_SLURM_ARGS env. variable: %s", nativeConfig)
616
+ logger.debug(
617
+ "Native SLURM options appended to sbatch: %s", nativeConfig
618
+ )
303
619
 
304
620
  for arg in nativeConfig.split():
305
621
  if arg.startswith("--mem") or arg.startswith("--cpus-per-task"):
306
- raise ValueError(f"Some resource arguments are incompatible: {nativeConfig}")
622
+ raise ValueError(
623
+ f"Some resource arguments are incompatible: {nativeConfig}"
624
+ )
307
625
  # repleace default behaviour by the one stated at TOIL_SLURM_ARGS
308
626
  if arg.startswith("--export"):
309
627
  set_exports = arg
@@ -314,54 +632,149 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
314
632
 
315
633
  for k, v in environment.items():
316
634
  quoted_value = quote(os.environ[k] if v is None else v)
317
- argList.append(f'{k}={quoted_value}')
635
+ argList.append(f"{k}={quoted_value}")
318
636
 
319
- set_exports += ',' + ','.join(argList)
637
+ set_exports += "," + ",".join(argList)
320
638
 
321
639
  # add --export to the sbatch
322
640
  sbatch_line.append(set_exports)
323
641
 
324
- parallel_env = os.getenv('TOIL_SLURM_PE')
642
+ parallel_env: str = self.boss.config.slurm_pe # type: ignore[attr-defined]
325
643
  if cpu and cpu > 1 and parallel_env:
326
- sbatch_line.append(f'--partition={parallel_env}')
644
+ sbatch_line.append(f"--partition={parallel_env}")
327
645
 
328
- if mem is not None and self.boss.config.allocate_mem:
646
+ if mem is not None and self.boss.config.slurm_allocate_mem: # type: ignore[attr-defined]
329
647
  # memory passed in is in bytes, but slurm expects megabytes
330
- sbatch_line.append(f'--mem={math.ceil(mem / 2 ** 20)}')
648
+ sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}")
331
649
  if cpu is not None:
332
- sbatch_line.append(f'--cpus-per-task={math.ceil(cpu)}')
650
+ sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}")
333
651
 
334
- stdoutfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'out')
335
- stderrfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'err')
336
- sbatch_line.extend(['-o', stdoutfile, '-e', stderrfile])
652
+ time_limit: int = self.boss.config.slurm_time # type: ignore[attr-defined]
653
+ if time_limit is not None:
654
+ # Put all the seconds in the seconds slot
655
+ sbatch_line.append(f"--time=0:{time_limit}")
656
+
657
+ if gpus:
658
+ # This block will add a gpu supported partition only if no partition is supplied by the user
659
+ sbatch_line = sbatch_line[:1] + [f"--gres=gpu:{gpus}"] + sbatch_line[1:]
660
+ if not any(option.startswith("--partition") for option in sbatch_line):
661
+ # no partition specified, so specify one
662
+ # try to get the name of the lowest priority gpu supported partition
663
+ lowest_gpu_partition = self.boss.partitions.default_gpu_partition
664
+ if lowest_gpu_partition is None:
665
+ # no gpu partitions are available, raise an error
666
+ raise RuntimeError(
667
+ f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
668
+ )
669
+ if (
670
+ time_limit is not None
671
+ and lowest_gpu_partition.time_limit < time_limit
672
+ ):
673
+ # TODO: find the lowest-priority GPU partition that has at least each job's time limit!
674
+ logger.warning(
675
+ "Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
676
+ time_limit,
677
+ lowest_gpu_partition.partition_name,
678
+ lowest_gpu_partition.time_limit,
679
+ )
680
+ sbatch_line.append(
681
+ f"--partition={lowest_gpu_partition.partition_name}"
682
+ )
683
+ else:
684
+ # there is a partition specified already, check if the partition has GPUs
685
+ for i, option in enumerate(sbatch_line):
686
+ if option.startswith("--partition"):
687
+ # grab the partition name depending on if it's specified via an "=" or a space
688
+ if "=" in option:
689
+ partition_name = option[len("--partition=") :]
690
+ else:
691
+ partition_name = option[i + 1]
692
+ available_gpu_partitions = (
693
+ self.boss.partitions.gpu_partitions
694
+ )
695
+ if partition_name not in available_gpu_partitions:
696
+ # the specified partition is not compatible, so warn the user that the job may not work
697
+ logger.warning(
698
+ f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work."
699
+ f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}."
700
+ )
701
+ break
702
+
703
+ if not any(option.startswith("--partition") for option in sbatch_line):
704
+ # Pick a partition ourselves
705
+ chosen_partition = self.boss.partitions.get_partition(time_limit)
706
+ if chosen_partition is not None:
707
+ # Route to that partition
708
+ sbatch_line.append(f"--partition={chosen_partition}")
709
+
710
+ stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out")
711
+ stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err")
712
+ sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile])
337
713
 
338
714
  return sbatch_line
339
715
 
340
- def parse_elapsed(self, elapsed):
341
- # slurm returns elapsed time in days-hours:minutes:seconds format
342
- # Sometimes it will only return minutes:seconds, so days may be omitted
343
- # For ease of calculating, we'll make sure all the delimeters are ':'
344
- # Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days
345
- total_seconds = 0
346
- try:
347
- elapsed = elapsed.replace('-', ':').split(':')
348
- elapsed.reverse()
349
- seconds_per_unit = [1, 60, 3600, 86400]
350
- for index, multiplier in enumerate(seconds_per_unit):
351
- if index < len(elapsed):
352
- total_seconds += multiplier * int(elapsed[index])
353
- except ValueError:
354
- pass # slurm may return INVALID instead of a time
355
- return total_seconds
716
+ def __init__(
717
+ self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
718
+ ) -> None:
719
+ super().__init__(config, maxCores, maxMemory, maxDisk)
720
+ self.partitions = SlurmBatchSystem.PartitionSet()
721
+
722
+ # Override issuing jobs so we can check if we need to use Slurm's magic
723
+ # whole-node-memory feature.
724
+ def issueBatchJob(
725
+ self,
726
+ command: str,
727
+ job_desc: JobDescription,
728
+ job_environment: dict[str, str] | None = None,
729
+ ) -> int:
730
+ # Avoid submitting internal jobs to the batch queue, handle locally
731
+ local_id = self.handleLocalJob(command, job_desc)
732
+ if local_id is not None:
733
+ return local_id
734
+ else:
735
+ self.check_resource_request(job_desc)
736
+ gpus = self.count_needed_gpus(job_desc)
737
+ job_id = self.getNextJobID()
738
+ self.currentJobs.add(job_id)
739
+
740
+ if "memory" not in job_desc.requirements and self.config.slurm_default_all_mem: # type: ignore[attr-defined]
741
+ # The job doesn't have its own memory requirement, and we are
742
+ # defaulting to whole node memory. Use Slurm's 0-memory sentinel.
743
+ memory = 0
744
+ else:
745
+ # Use the memory actually on the job, or the Toil default memory
746
+ memory = job_desc.memory
747
+
748
+ self.newJobsQueue.put(
749
+ (
750
+ job_id,
751
+ job_desc.cores,
752
+ memory,
753
+ command,
754
+ get_job_kind(job_desc.get_names()),
755
+ job_environment,
756
+ gpus,
757
+ )
758
+ )
759
+ logger.debug(
760
+ "Issued the job command: %s with job id: %s and job name %s",
761
+ command,
762
+ str(job_id),
763
+ get_job_kind(job_desc.get_names()),
764
+ )
765
+ return job_id
356
766
 
357
767
  def _check_accelerator_request(self, requirer: Requirer) -> None:
358
768
  for accelerator in requirer.accelerators:
359
- if accelerator['kind'] != 'gpu':
360
- raise InsufficientSystemResources(requirer, 'accelerators', details=
361
- [
362
- f'The accelerator {accelerator} could not be provided'
363
- 'The Toil Slurm batch system only supports gpu accelerators at the moment.'
364
- ])
769
+ if accelerator["kind"] != "gpu":
770
+ raise InsufficientSystemResources(
771
+ requirer,
772
+ "accelerators",
773
+ details=[
774
+ f"The accelerator {accelerator} could not be provided"
775
+ "The Toil Slurm batch system only supports gpu accelerators at the moment."
776
+ ],
777
+ )
365
778
 
366
779
  ###
367
780
  ### The interface for SLURM
@@ -375,17 +788,69 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
375
788
  # implement getWaitDuration().
376
789
 
377
790
  @classmethod
378
- def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]):
379
- allocate_mem = parser.add_mutually_exclusive_group()
380
- allocate_mem_help = ("A flag that can block allocating memory with '--mem' for job submissions "
381
- "on SLURM since some system servers may reject any job request that "
382
- "explicitly specifies the memory allocation. The default is to always allocate memory.")
383
- allocate_mem.add_argument("--dont_allocate_mem", action='store_false', dest="allocate_mem", help=allocate_mem_help)
384
- allocate_mem.add_argument("--allocate_mem", action='store_true', dest="allocate_mem", help=allocate_mem_help)
385
- allocate_mem.set_defaults(allocate_mem=True)
386
-
387
- OptionType = TypeVar('OptionType')
791
+ def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None:
792
+
793
+ parser.add_argument(
794
+ "--slurmAllocateMem",
795
+ dest="slurm_allocate_mem",
796
+ type=strtobool,
797
+ default=True,
798
+ env_var="TOIL_SLURM_ALLOCATE_MEM",
799
+ help="If False, do not use --mem. Used as a workaround for Slurm clusters that reject jobs "
800
+ "with memory allocations.",
801
+ )
802
+ # Keep these deprcated options for backward compatibility
803
+ parser.add_argument(
804
+ "--dont_allocate_mem",
805
+ action="store_false",
806
+ dest="slurm_allocate_mem",
807
+ help=SUPPRESS,
808
+ )
809
+ parser.add_argument(
810
+ "--allocate_mem",
811
+ action="store_true",
812
+ dest="slurm_allocate_mem",
813
+ help=SUPPRESS,
814
+ )
815
+
816
+ parser.add_argument(
817
+ "--slurmDefaultAllMem",
818
+ dest="slurm_default_all_mem",
819
+ type=strtobool,
820
+ default=False,
821
+ env_var="TOIL_SLURM_DEFAULT_ALL_MEM",
822
+ help="If True, assign Toil jobs without their own memory requirements all available "
823
+ "memory on a Slurm node (via Slurm --mem=0).",
824
+ )
825
+ parser.add_argument(
826
+ "--slurmTime",
827
+ dest="slurm_time",
828
+ type=parse_slurm_time,
829
+ default=None,
830
+ env_var="TOIL_SLURM_TIME",
831
+ help="Slurm job time limit, in [DD-]HH:MM:SS format.",
832
+ )
833
+ parser.add_argument(
834
+ "--slurmPE",
835
+ dest="slurm_pe",
836
+ default=None,
837
+ env_var="TOIL_SLURM_PE",
838
+ help="Special partition to send Slurm jobs to if they ask for more than 1 CPU.",
839
+ )
840
+ parser.add_argument(
841
+ "--slurmArgs",
842
+ dest="slurm_args",
843
+ default="",
844
+ env_var="TOIL_SLURM_ARGS",
845
+ help="Extra arguments to pass to Slurm.",
846
+ )
847
+
848
+ OptionType = TypeVar("OptionType")
849
+
388
850
  @classmethod
389
851
  def setOptions(cls, setOption: OptionSetter) -> None:
390
- setOption("allocate_mem")
391
-
852
+ setOption("slurm_allocate_mem")
853
+ setOption("slurm_default_all_mem")
854
+ setOption("slurm_time")
855
+ setOption("slurm_pe")
856
+ setOption("slurm_args")