toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. toil/__init__.py +124 -86
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +137 -77
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
  5. toil/batchSystems/awsBatch.py +237 -128
  6. toil/batchSystems/cleanup_support.py +22 -16
  7. toil/batchSystems/contained_executor.py +30 -26
  8. toil/batchSystems/gridengine.py +85 -49
  9. toil/batchSystems/htcondor.py +164 -87
  10. toil/batchSystems/kubernetes.py +622 -386
  11. toil/batchSystems/local_support.py +17 -12
  12. toil/batchSystems/lsf.py +132 -79
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +288 -149
  16. toil/batchSystems/mesos/executor.py +77 -49
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +39 -29
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +293 -123
  21. toil/batchSystems/slurm.py +651 -155
  22. toil/batchSystems/torque.py +46 -32
  23. toil/bus.py +141 -73
  24. toil/common.py +784 -397
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1137 -534
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +62 -41
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +88 -57
  32. toil/fileStores/cachingFileStore.py +711 -247
  33. toil/fileStores/nonCachingFileStore.py +113 -75
  34. toil/job.py +1031 -349
  35. toil/jobStores/abstractJobStore.py +387 -243
  36. toil/jobStores/aws/jobStore.py +772 -412
  37. toil/jobStores/aws/utils.py +161 -109
  38. toil/jobStores/conftest.py +1 -0
  39. toil/jobStores/fileJobStore.py +289 -151
  40. toil/jobStores/googleJobStore.py +137 -70
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +614 -269
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +55 -28
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +204 -58
  49. toil/lib/aws/utils.py +290 -213
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +83 -49
  53. toil/lib/docker.py +131 -103
  54. toil/lib/dockstore.py +379 -0
  55. toil/lib/ec2.py +322 -209
  56. toil/lib/ec2nodes.py +174 -105
  57. toil/lib/encryption/_dummy.py +5 -3
  58. toil/lib/encryption/_nacl.py +10 -6
  59. toil/lib/encryption/conftest.py +1 -0
  60. toil/lib/exceptions.py +26 -7
  61. toil/lib/expando.py +4 -2
  62. toil/lib/ftp_utils.py +217 -0
  63. toil/lib/generatedEC2Lists.py +127 -19
  64. toil/lib/history.py +1271 -0
  65. toil/lib/history_submission.py +681 -0
  66. toil/lib/humanize.py +6 -2
  67. toil/lib/io.py +121 -12
  68. toil/lib/iterables.py +4 -2
  69. toil/lib/memoize.py +12 -8
  70. toil/lib/misc.py +83 -18
  71. toil/lib/objects.py +2 -2
  72. toil/lib/resources.py +19 -7
  73. toil/lib/retry.py +125 -87
  74. toil/lib/threading.py +282 -80
  75. toil/lib/throttle.py +15 -14
  76. toil/lib/trs.py +390 -0
  77. toil/lib/web.py +38 -0
  78. toil/options/common.py +850 -402
  79. toil/options/cwl.py +185 -90
  80. toil/options/runner.py +50 -0
  81. toil/options/wdl.py +70 -19
  82. toil/provisioners/__init__.py +111 -46
  83. toil/provisioners/abstractProvisioner.py +322 -157
  84. toil/provisioners/aws/__init__.py +62 -30
  85. toil/provisioners/aws/awsProvisioner.py +980 -627
  86. toil/provisioners/clusterScaler.py +541 -279
  87. toil/provisioners/gceProvisioner.py +283 -180
  88. toil/provisioners/node.py +147 -79
  89. toil/realtimeLogger.py +34 -22
  90. toil/resource.py +137 -75
  91. toil/server/app.py +127 -61
  92. toil/server/celery_app.py +3 -1
  93. toil/server/cli/wes_cwl_runner.py +84 -55
  94. toil/server/utils.py +56 -31
  95. toil/server/wes/abstract_backend.py +64 -26
  96. toil/server/wes/amazon_wes_utils.py +21 -15
  97. toil/server/wes/tasks.py +121 -63
  98. toil/server/wes/toil_backend.py +142 -107
  99. toil/server/wsgi_app.py +4 -3
  100. toil/serviceManager.py +58 -22
  101. toil/statsAndLogging.py +183 -65
  102. toil/test/__init__.py +263 -179
  103. toil/test/batchSystems/batchSystemTest.py +438 -195
  104. toil/test/batchSystems/batch_system_plugin_test.py +18 -7
  105. toil/test/batchSystems/test_gridengine.py +173 -0
  106. toil/test/batchSystems/test_lsf_helper.py +67 -58
  107. toil/test/batchSystems/test_slurm.py +265 -49
  108. toil/test/cactus/test_cactus_integration.py +20 -22
  109. toil/test/cwl/conftest.py +39 -0
  110. toil/test/cwl/cwlTest.py +375 -72
  111. toil/test/cwl/measure_default_memory.cwl +12 -0
  112. toil/test/cwl/not_run_required_input.cwl +29 -0
  113. toil/test/cwl/optional-file.cwl +18 -0
  114. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  115. toil/test/docs/scriptsTest.py +60 -34
  116. toil/test/jobStores/jobStoreTest.py +412 -235
  117. toil/test/lib/aws/test_iam.py +116 -48
  118. toil/test/lib/aws/test_s3.py +16 -9
  119. toil/test/lib/aws/test_utils.py +5 -6
  120. toil/test/lib/dockerTest.py +118 -141
  121. toil/test/lib/test_conversions.py +113 -115
  122. toil/test/lib/test_ec2.py +57 -49
  123. toil/test/lib/test_history.py +212 -0
  124. toil/test/lib/test_misc.py +12 -5
  125. toil/test/lib/test_trs.py +161 -0
  126. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  127. toil/test/mesos/helloWorld.py +7 -6
  128. toil/test/mesos/stress.py +25 -20
  129. toil/test/options/options.py +7 -2
  130. toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
  131. toil/test/provisioners/clusterScalerTest.py +440 -250
  132. toil/test/provisioners/clusterTest.py +81 -42
  133. toil/test/provisioners/gceProvisionerTest.py +174 -100
  134. toil/test/provisioners/provisionerTest.py +25 -13
  135. toil/test/provisioners/restartScript.py +5 -4
  136. toil/test/server/serverTest.py +188 -141
  137. toil/test/sort/restart_sort.py +137 -68
  138. toil/test/sort/sort.py +134 -66
  139. toil/test/sort/sortTest.py +91 -49
  140. toil/test/src/autoDeploymentTest.py +140 -100
  141. toil/test/src/busTest.py +20 -18
  142. toil/test/src/checkpointTest.py +8 -2
  143. toil/test/src/deferredFunctionTest.py +49 -35
  144. toil/test/src/dockerCheckTest.py +33 -26
  145. toil/test/src/environmentTest.py +20 -10
  146. toil/test/src/fileStoreTest.py +538 -271
  147. toil/test/src/helloWorldTest.py +7 -4
  148. toil/test/src/importExportFileTest.py +61 -31
  149. toil/test/src/jobDescriptionTest.py +32 -17
  150. toil/test/src/jobEncapsulationTest.py +2 -0
  151. toil/test/src/jobFileStoreTest.py +74 -50
  152. toil/test/src/jobServiceTest.py +187 -73
  153. toil/test/src/jobTest.py +120 -70
  154. toil/test/src/miscTests.py +19 -18
  155. toil/test/src/promisedRequirementTest.py +82 -36
  156. toil/test/src/promisesTest.py +7 -6
  157. toil/test/src/realtimeLoggerTest.py +6 -6
  158. toil/test/src/regularLogTest.py +71 -37
  159. toil/test/src/resourceTest.py +80 -49
  160. toil/test/src/restartDAGTest.py +36 -22
  161. toil/test/src/resumabilityTest.py +9 -2
  162. toil/test/src/retainTempDirTest.py +45 -14
  163. toil/test/src/systemTest.py +12 -8
  164. toil/test/src/threadingTest.py +44 -25
  165. toil/test/src/toilContextManagerTest.py +10 -7
  166. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  167. toil/test/src/workerTest.py +33 -16
  168. toil/test/utils/toilDebugTest.py +70 -58
  169. toil/test/utils/toilKillTest.py +4 -5
  170. toil/test/utils/utilsTest.py +239 -102
  171. toil/test/wdl/wdltoil_test.py +789 -148
  172. toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
  173. toil/toilState.py +52 -26
  174. toil/utils/toilConfig.py +13 -4
  175. toil/utils/toilDebugFile.py +44 -27
  176. toil/utils/toilDebugJob.py +85 -25
  177. toil/utils/toilDestroyCluster.py +11 -6
  178. toil/utils/toilKill.py +8 -3
  179. toil/utils/toilLaunchCluster.py +251 -145
  180. toil/utils/toilMain.py +37 -16
  181. toil/utils/toilRsyncCluster.py +27 -14
  182. toil/utils/toilSshCluster.py +45 -22
  183. toil/utils/toilStats.py +75 -36
  184. toil/utils/toilStatus.py +226 -119
  185. toil/utils/toilUpdateEC2Instances.py +3 -1
  186. toil/version.py +6 -6
  187. toil/wdl/utils.py +5 -5
  188. toil/wdl/wdltoil.py +3528 -1053
  189. toil/worker.py +370 -149
  190. toil-8.1.0b1.dist-info/METADATA +178 -0
  191. toil-8.1.0b1.dist-info/RECORD +259 -0
  192. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
  193. toil-7.0.0.dist-info/METADATA +0 -158
  194. toil-7.0.0.dist-info/RECORD +0 -244
  195. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
  196. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
  197. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
@@ -11,19 +11,32 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from __future__ import annotations
15
+
16
+ import errno
14
17
  import logging
15
18
  import math
16
19
  import os
17
- from argparse import ArgumentParser, _ArgumentGroup
18
- from shlex import quote
19
- from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union
20
-
21
- from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE, InsufficientSystemResources
22
- from toil.batchSystems.abstractGridEngineBatchSystem import \
23
- AbstractGridEngineBatchSystem
20
+ import sys
21
+ from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
22
+ import shlex
23
+ from typing import Callable, NamedTuple, TypeVar
24
+
25
+ from toil.batchSystems.abstractBatchSystem import (
26
+ EXIT_STATUS_UNAVAILABLE_VALUE,
27
+ BatchJobExitReason,
28
+ InsufficientSystemResources,
29
+ )
30
+ from toil.batchSystems.abstractGridEngineBatchSystem import (
31
+ AbstractGridEngineBatchSystem,
32
+ )
24
33
  from toil.batchSystems.options import OptionSetter
25
- from toil.job import Requirer
34
+ from toil.bus import get_job_kind
35
+ from toil.common import Config
36
+ from toil.job import JobDescription, Requirer
37
+ from toil.lib.conversions import strtobool
26
38
  from toil.lib.misc import CalledProcessErrorStderr, call_command
39
+ from toil.statsAndLogging import TRACE
27
40
 
28
41
  logger = logging.getLogger(__name__)
29
42
 
@@ -36,7 +49,7 @@ logger = logging.getLogger(__name__)
36
49
  # it, so Toil should wait for it.
37
50
  #
38
51
  # We map from each terminal state to the Toil-ontology exit reason.
39
- TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
52
+ TERMINAL_STATES: dict[str, BatchJobExitReason] = {
40
53
  "BOOT_FAIL": BatchJobExitReason.LOST,
41
54
  "CANCELLED": BatchJobExitReason.KILLED,
42
55
  "COMPLETED": BatchJobExitReason.FINISHED,
@@ -47,12 +60,12 @@ TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
47
60
  "PREEMPTED": BatchJobExitReason.KILLED,
48
61
  "REVOKED": BatchJobExitReason.KILLED,
49
62
  "SPECIAL_EXIT": BatchJobExitReason.FAILED,
50
- "TIMEOUT": BatchJobExitReason.KILLED
63
+ "TIMEOUT": BatchJobExitReason.KILLED,
51
64
  }
52
65
 
53
66
  # If a job is in one of these states, it might eventually move to a different
54
67
  # state.
55
- NONTERMINAL_STATES: Set[str] = {
68
+ NONTERMINAL_STATES: set[str] = {
56
69
  "CONFIGURING",
57
70
  "COMPLETING",
58
71
  "PENDING",
@@ -65,51 +78,215 @@ NONTERMINAL_STATES: Set[str] = {
65
78
  "SIGNALING",
66
79
  "STAGE_OUT",
67
80
  "STOPPED",
68
- "SUSPENDED"
69
- }
81
+ "SUSPENDED",
82
+ }
83
+
84
+
85
+ def parse_slurm_time(slurm_time: str) -> int:
86
+ """
87
+ Parse a Slurm-style time duration like 7-00:00:00 to a number of seconds.
88
+
89
+ Raises ValueError if not parseable.
90
+ """
91
+ # slurm returns time in days-hours:minutes:seconds format
92
+ # Sometimes it will only return minutes:seconds, so days may be omitted
93
+ # For ease of calculating, we'll make sure all the delimeters are ':'
94
+ # Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days
95
+ total_seconds = 0
96
+ elapsed_split: list[str] = slurm_time.replace("-", ":").split(":")
97
+ elapsed_split.reverse()
98
+ seconds_per_unit = [1, 60, 3600, 86400]
99
+ for index, multiplier in enumerate(seconds_per_unit):
100
+ if index < len(elapsed_split):
101
+ total_seconds += multiplier * int(elapsed_split[index])
102
+ return total_seconds
103
+
70
104
 
71
105
  class SlurmBatchSystem(AbstractGridEngineBatchSystem):
106
+ class PartitionInfo(NamedTuple):
107
+ partition_name: str
108
+ gres: bool
109
+ time_limit: float
110
+ priority: int
111
+ cpus: str
112
+ memory: str
113
+
114
+ class PartitionSet:
115
+ """
116
+ Set of available partitions detected on the slurm batch system
117
+ """
118
+
119
+ default_gpu_partition: SlurmBatchSystem.PartitionInfo | None
120
+ all_partitions: list[SlurmBatchSystem.PartitionInfo]
121
+ gpu_partitions: set[str]
122
+
123
+ def __init__(self) -> None:
124
+ self._get_partition_info()
125
+ self._get_gpu_partitions()
126
+
127
+ def _get_gpu_partitions(self) -> None:
128
+ """
129
+ Get all available GPU partitions. Also get the default GPU partition.
130
+ :return: None
131
+ """
132
+ gpu_partitions = [
133
+ partition for partition in self.all_partitions if partition.gres
134
+ ]
135
+ self.gpu_partitions = {p.partition_name for p in gpu_partitions}
136
+ # Grab the lowest priority GPU partition
137
+ # If no GPU partitions are available, then set the default to None
138
+ self.default_gpu_partition = None
139
+ if len(gpu_partitions) > 0:
140
+ self.default_gpu_partition = sorted(
141
+ gpu_partitions, key=lambda x: x.priority
142
+ )[0]
143
+
144
+ def _get_partition_info(self) -> None:
145
+ """
146
+ Call the Slurm batch system with sinfo to grab all available partitions.
147
+ Then parse the output and store all available Slurm partitions
148
+ :return: None
149
+ """
150
+ sinfo_command = ["sinfo", "-a", "-o", "%P %G %l %p %c %m"]
151
+
152
+ sinfo = call_command(sinfo_command)
153
+
154
+ parsed_partitions = []
155
+ for line in sinfo.split("\n")[1:]:
156
+ if line.strip():
157
+ partition_name, gres, time, priority, cpus, memory = line.split(" ")
158
+ try:
159
+ # Parse time to a number so we can compute on it
160
+ partition_time: float = parse_slurm_time(time)
161
+ except ValueError:
162
+ # Maybe time is unlimited?
163
+ partition_time = float("inf")
164
+ try:
165
+ # Parse priority to an int so we can sort on it
166
+ partition_priority = int(priority)
167
+ except ValueError:
168
+ logger.warning(
169
+ "Could not parse priority %s for partition %s, assuming high priority",
170
+ partition_name,
171
+ priority,
172
+ )
173
+ partition_priority = sys.maxsize
174
+ parsed_partitions.append(
175
+ SlurmBatchSystem.PartitionInfo(
176
+ partition_name.rstrip("*"),
177
+ gres != "(null)",
178
+ partition_time,
179
+ partition_priority,
180
+ cpus,
181
+ memory,
182
+ )
183
+ )
184
+ self.all_partitions = parsed_partitions
185
+
186
+ def get_partition(self, time_limit: float | None) -> str | None:
187
+ """
188
+ Get the partition name to use for a job with the given time limit.
189
+
190
+ :param time_limit: Time limit in seconds.
191
+ """
192
+
193
+ if time_limit is None:
194
+ # Just use Slurm's default
195
+ return None
196
+
197
+ winning_partition = None
198
+ for partition in self.all_partitions:
199
+ if partition.time_limit < time_limit:
200
+ # Can't use this
201
+ continue
202
+ if winning_partition is None:
203
+ # Anything beats None
204
+ winning_partition = partition
205
+ continue
206
+ if partition.gres and not winning_partition.gres:
207
+ # Never use a partition witn GRES if you can avoid it
208
+ continue
209
+ elif not partition.gres and winning_partition.gres:
210
+ # Never keep a partition with GRES if we find one without
211
+ winning_partition = partition
212
+ continue
213
+ if partition.priority > winning_partition.priority:
214
+ # After that, don't raise priority
215
+ continue
216
+ elif partition.priority < winning_partition.priority:
217
+ # And always lower it
218
+ winning_partition = partition
219
+ continue
220
+ if partition.time_limit < winning_partition.time_limit:
221
+ # Finally, lower time limit
222
+ winning_partition = partition
223
+
224
+ # TODO: Store partitions in a better indexed way
225
+ if winning_partition is None and len(self.all_partitions) > 0:
226
+ # We have partitions and none of them can fit this
227
+ raise RuntimeError(
228
+ f"Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
229
+ )
230
+
231
+ if winning_partition is None:
232
+ return None
233
+ else:
234
+ return winning_partition.partition_name
72
235
 
73
236
  class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
237
+ # Our boss is always the enclosing class
238
+ boss: SlurmBatchSystem
74
239
 
75
- def getRunningJobIDs(self):
240
+ def getRunningJobIDs(self) -> dict[int, int]:
76
241
  # Should return a dictionary of Job IDs and number of seconds
77
242
  times = {}
78
243
  with self.runningJobsLock:
79
- currentjobs = {str(self.batchJobIDs[x][0]): x for x in self.runningJobs}
244
+ currentjobs: dict[str, int] = {
245
+ str(self.batchJobIDs[x][0]): x for x in self.runningJobs
246
+ }
80
247
  # currentjobs is a dictionary that maps a slurm job id (string) to our own internal job id
81
248
  # squeue arguments:
82
249
  # -h for no header
83
250
  # --format to get jobid i, state %t and time days-hours:minutes:seconds
84
251
 
85
- lines = call_command(['squeue', '-h', '--format', '%i %t %M'], quiet=True).split('\n')
252
+ lines = call_command(
253
+ ["squeue", "-h", "--format", "%i %t %M"], quiet=True
254
+ ).split("\n")
86
255
  for line in lines:
87
256
  values = line.split()
88
257
  if len(values) < 3:
89
258
  continue
90
259
  slurm_jobid, state, elapsed_time = values
91
- if slurm_jobid in currentjobs and state == 'R':
92
- seconds_running = self.parse_elapsed(elapsed_time)
260
+ if slurm_jobid in currentjobs and state == "R":
261
+ try:
262
+ seconds_running = parse_slurm_time(elapsed_time)
263
+ except ValueError:
264
+ # slurm may return INVALID instead of a time
265
+ seconds_running = 0
93
266
  times[currentjobs[slurm_jobid]] = seconds_running
94
267
 
95
268
  return times
96
269
 
97
- def killJob(self, jobID):
98
- call_command(['scancel', self.getBatchSystemID(jobID)])
99
-
100
- def prepareSubmission(self,
101
- cpu: int,
102
- memory: int,
103
- jobID: int,
104
- command: str,
105
- jobName: str,
106
- job_environment: Optional[Dict[str, str]] = None,
107
- gpus: Optional[int] = None) -> List[str]:
270
+ def killJob(self, jobID: int) -> None:
271
+ call_command(["scancel", self.getBatchSystemID(jobID)])
272
+
273
+ def prepareSubmission(
274
+ self,
275
+ cpu: int,
276
+ memory: int,
277
+ jobID: int,
278
+ command: str,
279
+ jobName: str,
280
+ job_environment: dict[str, str] | None = None,
281
+ gpus: int | None = None,
282
+ ) -> list[str]:
108
283
  # Make sure to use exec so we can get Slurm's signals in the Toil
109
284
  # worker instead of having an intervening Bash
110
- return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap=exec {command}']
285
+ return self.prepareSbatch(
286
+ cpu, memory, jobID, jobName, job_environment, gpus
287
+ ) + [f"--wrap=exec {command}"]
111
288
 
112
- def submitJob(self, subLine):
289
+ def submitJob(self, subLine: list[str]) -> int:
113
290
  try:
114
291
  # Slurm is not quite clever enough to follow the XDG spec on
115
292
  # its own. If the submission command sees e.g. XDG_RUNTIME_DIR
@@ -125,7 +302,11 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
125
302
  # This doesn't get us a trustworthy XDG session in Slurm, but
126
303
  # it does let us see the one Slurm tries to give us.
127
304
  no_session_environment = os.environ.copy()
128
- session_names = [n for n in no_session_environment.keys() if n.startswith('XDG_') or n.startswith('DBUS_')]
305
+ session_names = [
306
+ n
307
+ for n in no_session_environment.keys()
308
+ if n.startswith("XDG_") or n.startswith("DBUS_")
309
+ ]
129
310
  for name in session_names:
130
311
  del no_session_environment[name]
131
312
 
@@ -138,36 +319,44 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
138
319
  logger.error(f"sbatch command failed with error: {e}")
139
320
  raise e
140
321
 
141
- def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
322
+ def coalesce_job_exit_codes(
323
+ self, batch_job_id_list: list[str]
324
+ ) -> list[int | tuple[int, BatchJobExitReason | None] | None]:
142
325
  """
143
326
  Collect all job exit codes in a single call.
144
327
  :param batch_job_id_list: list of Job ID strings, where each string has the form
145
328
  "<job>[.<task>]".
146
329
  :return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
147
330
  """
148
- logger.debug("Getting exit codes for slurm jobs: %s", batch_job_id_list)
331
+ logger.log(
332
+ TRACE, "Getting exit codes for slurm jobs: %s", batch_job_id_list
333
+ )
149
334
  # Convert batch_job_id_list to list of integer job IDs.
150
- job_id_list = [int(id.split('.')[0]) for id in batch_job_id_list]
335
+ job_id_list = [int(id.split(".")[0]) for id in batch_job_id_list]
151
336
  status_dict = self._get_job_details(job_id_list)
152
- exit_codes = []
337
+ exit_codes: list[int | tuple[int, BatchJobExitReason | None] | None] = []
153
338
  for _, status in status_dict.items():
154
339
  exit_codes.append(self._get_job_return_code(status))
155
340
  return exit_codes
156
341
 
157
- def getJobExitCode(self, batchJobID: str) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
342
+ def getJobExitCode(
343
+ self, batchJobID: str
344
+ ) -> int | tuple[int, BatchJobExitReason | None] | None:
158
345
  """
159
346
  Get job exit code for given batch job ID.
160
347
  :param batchJobID: string of the form "<job>[.<task>]".
161
348
  :return: integer job exit code.
162
349
  """
163
- logger.debug("Getting exit code for slurm job: %s", batchJobID)
350
+ logger.log(TRACE, "Getting exit code for slurm job: %s", batchJobID)
164
351
  # Convert batchJobID to an integer job ID.
165
- job_id = int(batchJobID.split('.')[0])
352
+ job_id = int(batchJobID.split(".")[0])
166
353
  status_dict = self._get_job_details([job_id])
167
354
  status = status_dict[job_id]
168
355
  return self._get_job_return_code(status)
169
356
 
170
- def _get_job_details(self, job_id_list: list) -> dict:
357
+ def _get_job_details(
358
+ self, job_id_list: list[int]
359
+ ) -> dict[int, tuple[str | None, int | None]]:
171
360
  """
172
361
  Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
173
362
  Fetch job details from Slurm's accounting system or job control system.
@@ -177,11 +366,15 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
177
366
  """
178
367
  try:
179
368
  status_dict = self._getJobDetailsFromSacct(job_id_list)
180
- except CalledProcessErrorStderr:
369
+ except (CalledProcessErrorStderr, OSError) as e:
370
+ if isinstance(e, OSError):
371
+ logger.warning("Could not run sacct: %s", e)
181
372
  status_dict = self._getJobDetailsFromScontrol(job_id_list)
182
373
  return status_dict
183
374
 
184
- def _get_job_return_code(self, status: tuple) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
375
+ def _get_job_return_code(
376
+ self, status: tuple[str | None, int | None]
377
+ ) -> int | tuple[int, BatchJobExitReason | None] | None:
185
378
  """
186
379
  Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair.
187
380
 
@@ -215,7 +408,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
215
408
  # The only state that should produce a 0 ever is COMPLETED. So
216
409
  # if the job is COMPLETED and the exit reason is thus FINISHED,
217
410
  # pass along the code it has.
218
- return (rc, exit_reason)
411
+ return (rc, exit_reason) # type: ignore[return-value] # mypy doesn't understand enums well
219
412
 
220
413
  if rc == 0:
221
414
  # The job claims to be in a state other than COMPLETED, but
@@ -224,7 +417,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
224
417
  return (EXIT_STATUS_UNAVAILABLE_VALUE, exit_reason)
225
418
 
226
419
  # If the code is nonzero, pass it along.
227
- return (rc, exit_reason)
420
+ return (rc, exit_reason) # type: ignore[return-value] # mypy doesn't understand enums well
228
421
 
229
422
  def _canonicalize_state(self, state: str) -> str:
230
423
  """
@@ -233,18 +426,23 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
233
426
 
234
427
  # Slurm will sometimes send something like "CANCELED by 30065" in
235
428
  # the state column for some reason.
236
-
429
+
237
430
  state_token = state
238
431
 
239
432
  if " " in state_token:
240
433
  state_token = state.split(" ", 1)[0]
241
434
 
242
- if state_token not in TERMINAL_STATES and state_token not in NONTERMINAL_STATES:
435
+ if (
436
+ state_token not in TERMINAL_STATES
437
+ and state_token not in NONTERMINAL_STATES
438
+ ):
243
439
  raise RuntimeError("Toil job in unimplemented Slurm state " + state)
244
-
440
+
245
441
  return state_token
246
442
 
247
- def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
443
+ def _getJobDetailsFromSacct(
444
+ self, job_id_list: list[int]
445
+ ) -> dict[int, tuple[str | None, int | None]]:
248
446
  """
249
447
  Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
250
448
  :param job_id_list: list of integer batch job IDs.
@@ -252,52 +450,82 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
252
450
  containing the job's state and exit code.
253
451
  """
254
452
  job_ids = ",".join(str(id) for id in job_id_list)
255
- args = ['sacct',
256
- '-n', # no header
257
- '-j', job_ids, # job
258
- '--format', 'JobIDRaw,State,ExitCode', # specify output columns
259
- '-P', # separate columns with pipes
260
- '-S', '1970-01-01'] # override start time limit
261
- stdout = call_command(args, quiet=True)
453
+ args = [
454
+ "sacct",
455
+ "-n", # no header
456
+ "-j",
457
+ job_ids, # job
458
+ "--format",
459
+ "JobIDRaw,State,ExitCode", # specify output columns
460
+ "-P", # separate columns with pipes
461
+ "-S",
462
+ "1970-01-01",
463
+ ] # override start time limit
262
464
 
263
465
  # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
264
466
  # job state and exit status. Initialize dict before processing output of `sacct`.
265
- job_statuses = {}
467
+ job_statuses: dict[int, tuple[str | None, int | None]] = {}
468
+
469
+ try:
470
+ stdout = call_command(args, quiet=True)
471
+ except OSError as e:
472
+ if e.errno == errno.E2BIG:
473
+ # Argument list is too big, recurse on half the argument list
474
+ if len(job_id_list) == 1:
475
+ # 1 is too big, we can't recurse further, bail out
476
+ raise
477
+ job_statuses.update(self._getJobDetailsFromSacct(job_id_list[:len(job_id_list)//2]))
478
+ job_statuses.update(self._getJobDetailsFromSacct(job_id_list[len(job_id_list)//2:]))
479
+ return job_statuses
480
+ else:
481
+ raise
482
+
266
483
  for job_id in job_id_list:
267
484
  job_statuses[job_id] = (None, None)
268
485
 
269
486
  for line in stdout.splitlines():
270
- values = line.strip().split('|')
487
+ values = line.strip().split("|")
271
488
  if len(values) < 3:
272
489
  continue
490
+ state: str
273
491
  job_id_raw, state, exitcode = values
274
492
  state = self._canonicalize_state(state)
275
- logger.debug("%s state of job %s is %s", args[0], job_id_raw, state)
493
+ logger.log(
494
+ TRACE, "%s state of job %s is %s", args[0], job_id_raw, state
495
+ )
276
496
  # JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
277
497
  job_id_parts = job_id_raw.split(".")
278
498
  if len(job_id_parts) > 1:
279
499
  continue
280
500
  job_id = int(job_id_parts[0])
281
- status, signal = (int(n) for n in exitcode.split(':'))
501
+ status: int
502
+ signal: int
503
+ status, signal = (int(n) for n in exitcode.split(":"))
282
504
  if signal > 0:
283
505
  # A non-zero signal may indicate e.g. an out-of-memory killed job
284
506
  status = 128 + signal
285
- logger.debug("%s exit code of job %d is %s, return status %d",
286
- args[0], job_id, exitcode, status)
507
+ logger.log(
508
+ TRACE,
509
+ "%s exit code of job %d is %s, return status %d",
510
+ args[0],
511
+ job_id,
512
+ exitcode,
513
+ status,
514
+ )
287
515
  job_statuses[job_id] = state, status
288
- logger.debug("%s returning job statuses: %s", args[0], job_statuses)
516
+ logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
289
517
  return job_statuses
290
518
 
291
- def _getJobDetailsFromScontrol(self, job_id_list: list) -> dict:
519
+ def _getJobDetailsFromScontrol(
520
+ self, job_id_list: list[int]
521
+ ) -> dict[int, tuple[str | None, int | None]]:
292
522
  """
293
523
  Get SLURM job exit codes for the jobs in `job_id_list` by running `scontrol`.
294
524
  :param job_id_list: list of integer batch job IDs.
295
525
  :return: dict of job statuses, where key is the job-id, and value is a tuple
296
526
  containing the job's state and exit code.
297
527
  """
298
- args = ['scontrol',
299
- 'show',
300
- 'job']
528
+ args = ["scontrol", "show", "job"]
301
529
  # `scontrol` can only return information about a single job,
302
530
  # or all the jobs it knows about.
303
531
  if len(job_id_list) == 1:
@@ -306,14 +534,16 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
306
534
  stdout = call_command(args, quiet=True)
307
535
 
308
536
  # Job records are separated by a blank line.
537
+ job_records = None
309
538
  if isinstance(stdout, str):
310
- job_records = stdout.strip().split('\n\n')
539
+ job_records = stdout.strip().split("\n\n")
311
540
  elif isinstance(stdout, bytes):
312
- job_records = stdout.decode('utf-8').strip().split('\n\n')
541
+ job_records = stdout.decode("utf-8").strip().split("\n\n")
313
542
 
314
543
  # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
315
544
  # job state and exit status. Initialize dict before processing output of `scontrol`.
316
- job_statuses = {}
545
+ job_statuses: dict[int, tuple[str | None, int | None]] = {}
546
+ job_id: int | None
317
547
  for job_id in job_id_list:
318
548
  job_statuses[job_id] = (None, None)
319
549
 
@@ -323,7 +553,8 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
323
553
  return job_statuses
324
554
 
325
555
  for record in job_records:
326
- job = {}
556
+ job: dict[str, str] = {}
557
+ job_id = None
327
558
  for line in record.splitlines():
328
559
  for item in line.split():
329
560
  # Output is in the form of many key=value pairs, multiple pairs on each line
@@ -331,59 +562,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
331
562
  # added to a dictionary.
332
563
  # Note: In some cases, the value itself may contain white-space. So, if we find
333
564
  # a key without a value, we consider that key part of the previous value.
334
- bits = item.split('=', 1)
565
+ bits = item.split("=", 1)
335
566
  if len(bits) == 1:
336
- job[key] += ' ' + bits[0]
567
+ job[key] += " " + bits[0] # type: ignore[has-type] # we depend on the previous iteration to populate key
337
568
  else:
338
569
  key = bits[0]
339
570
  job[key] = bits[1]
340
571
  # The first line of the record contains the JobId. Stop processing the remainder
341
572
  # of this record, if we're not interested in this job.
342
- job_id = int(job['JobId'])
573
+ job_id = int(job["JobId"])
343
574
  if job_id not in job_id_list:
344
- logger.debug("%s job %d is not in the list", args[0], job_id)
575
+ logger.log(
576
+ TRACE, "%s job %d is not in the list", args[0], job_id
577
+ )
345
578
  break
346
- if job_id not in job_id_list:
579
+ if job_id is None or job_id not in job_id_list:
347
580
  continue
348
- state = job['JobState']
581
+ state = job["JobState"]
349
582
  state = self._canonicalize_state(state)
350
- logger.debug("%s state of job %s is %s", args[0], job_id, state)
583
+ logger.log(TRACE, "%s state of job %s is %s", args[0], job_id, state)
351
584
  try:
352
- exitcode = job['ExitCode']
585
+ exitcode = job["ExitCode"]
353
586
  if exitcode is not None:
354
- status, signal = (int(n) for n in exitcode.split(':'))
587
+ status, signal = (int(n) for n in exitcode.split(":"))
355
588
  if signal > 0:
356
589
  # A non-zero signal may indicate e.g. an out-of-memory killed job
357
590
  status = 128 + signal
358
- logger.debug("%s exit code of job %d is %s, return status %d",
359
- args[0], job_id, exitcode, status)
591
+ logger.log(
592
+ TRACE,
593
+ "%s exit code of job %d is %s, return status %d",
594
+ args[0],
595
+ job_id,
596
+ exitcode,
597
+ status,
598
+ )
360
599
  rc = status
361
600
  else:
362
601
  rc = None
363
602
  except KeyError:
364
603
  rc = None
365
604
  job_statuses[job_id] = (state, rc)
366
- logger.debug("%s returning job statuses: %s", args[0], job_statuses)
605
+ logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
367
606
  return job_statuses
368
607
 
369
608
  ###
370
609
  ### Implementation-specific helper methods
371
610
  ###
372
611
 
373
- def prepareSbatch(self,
374
- cpu: int,
375
- mem: int,
376
- jobID: int,
377
- jobName: str,
378
- job_environment: Optional[Dict[str, str]],
379
- gpus: Optional[int]) -> List[str]:
380
-
612
+ def prepareSbatch(
613
+ self,
614
+ cpu: int,
615
+ mem: int,
616
+ jobID: int,
617
+ jobName: str,
618
+ job_environment: dict[str, str] | None,
619
+ gpus: int | None,
620
+ ) -> list[str]:
381
621
  """
382
622
  Returns the sbatch command line to run to queue the job.
383
623
  """
384
624
 
385
625
  # Start by naming the job
386
- sbatch_line = ['sbatch', '-J', f'toil_job_{jobID}_{jobName}']
626
+ sbatch_line = ["sbatch", "-J", f"toil_job_{jobID}_{jobName}"]
387
627
 
388
628
  # Make sure the job gets a signal before it disappears so that e.g.
389
629
  # container cleanup finally blocks can run. Ask for SIGINT so we
@@ -398,83 +638,271 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
398
638
  # responded to this signal and use the right exit reason for it.
399
639
  sbatch_line.append("--signal=B:INT@30")
400
640
 
401
- if gpus:
402
- sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:]
403
641
  environment = {}
404
642
  environment.update(self.boss.environment)
405
643
  if job_environment:
406
644
  environment.update(job_environment)
407
645
 
408
646
  # "Native extensions" for SLURM (see DRMAA or SAGA)
409
- nativeConfig = os.getenv('TOIL_SLURM_ARGS')
647
+ # Also any extra arguments from --slurmArgs or TOIL_SLURM_ARGS
648
+ nativeConfig: str = self.boss.config.slurm_args # type: ignore[attr-defined]
649
+
650
+ # For parsing user-provided option overrides (or self-generated
651
+ # options) we need a way to recognize long, long-with-equals, and
652
+ # short forms.
653
+ def option_detector(long: str, short: str | None = None) -> Callable[[str], bool]:
654
+ """
655
+ Get a function that returns true if it sees the long or short
656
+ option.
657
+ """
658
+ def is_match(option: str) -> bool:
659
+ return option == f"--{long}" or option.startswith(f"--{long}=") or (short is not None and option == f"-{short}")
660
+ return is_match
661
+
662
+ def any_option_detector(options: list[str | tuple[str, str]]) -> Callable[[str], bool]:
663
+ """
664
+ Get a function that returns true if it sees any of the long
665
+ options or long or short option pairs.
666
+ """
667
+ detectors = [option_detector(o) if isinstance(o, str) else option_detector(*o) for o in options]
668
+ def is_match(option: str) -> bool:
669
+ for detector in detectors:
670
+ if detector(option):
671
+ return True
672
+ return False
673
+ return is_match
674
+
675
+ is_any_mem_option = any_option_detector(["mem", "mem-per-cpu", "mem-per-gpu"])
676
+ is_any_cpus_option = any_option_detector([("cpus-per-task", "c"), "cpus-per-gpu"])
677
+ is_export_option = option_detector("export")
678
+ is_export_file_option = option_detector("export-file")
679
+ is_time_option = option_detector("time", "t")
680
+ is_partition_option = option_detector("partition", "p")
681
+
682
+ # We will fill these in with stuff parsed from TOIL_SLURM_ARGS, or
683
+ # with our own determinations if they aren't there.
410
684
 
411
685
  # --export=[ALL,]<environment_toil_variables>
412
- set_exports = "--export=ALL"
686
+ export_all = True
687
+ export_list = [] # Some items here may be multiple comma-separated values
688
+ time_limit: int | None = self.boss.config.slurm_time # type: ignore[attr-defined]
689
+ partition: str | None = None
413
690
 
414
691
  if nativeConfig is not None:
415
- logger.debug("Native SLURM options appended to sbatch from TOIL_SLURM_ARGS env. variable: %s", nativeConfig)
692
+ logger.debug(
693
+ "Native SLURM options appended to sbatch: %s", nativeConfig
694
+ )
695
+
696
+ # Do a mini argument parse to pull out export and parse time if
697
+ # needed
698
+ args = shlex.split(nativeConfig)
699
+ i = 0
700
+ while i < len(args):
701
+ arg = args[i]
702
+ if is_any_mem_option(arg) or is_any_cpus_option(arg):
703
+ # Prohibit arguments that set CPUs or memory
704
+ raise ValueError(
705
+ f"Cannot use Slurm argument {arg} which conflicts "
706
+ f"with Toil's own arguments to Slurm"
707
+ )
708
+ elif is_export_option(arg):
709
+ # Capture the export argument value so we can modify it
710
+ export_all = False
711
+ if "=" not in arg:
712
+ if i + 1 >= len(args):
713
+ raise ValueError(
714
+ f"No value supplied for Slurm {arg} argument"
715
+ )
716
+ i += 1
717
+ export_list.append(args[i])
718
+ else:
719
+ export_list.append(arg.split("=", 1)[1])
720
+ elif is_export_file_option(arg):
721
+ # Keep --export-file but turn off --export=ALL in that
722
+ # case.
723
+ export_all = False
724
+ sbatch_line.append(arg)
725
+ elif is_time_option(arg):
726
+ # Capture the time limit in seconds so we can use it for picking a partition
727
+ if "=" not in arg:
728
+ if i + 1 >= len(args):
729
+ raise ValueError(
730
+ f"No value supplied for Slurm {arg} argument"
731
+ )
732
+ i += 1
733
+ time_string = args[i]
734
+ else:
735
+ time_string = arg.split("=", 1)[1]
736
+ time_limit = parse_slurm_time(time_string)
737
+ elif is_partition_option(arg):
738
+ # Capture the partition so we can run checks on it and know not to assign one
739
+ if "=" not in arg:
740
+ if i + 1 >= len(args):
741
+ raise ValueError(
742
+ f"No value supplied for Slurm {arg} argument"
743
+ )
744
+ i += 1
745
+ partition = args[i]
746
+ else:
747
+ partition = arg.split("=", 1)[1]
748
+ else:
749
+ # Other arguments pass through.
750
+ sbatch_line.append(arg)
751
+ i += 1
416
752
 
417
- for arg in nativeConfig.split():
418
- if arg.startswith("--mem") or arg.startswith("--cpus-per-task"):
419
- raise ValueError(f"Some resource arguments are incompatible: {nativeConfig}")
420
- # repleace default behaviour by the one stated at TOIL_SLURM_ARGS
421
- if arg.startswith("--export"):
422
- set_exports = arg
423
- sbatch_line.extend(nativeConfig.split())
753
+ if export_all:
754
+ # We don't have any export overrides so we ened to start with
755
+ # an ALL
756
+ export_list.append("ALL")
424
757
 
425
758
  if environment:
426
759
  argList = []
427
760
 
428
761
  for k, v in environment.items():
429
- quoted_value = quote(os.environ[k] if v is None else v)
430
- argList.append(f'{k}={quoted_value}')
431
-
432
- set_exports += ',' + ','.join(argList)
433
-
434
- # add --export to the sbatch
435
- sbatch_line.append(set_exports)
436
-
437
- parallel_env = os.getenv('TOIL_SLURM_PE')
438
- if cpu and cpu > 1 and parallel_env:
439
- sbatch_line.append(f'--partition={parallel_env}')
440
-
441
- if mem is not None and self.boss.config.allocate_mem:
762
+ # TODO: The sbatch man page doesn't say we can quote these;
763
+ # if we need to send characters like , itself we need to
764
+ # use --export-file and clean it up when the command has
765
+ # been issued.
766
+ quoted_value = shlex.quote(os.environ[k] if v is None else v)
767
+ argList.append(f"{k}={quoted_value}")
768
+
769
+ export_list.extend(argList)
770
+
771
+ # If partition isn't set and we have a GPU partition override
772
+ # that applies, apply it
773
+ gpu_partition_override: str | None = self.boss.config.slurm_gpu_partition # type: ignore[attr-defined]
774
+ if partition is None and gpus and gpu_partition_override:
775
+ partition = gpu_partition_override
776
+
777
+ # If partition isn't set and we have a parallel partition override
778
+ # that applies, apply it
779
+ parallel_env: str | None = self.boss.config.slurm_pe # type: ignore[attr-defined]
780
+ if partition is None and cpu and cpu > 1 and parallel_env:
781
+ partition = parallel_env
782
+
783
+ # If partition isn't set and we have a general partition override
784
+ # that applies, apply it
785
+ partition_override: str | None = self.boss.config.slurm_partition # type: ignore[attr-defined]
786
+ if partition is None and partition_override:
787
+ partition = partition_override
788
+
789
+ if partition is None and gpus:
790
+ # Send to a GPU partition
791
+ gpu_partition = self.boss.partitions.default_gpu_partition
792
+ if gpu_partition is None:
793
+ # no gpu partitions are available, raise an error
794
+ raise RuntimeError(
795
+ f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
796
+ )
797
+ if (
798
+ time_limit is not None
799
+ and gpu_partition.time_limit < time_limit
800
+ ):
801
+ # TODO: find the lowest-priority GPU partition that has at least each job's time limit!
802
+ logger.warning(
803
+ "Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
804
+ time_limit,
805
+ gpu_partition.partition_name,
806
+ gpu_partition.time_limit,
807
+ )
808
+ partition = gpu_partition.partition_name
809
+
810
+ if partition is None:
811
+ # Pick a partition based on time limit
812
+ partition = self.boss.partitions.get_partition(time_limit)
813
+
814
+ # Now generate all the arguments
815
+ if len(export_list) > 0:
816
+ # add --export to the sbatch
817
+ sbatch_line.append("--export=" + ",".join(export_list))
818
+ if partition is not None:
819
+ sbatch_line.append(f"--partition={partition}")
820
+ if gpus:
821
+ # Generate GPU assignment argument
822
+ sbatch_line.append(f"--gres=gpu:{gpus}")
823
+ if partition is not None and partition not in self.boss.partitions.gpu_partitions:
824
+ # the specified partition is not compatible, so warn the user that the job may not work
825
+ logger.warning(
826
+ f"Job {jobName} needs GPUs, but specified partition {partition} does not have them. This job may not work."
827
+ f"Try specifying one of these partitions instead: {', '.join(self.boss.partitions.gpu_partitions)}."
828
+ )
829
+ if mem is not None and self.boss.config.slurm_allocate_mem: # type: ignore[attr-defined]
442
830
  # memory passed in is in bytes, but slurm expects megabytes
443
- sbatch_line.append(f'--mem={math.ceil(mem / 2 ** 20)}')
831
+ sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}")
444
832
  if cpu is not None:
445
- sbatch_line.append(f'--cpus-per-task={math.ceil(cpu)}')
833
+ sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}")
834
+ if time_limit is not None:
835
+ # Put all the seconds in the seconds slot
836
+ sbatch_line.append(f"--time=0:{time_limit}")
446
837
 
447
- stdoutfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'out')
448
- stderrfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'err')
449
- sbatch_line.extend(['-o', stdoutfile, '-e', stderrfile])
838
+ stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out")
839
+ stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err")
840
+ sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile])
450
841
 
451
842
  return sbatch_line
452
843
 
453
- def parse_elapsed(self, elapsed):
454
- # slurm returns elapsed time in days-hours:minutes:seconds format
455
- # Sometimes it will only return minutes:seconds, so days may be omitted
456
- # For ease of calculating, we'll make sure all the delimeters are ':'
457
- # Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days
458
- total_seconds = 0
459
- try:
460
- elapsed = elapsed.replace('-', ':').split(':')
461
- elapsed.reverse()
462
- seconds_per_unit = [1, 60, 3600, 86400]
463
- for index, multiplier in enumerate(seconds_per_unit):
464
- if index < len(elapsed):
465
- total_seconds += multiplier * int(elapsed[index])
466
- except ValueError:
467
- pass # slurm may return INVALID instead of a time
468
- return total_seconds
844
+ def __init__(
845
+ self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
846
+ ) -> None:
847
+ super().__init__(config, maxCores, maxMemory, maxDisk)
848
+ self.partitions = SlurmBatchSystem.PartitionSet()
849
+
850
+ # Override issuing jobs so we can check if we need to use Slurm's magic
851
+ # whole-node-memory feature.
852
+ def issueBatchJob(
853
+ self,
854
+ command: str,
855
+ job_desc: JobDescription,
856
+ job_environment: dict[str, str] | None = None,
857
+ ) -> int:
858
+ # Avoid submitting internal jobs to the batch queue, handle locally
859
+ local_id = self.handleLocalJob(command, job_desc)
860
+ if local_id is not None:
861
+ return local_id
862
+ else:
863
+ self.check_resource_request(job_desc)
864
+ gpus = self.count_needed_gpus(job_desc)
865
+ job_id = self.getNextJobID()
866
+ self.currentJobs.add(job_id)
867
+
868
+ if "memory" not in job_desc.requirements and self.config.slurm_default_all_mem: # type: ignore[attr-defined]
869
+ # The job doesn't have its own memory requirement, and we are
870
+ # defaulting to whole node memory. Use Slurm's 0-memory sentinel.
871
+ memory = 0
872
+ else:
873
+ # Use the memory actually on the job, or the Toil default memory
874
+ memory = job_desc.memory
875
+
876
+ self.newJobsQueue.put(
877
+ (
878
+ job_id,
879
+ job_desc.cores,
880
+ memory,
881
+ command,
882
+ get_job_kind(job_desc.get_names()),
883
+ job_environment,
884
+ gpus,
885
+ )
886
+ )
887
+ logger.debug(
888
+ "Issued the job command: %s with job id: %s and job name %s",
889
+ command,
890
+ str(job_id),
891
+ get_job_kind(job_desc.get_names()),
892
+ )
893
+ return job_id
469
894
 
470
895
  def _check_accelerator_request(self, requirer: Requirer) -> None:
471
896
  for accelerator in requirer.accelerators:
472
- if accelerator['kind'] != 'gpu':
473
- raise InsufficientSystemResources(requirer, 'accelerators', details=
474
- [
475
- f'The accelerator {accelerator} could not be provided'
476
- 'The Toil Slurm batch system only supports gpu accelerators at the moment.'
477
- ])
897
+ if accelerator["kind"] != "gpu":
898
+ raise InsufficientSystemResources(
899
+ requirer,
900
+ "accelerators",
901
+ details=[
902
+ f"The accelerator {accelerator} could not be provided"
903
+ "The Toil Slurm batch system only supports gpu accelerators at the moment."
904
+ ],
905
+ )
478
906
 
479
907
  ###
480
908
  ### The interface for SLURM
@@ -488,17 +916,85 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
488
916
  # implement getWaitDuration().
489
917
 
490
918
  @classmethod
491
- def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]):
492
- allocate_mem = parser.add_mutually_exclusive_group()
493
- allocate_mem_help = ("A flag that can block allocating memory with '--mem' for job submissions "
494
- "on SLURM since some system servers may reject any job request that "
495
- "explicitly specifies the memory allocation. The default is to always allocate memory.")
496
- allocate_mem.add_argument("--dont_allocate_mem", action='store_false', dest="allocate_mem", help=allocate_mem_help)
497
- allocate_mem.add_argument("--allocate_mem", action='store_true', dest="allocate_mem", help=allocate_mem_help)
498
- allocate_mem.set_defaults(allocate_mem=True)
499
-
500
- OptionType = TypeVar('OptionType')
919
+ def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None:
920
+
921
+ parser.add_argument(
922
+ "--slurmAllocateMem",
923
+ dest="slurm_allocate_mem",
924
+ type=strtobool,
925
+ default=True,
926
+ env_var="TOIL_SLURM_ALLOCATE_MEM",
927
+ help="If False, do not use --mem. Used as a workaround for Slurm clusters that reject jobs "
928
+ "with memory allocations.",
929
+ )
930
+ # Keep these deprcated options for backward compatibility
931
+ parser.add_argument(
932
+ "--dont_allocate_mem",
933
+ action="store_false",
934
+ dest="slurm_allocate_mem",
935
+ help=SUPPRESS,
936
+ )
937
+ parser.add_argument(
938
+ "--allocate_mem",
939
+ action="store_true",
940
+ dest="slurm_allocate_mem",
941
+ help=SUPPRESS,
942
+ )
943
+
944
+ parser.add_argument(
945
+ "--slurmDefaultAllMem",
946
+ dest="slurm_default_all_mem",
947
+ type=strtobool,
948
+ default=False,
949
+ env_var="TOIL_SLURM_DEFAULT_ALL_MEM",
950
+ help="If True, assign Toil jobs without their own memory requirements all available "
951
+ "memory on a Slurm node (via Slurm --mem=0).",
952
+ )
953
+ parser.add_argument(
954
+ "--slurmTime",
955
+ dest="slurm_time",
956
+ type=parse_slurm_time,
957
+ default=None,
958
+ env_var="TOIL_SLURM_TIME",
959
+ help="Slurm job time limit, in [DD-]HH:MM:SS format.",
960
+ )
961
+ parser.add_argument(
962
+ "--slurmPartition",
963
+ dest="slurm_partition",
964
+ default=None,
965
+ env_var="TOIL_SLURM_PARTITION",
966
+ help="Partition to send Slurm jobs to.",
967
+ )
968
+ parser.add_argument(
969
+ "--slurmGPUPartition",
970
+ dest="slurm_gpu_partition",
971
+ default=None,
972
+ env_var="TOIL_SLURM_GPU_PARTITION",
973
+ help="Partition to send Slurm jobs to if they ask for GPUs.",
974
+ )
975
+ parser.add_argument(
976
+ "--slurmPE",
977
+ dest="slurm_pe",
978
+ default=None,
979
+ env_var="TOIL_SLURM_PE",
980
+ help="Special partition to send Slurm jobs to if they ask for more than 1 CPU.",
981
+ )
982
+ parser.add_argument(
983
+ "--slurmArgs",
984
+ dest="slurm_args",
985
+ default="",
986
+ env_var="TOIL_SLURM_ARGS",
987
+ help="Extra arguments to pass to Slurm.",
988
+ )
989
+
990
+ OptionType = TypeVar("OptionType")
991
+
501
992
  @classmethod
502
993
  def setOptions(cls, setOption: OptionSetter) -> None:
503
- setOption("allocate_mem")
504
-
994
+ setOption("slurm_allocate_mem")
995
+ setOption("slurm_default_all_mem")
996
+ setOption("slurm_time")
997
+ setOption("slurm_partition")
998
+ setOption("slurm_gpu_partition")
999
+ setOption("slurm_pe")
1000
+ setOption("slurm_args")