toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. toil/__init__.py +121 -83
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +137 -77
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
  5. toil/batchSystems/awsBatch.py +237 -128
  6. toil/batchSystems/cleanup_support.py +22 -16
  7. toil/batchSystems/contained_executor.py +30 -26
  8. toil/batchSystems/gridengine.py +85 -49
  9. toil/batchSystems/htcondor.py +164 -87
  10. toil/batchSystems/kubernetes.py +622 -386
  11. toil/batchSystems/local_support.py +17 -12
  12. toil/batchSystems/lsf.py +132 -79
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +288 -149
  16. toil/batchSystems/mesos/executor.py +77 -49
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +38 -29
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +293 -123
  21. toil/batchSystems/slurm.py +489 -137
  22. toil/batchSystems/torque.py +46 -32
  23. toil/bus.py +141 -73
  24. toil/common.py +630 -359
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1114 -532
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +62 -41
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +88 -57
  32. toil/fileStores/cachingFileStore.py +711 -247
  33. toil/fileStores/nonCachingFileStore.py +113 -75
  34. toil/job.py +988 -315
  35. toil/jobStores/abstractJobStore.py +387 -243
  36. toil/jobStores/aws/jobStore.py +727 -403
  37. toil/jobStores/aws/utils.py +161 -109
  38. toil/jobStores/conftest.py +1 -0
  39. toil/jobStores/fileJobStore.py +289 -151
  40. toil/jobStores/googleJobStore.py +137 -70
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +614 -269
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +55 -28
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +193 -58
  49. toil/lib/aws/utils.py +238 -218
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +83 -49
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +322 -209
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +4 -2
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +99 -11
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +65 -18
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +19 -7
  71. toil/lib/retry.py +115 -77
  72. toil/lib/threading.py +282 -80
  73. toil/lib/throttle.py +15 -14
  74. toil/options/common.py +834 -401
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +70 -19
  78. toil/provisioners/__init__.py +111 -46
  79. toil/provisioners/abstractProvisioner.py +322 -157
  80. toil/provisioners/aws/__init__.py +62 -30
  81. toil/provisioners/aws/awsProvisioner.py +980 -627
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +147 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +127 -61
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +148 -64
  98. toil/test/__init__.py +263 -179
  99. toil/test/batchSystems/batchSystemTest.py +438 -195
  100. toil/test/batchSystems/batch_system_plugin_test.py +18 -7
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +93 -47
  104. toil/test/cactus/test_cactus_integration.py +20 -22
  105. toil/test/cwl/cwlTest.py +271 -71
  106. toil/test/cwl/measure_default_memory.cwl +12 -0
  107. toil/test/cwl/not_run_required_input.cwl +29 -0
  108. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  109. toil/test/docs/scriptsTest.py +60 -34
  110. toil/test/jobStores/jobStoreTest.py +412 -235
  111. toil/test/lib/aws/test_iam.py +116 -48
  112. toil/test/lib/aws/test_s3.py +16 -9
  113. toil/test/lib/aws/test_utils.py +5 -6
  114. toil/test/lib/dockerTest.py +118 -141
  115. toil/test/lib/test_conversions.py +113 -115
  116. toil/test/lib/test_ec2.py +57 -49
  117. toil/test/lib/test_integration.py +104 -0
  118. toil/test/lib/test_misc.py +12 -5
  119. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  120. toil/test/mesos/helloWorld.py +7 -6
  121. toil/test/mesos/stress.py +25 -20
  122. toil/test/options/options.py +7 -2
  123. toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
  124. toil/test/provisioners/clusterScalerTest.py +440 -250
  125. toil/test/provisioners/clusterTest.py +81 -42
  126. toil/test/provisioners/gceProvisionerTest.py +174 -100
  127. toil/test/provisioners/provisionerTest.py +25 -13
  128. toil/test/provisioners/restartScript.py +5 -4
  129. toil/test/server/serverTest.py +188 -141
  130. toil/test/sort/restart_sort.py +137 -68
  131. toil/test/sort/sort.py +134 -66
  132. toil/test/sort/sortTest.py +91 -49
  133. toil/test/src/autoDeploymentTest.py +140 -100
  134. toil/test/src/busTest.py +20 -18
  135. toil/test/src/checkpointTest.py +8 -2
  136. toil/test/src/deferredFunctionTest.py +49 -35
  137. toil/test/src/dockerCheckTest.py +33 -26
  138. toil/test/src/environmentTest.py +20 -10
  139. toil/test/src/fileStoreTest.py +538 -271
  140. toil/test/src/helloWorldTest.py +7 -4
  141. toil/test/src/importExportFileTest.py +61 -31
  142. toil/test/src/jobDescriptionTest.py +32 -17
  143. toil/test/src/jobEncapsulationTest.py +2 -0
  144. toil/test/src/jobFileStoreTest.py +74 -50
  145. toil/test/src/jobServiceTest.py +187 -73
  146. toil/test/src/jobTest.py +120 -70
  147. toil/test/src/miscTests.py +19 -18
  148. toil/test/src/promisedRequirementTest.py +82 -36
  149. toil/test/src/promisesTest.py +7 -6
  150. toil/test/src/realtimeLoggerTest.py +6 -6
  151. toil/test/src/regularLogTest.py +71 -37
  152. toil/test/src/resourceTest.py +80 -49
  153. toil/test/src/restartDAGTest.py +36 -22
  154. toil/test/src/resumabilityTest.py +9 -2
  155. toil/test/src/retainTempDirTest.py +45 -14
  156. toil/test/src/systemTest.py +12 -8
  157. toil/test/src/threadingTest.py +44 -25
  158. toil/test/src/toilContextManagerTest.py +10 -7
  159. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  160. toil/test/src/workerTest.py +33 -16
  161. toil/test/utils/toilDebugTest.py +70 -58
  162. toil/test/utils/toilKillTest.py +4 -5
  163. toil/test/utils/utilsTest.py +239 -102
  164. toil/test/wdl/wdltoil_test.py +789 -148
  165. toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
  166. toil/toilState.py +52 -26
  167. toil/utils/toilConfig.py +13 -4
  168. toil/utils/toilDebugFile.py +44 -27
  169. toil/utils/toilDebugJob.py +85 -25
  170. toil/utils/toilDestroyCluster.py +11 -6
  171. toil/utils/toilKill.py +8 -3
  172. toil/utils/toilLaunchCluster.py +251 -145
  173. toil/utils/toilMain.py +37 -16
  174. toil/utils/toilRsyncCluster.py +27 -14
  175. toil/utils/toilSshCluster.py +45 -22
  176. toil/utils/toilStats.py +75 -36
  177. toil/utils/toilStatus.py +226 -119
  178. toil/utils/toilUpdateEC2Instances.py +3 -1
  179. toil/version.py +11 -11
  180. toil/wdl/utils.py +5 -5
  181. toil/wdl/wdltoil.py +3513 -1052
  182. toil/worker.py +269 -128
  183. toil-8.0.0.dist-info/METADATA +173 -0
  184. toil-8.0.0.dist-info/RECORD +253 -0
  185. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  186. toil-7.0.0.dist-info/METADATA +0 -158
  187. toil-7.0.0.dist-info/RECORD +0 -244
  188. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
  189. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  190. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
@@ -11,19 +11,31 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from __future__ import annotations
15
+
14
16
  import logging
15
17
  import math
16
18
  import os
17
- from argparse import ArgumentParser, _ArgumentGroup
19
+ import sys
20
+ from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
18
21
  from shlex import quote
19
- from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union
20
-
21
- from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE, InsufficientSystemResources
22
- from toil.batchSystems.abstractGridEngineBatchSystem import \
23
- AbstractGridEngineBatchSystem
22
+ from typing import NamedTuple, TypeVar
23
+
24
+ from toil.batchSystems.abstractBatchSystem import (
25
+ EXIT_STATUS_UNAVAILABLE_VALUE,
26
+ BatchJobExitReason,
27
+ InsufficientSystemResources,
28
+ )
29
+ from toil.batchSystems.abstractGridEngineBatchSystem import (
30
+ AbstractGridEngineBatchSystem,
31
+ )
24
32
  from toil.batchSystems.options import OptionSetter
25
- from toil.job import Requirer
33
+ from toil.bus import get_job_kind
34
+ from toil.common import Config
35
+ from toil.job import JobDescription, Requirer
36
+ from toil.lib.conversions import strtobool
26
37
  from toil.lib.misc import CalledProcessErrorStderr, call_command
38
+ from toil.statsAndLogging import TRACE
27
39
 
28
40
  logger = logging.getLogger(__name__)
29
41
 
@@ -36,7 +48,7 @@ logger = logging.getLogger(__name__)
36
48
  # it, so Toil should wait for it.
37
49
  #
38
50
  # We map from each terminal state to the Toil-ontology exit reason.
39
- TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
51
+ TERMINAL_STATES: dict[str, BatchJobExitReason] = {
40
52
  "BOOT_FAIL": BatchJobExitReason.LOST,
41
53
  "CANCELLED": BatchJobExitReason.KILLED,
42
54
  "COMPLETED": BatchJobExitReason.FINISHED,
@@ -47,12 +59,12 @@ TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
47
59
  "PREEMPTED": BatchJobExitReason.KILLED,
48
60
  "REVOKED": BatchJobExitReason.KILLED,
49
61
  "SPECIAL_EXIT": BatchJobExitReason.FAILED,
50
- "TIMEOUT": BatchJobExitReason.KILLED
62
+ "TIMEOUT": BatchJobExitReason.KILLED,
51
63
  }
52
64
 
53
65
  # If a job is in one of these states, it might eventually move to a different
54
66
  # state.
55
- NONTERMINAL_STATES: Set[str] = {
67
+ NONTERMINAL_STATES: set[str] = {
56
68
  "CONFIGURING",
57
69
  "COMPLETING",
58
70
  "PENDING",
@@ -65,51 +77,194 @@ NONTERMINAL_STATES: Set[str] = {
65
77
  "SIGNALING",
66
78
  "STAGE_OUT",
67
79
  "STOPPED",
68
- "SUSPENDED"
69
- }
80
+ "SUSPENDED",
81
+ }
82
+
83
+
84
+ def parse_slurm_time(slurm_time: str) -> int:
85
+ """
86
+ Parse a Slurm-style time duration like 7-00:00:00 to a number of seconds.
87
+
88
+ Raises ValueError if not parseable.
89
+ """
90
+ # slurm returns time in days-hours:minutes:seconds format
91
+ # Sometimes it will only return minutes:seconds, so days may be omitted
92
+ # For ease of calculating, we'll make sure all the delimeters are ':'
93
+ # Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days
94
+ total_seconds = 0
95
+ elapsed_split: list[str] = slurm_time.replace("-", ":").split(":")
96
+ elapsed_split.reverse()
97
+ seconds_per_unit = [1, 60, 3600, 86400]
98
+ for index, multiplier in enumerate(seconds_per_unit):
99
+ if index < len(elapsed_split):
100
+ total_seconds += multiplier * int(elapsed_split[index])
101
+ return total_seconds
102
+
70
103
 
71
104
  class SlurmBatchSystem(AbstractGridEngineBatchSystem):
105
+ class PartitionInfo(NamedTuple):
106
+ partition_name: str
107
+ gres: bool
108
+ time_limit: float
109
+ priority: int
110
+ cpus: str
111
+ memory: str
112
+
113
+ class PartitionSet:
114
+ """
115
+ Set of available partitions detected on the slurm batch system
116
+ """
117
+
118
+ default_gpu_partition: SlurmBatchSystem.PartitionInfo | None
119
+ all_partitions: list[SlurmBatchSystem.PartitionInfo]
120
+ gpu_partitions: set[str]
121
+
122
+ def __init__(self) -> None:
123
+ self._get_partition_info()
124
+ self._get_gpu_partitions()
125
+
126
+ def _get_gpu_partitions(self) -> None:
127
+ """
128
+ Get all available GPU partitions. Also get the default GPU partition.
129
+ :return: None
130
+ """
131
+ gpu_partitions = [
132
+ partition for partition in self.all_partitions if partition.gres
133
+ ]
134
+ self.gpu_partitions = {p.partition_name for p in gpu_partitions}
135
+ # Grab the lowest priority GPU partition
136
+ # If no GPU partitions are available, then set the default to None
137
+ self.default_gpu_partition = None
138
+ if len(gpu_partitions) > 0:
139
+ self.default_gpu_partition = sorted(
140
+ gpu_partitions, key=lambda x: x.priority
141
+ )[0]
142
+
143
+ def _get_partition_info(self) -> None:
144
+ """
145
+ Call the Slurm batch system with sinfo to grab all available partitions.
146
+ Then parse the output and store all available Slurm partitions
147
+ :return: None
148
+ """
149
+ sinfo_command = ["sinfo", "-a", "-o", "%P %G %l %p %c %m"]
150
+
151
+ sinfo = call_command(sinfo_command)
152
+
153
+ parsed_partitions = []
154
+ for line in sinfo.split("\n")[1:]:
155
+ if line.strip():
156
+ partition_name, gres, time, priority, cpus, memory = line.split(" ")
157
+ try:
158
+ # Parse time to a number so we can compute on it
159
+ partition_time: float = parse_slurm_time(time)
160
+ except ValueError:
161
+ # Maybe time is unlimited?
162
+ partition_time = float("inf")
163
+ try:
164
+ # Parse priority to an int so we can sort on it
165
+ partition_priority = int(priority)
166
+ except ValueError:
167
+ logger.warning(
168
+ "Could not parse priority %s for partition %s, assuming high priority",
169
+ partition_name,
170
+ priority,
171
+ )
172
+ partition_priority = sys.maxsize
173
+ parsed_partitions.append(
174
+ SlurmBatchSystem.PartitionInfo(
175
+ partition_name.rstrip("*"),
176
+ gres != "(null)",
177
+ partition_time,
178
+ partition_priority,
179
+ cpus,
180
+ memory,
181
+ )
182
+ )
183
+ self.all_partitions = parsed_partitions
184
+
185
+ def get_partition(self, time_limit: float | None) -> str | None:
186
+ """
187
+ Get the partition name to use for a job with the given time limit.
188
+ """
189
+
190
+ if time_limit is None:
191
+ # Just use Slurm's default
192
+ return None
193
+
194
+ winning_partition = None
195
+ for partition in self.all_partitions:
196
+ if partition.time_limit >= time_limit and (
197
+ winning_partition is None
198
+ or partition.time_limit < winning_partition.time_limit
199
+ ):
200
+ # If this partition can fit the job and is faster than the current winner, take it
201
+ winning_partition = partition
202
+ # TODO: Store partitions in a better indexed way
203
+ if winning_partition is None and len(self.all_partitions) > 0:
204
+ # We have partitions and none of them can fit this
205
+ raise RuntimeError(
206
+ "Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
207
+ )
208
+
209
+ if winning_partition is None:
210
+ return None
211
+ else:
212
+ return winning_partition.partition_name
72
213
 
73
214
  class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
215
+ # Our boss is always the enclosing class
216
+ boss: SlurmBatchSystem
74
217
 
75
- def getRunningJobIDs(self):
218
+ def getRunningJobIDs(self) -> dict[int, int]:
76
219
  # Should return a dictionary of Job IDs and number of seconds
77
220
  times = {}
78
221
  with self.runningJobsLock:
79
- currentjobs = {str(self.batchJobIDs[x][0]): x for x in self.runningJobs}
222
+ currentjobs: dict[str, int] = {
223
+ str(self.batchJobIDs[x][0]): x for x in self.runningJobs
224
+ }
80
225
  # currentjobs is a dictionary that maps a slurm job id (string) to our own internal job id
81
226
  # squeue arguments:
82
227
  # -h for no header
83
228
  # --format to get jobid i, state %t and time days-hours:minutes:seconds
84
229
 
85
- lines = call_command(['squeue', '-h', '--format', '%i %t %M'], quiet=True).split('\n')
230
+ lines = call_command(
231
+ ["squeue", "-h", "--format", "%i %t %M"], quiet=True
232
+ ).split("\n")
86
233
  for line in lines:
87
234
  values = line.split()
88
235
  if len(values) < 3:
89
236
  continue
90
237
  slurm_jobid, state, elapsed_time = values
91
- if slurm_jobid in currentjobs and state == 'R':
92
- seconds_running = self.parse_elapsed(elapsed_time)
238
+ if slurm_jobid in currentjobs and state == "R":
239
+ try:
240
+ seconds_running = parse_slurm_time(elapsed_time)
241
+ except ValueError:
242
+ # slurm may return INVALID instead of a time
243
+ seconds_running = 0
93
244
  times[currentjobs[slurm_jobid]] = seconds_running
94
245
 
95
246
  return times
96
247
 
97
- def killJob(self, jobID):
98
- call_command(['scancel', self.getBatchSystemID(jobID)])
99
-
100
- def prepareSubmission(self,
101
- cpu: int,
102
- memory: int,
103
- jobID: int,
104
- command: str,
105
- jobName: str,
106
- job_environment: Optional[Dict[str, str]] = None,
107
- gpus: Optional[int] = None) -> List[str]:
248
+ def killJob(self, jobID: int) -> None:
249
+ call_command(["scancel", self.getBatchSystemID(jobID)])
250
+
251
+ def prepareSubmission(
252
+ self,
253
+ cpu: int,
254
+ memory: int,
255
+ jobID: int,
256
+ command: str,
257
+ jobName: str,
258
+ job_environment: dict[str, str] | None = None,
259
+ gpus: int | None = None,
260
+ ) -> list[str]:
108
261
  # Make sure to use exec so we can get Slurm's signals in the Toil
109
262
  # worker instead of having an intervening Bash
110
- return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap=exec {command}']
263
+ return self.prepareSbatch(
264
+ cpu, memory, jobID, jobName, job_environment, gpus
265
+ ) + [f"--wrap=exec {command}"]
111
266
 
112
- def submitJob(self, subLine):
267
+ def submitJob(self, subLine: list[str]) -> int:
113
268
  try:
114
269
  # Slurm is not quite clever enough to follow the XDG spec on
115
270
  # its own. If the submission command sees e.g. XDG_RUNTIME_DIR
@@ -125,7 +280,11 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
125
280
  # This doesn't get us a trustworthy XDG session in Slurm, but
126
281
  # it does let us see the one Slurm tries to give us.
127
282
  no_session_environment = os.environ.copy()
128
- session_names = [n for n in no_session_environment.keys() if n.startswith('XDG_') or n.startswith('DBUS_')]
283
+ session_names = [
284
+ n
285
+ for n in no_session_environment.keys()
286
+ if n.startswith("XDG_") or n.startswith("DBUS_")
287
+ ]
129
288
  for name in session_names:
130
289
  del no_session_environment[name]
131
290
 
@@ -138,36 +297,44 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
138
297
  logger.error(f"sbatch command failed with error: {e}")
139
298
  raise e
140
299
 
141
- def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
300
+ def coalesce_job_exit_codes(
301
+ self, batch_job_id_list: list[str]
302
+ ) -> list[int | tuple[int, BatchJobExitReason | None] | None]:
142
303
  """
143
304
  Collect all job exit codes in a single call.
144
305
  :param batch_job_id_list: list of Job ID strings, where each string has the form
145
306
  "<job>[.<task>]".
146
307
  :return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
147
308
  """
148
- logger.debug("Getting exit codes for slurm jobs: %s", batch_job_id_list)
309
+ logger.log(
310
+ TRACE, "Getting exit codes for slurm jobs: %s", batch_job_id_list
311
+ )
149
312
  # Convert batch_job_id_list to list of integer job IDs.
150
- job_id_list = [int(id.split('.')[0]) for id in batch_job_id_list]
313
+ job_id_list = [int(id.split(".")[0]) for id in batch_job_id_list]
151
314
  status_dict = self._get_job_details(job_id_list)
152
- exit_codes = []
315
+ exit_codes: list[int | tuple[int, BatchJobExitReason | None] | None] = []
153
316
  for _, status in status_dict.items():
154
317
  exit_codes.append(self._get_job_return_code(status))
155
318
  return exit_codes
156
319
 
157
- def getJobExitCode(self, batchJobID: str) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
320
+ def getJobExitCode(
321
+ self, batchJobID: str
322
+ ) -> int | tuple[int, BatchJobExitReason | None] | None:
158
323
  """
159
324
  Get job exit code for given batch job ID.
160
325
  :param batchJobID: string of the form "<job>[.<task>]".
161
326
  :return: integer job exit code.
162
327
  """
163
- logger.debug("Getting exit code for slurm job: %s", batchJobID)
328
+ logger.log(TRACE, "Getting exit code for slurm job: %s", batchJobID)
164
329
  # Convert batchJobID to an integer job ID.
165
- job_id = int(batchJobID.split('.')[0])
330
+ job_id = int(batchJobID.split(".")[0])
166
331
  status_dict = self._get_job_details([job_id])
167
332
  status = status_dict[job_id]
168
333
  return self._get_job_return_code(status)
169
334
 
170
- def _get_job_details(self, job_id_list: list) -> dict:
335
+ def _get_job_details(
336
+ self, job_id_list: list[int]
337
+ ) -> dict[int, tuple[str | None, int | None]]:
171
338
  """
172
339
  Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
173
340
  Fetch job details from Slurm's accounting system or job control system.
@@ -181,7 +348,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
181
348
  status_dict = self._getJobDetailsFromScontrol(job_id_list)
182
349
  return status_dict
183
350
 
184
- def _get_job_return_code(self, status: tuple) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
351
+ def _get_job_return_code(
352
+ self, status: tuple[str | None, int | None]
353
+ ) -> int | tuple[int, BatchJobExitReason | None] | None:
185
354
  """
186
355
  Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair.
187
356
 
@@ -215,7 +384,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
215
384
  # The only state that should produce a 0 ever is COMPLETED. So
216
385
  # if the job is COMPLETED and the exit reason is thus FINISHED,
217
386
  # pass along the code it has.
218
- return (rc, exit_reason)
387
+ return (rc, exit_reason) # type: ignore[return-value] # mypy doesn't understand enums well
219
388
 
220
389
  if rc == 0:
221
390
  # The job claims to be in a state other than COMPLETED, but
@@ -224,7 +393,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
224
393
  return (EXIT_STATUS_UNAVAILABLE_VALUE, exit_reason)
225
394
 
226
395
  # If the code is nonzero, pass it along.
227
- return (rc, exit_reason)
396
+ return (rc, exit_reason) # type: ignore[return-value] # mypy doesn't understand enums well
228
397
 
229
398
  def _canonicalize_state(self, state: str) -> str:
230
399
  """
@@ -233,18 +402,23 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
233
402
 
234
403
  # Slurm will sometimes send something like "CANCELED by 30065" in
235
404
  # the state column for some reason.
236
-
405
+
237
406
  state_token = state
238
407
 
239
408
  if " " in state_token:
240
409
  state_token = state.split(" ", 1)[0]
241
410
 
242
- if state_token not in TERMINAL_STATES and state_token not in NONTERMINAL_STATES:
411
+ if (
412
+ state_token not in TERMINAL_STATES
413
+ and state_token not in NONTERMINAL_STATES
414
+ ):
243
415
  raise RuntimeError("Toil job in unimplemented Slurm state " + state)
244
-
416
+
245
417
  return state_token
246
418
 
247
- def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
419
+ def _getJobDetailsFromSacct(
420
+ self, job_id_list: list[int]
421
+ ) -> dict[int, tuple[str | None, int | None]]:
248
422
  """
249
423
  Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
250
424
  :param job_id_list: list of integer batch job IDs.
@@ -252,52 +426,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
252
426
  containing the job's state and exit code.
253
427
  """
254
428
  job_ids = ",".join(str(id) for id in job_id_list)
255
- args = ['sacct',
256
- '-n', # no header
257
- '-j', job_ids, # job
258
- '--format', 'JobIDRaw,State,ExitCode', # specify output columns
259
- '-P', # separate columns with pipes
260
- '-S', '1970-01-01'] # override start time limit
429
+ args = [
430
+ "sacct",
431
+ "-n", # no header
432
+ "-j",
433
+ job_ids, # job
434
+ "--format",
435
+ "JobIDRaw,State,ExitCode", # specify output columns
436
+ "-P", # separate columns with pipes
437
+ "-S",
438
+ "1970-01-01",
439
+ ] # override start time limit
261
440
  stdout = call_command(args, quiet=True)
262
441
 
263
442
  # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
264
443
  # job state and exit status. Initialize dict before processing output of `sacct`.
265
- job_statuses = {}
444
+ job_statuses: dict[int, tuple[str | None, int | None]] = {}
266
445
  for job_id in job_id_list:
267
446
  job_statuses[job_id] = (None, None)
268
447
 
269
448
  for line in stdout.splitlines():
270
- values = line.strip().split('|')
449
+ values = line.strip().split("|")
271
450
  if len(values) < 3:
272
451
  continue
452
+ state: str
273
453
  job_id_raw, state, exitcode = values
274
454
  state = self._canonicalize_state(state)
275
- logger.debug("%s state of job %s is %s", args[0], job_id_raw, state)
455
+ logger.log(
456
+ TRACE, "%s state of job %s is %s", args[0], job_id_raw, state
457
+ )
276
458
  # JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
277
459
  job_id_parts = job_id_raw.split(".")
278
460
  if len(job_id_parts) > 1:
279
461
  continue
280
462
  job_id = int(job_id_parts[0])
281
- status, signal = (int(n) for n in exitcode.split(':'))
463
+ status: int
464
+ signal: int
465
+ status, signal = (int(n) for n in exitcode.split(":"))
282
466
  if signal > 0:
283
467
  # A non-zero signal may indicate e.g. an out-of-memory killed job
284
468
  status = 128 + signal
285
- logger.debug("%s exit code of job %d is %s, return status %d",
286
- args[0], job_id, exitcode, status)
469
+ logger.log(
470
+ TRACE,
471
+ "%s exit code of job %d is %s, return status %d",
472
+ args[0],
473
+ job_id,
474
+ exitcode,
475
+ status,
476
+ )
287
477
  job_statuses[job_id] = state, status
288
- logger.debug("%s returning job statuses: %s", args[0], job_statuses)
478
+ logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
289
479
  return job_statuses
290
480
 
291
- def _getJobDetailsFromScontrol(self, job_id_list: list) -> dict:
481
+ def _getJobDetailsFromScontrol(
482
+ self, job_id_list: list[int]
483
+ ) -> dict[int, tuple[str | None, int | None]]:
292
484
  """
293
485
  Get SLURM job exit codes for the jobs in `job_id_list` by running `scontrol`.
294
486
  :param job_id_list: list of integer batch job IDs.
295
487
  :return: dict of job statuses, where key is the job-id, and value is a tuple
296
488
  containing the job's state and exit code.
297
489
  """
298
- args = ['scontrol',
299
- 'show',
300
- 'job']
490
+ args = ["scontrol", "show", "job"]
301
491
  # `scontrol` can only return information about a single job,
302
492
  # or all the jobs it knows about.
303
493
  if len(job_id_list) == 1:
@@ -306,14 +496,16 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
306
496
  stdout = call_command(args, quiet=True)
307
497
 
308
498
  # Job records are separated by a blank line.
499
+ job_records = None
309
500
  if isinstance(stdout, str):
310
- job_records = stdout.strip().split('\n\n')
501
+ job_records = stdout.strip().split("\n\n")
311
502
  elif isinstance(stdout, bytes):
312
- job_records = stdout.decode('utf-8').strip().split('\n\n')
503
+ job_records = stdout.decode("utf-8").strip().split("\n\n")
313
504
 
314
505
  # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
315
506
  # job state and exit status. Initialize dict before processing output of `scontrol`.
316
- job_statuses = {}
507
+ job_statuses: dict[int, tuple[str | None, int | None]] = {}
508
+ job_id: int | None
317
509
  for job_id in job_id_list:
318
510
  job_statuses[job_id] = (None, None)
319
511
 
@@ -323,7 +515,8 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
323
515
  return job_statuses
324
516
 
325
517
  for record in job_records:
326
- job = {}
518
+ job: dict[str, str] = {}
519
+ job_id = None
327
520
  for line in record.splitlines():
328
521
  for item in line.split():
329
522
  # Output is in the form of many key=value pairs, multiple pairs on each line
@@ -331,59 +524,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
331
524
  # added to a dictionary.
332
525
  # Note: In some cases, the value itself may contain white-space. So, if we find
333
526
  # a key without a value, we consider that key part of the previous value.
334
- bits = item.split('=', 1)
527
+ bits = item.split("=", 1)
335
528
  if len(bits) == 1:
336
- job[key] += ' ' + bits[0]
529
+ job[key] += " " + bits[0] # type: ignore[has-type] # we depend on the previous iteration to populate key
337
530
  else:
338
531
  key = bits[0]
339
532
  job[key] = bits[1]
340
533
  # The first line of the record contains the JobId. Stop processing the remainder
341
534
  # of this record, if we're not interested in this job.
342
- job_id = int(job['JobId'])
535
+ job_id = int(job["JobId"])
343
536
  if job_id not in job_id_list:
344
- logger.debug("%s job %d is not in the list", args[0], job_id)
537
+ logger.log(
538
+ TRACE, "%s job %d is not in the list", args[0], job_id
539
+ )
345
540
  break
346
- if job_id not in job_id_list:
541
+ if job_id is None or job_id not in job_id_list:
347
542
  continue
348
- state = job['JobState']
543
+ state = job["JobState"]
349
544
  state = self._canonicalize_state(state)
350
- logger.debug("%s state of job %s is %s", args[0], job_id, state)
545
+ logger.log(TRACE, "%s state of job %s is %s", args[0], job_id, state)
351
546
  try:
352
- exitcode = job['ExitCode']
547
+ exitcode = job["ExitCode"]
353
548
  if exitcode is not None:
354
- status, signal = (int(n) for n in exitcode.split(':'))
549
+ status, signal = (int(n) for n in exitcode.split(":"))
355
550
  if signal > 0:
356
551
  # A non-zero signal may indicate e.g. an out-of-memory killed job
357
552
  status = 128 + signal
358
- logger.debug("%s exit code of job %d is %s, return status %d",
359
- args[0], job_id, exitcode, status)
553
+ logger.log(
554
+ TRACE,
555
+ "%s exit code of job %d is %s, return status %d",
556
+ args[0],
557
+ job_id,
558
+ exitcode,
559
+ status,
560
+ )
360
561
  rc = status
361
562
  else:
362
563
  rc = None
363
564
  except KeyError:
364
565
  rc = None
365
566
  job_statuses[job_id] = (state, rc)
366
- logger.debug("%s returning job statuses: %s", args[0], job_statuses)
567
+ logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
367
568
  return job_statuses
368
569
 
369
570
  ###
370
571
  ### Implementation-specific helper methods
371
572
  ###
372
573
 
373
- def prepareSbatch(self,
374
- cpu: int,
375
- mem: int,
376
- jobID: int,
377
- jobName: str,
378
- job_environment: Optional[Dict[str, str]],
379
- gpus: Optional[int]) -> List[str]:
380
-
574
+ def prepareSbatch(
575
+ self,
576
+ cpu: int,
577
+ mem: int,
578
+ jobID: int,
579
+ jobName: str,
580
+ job_environment: dict[str, str] | None,
581
+ gpus: int | None,
582
+ ) -> list[str]:
381
583
  """
382
584
  Returns the sbatch command line to run to queue the job.
383
585
  """
384
586
 
385
587
  # Start by naming the job
386
- sbatch_line = ['sbatch', '-J', f'toil_job_{jobID}_{jobName}']
588
+ sbatch_line = ["sbatch", "-J", f"toil_job_{jobID}_{jobName}"]
387
589
 
388
590
  # Make sure the job gets a signal before it disappears so that e.g.
389
591
  # container cleanup finally blocks can run. Ask for SIGINT so we
@@ -398,25 +600,28 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
398
600
  # responded to this signal and use the right exit reason for it.
399
601
  sbatch_line.append("--signal=B:INT@30")
400
602
 
401
- if gpus:
402
- sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:]
403
603
  environment = {}
404
604
  environment.update(self.boss.environment)
405
605
  if job_environment:
406
606
  environment.update(job_environment)
407
607
 
408
608
  # "Native extensions" for SLURM (see DRMAA or SAGA)
409
- nativeConfig = os.getenv('TOIL_SLURM_ARGS')
609
+ # Also any extra arguments from --slurmArgs or TOIL_SLURM_ARGS
610
+ nativeConfig: str = self.boss.config.slurm_args # type: ignore[attr-defined]
410
611
 
411
612
  # --export=[ALL,]<environment_toil_variables>
412
613
  set_exports = "--export=ALL"
413
614
 
414
615
  if nativeConfig is not None:
415
- logger.debug("Native SLURM options appended to sbatch from TOIL_SLURM_ARGS env. variable: %s", nativeConfig)
616
+ logger.debug(
617
+ "Native SLURM options appended to sbatch: %s", nativeConfig
618
+ )
416
619
 
417
620
  for arg in nativeConfig.split():
418
621
  if arg.startswith("--mem") or arg.startswith("--cpus-per-task"):
419
- raise ValueError(f"Some resource arguments are incompatible: {nativeConfig}")
622
+ raise ValueError(
623
+ f"Some resource arguments are incompatible: {nativeConfig}"
624
+ )
420
625
  # repleace default behaviour by the one stated at TOIL_SLURM_ARGS
421
626
  if arg.startswith("--export"):
422
627
  set_exports = arg
@@ -427,54 +632,149 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
427
632
 
428
633
  for k, v in environment.items():
429
634
  quoted_value = quote(os.environ[k] if v is None else v)
430
- argList.append(f'{k}={quoted_value}')
635
+ argList.append(f"{k}={quoted_value}")
431
636
 
432
- set_exports += ',' + ','.join(argList)
637
+ set_exports += "," + ",".join(argList)
433
638
 
434
639
  # add --export to the sbatch
435
640
  sbatch_line.append(set_exports)
436
641
 
437
- parallel_env = os.getenv('TOIL_SLURM_PE')
642
+ parallel_env: str = self.boss.config.slurm_pe # type: ignore[attr-defined]
438
643
  if cpu and cpu > 1 and parallel_env:
439
- sbatch_line.append(f'--partition={parallel_env}')
644
+ sbatch_line.append(f"--partition={parallel_env}")
440
645
 
441
- if mem is not None and self.boss.config.allocate_mem:
646
+ if mem is not None and self.boss.config.slurm_allocate_mem: # type: ignore[attr-defined]
442
647
  # memory passed in is in bytes, but slurm expects megabytes
443
- sbatch_line.append(f'--mem={math.ceil(mem / 2 ** 20)}')
648
+ sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}")
444
649
  if cpu is not None:
445
- sbatch_line.append(f'--cpus-per-task={math.ceil(cpu)}')
650
+ sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}")
651
+
652
+ time_limit: int = self.boss.config.slurm_time # type: ignore[attr-defined]
653
+ if time_limit is not None:
654
+ # Put all the seconds in the seconds slot
655
+ sbatch_line.append(f"--time=0:{time_limit}")
446
656
 
447
- stdoutfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'out')
448
- stderrfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'err')
449
- sbatch_line.extend(['-o', stdoutfile, '-e', stderrfile])
657
+ if gpus:
658
+ # This block will add a gpu supported partition only if no partition is supplied by the user
659
+ sbatch_line = sbatch_line[:1] + [f"--gres=gpu:{gpus}"] + sbatch_line[1:]
660
+ if not any(option.startswith("--partition") for option in sbatch_line):
661
+ # no partition specified, so specify one
662
+ # try to get the name of the lowest priority gpu supported partition
663
+ lowest_gpu_partition = self.boss.partitions.default_gpu_partition
664
+ if lowest_gpu_partition is None:
665
+ # no gpu partitions are available, raise an error
666
+ raise RuntimeError(
667
+ f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
668
+ )
669
+ if (
670
+ time_limit is not None
671
+ and lowest_gpu_partition.time_limit < time_limit
672
+ ):
673
+ # TODO: find the lowest-priority GPU partition that has at least each job's time limit!
674
+ logger.warning(
675
+ "Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
676
+ time_limit,
677
+ lowest_gpu_partition.partition_name,
678
+ lowest_gpu_partition.time_limit,
679
+ )
680
+ sbatch_line.append(
681
+ f"--partition={lowest_gpu_partition.partition_name}"
682
+ )
683
+ else:
684
+ # there is a partition specified already, check if the partition has GPUs
685
+ for i, option in enumerate(sbatch_line):
686
+ if option.startswith("--partition"):
687
+ # grab the partition name depending on if it's specified via an "=" or a space
688
+ if "=" in option:
689
+ partition_name = option[len("--partition=") :]
690
+ else:
691
+ partition_name = option[i + 1]
692
+ available_gpu_partitions = (
693
+ self.boss.partitions.gpu_partitions
694
+ )
695
+ if partition_name not in available_gpu_partitions:
696
+ # the specified partition is not compatible, so warn the user that the job may not work
697
+ logger.warning(
698
+ f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work."
699
+ f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}."
700
+ )
701
+ break
702
+
703
+ if not any(option.startswith("--partition") for option in sbatch_line):
704
+ # Pick a partition ourselves
705
+ chosen_partition = self.boss.partitions.get_partition(time_limit)
706
+ if chosen_partition is not None:
707
+ # Route to that partition
708
+ sbatch_line.append(f"--partition={chosen_partition}")
709
+
710
+ stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out")
711
+ stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err")
712
+ sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile])
450
713
 
451
714
  return sbatch_line
452
715
 
453
- def parse_elapsed(self, elapsed):
454
- # slurm returns elapsed time in days-hours:minutes:seconds format
455
- # Sometimes it will only return minutes:seconds, so days may be omitted
456
- # For ease of calculating, we'll make sure all the delimeters are ':'
457
- # Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days
458
- total_seconds = 0
459
- try:
460
- elapsed = elapsed.replace('-', ':').split(':')
461
- elapsed.reverse()
462
- seconds_per_unit = [1, 60, 3600, 86400]
463
- for index, multiplier in enumerate(seconds_per_unit):
464
- if index < len(elapsed):
465
- total_seconds += multiplier * int(elapsed[index])
466
- except ValueError:
467
- pass # slurm may return INVALID instead of a time
468
- return total_seconds
716
+ def __init__(
717
+ self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
718
+ ) -> None:
719
+ super().__init__(config, maxCores, maxMemory, maxDisk)
720
+ self.partitions = SlurmBatchSystem.PartitionSet()
721
+
722
+ # Override issuing jobs so we can check if we need to use Slurm's magic
723
+ # whole-node-memory feature.
724
+ def issueBatchJob(
725
+ self,
726
+ command: str,
727
+ job_desc: JobDescription,
728
+ job_environment: dict[str, str] | None = None,
729
+ ) -> int:
730
+ # Avoid submitting internal jobs to the batch queue, handle locally
731
+ local_id = self.handleLocalJob(command, job_desc)
732
+ if local_id is not None:
733
+ return local_id
734
+ else:
735
+ self.check_resource_request(job_desc)
736
+ gpus = self.count_needed_gpus(job_desc)
737
+ job_id = self.getNextJobID()
738
+ self.currentJobs.add(job_id)
739
+
740
+ if "memory" not in job_desc.requirements and self.config.slurm_default_all_mem: # type: ignore[attr-defined]
741
+ # The job doesn't have its own memory requirement, and we are
742
+ # defaulting to whole node memory. Use Slurm's 0-memory sentinel.
743
+ memory = 0
744
+ else:
745
+ # Use the memory actually on the job, or the Toil default memory
746
+ memory = job_desc.memory
747
+
748
+ self.newJobsQueue.put(
749
+ (
750
+ job_id,
751
+ job_desc.cores,
752
+ memory,
753
+ command,
754
+ get_job_kind(job_desc.get_names()),
755
+ job_environment,
756
+ gpus,
757
+ )
758
+ )
759
+ logger.debug(
760
+ "Issued the job command: %s with job id: %s and job name %s",
761
+ command,
762
+ str(job_id),
763
+ get_job_kind(job_desc.get_names()),
764
+ )
765
+ return job_id
469
766
 
470
767
  def _check_accelerator_request(self, requirer: Requirer) -> None:
471
768
  for accelerator in requirer.accelerators:
472
- if accelerator['kind'] != 'gpu':
473
- raise InsufficientSystemResources(requirer, 'accelerators', details=
474
- [
475
- f'The accelerator {accelerator} could not be provided'
476
- 'The Toil Slurm batch system only supports gpu accelerators at the moment.'
477
- ])
769
+ if accelerator["kind"] != "gpu":
770
+ raise InsufficientSystemResources(
771
+ requirer,
772
+ "accelerators",
773
+ details=[
774
+ f"The accelerator {accelerator} could not be provided"
775
+ "The Toil Slurm batch system only supports gpu accelerators at the moment."
776
+ ],
777
+ )
478
778
 
479
779
  ###
480
780
  ### The interface for SLURM
@@ -488,17 +788,69 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
488
788
  # implement getWaitDuration().
489
789
 
490
790
  @classmethod
491
- def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]):
492
- allocate_mem = parser.add_mutually_exclusive_group()
493
- allocate_mem_help = ("A flag that can block allocating memory with '--mem' for job submissions "
494
- "on SLURM since some system servers may reject any job request that "
495
- "explicitly specifies the memory allocation. The default is to always allocate memory.")
496
- allocate_mem.add_argument("--dont_allocate_mem", action='store_false', dest="allocate_mem", help=allocate_mem_help)
497
- allocate_mem.add_argument("--allocate_mem", action='store_true', dest="allocate_mem", help=allocate_mem_help)
498
- allocate_mem.set_defaults(allocate_mem=True)
499
-
500
- OptionType = TypeVar('OptionType')
791
+ def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None:
792
+
793
+ parser.add_argument(
794
+ "--slurmAllocateMem",
795
+ dest="slurm_allocate_mem",
796
+ type=strtobool,
797
+ default=True,
798
+ env_var="TOIL_SLURM_ALLOCATE_MEM",
799
+ help="If False, do not use --mem. Used as a workaround for Slurm clusters that reject jobs "
800
+ "with memory allocations.",
801
+ )
802
+ # Keep these deprcated options for backward compatibility
803
+ parser.add_argument(
804
+ "--dont_allocate_mem",
805
+ action="store_false",
806
+ dest="slurm_allocate_mem",
807
+ help=SUPPRESS,
808
+ )
809
+ parser.add_argument(
810
+ "--allocate_mem",
811
+ action="store_true",
812
+ dest="slurm_allocate_mem",
813
+ help=SUPPRESS,
814
+ )
815
+
816
+ parser.add_argument(
817
+ "--slurmDefaultAllMem",
818
+ dest="slurm_default_all_mem",
819
+ type=strtobool,
820
+ default=False,
821
+ env_var="TOIL_SLURM_DEFAULT_ALL_MEM",
822
+ help="If True, assign Toil jobs without their own memory requirements all available "
823
+ "memory on a Slurm node (via Slurm --mem=0).",
824
+ )
825
+ parser.add_argument(
826
+ "--slurmTime",
827
+ dest="slurm_time",
828
+ type=parse_slurm_time,
829
+ default=None,
830
+ env_var="TOIL_SLURM_TIME",
831
+ help="Slurm job time limit, in [DD-]HH:MM:SS format.",
832
+ )
833
+ parser.add_argument(
834
+ "--slurmPE",
835
+ dest="slurm_pe",
836
+ default=None,
837
+ env_var="TOIL_SLURM_PE",
838
+ help="Special partition to send Slurm jobs to if they ask for more than 1 CPU.",
839
+ )
840
+ parser.add_argument(
841
+ "--slurmArgs",
842
+ dest="slurm_args",
843
+ default="",
844
+ env_var="TOIL_SLURM_ARGS",
845
+ help="Extra arguments to pass to Slurm.",
846
+ )
847
+
848
+ OptionType = TypeVar("OptionType")
849
+
501
850
  @classmethod
502
851
  def setOptions(cls, setOption: OptionSetter) -> None:
503
- setOption("allocate_mem")
504
-
852
+ setOption("slurm_allocate_mem")
853
+ setOption("slurm_default_all_mem")
854
+ setOption("slurm_time")
855
+ setOption("slurm_pe")
856
+ setOption("slurm_args")