toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. toil/__init__.py +124 -86
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +137 -77
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
  5. toil/batchSystems/awsBatch.py +237 -128
  6. toil/batchSystems/cleanup_support.py +22 -16
  7. toil/batchSystems/contained_executor.py +30 -26
  8. toil/batchSystems/gridengine.py +85 -49
  9. toil/batchSystems/htcondor.py +164 -87
  10. toil/batchSystems/kubernetes.py +622 -386
  11. toil/batchSystems/local_support.py +17 -12
  12. toil/batchSystems/lsf.py +132 -79
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +288 -149
  16. toil/batchSystems/mesos/executor.py +77 -49
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +39 -29
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +293 -123
  21. toil/batchSystems/slurm.py +651 -155
  22. toil/batchSystems/torque.py +46 -32
  23. toil/bus.py +141 -73
  24. toil/common.py +784 -397
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1137 -534
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +62 -41
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +88 -57
  32. toil/fileStores/cachingFileStore.py +711 -247
  33. toil/fileStores/nonCachingFileStore.py +113 -75
  34. toil/job.py +1031 -349
  35. toil/jobStores/abstractJobStore.py +387 -243
  36. toil/jobStores/aws/jobStore.py +772 -412
  37. toil/jobStores/aws/utils.py +161 -109
  38. toil/jobStores/conftest.py +1 -0
  39. toil/jobStores/fileJobStore.py +289 -151
  40. toil/jobStores/googleJobStore.py +137 -70
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +614 -269
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +55 -28
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +204 -58
  49. toil/lib/aws/utils.py +290 -213
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +83 -49
  53. toil/lib/docker.py +131 -103
  54. toil/lib/dockstore.py +379 -0
  55. toil/lib/ec2.py +322 -209
  56. toil/lib/ec2nodes.py +174 -105
  57. toil/lib/encryption/_dummy.py +5 -3
  58. toil/lib/encryption/_nacl.py +10 -6
  59. toil/lib/encryption/conftest.py +1 -0
  60. toil/lib/exceptions.py +26 -7
  61. toil/lib/expando.py +4 -2
  62. toil/lib/ftp_utils.py +217 -0
  63. toil/lib/generatedEC2Lists.py +127 -19
  64. toil/lib/history.py +1271 -0
  65. toil/lib/history_submission.py +681 -0
  66. toil/lib/humanize.py +6 -2
  67. toil/lib/io.py +121 -12
  68. toil/lib/iterables.py +4 -2
  69. toil/lib/memoize.py +12 -8
  70. toil/lib/misc.py +83 -18
  71. toil/lib/objects.py +2 -2
  72. toil/lib/resources.py +19 -7
  73. toil/lib/retry.py +125 -87
  74. toil/lib/threading.py +282 -80
  75. toil/lib/throttle.py +15 -14
  76. toil/lib/trs.py +390 -0
  77. toil/lib/web.py +38 -0
  78. toil/options/common.py +850 -402
  79. toil/options/cwl.py +185 -90
  80. toil/options/runner.py +50 -0
  81. toil/options/wdl.py +70 -19
  82. toil/provisioners/__init__.py +111 -46
  83. toil/provisioners/abstractProvisioner.py +322 -157
  84. toil/provisioners/aws/__init__.py +62 -30
  85. toil/provisioners/aws/awsProvisioner.py +980 -627
  86. toil/provisioners/clusterScaler.py +541 -279
  87. toil/provisioners/gceProvisioner.py +283 -180
  88. toil/provisioners/node.py +147 -79
  89. toil/realtimeLogger.py +34 -22
  90. toil/resource.py +137 -75
  91. toil/server/app.py +127 -61
  92. toil/server/celery_app.py +3 -1
  93. toil/server/cli/wes_cwl_runner.py +84 -55
  94. toil/server/utils.py +56 -31
  95. toil/server/wes/abstract_backend.py +64 -26
  96. toil/server/wes/amazon_wes_utils.py +21 -15
  97. toil/server/wes/tasks.py +121 -63
  98. toil/server/wes/toil_backend.py +142 -107
  99. toil/server/wsgi_app.py +4 -3
  100. toil/serviceManager.py +58 -22
  101. toil/statsAndLogging.py +183 -65
  102. toil/test/__init__.py +263 -179
  103. toil/test/batchSystems/batchSystemTest.py +438 -195
  104. toil/test/batchSystems/batch_system_plugin_test.py +18 -7
  105. toil/test/batchSystems/test_gridengine.py +173 -0
  106. toil/test/batchSystems/test_lsf_helper.py +67 -58
  107. toil/test/batchSystems/test_slurm.py +265 -49
  108. toil/test/cactus/test_cactus_integration.py +20 -22
  109. toil/test/cwl/conftest.py +39 -0
  110. toil/test/cwl/cwlTest.py +375 -72
  111. toil/test/cwl/measure_default_memory.cwl +12 -0
  112. toil/test/cwl/not_run_required_input.cwl +29 -0
  113. toil/test/cwl/optional-file.cwl +18 -0
  114. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  115. toil/test/docs/scriptsTest.py +60 -34
  116. toil/test/jobStores/jobStoreTest.py +412 -235
  117. toil/test/lib/aws/test_iam.py +116 -48
  118. toil/test/lib/aws/test_s3.py +16 -9
  119. toil/test/lib/aws/test_utils.py +5 -6
  120. toil/test/lib/dockerTest.py +118 -141
  121. toil/test/lib/test_conversions.py +113 -115
  122. toil/test/lib/test_ec2.py +57 -49
  123. toil/test/lib/test_history.py +212 -0
  124. toil/test/lib/test_misc.py +12 -5
  125. toil/test/lib/test_trs.py +161 -0
  126. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  127. toil/test/mesos/helloWorld.py +7 -6
  128. toil/test/mesos/stress.py +25 -20
  129. toil/test/options/options.py +7 -2
  130. toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
  131. toil/test/provisioners/clusterScalerTest.py +440 -250
  132. toil/test/provisioners/clusterTest.py +81 -42
  133. toil/test/provisioners/gceProvisionerTest.py +174 -100
  134. toil/test/provisioners/provisionerTest.py +25 -13
  135. toil/test/provisioners/restartScript.py +5 -4
  136. toil/test/server/serverTest.py +188 -141
  137. toil/test/sort/restart_sort.py +137 -68
  138. toil/test/sort/sort.py +134 -66
  139. toil/test/sort/sortTest.py +91 -49
  140. toil/test/src/autoDeploymentTest.py +140 -100
  141. toil/test/src/busTest.py +20 -18
  142. toil/test/src/checkpointTest.py +8 -2
  143. toil/test/src/deferredFunctionTest.py +49 -35
  144. toil/test/src/dockerCheckTest.py +33 -26
  145. toil/test/src/environmentTest.py +20 -10
  146. toil/test/src/fileStoreTest.py +538 -271
  147. toil/test/src/helloWorldTest.py +7 -4
  148. toil/test/src/importExportFileTest.py +61 -31
  149. toil/test/src/jobDescriptionTest.py +32 -17
  150. toil/test/src/jobEncapsulationTest.py +2 -0
  151. toil/test/src/jobFileStoreTest.py +74 -50
  152. toil/test/src/jobServiceTest.py +187 -73
  153. toil/test/src/jobTest.py +120 -70
  154. toil/test/src/miscTests.py +19 -18
  155. toil/test/src/promisedRequirementTest.py +82 -36
  156. toil/test/src/promisesTest.py +7 -6
  157. toil/test/src/realtimeLoggerTest.py +6 -6
  158. toil/test/src/regularLogTest.py +71 -37
  159. toil/test/src/resourceTest.py +80 -49
  160. toil/test/src/restartDAGTest.py +36 -22
  161. toil/test/src/resumabilityTest.py +9 -2
  162. toil/test/src/retainTempDirTest.py +45 -14
  163. toil/test/src/systemTest.py +12 -8
  164. toil/test/src/threadingTest.py +44 -25
  165. toil/test/src/toilContextManagerTest.py +10 -7
  166. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  167. toil/test/src/workerTest.py +33 -16
  168. toil/test/utils/toilDebugTest.py +70 -58
  169. toil/test/utils/toilKillTest.py +4 -5
  170. toil/test/utils/utilsTest.py +239 -102
  171. toil/test/wdl/wdltoil_test.py +789 -148
  172. toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
  173. toil/toilState.py +52 -26
  174. toil/utils/toilConfig.py +13 -4
  175. toil/utils/toilDebugFile.py +44 -27
  176. toil/utils/toilDebugJob.py +85 -25
  177. toil/utils/toilDestroyCluster.py +11 -6
  178. toil/utils/toilKill.py +8 -3
  179. toil/utils/toilLaunchCluster.py +251 -145
  180. toil/utils/toilMain.py +37 -16
  181. toil/utils/toilRsyncCluster.py +27 -14
  182. toil/utils/toilSshCluster.py +45 -22
  183. toil/utils/toilStats.py +75 -36
  184. toil/utils/toilStatus.py +226 -119
  185. toil/utils/toilUpdateEC2Instances.py +3 -1
  186. toil/version.py +6 -6
  187. toil/wdl/utils.py +5 -5
  188. toil/wdl/wdltoil.py +3528 -1053
  189. toil/worker.py +370 -149
  190. toil-8.1.0b1.dist-info/METADATA +178 -0
  191. toil-8.1.0b1.dist-info/RECORD +259 -0
  192. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
  193. toil-7.0.0.dist-info/METADATA +0 -158
  194. toil-7.0.0.dist-info/RECORD +0 -244
  195. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
  196. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
  197. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
@@ -30,22 +30,10 @@ import tempfile
30
30
  import time
31
31
  import uuid
32
32
  from argparse import ArgumentParser, _ArgumentGroup
33
+ from collections.abc import Iterator
33
34
  from queue import Empty, Queue
34
35
  from threading import Condition, Event, RLock, Thread
35
- from typing import (Any,
36
- Callable,
37
- Dict,
38
- Iterator,
39
- List,
40
- Literal,
41
- Optional,
42
- Set,
43
- Tuple,
44
- Type,
45
- TypeVar,
46
- Union,
47
- cast,
48
- overload)
36
+ from typing import Any, Callable, Literal, Optional, TypeVar, Union, cast, overload
49
37
 
50
38
  from toil.lib.conversions import opt_strtobool
51
39
 
@@ -53,72 +41,79 @@ if sys.version_info < (3, 10):
53
41
  from typing_extensions import ParamSpec
54
42
  else:
55
43
  from typing import ParamSpec
56
- if sys.version_info >= (3, 8):
57
- from typing import Protocol, TypedDict, runtime_checkable
44
+
45
+ if sys.version_info < (3, 11):
46
+ from typing_extensions import NotRequired
58
47
  else:
59
- from typing_extensions import Protocol, TypedDict, runtime_checkable
60
- # TODO: When this gets into the standard library, get it from there and drop
48
+ from typing import NotRequired
49
+
50
+ from typing import Protocol, TypedDict, runtime_checkable
51
+
61
52
  import urllib3
62
53
  import yaml
54
+
63
55
  # The Right Way to use the Kubernetes module is to `import kubernetes` and then you get all your stuff as like ApiClient. But this doesn't work for the stubs: the stubs seem to only support importing things from the internal modules in `kubernetes` where they are actually defined. See for example <https://github.com/MaterializeInc/kubernetes-stubs/issues/9 and <https://github.com/MaterializeInc/kubernetes-stubs/issues/10>. So we just import all the things we use into our global namespace here.
64
- from kubernetes.client import (BatchV1Api,
65
- CoreV1Api,
66
- CustomObjectsApi,
67
- V1Affinity,
68
- V1Container,
69
- V1ContainerStatus,
70
- V1EmptyDirVolumeSource,
71
- V1HostPathVolumeSource,
72
- V1Job,
73
- V1JobCondition,
74
- V1JobSpec,
75
- V1NodeAffinity,
76
- V1NodeSelector,
77
- V1NodeSelectorRequirement,
78
- V1NodeSelectorTerm,
79
- V1ObjectMeta,
80
- V1Pod,
81
- V1PodSpec,
82
- V1PodTemplateSpec,
83
- V1PreferredSchedulingTerm,
84
- V1ResourceRequirements,
85
- V1SecretVolumeSource,
86
- V1Toleration,
87
- V1Volume,
88
- V1VolumeMount, V1SecurityContext)
56
+ from kubernetes.client import (
57
+ BatchV1Api,
58
+ CoreV1Api,
59
+ CustomObjectsApi,
60
+ V1Affinity,
61
+ V1Container,
62
+ V1ContainerStatus,
63
+ V1EmptyDirVolumeSource,
64
+ V1HostPathVolumeSource,
65
+ V1Job,
66
+ V1JobCondition,
67
+ V1JobSpec,
68
+ V1NodeAffinity,
69
+ V1NodeSelector,
70
+ V1NodeSelectorRequirement,
71
+ V1NodeSelectorTerm,
72
+ V1ObjectMeta,
73
+ V1Pod,
74
+ V1PodSpec,
75
+ V1PodTemplateSpec,
76
+ V1PreferredSchedulingTerm,
77
+ V1ResourceRequirements,
78
+ V1SecretVolumeSource,
79
+ V1SecurityContext,
80
+ V1Toleration,
81
+ V1Volume,
82
+ V1VolumeMount,
83
+ )
89
84
  from kubernetes.client.api_client import ApiClient
90
85
  from kubernetes.client.exceptions import ApiException
91
86
  from kubernetes.config.config_exception import ConfigException
92
87
  from kubernetes.config.incluster_config import load_incluster_config
93
- from kubernetes.config.kube_config import (list_kube_config_contexts,
94
- load_kube_config)
88
+ from kubernetes.config.kube_config import list_kube_config_contexts, load_kube_config
89
+
95
90
  # TODO: Watch API is not typed yet
96
91
  from kubernetes.watch import Watch # type: ignore
97
- # typing-extensions dependency on Pythons that are new enough.
98
- from typing_extensions import NotRequired
99
92
 
100
93
  from toil import applianceSelf
101
- from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
102
- BatchJobExitReason,
103
- InsufficientSystemResources,
104
- ResourcePool,
105
- UpdatedBatchJobInfo)
94
+ from toil.batchSystems.abstractBatchSystem import (
95
+ EXIT_STATUS_UNAVAILABLE_VALUE,
96
+ BatchJobExitReason,
97
+ InsufficientSystemResources,
98
+ ResourcePool,
99
+ UpdatedBatchJobInfo,
100
+ )
106
101
  from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
107
102
  from toil.batchSystems.contained_executor import pack_job
108
103
  from toil.batchSystems.options import OptionSetter
109
104
  from toil.common import Config, Toil
110
- from toil.options.common import SYS_MAX_SIZE
111
105
  from toil.job import JobDescription, Requirer
112
106
  from toil.lib.conversions import human2bytes
113
107
  from toil.lib.misc import get_user_name, slow_down, utc_now
114
108
  from toil.lib.retry import ErrorCondition, retry
109
+ from toil.options.common import SYS_MAX_SIZE
115
110
  from toil.resource import Resource
116
111
 
117
112
  logger = logging.getLogger(__name__)
118
- retryable_kubernetes_errors: List[Union[Type[Exception], ErrorCondition]] = [
113
+ retryable_kubernetes_errors: list[Union[type[Exception], ErrorCondition]] = [
119
114
  urllib3.exceptions.MaxRetryError,
120
115
  urllib3.exceptions.ProtocolError,
121
- ApiException
116
+ ApiException,
122
117
  ]
123
118
 
124
119
 
@@ -132,8 +127,10 @@ def is_retryable_kubernetes_error(e: Exception) -> bool:
132
127
  return True
133
128
  return False
134
129
 
130
+
135
131
  # Represents a collection of label or taint keys and their sets of acceptable (or unacceptable) values.
136
- KeyValuesList = List[Tuple[str, List[str]]]
132
+ KeyValuesList = list[tuple[str, list[str]]]
133
+
137
134
 
138
135
  class KubernetesBatchSystem(BatchSystemCleanupSupport):
139
136
  @classmethod
@@ -150,8 +147,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
150
147
  core: NotRequired[CoreV1Api]
151
148
  customObjects: NotRequired[CustomObjectsApi]
152
149
 
153
-
154
- def __init__(self, config: Config, maxCores: int, maxMemory: int, maxDisk: int) -> None:
150
+ def __init__(
151
+ self, config: Config, maxCores: int, maxMemory: int, maxDisk: int
152
+ ) -> None:
155
153
  super().__init__(config, maxCores, maxMemory, maxDisk)
156
154
 
157
155
  # Re-type the config to make sure it has all the fields we need.
@@ -162,8 +160,8 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
162
160
  # Otherwise if we are at debug log level, we dump every
163
161
  # request/response to Kubernetes, including tokens which we shouldn't
164
162
  # reveal on CI.
165
- logging.getLogger('kubernetes').setLevel(logging.ERROR)
166
- logging.getLogger('requests_oauthlib').setLevel(logging.ERROR)
163
+ logging.getLogger("kubernetes").setLevel(logging.ERROR)
164
+ logging.getLogger("requests_oauthlib").setLevel(logging.ERROR)
167
165
 
168
166
  # This will hold the last time our Kubernetes credentials were refreshed
169
167
  self.credential_time: Optional[datetime.datetime] = None
@@ -171,7 +169,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
171
169
  self._apis: KubernetesBatchSystem._ApiStorageDict = {}
172
170
 
173
171
  # Get our namespace (and our Kubernetes credentials to make sure they exist)
174
- self.namespace: str = self._api('namespace')
172
+ self.namespace: str = self._api("namespace")
175
173
 
176
174
  # Decide if we are going to mount a Kubernetes host path as the Toil
177
175
  # work dir in the workers, for shared caching.
@@ -190,7 +188,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
190
188
  self.unique_id = uuid.uuid4()
191
189
 
192
190
  # Create a prefix for jobs, starting with our username
193
- self.job_prefix: str = f'{username}-toil-{self.unique_id}-'
191
+ self.job_prefix: str = f"{username}-toil-{self.unique_id}-"
194
192
  # Instead of letting Kubernetes assign unique job names, we assign our
195
193
  # own based on a numerical job ID. This functionality is managed by the
196
194
  # BatchSystemLocalSupport.
@@ -214,55 +212,61 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
214
212
  # Try and guess what Toil work dir the workers will use.
215
213
  # We need to be able to provision (possibly shared) space there.
216
214
  self.worker_work_dir: str = Toil.getToilWorkDir(config.workDir)
217
- if (config.workDir is None and
218
- os.getenv('TOIL_WORKDIR') is None and
219
- self.worker_work_dir == tempfile.gettempdir()):
215
+ if (
216
+ config.workDir is None
217
+ and os.getenv("TOIL_WORKDIR") is None
218
+ and self.worker_work_dir == tempfile.gettempdir()
219
+ ):
220
220
 
221
221
  # We defaulted to the system temp directory. But we think the
222
222
  # worker Dockerfiles will make them use /var/lib/toil instead.
223
223
  # TODO: Keep this in sync with the Dockerfile.
224
- self.worker_work_dir = '/var/lib/toil'
224
+ self.worker_work_dir = "/var/lib/toil"
225
225
 
226
226
  # A Toil-managed Kubernetes cluster will have most of its temp space at
227
227
  # /var/tmp, which is where really large temp files really belong
228
228
  # according to https://systemd.io/TEMPORARY_DIRECTORIES/. So we will
229
229
  # set the default temporary directory to there for all our jobs.
230
- self.environment['TMPDIR'] = '/var/tmp'
230
+ self.environment["TMPDIR"] = "/var/tmp"
231
231
 
232
232
  # Get the name of the AWS secret, if any, to mount in containers.
233
- self.aws_secret_name: Optional[str] = os.environ.get("TOIL_AWS_SECRET_NAME", None)
233
+ self.aws_secret_name: Optional[str] = os.environ.get(
234
+ "TOIL_AWS_SECRET_NAME", None
235
+ )
234
236
 
235
237
  # Set this to True to enable the experimental wait-for-job-update code
236
238
  self.enable_watching: bool = os.environ.get("KUBE_WATCH_ENABLED", False)
237
239
 
238
240
  # This will be a label to select all our jobs.
239
- self.run_id: str = f'toil-{self.unique_id}'
241
+ self.run_id: str = f"toil-{self.unique_id}"
240
242
 
241
243
  # Keep track of available resources.
242
- maxMillicores = int(SYS_MAX_SIZE if self.maxCores == SYS_MAX_SIZE else self.maxCores * 1000)
243
- self.resource_sources: List[ResourcePool] = [
244
+ maxMillicores = int(
245
+ SYS_MAX_SIZE if self.maxCores == SYS_MAX_SIZE else self.maxCores * 1000
246
+ )
247
+ self.resource_sources: list[ResourcePool] = [
244
248
  # A pool representing available job slots
245
- ResourcePool(self.config.max_jobs, 'job slots'),
249
+ ResourcePool(self.config.max_jobs, "job slots"),
246
250
  # A pool representing available CPU in units of millicores (1 CPU
247
251
  # unit = 1000 millicores)
248
- ResourcePool(maxMillicores, 'cores'),
252
+ ResourcePool(maxMillicores, "cores"),
249
253
  # A pool representing available memory in bytes
250
- ResourcePool(self.maxMemory, 'memory'),
254
+ ResourcePool(self.maxMemory, "memory"),
251
255
  # A pool representing the available space in bytes
252
- ResourcePool(self.maxDisk, 'disk'),
256
+ ResourcePool(self.maxDisk, "disk"),
253
257
  ]
254
258
 
255
259
  # A set of job IDs that are queued (useful for getIssuedBatchJobIDs())
256
- self._queued_job_ids: Set[int] = set()
260
+ self._queued_job_ids: set[int] = set()
257
261
 
258
262
  # Keep track of the acquired resources for each job
259
- self._acquired_resources: Dict[str, List[int]] = {}
263
+ self._acquired_resources: dict[str, list[int]] = {}
260
264
 
261
265
  # Queue for jobs to be submitted to the Kubernetes cluster
262
- self._jobs_queue: Queue[Tuple[int, JobDescription, V1PodSpec]] = Queue()
266
+ self._jobs_queue: Queue[tuple[int, JobDescription, V1PodSpec]] = Queue()
263
267
 
264
268
  # A set of job IDs that should be killed
265
- self._killed_queue_jobs: Set[int] = set()
269
+ self._killed_queue_jobs: set[int] = set()
266
270
 
267
271
  # We use this event to signal shutdown
268
272
  self._shutting_down: Event = Event()
@@ -286,7 +290,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
286
290
  """
287
291
 
288
292
  if not kubernetes_object:
289
- return 'None'
293
+ return "None"
290
294
 
291
295
  # We need a Kubernetes widget that knows how to translate
292
296
  # its data structures to nice YAML-able dicts. See:
@@ -296,7 +300,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
296
300
  # Convert to a dict
297
301
  root_dict = api_client.sanitize_for_serialization(kubernetes_object)
298
302
 
299
- def drop_boring(here: Dict[str, Any]) -> None:
303
+ def drop_boring(here: dict[str, Any]) -> None:
300
304
  """
301
305
  Drop boring fields recursively.
302
306
  """
@@ -304,7 +308,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
304
308
  for k, v in here.items():
305
309
  if isinstance(v, dict):
306
310
  drop_boring(v)
307
- if k in ['managedFields']:
311
+ if k in ["managedFields"]:
308
312
  boring_keys.append(k)
309
313
  for k in boring_keys:
310
314
  del here[k]
@@ -314,33 +318,43 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
314
318
 
315
319
  @overload
316
320
  def _api(
317
- self, kind: Literal['batch'], max_age_seconds: float = 5 * 60, errors: Optional[List[int]] = None
318
- ) -> BatchV1Api:
319
- ...
321
+ self,
322
+ kind: Literal["batch"],
323
+ max_age_seconds: float = 5 * 60,
324
+ errors: Optional[list[int]] = None,
325
+ ) -> BatchV1Api: ...
320
326
 
321
327
  @overload
322
328
  def _api(
323
- self, kind: Literal['core'], max_age_seconds: float = 5 * 60, errors: Optional[List[int]] = None
324
- ) -> CoreV1Api:
325
- ...
329
+ self,
330
+ kind: Literal["core"],
331
+ max_age_seconds: float = 5 * 60,
332
+ errors: Optional[list[int]] = None,
333
+ ) -> CoreV1Api: ...
326
334
 
327
335
  @overload
328
336
  def _api(
329
- self, kind: Literal['customObjects'], max_age_seconds: float = 5 * 60, errors: Optional[List[int]] = None
330
- ) -> CustomObjectsApi:
331
- ...
337
+ self,
338
+ kind: Literal["customObjects"],
339
+ max_age_seconds: float = 5 * 60,
340
+ errors: Optional[list[int]] = None,
341
+ ) -> CustomObjectsApi: ...
332
342
 
333
343
  @overload
334
344
  def _api(
335
- self, kind: Literal['namespace'], max_age_seconds: float = 5 * 60
336
- ) -> str:
337
- ...
345
+ self, kind: Literal["namespace"], max_age_seconds: float = 5 * 60
346
+ ) -> str: ...
338
347
 
339
348
  def _api(
340
349
  self,
341
- kind: Union[Literal['batch'], Literal['core'], Literal['customObjects'], Literal['namespace']],
350
+ kind: Union[
351
+ Literal["batch"],
352
+ Literal["core"],
353
+ Literal["customObjects"],
354
+ Literal["namespace"],
355
+ ],
342
356
  max_age_seconds: float = 5 * 60,
343
- errors: Optional[List[int]] = None
357
+ errors: Optional[list[int]] = None,
344
358
  ) -> Union[BatchV1Api, CoreV1Api, CustomObjectsApi, str]:
345
359
  """
346
360
  The Kubernetes module isn't clever enough to renew its credentials when
@@ -373,44 +387,53 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
373
387
 
374
388
  now = utc_now()
375
389
 
376
- if self.credential_time is None or (now - self.credential_time).total_seconds() > max_age_seconds:
390
+ if (
391
+ self.credential_time is None
392
+ or (now - self.credential_time).total_seconds() > max_age_seconds
393
+ ):
377
394
  # Credentials need a refresh
378
395
  try:
379
396
  # Load ~/.kube/config or KUBECONFIG
380
397
  load_kube_config()
381
398
  # Worked. We're using kube config
382
- config_source = 'kube'
399
+ config_source = "kube"
383
400
  except ConfigException:
384
401
  # Didn't work. Try pod-based credentials in case we are in a pod.
385
402
  try:
386
403
  load_incluster_config()
387
404
  # Worked. We're using in_cluster config
388
- config_source = 'in_cluster'
405
+ config_source = "in_cluster"
389
406
  except ConfigException:
390
- raise RuntimeError('Could not load Kubernetes configuration from ~/.kube/config, $KUBECONFIG, or current pod.')
407
+ raise RuntimeError(
408
+ "Could not load Kubernetes configuration from ~/.kube/config, $KUBECONFIG, or current pod."
409
+ )
391
410
 
392
411
  # Now fill in the API objects with these credentials
393
- self._apis['batch'] = BatchV1Api()
394
- self._apis['core'] = CoreV1Api()
395
- self._apis['customObjects'] = CustomObjectsApi()
412
+ self._apis["batch"] = BatchV1Api()
413
+ self._apis["core"] = CoreV1Api()
414
+ self._apis["customObjects"] = CustomObjectsApi()
396
415
 
397
416
  # And save the time
398
417
  self.credential_time = now
399
- if kind == 'namespace':
418
+ if kind == "namespace":
400
419
  # We just need the namespace string
401
- if config_source == 'in_cluster':
420
+ if config_source == "in_cluster":
402
421
  # Our namespace comes from a particular file.
403
- with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as fh:
422
+ with open(
423
+ "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
424
+ ) as fh:
404
425
  return fh.read().strip()
405
426
  else:
406
427
  # Find all contexts and the active context.
407
428
  # The active context gets us our namespace.
408
429
  contexts, activeContext = list_kube_config_contexts()
409
430
  if not contexts:
410
- raise RuntimeError("No Kubernetes contexts available in ~/.kube/config or $KUBECONFIG")
431
+ raise RuntimeError(
432
+ "No Kubernetes contexts available in ~/.kube/config or $KUBECONFIG"
433
+ )
411
434
 
412
435
  # Identify the namespace to work in
413
- namespace = activeContext.get('context', {}).get('namespace', 'default')
436
+ namespace = activeContext.get("context", {}).get("namespace", "default")
414
437
  assert isinstance(namespace, str)
415
438
  return namespace
416
439
 
@@ -430,11 +453,13 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
430
453
  ErrorCondition(
431
454
  error=ApiException,
432
455
  error_codes=errors,
433
- retry_on_this_condition=False
456
+ retry_on_this_condition=False,
434
457
  )
435
458
  )
436
459
  decorator = retry(errors=error_list)
437
- wrapper = KubernetesBatchSystem.DecoratorWrapper(api_object, decorator)
460
+ wrapper = KubernetesBatchSystem.DecoratorWrapper(
461
+ api_object, decorator
462
+ )
438
463
  return cast(Union[BatchV1Api, CoreV1Api, CustomObjectsApi], wrapper)
439
464
  except KeyError:
440
465
  raise RuntimeError(f"Unknown Kubernetes API type: {kind}")
@@ -445,7 +470,12 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
445
470
  """
446
471
 
447
472
  P = ParamSpec("P")
448
- def __init__(self, to_wrap: Any, decorator: Callable[[Callable[P, Any]], Callable[P, Any]]) -> None:
473
+
474
+ def __init__(
475
+ self,
476
+ to_wrap: Any,
477
+ decorator: Callable[[Callable[P, Any]], Callable[P, Any]],
478
+ ) -> None:
449
479
  """
450
480
  Make a wrapper around the given object.
451
481
  When methods on the object are called, they will be called through
@@ -469,16 +499,19 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
469
499
  return attr
470
500
 
471
501
  ItemT = TypeVar("ItemT")
502
+
472
503
  class _ItemsHaver(Protocol[ItemT]):
473
504
  """
474
505
  Anything that has a .items that is a list of something.
475
506
  """
507
+
476
508
  # KubernetesBatchSystem isn't defined until the class executes, so any
477
509
  # up-references to types from there that are in signatures (and not
478
510
  # method code) need to be quoted
479
- items: List["KubernetesBatchSystem.ItemT"]
511
+ items: list["KubernetesBatchSystem.ItemT"]
480
512
 
481
513
  CovItemT = TypeVar("CovItemT", covariant=True)
514
+
482
515
  class _WatchEvent(Protocol[CovItemT]):
483
516
  """
484
517
  An event from a Kubernetes watch stream.
@@ -490,23 +523,26 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
490
523
  # __getitem__ instead.
491
524
 
492
525
  @overload
493
- def __getitem__(self, name: Literal['type']) -> str:
494
- ...
526
+ def __getitem__(self, name: Literal["type"]) -> str: ...
495
527
 
496
528
  @overload
497
- def __getitem__(self, name: Literal['object']) -> "KubernetesBatchSystem.CovItemT":
498
- ...
529
+ def __getitem__(
530
+ self, name: Literal["object"]
531
+ ) -> "KubernetesBatchSystem.CovItemT": ...
499
532
 
500
533
  @overload
501
- def __getitem__(self, name: Literal['raw_object']) -> Dict[str, Any]:
502
- ...
534
+ def __getitem__(self, name: Literal["raw_object"]) -> dict[str, Any]: ...
503
535
 
504
- def __getitem__(self, name: Union[Literal['type'], Literal['object'], Literal['raw_object']]) -> Any:
505
- ...
536
+ def __getitem__(
537
+ self, name: Union[Literal["type"], Literal["object"], Literal["raw_object"]]
538
+ ) -> Any: ...
506
539
 
507
540
  P = ParamSpec("P")
508
541
  R = TypeVar("R")
509
- def _stream_until_error(self, method: Callable[P, _ItemsHaver[R]], *args: P.args, **kwargs: P.kwargs) -> Iterator[_WatchEvent[R]]:
542
+
543
+ def _stream_until_error(
544
+ self, method: Callable[P, _ItemsHaver[R]], *args: P.args, **kwargs: P.kwargs
545
+ ) -> Iterator[_WatchEvent[R]]:
510
546
  """
511
547
  Kubernetes kubernetes.watch.Watch().stream() streams can fail and raise
512
548
  errors. We don't want to have those errors fail the entire workflow, so
@@ -572,7 +608,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
572
608
 
573
609
  # Loop through all jobs inside the queue and see if any of them
574
610
  # could be launched.
575
- jobs: Queue[Tuple[int, JobDescription, V1PodSpec]] = Queue()
611
+ jobs: Queue[tuple[int, JobDescription, V1PodSpec]] = Queue()
576
612
  while True:
577
613
  try:
578
614
  job = self._jobs_queue.get_nowait()
@@ -584,7 +620,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
584
620
  logger.debug(f"Skipping killed job {job_id}")
585
621
  continue
586
622
 
587
- job_name = f'{self.job_prefix}{job_id}'
623
+ job_name = f"{self.job_prefix}{job_id}"
588
624
  result = self._launch_job(job_name, job_desc, spec)
589
625
  if result is False:
590
626
  # Not enough resources to launch this job.
@@ -605,7 +641,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
605
641
  logger.debug(f"Roughly {self._jobs_queue.qsize} jobs in the queue")
606
642
 
607
643
  def setUserScript(self, userScript: Resource) -> None:
608
- logger.info(f'Setting user script for deployment: {userScript}')
644
+ logger.info(f"Setting user script for deployment: {userScript}")
609
645
  self.user_script = userScript
610
646
 
611
647
  # setEnv is provided by BatchSystemSupport, updates self.environment
@@ -657,18 +693,21 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
657
693
  # Amazon just uses a label, while Google
658
694
  # <https://cloud.google.com/kubernetes-engine/docs/how-to/preemptible-vms>
659
695
  # uses a label and a taint.
660
- PREEMPTIBLE_SCHEMES = {'labels': [('eks.amazonaws.com/capacityType', ['SPOT']),
661
- ('cloud.google.com/gke-preemptible', ['true'])],
662
- 'taints': [('cloud.google.com/gke-preemptible', ['true'])]}
696
+ PREEMPTIBLE_SCHEMES = {
697
+ "labels": [
698
+ ("eks.amazonaws.com/capacityType", ["SPOT"]),
699
+ ("cloud.google.com/gke-preemptible", ["true"]),
700
+ ],
701
+ "taints": [("cloud.google.com/gke-preemptible", ["true"])],
702
+ }
663
703
 
664
704
  if preemptible:
665
705
  # We want to seek preemptible labels and tolerate preemptible taints.
666
- self.desired_labels += PREEMPTIBLE_SCHEMES['labels']
667
- self.tolerated_taints += PREEMPTIBLE_SCHEMES['taints']
706
+ self.desired_labels += PREEMPTIBLE_SCHEMES["labels"]
707
+ self.tolerated_taints += PREEMPTIBLE_SCHEMES["taints"]
668
708
  else:
669
709
  # We want to prohibit preemptible labels
670
- self.prohibited_labels += PREEMPTIBLE_SCHEMES['labels']
671
-
710
+ self.prohibited_labels += PREEMPTIBLE_SCHEMES["labels"]
672
711
 
673
712
  def apply(self, pod_spec: V1PodSpec) -> None:
674
713
  """
@@ -679,29 +718,26 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
679
718
  # Convert our collections to Kubernetes expressions.
680
719
 
681
720
  # REQUIRE that ALL of these requirements be satisfied
682
- required_selector_requirements: List[V1NodeSelectorRequirement] = []
721
+ required_selector_requirements: list[V1NodeSelectorRequirement] = []
683
722
  # PREFER that EACH of these terms be satisfied
684
- preferred_scheduling_terms: List[V1PreferredSchedulingTerm] = []
723
+ preferred_scheduling_terms: list[V1PreferredSchedulingTerm] = []
685
724
  # And this list of tolerations to apply
686
- tolerations: List[V1Toleration] = []
725
+ tolerations: list[V1Toleration] = []
687
726
 
688
727
  for label, values in self.required_labels:
689
728
  # Collect requirements for the required labels
690
- has_label = V1NodeSelectorRequirement(key=label,
691
- operator='In',
692
- values=values)
729
+ has_label = V1NodeSelectorRequirement(
730
+ key=label, operator="In", values=values
731
+ )
693
732
  required_selector_requirements.append(has_label)
694
733
  for label, values in self.desired_labels:
695
734
  # Collect preferences for the preferred labels
696
- has_label = V1NodeSelectorRequirement(key=label,
697
- operator='In',
698
- values=values)
699
- term = V1NodeSelectorTerm(
700
- match_expressions=[has_label]
735
+ has_label = V1NodeSelectorRequirement(
736
+ key=label, operator="In", values=values
701
737
  )
738
+ term = V1NodeSelectorTerm(match_expressions=[has_label])
702
739
  # Each becomes a separate preference, more is better.
703
- preference = V1PreferredSchedulingTerm(weight=1,
704
- preference=term)
740
+ preference = V1PreferredSchedulingTerm(weight=1, preference=term)
705
741
 
706
742
  preferred_scheduling_terms.append(preference)
707
743
  for label, values in self.prohibited_labels:
@@ -712,15 +748,14 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
712
748
  # <https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#set-based-requirement>
713
749
  # So we create a NotIn for each label and AND them
714
750
  # all together.
715
- not_labeled = V1NodeSelectorRequirement(key=label,
716
- operator='NotIn',
717
- values=values)
751
+ not_labeled = V1NodeSelectorRequirement(
752
+ key=label, operator="NotIn", values=values
753
+ )
718
754
  required_selector_requirements.append(not_labeled)
719
755
  for taint, values in self.tolerated_taints:
720
756
  for value in values:
721
757
  # Each toleration can tolerate one value
722
- taint_ok = V1Toleration(key=taint,
723
- value=value)
758
+ taint_ok = V1Toleration(key=taint, value=value)
724
759
  tolerations.append(taint_ok)
725
760
 
726
761
  # Now combine everything
@@ -734,16 +769,22 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
734
769
  match_expressions=required_selector_requirements
735
770
  )
736
771
  # And a selector to hold the term
737
- requirements_selector = V1NodeSelector(node_selector_terms=[requirements_term])
772
+ requirements_selector = V1NodeSelector(
773
+ node_selector_terms=[requirements_term]
774
+ )
738
775
 
739
776
  # Make an affinity that prefers the preferences and requires the requirements
740
777
  node_affinity = V1NodeAffinity(
741
- preferred_during_scheduling_ignored_during_execution=preferred_scheduling_terms if preferred_scheduling_terms else None,
742
- required_during_scheduling_ignored_during_execution=requirements_selector
778
+ preferred_during_scheduling_ignored_during_execution=(
779
+ preferred_scheduling_terms
780
+ if preferred_scheduling_terms
781
+ else None
782
+ ),
783
+ required_during_scheduling_ignored_during_execution=requirements_selector,
743
784
  )
744
785
 
745
786
  # Apply the affinity
746
- pod_spec.affinity = V1Affinity(node_affinity = node_affinity)
787
+ pod_spec.affinity = V1Affinity(node_affinity=node_affinity)
747
788
 
748
789
  if tolerations:
749
790
  # Apply the tolerations
@@ -751,18 +792,22 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
751
792
 
752
793
  def _check_accelerator_request(self, requirer: Requirer) -> None:
753
794
  for accelerator in requirer.accelerators:
754
- if accelerator['kind'] != 'gpu' and 'model' not in accelerator:
795
+ if accelerator["kind"] != "gpu" and "model" not in accelerator:
755
796
  # We can only provide GPUs or things with a model right now
756
- raise InsufficientSystemResources(requirer, 'accelerators', details=[
757
- f'The accelerator {accelerator} could not be provided.',
758
- 'The Toil Kubernetes batch system only knows how to request gpu accelerators or accelerators with a defined model.'
759
- ])
797
+ raise InsufficientSystemResources(
798
+ requirer,
799
+ "accelerators",
800
+ details=[
801
+ f"The accelerator {accelerator} could not be provided.",
802
+ "The Toil Kubernetes batch system only knows how to request gpu accelerators or accelerators with a defined model.",
803
+ ],
804
+ )
760
805
 
761
806
  def _create_pod_spec(
762
- self,
763
- command: str,
764
- job_desc: JobDescription,
765
- job_environment: Optional[Dict[str, str]] = None
807
+ self,
808
+ command: str,
809
+ job_desc: JobDescription,
810
+ job_environment: Optional[dict[str, str]] = None,
766
811
  ) -> V1PodSpec:
767
812
  """
768
813
  Make the specification for a pod that can execute the given job.
@@ -789,9 +834,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
789
834
  # OOMing. We also want to provision some extra space so that when
790
835
  # we test _isPodStuckOOM we never get True unless the job has
791
836
  # exceeded job_desc.memory.
792
- requirements_dict = {'cpu': job_desc.cores,
793
- 'memory': job_desc.memory + 1024 * 1024 * 512,
794
- 'ephemeral-storage': job_desc.disk + 1024 * 1024 * 512}
837
+ requirements_dict = {
838
+ "cpu": job_desc.cores,
839
+ "memory": job_desc.memory + 1024 * 1024 * 512,
840
+ "ephemeral-storage": job_desc.disk + 1024 * 1024 * 512,
841
+ }
795
842
 
796
843
  # Also start on the placement constraints
797
844
  placement = KubernetesBatchSystem.Placement()
@@ -801,19 +848,21 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
801
848
  # Add in requirements for accelerators (GPUs).
802
849
  # See https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/
803
850
 
804
- if accelerator['kind'] == 'gpu':
851
+ if accelerator["kind"] == "gpu":
805
852
  # We can't schedule GPUs without a brand, because the
806
853
  # Kubernetes resources are <brand>.com/gpu. If no brand is
807
854
  # specified, default to nvidia, which is very popular.
808
- vendor = accelerator.get('brand', 'nvidia')
855
+ vendor = accelerator.get("brand", "nvidia")
809
856
  key = f'{vendor}.com/{accelerator["kind"]}'
810
857
  if key not in requirements_dict:
811
858
  requirements_dict[key] = 0
812
- requirements_dict[key] += accelerator['count']
859
+ requirements_dict[key] += accelerator["count"]
813
860
 
814
- if 'model' in accelerator:
861
+ if "model" in accelerator:
815
862
  # TODO: What if the cluster uses some other accelerator model labeling scheme?
816
- placement.required_labels.append(('accelerator', [accelerator['model']]))
863
+ placement.required_labels.append(
864
+ ("accelerator", [accelerator["model"]])
865
+ )
817
866
 
818
867
  # TODO: Support AMD's labeling scheme: https://github.com/RadeonOpenCompute/k8s-device-plugin/tree/master/cmd/k8s-node-labeller
819
868
  # That just has each trait of the accelerator as a separate label, but nothing that quite corresponds to a model.
@@ -825,14 +874,15 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
825
874
  # the UCSC Kubernetes admins want it that way. For GPUs, Kubernetes
826
875
  # requires them to be equal.
827
876
  limits_dict = requests_dict
828
- resources = V1ResourceRequirements(limits=limits_dict,
829
- requests=requests_dict)
877
+ resources = V1ResourceRequirements(limits=limits_dict, requests=requests_dict)
830
878
 
831
879
  # Collect volumes and mounts
832
880
  volumes = []
833
881
  mounts = []
834
882
 
835
- def mount_host_path(volume_name: str, host_path: str, mount_path: str, create: bool = False) -> None:
883
+ def mount_host_path(
884
+ volume_name: str, host_path: str, mount_path: str, create: bool = False
885
+ ) -> None:
836
886
  """
837
887
  Add a host path volume with the given name to mount the given path.
838
888
 
@@ -840,10 +890,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
840
890
  not exist. Otherwise, when the directory does not exist, the
841
891
  pod will wait for it to come into existence.
842
892
  """
843
- volume_type = 'DirectoryOrCreate' if create else 'Directory'
893
+ volume_type = "DirectoryOrCreate" if create else "Directory"
844
894
  volume_source = V1HostPathVolumeSource(path=host_path, type=volume_type)
845
- volume = V1Volume(name=volume_name,
846
- host_path=volume_source)
895
+ volume = V1Volume(name=volume_name, host_path=volume_source)
847
896
  volumes.append(volume)
848
897
  volume_mount = V1VolumeMount(mount_path=mount_path, name=volume_name)
849
898
  mounts.append(volume_mount)
@@ -851,49 +900,63 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
851
900
  if self.host_path is not None:
852
901
  # Provision Toil WorkDir from a HostPath volume, to share with other pods.
853
902
  # Create the directory if it doesn't exist already.
854
- mount_host_path('workdir', self.host_path, self.worker_work_dir, create=True)
903
+ mount_host_path(
904
+ "workdir", self.host_path, self.worker_work_dir, create=True
905
+ )
855
906
  # We also need to mount across /run/lock, where we will put
856
907
  # per-node coordiantion info.
857
908
  # Don't create this; it really should always exist.
858
- mount_host_path('coordination', '/run/lock', '/run/lock')
909
+ mount_host_path("coordination", "/run/lock", "/run/lock")
859
910
  else:
860
911
  # Provision Toil WorkDir as an ephemeral volume
861
- ephemeral_volume_name = 'workdir'
912
+ ephemeral_volume_name = "workdir"
862
913
  ephemeral_volume_source = V1EmptyDirVolumeSource()
863
- ephemeral_volume = V1Volume(name=ephemeral_volume_name,
864
- empty_dir=ephemeral_volume_source)
914
+ ephemeral_volume = V1Volume(
915
+ name=ephemeral_volume_name, empty_dir=ephemeral_volume_source
916
+ )
865
917
  volumes.append(ephemeral_volume)
866
- ephemeral_volume_mount = V1VolumeMount(mount_path=self.worker_work_dir, name=ephemeral_volume_name)
918
+ ephemeral_volume_mount = V1VolumeMount(
919
+ mount_path=self.worker_work_dir, name=ephemeral_volume_name
920
+ )
867
921
  mounts.append(ephemeral_volume_mount)
868
922
  # And don't share coordination directory
869
923
 
870
924
  if self.aws_secret_name is not None:
871
925
  # Also mount an AWS secret, if provided.
872
926
  # TODO: make this generic somehow
873
- secret_volume_name = 's3-credentials'
874
- secret_volume_source = V1SecretVolumeSource(secret_name=self.aws_secret_name)
875
- secret_volume = V1Volume(name=secret_volume_name,
876
- secret=secret_volume_source)
927
+ secret_volume_name = "s3-credentials"
928
+ secret_volume_source = V1SecretVolumeSource(
929
+ secret_name=self.aws_secret_name
930
+ )
931
+ secret_volume = V1Volume(
932
+ name=secret_volume_name, secret=secret_volume_source
933
+ )
877
934
  volumes.append(secret_volume)
878
- secret_volume_mount = V1VolumeMount(mount_path='/root/.aws', name=secret_volume_name)
935
+ secret_volume_mount = V1VolumeMount(
936
+ mount_path="/root/.aws", name=secret_volume_name
937
+ )
879
938
  mounts.append(secret_volume_mount)
880
939
 
881
940
  # Make a container definition
882
- container = V1Container(command=command_list,
883
- image=self.docker_image,
884
- name="runner-container",
885
- resources=resources,
886
- volume_mounts=mounts)
941
+ container = V1Container(
942
+ command=command_list,
943
+ image=self.docker_image,
944
+ name="runner-container",
945
+ resources=resources,
946
+ volume_mounts=mounts,
947
+ )
887
948
 
888
949
  # In case security context rules are not allowed to be set, we only apply
889
950
  # a security context at all if we need to turn on privileged mode.
890
951
  if self.config.kubernetes_privileged:
891
- container.security_context = V1SecurityContext(privileged=self.config.kubernetes_privileged)
952
+ container.security_context = V1SecurityContext(
953
+ privileged=self.config.kubernetes_privileged
954
+ )
892
955
 
893
956
  # Wrap the container in a spec
894
- pod_spec = V1PodSpec(containers=[container],
895
- volumes=volumes,
896
- restart_policy="Never")
957
+ pod_spec = V1PodSpec(
958
+ containers=[container], volumes=volumes, restart_policy="Never"
959
+ )
897
960
  # Tell the spec where to land
898
961
  placement.apply(pod_spec)
899
962
 
@@ -903,7 +966,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
903
966
 
904
967
  return pod_spec
905
968
 
906
- def _release_acquired_resources(self, resources: List[int], notify: bool = False) -> None:
969
+ def _release_acquired_resources(
970
+ self, resources: list[int], notify: bool = False
971
+ ) -> None:
907
972
  """
908
973
  Release all resources acquired for a job.
909
974
 
@@ -922,10 +987,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
922
987
  self._work_available.notify_all()
923
988
 
924
989
  def _launch_job(
925
- self,
926
- job_name: str,
927
- job_desc: JobDescription,
928
- pod_spec: V1PodSpec
990
+ self, job_name: str, job_desc: JobDescription, pod_spec: V1PodSpec
929
991
  ) -> bool:
930
992
  """
931
993
  Try to launch the given job to the Kubernetes cluster. Return False if
@@ -933,19 +995,26 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
933
995
  """
934
996
 
935
997
  # Limit the amount of resources requested at a time.
936
- resource_requests: List[int] = [1, int(job_desc.cores * 1000), job_desc.memory, job_desc.disk]
998
+ resource_requests: list[int] = [
999
+ 1,
1000
+ int(job_desc.cores * 1000),
1001
+ job_desc.memory,
1002
+ job_desc.disk,
1003
+ ]
937
1004
 
938
1005
  acquired = []
939
1006
  for source, request in zip(self.resource_sources, resource_requests):
940
1007
  # For each kind of resource we want, go get it
941
- assert ((isinstance(source, ResourcePool) and isinstance(request, int)))
1008
+ assert isinstance(source, ResourcePool) and isinstance(request, int)
942
1009
  if source.acquireNow(request):
943
1010
  acquired.append(request)
944
1011
  else:
945
1012
  # We can't get everything
946
- self._release_acquired_resources(acquired,
1013
+ self._release_acquired_resources(
1014
+ acquired,
947
1015
  # Put it back quietly.
948
- notify=False)
1016
+ notify=False,
1017
+ )
949
1018
  return False
950
1019
 
951
1020
  self._acquired_resources[job_name] = acquired
@@ -954,9 +1023,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
954
1023
 
955
1024
  # Make metadata to label the job/pod with info.
956
1025
  # Don't let the cluster autoscaler evict any Toil jobs.
957
- metadata = V1ObjectMeta(name=job_name,
958
- labels={"toil_run": self.run_id},
959
- annotations={"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"})
1026
+ metadata = V1ObjectMeta(
1027
+ name=job_name,
1028
+ labels={"toil_run": self.run_id},
1029
+ annotations={"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"},
1030
+ )
960
1031
 
961
1032
  # Wrap the spec in a template
962
1033
  template = V1PodTemplateSpec(spec=pod_spec, metadata=metadata)
@@ -964,18 +1035,21 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
964
1035
  # Make another spec for the job, asking to run the template with no
965
1036
  # backoff/retry. Specify our own TTL to avoid catching the notice
966
1037
  # of over-zealous abandoned job cleanup scripts.
967
- job_spec = V1JobSpec(template=template,
968
- backoff_limit=0,
969
- ttl_seconds_after_finished=self.finished_job_ttl)
1038
+ job_spec = V1JobSpec(
1039
+ template=template,
1040
+ backoff_limit=0,
1041
+ ttl_seconds_after_finished=self.finished_job_ttl,
1042
+ )
970
1043
 
971
1044
  # And make the actual job
972
- job = V1Job(spec=job_spec,
973
- metadata=metadata,
974
- api_version="batch/v1",
975
- kind="Job")
1045
+ job = V1Job(
1046
+ spec=job_spec, metadata=metadata, api_version="batch/v1", kind="Job"
1047
+ )
976
1048
 
977
1049
  # Launch the job
978
- launched = self._api('batch', errors=[]).create_namespaced_job(self.namespace, job)
1050
+ launched = self._api("batch", errors=[]).create_namespaced_job(
1051
+ self.namespace, job
1052
+ )
979
1053
 
980
1054
  logger.debug(f"Launched job: {job_name}")
981
1055
 
@@ -983,10 +1057,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
983
1057
 
984
1058
  def _delete_job(
985
1059
  self,
986
- job_name: str, *,
1060
+ job_name: str,
1061
+ *,
987
1062
  propagation_policy: Literal["Foreground", "Background"] = "Foreground",
988
1063
  gone_ok: bool = False,
989
- resource_notify: bool = True
1064
+ resource_notify: bool = True,
990
1065
  ) -> None:
991
1066
  """
992
1067
  Given the name of a kubernetes job, delete the job and release all
@@ -999,11 +1074,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
999
1074
  the self._work_available condition.
1000
1075
  """
1001
1076
  try:
1002
- logger.debug(f'Deleting Kubernetes job {job_name}')
1003
- self._api('batch', errors=[404] if gone_ok else []).delete_namespaced_job(
1004
- job_name,
1005
- self.namespace,
1006
- propagation_policy=propagation_policy
1077
+ logger.debug(f"Deleting Kubernetes job {job_name}")
1078
+ self._api("batch", errors=[404] if gone_ok else []).delete_namespaced_job(
1079
+ job_name, self.namespace, propagation_policy=propagation_policy
1007
1080
  )
1008
1081
  finally:
1009
1082
  # We should always release the acquired resources.
@@ -1014,7 +1087,12 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1014
1087
  self._release_acquired_resources(resources, notify=resource_notify)
1015
1088
  del self._acquired_resources[job_name]
1016
1089
 
1017
- def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
1090
+ def issueBatchJob(
1091
+ self,
1092
+ command: str,
1093
+ job_desc: JobDescription,
1094
+ job_environment: Optional[dict[str, str]] = None,
1095
+ ) -> int:
1018
1096
  # Try the job as local
1019
1097
  localID = self.handleLocalJob(command, job_desc)
1020
1098
  if localID is not None:
@@ -1027,7 +1105,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1027
1105
  self.check_resource_request(job_desc)
1028
1106
 
1029
1107
  # Make a pod that describes running the job
1030
- pod_spec = self._create_pod_spec(command, job_desc, job_environment=job_environment)
1108
+ pod_spec = self._create_pod_spec(
1109
+ command, job_desc, job_environment=job_environment
1110
+ )
1031
1111
 
1032
1112
  # Make a batch system scope job ID
1033
1113
  job_id = self.getNextJobID()
@@ -1055,6 +1135,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1055
1135
  kwargs, so we can't just set unused ones to None. But we also don't
1056
1136
  want to duplicate code for every combination of possible present keys.
1057
1137
  """
1138
+
1058
1139
  _continue: NotRequired[str]
1059
1140
  label_selector: NotRequired[str]
1060
1141
  field_selector: NotRequired[str]
@@ -1084,30 +1165,30 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1084
1165
  token = None
1085
1166
 
1086
1167
  while True:
1087
- kwargs: KubernetesBatchSystem._ArgsDict = {'label_selector': f"toil_run={self.run_id}"}
1168
+ kwargs: KubernetesBatchSystem._ArgsDict = {
1169
+ "label_selector": f"toil_run={self.run_id}"
1170
+ }
1088
1171
 
1089
1172
  if onlySucceeded:
1090
- kwargs['field_selector'] = "status.successful==1"
1173
+ kwargs["field_selector"] = "status.successful==1"
1091
1174
 
1092
1175
  if token is not None:
1093
- kwargs['_continue'] = token
1176
+ kwargs["_continue"] = token
1094
1177
 
1095
- results = self._api('batch', errors=[]).list_namespaced_job(
1096
- self.namespace,
1097
- **kwargs
1178
+ results = self._api("batch", errors=[]).list_namespaced_job(
1179
+ self.namespace, **kwargs
1098
1180
  )
1099
-
1181
+
1100
1182
  # These jobs belong to us
1101
1183
  yield from (j for j in results.items if not self._is_deleted(j))
1102
1184
 
1103
1185
  # Remember the continuation token, if any
1104
- token = getattr(results.metadata, 'continue', None)
1186
+ token = getattr(results.metadata, "continue", None)
1105
1187
 
1106
1188
  if token is None:
1107
1189
  # There isn't one. We got everything.
1108
1190
  break
1109
1191
 
1110
-
1111
1192
  def _ourPodObject(self) -> Iterator[V1Pod]:
1112
1193
  """
1113
1194
  Yield Kubernetes V1Pod objects that we are responsible for that the
@@ -1117,25 +1198,25 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1117
1198
  token = None
1118
1199
 
1119
1200
  while True:
1120
- kwargs: KubernetesBatchSystem._ArgsDict = {'label_selector': f"toil_run={self.run_id}"}
1201
+ kwargs: KubernetesBatchSystem._ArgsDict = {
1202
+ "label_selector": f"toil_run={self.run_id}"
1203
+ }
1121
1204
 
1122
1205
  if token is not None:
1123
- kwargs['_continue'] = token
1206
+ kwargs["_continue"] = token
1124
1207
 
1125
- results = self._api('core', errors=[]).list_namespaced_pod(
1126
- self.namespace,
1127
- **kwargs
1208
+ results = self._api("core", errors=[]).list_namespaced_pod(
1209
+ self.namespace, **kwargs
1128
1210
  )
1129
1211
 
1130
1212
  yield from (j for j in results.items if not self._is_deleted(j))
1131
1213
  # Remember the continuation token, if any
1132
- token = getattr(results.metadata, 'continue', None)
1214
+ token = getattr(results.metadata, "continue", None)
1133
1215
 
1134
1216
  if token is None:
1135
1217
  # There isn't one. We got everything.
1136
1218
  break
1137
1219
 
1138
-
1139
1220
  def _getPodForJob(self, jobObject: V1Job) -> Optional[V1Pod]:
1140
1221
  """
1141
1222
  Get the pod that belongs to the given job, or None if the job's pod is
@@ -1149,22 +1230,26 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1149
1230
  """
1150
1231
 
1151
1232
  # Make sure the job has the fields we need
1152
- assert(jobObject.metadata is not None)
1233
+ assert jobObject.metadata is not None
1153
1234
 
1154
1235
  token = None
1155
1236
 
1156
1237
  while True:
1157
- kwargs: KubernetesBatchSystem._ArgsDict = {'label_selector': f'job-name={jobObject.metadata.name}'}
1238
+ kwargs: KubernetesBatchSystem._ArgsDict = {
1239
+ "label_selector": f"job-name={jobObject.metadata.name}"
1240
+ }
1158
1241
  if token is not None:
1159
- kwargs['_continue'] = token
1160
- results = self._api('core', errors=[]).list_namespaced_pod(self.namespace, **kwargs)
1242
+ kwargs["_continue"] = token
1243
+ results = self._api("core", errors=[]).list_namespaced_pod(
1244
+ self.namespace, **kwargs
1245
+ )
1161
1246
 
1162
1247
  for pod in results.items:
1163
1248
  # Return the first pod we find
1164
1249
  return pod
1165
1250
 
1166
1251
  # Remember the continuation token, if any
1167
- token = getattr(results.metadata, 'continue', None)
1252
+ token = getattr(results.metadata, "continue", None)
1168
1253
 
1169
1254
  if token is None:
1170
1255
  # There isn't one. We got everything.
@@ -1188,12 +1273,13 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1188
1273
  assert podObject.metadata is not None
1189
1274
  assert podObject.metadata.name is not None
1190
1275
 
1191
- return self._api('core', errors=[]).read_namespaced_pod_log(
1192
- podObject.metadata.name,
1193
- namespace=self.namespace
1276
+ return self._api("core", errors=[]).read_namespaced_pod_log(
1277
+ podObject.metadata.name, namespace=self.namespace
1194
1278
  )
1195
1279
 
1196
- def _isPodStuckOOM(self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2) -> bool:
1280
+ def _isPodStuckOOM(
1281
+ self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2
1282
+ ) -> bool:
1197
1283
  """
1198
1284
  Poll the current memory usage for the pod from the cluster.
1199
1285
 
@@ -1223,14 +1309,18 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1223
1309
  assert podObject.metadata.name is not None
1224
1310
 
1225
1311
  # Compose a query to get just the pod we care about
1226
- query = f'metadata.name={podObject.metadata.name}'
1312
+ query = f"metadata.name={podObject.metadata.name}"
1227
1313
 
1228
1314
  # Look for it, but manage our own exceptions
1229
1315
  try:
1230
1316
  # TODO: When the Kubernetes Python API actually wraps the metrics API, switch to that
1231
- response = self._api('customObjects').list_namespaced_custom_object('metrics.k8s.io', 'v1beta1',
1232
- self.namespace, 'pods',
1233
- field_selector=query)
1317
+ response = self._api("customObjects").list_namespaced_custom_object(
1318
+ "metrics.k8s.io",
1319
+ "v1beta1",
1320
+ self.namespace,
1321
+ "pods",
1322
+ field_selector=query,
1323
+ )
1234
1324
  except Exception as e:
1235
1325
  # We couldn't talk to the metrics service on this attempt. We don't
1236
1326
  # retry, but we also don't want to just ignore all errors. We only
@@ -1246,7 +1336,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1246
1336
  raise
1247
1337
 
1248
1338
  # Pull out the items
1249
- items = response.get('items', [])
1339
+ items = response.get("items", [])
1250
1340
 
1251
1341
  if len(items) == 0:
1252
1342
  # If there's no statistics we can't say we're stuck OOM
@@ -1255,7 +1345,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1255
1345
  # Assume the first result is the right one, because of the selector.
1256
1346
  # That means we don't need to bother with _continue.
1257
1347
  # Assume it has exactly one pod, because we made it.
1258
- containers = items[0].get('containers', [{}])
1348
+ containers = items[0].get("containers", [{}])
1259
1349
 
1260
1350
  if len(containers) == 0:
1261
1351
  # If there are no containers (because none have started yet?), we can't say we're stuck OOM
@@ -1264,26 +1354,37 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1264
1354
  # Otherwise, assume it just has one container.
1265
1355
  # Grab the memory usage string, like 123Ki, and convert to bytes.
1266
1356
  # If anything is missing, assume 0 bytes used.
1267
- bytesUsed = human2bytes(containers[0].get('usage', {}).get('memory', '0'))
1357
+ bytesUsed = human2bytes(containers[0].get("usage", {}).get("memory", "0"))
1268
1358
 
1269
1359
  # Also get the limit out of the pod object's spec
1270
1360
  assert podObject.spec is not None
1271
1361
  assert len(podObject.spec.containers) > 0
1272
1362
  assert podObject.spec.containers[0].resources is not None
1273
1363
  assert podObject.spec.containers[0].resources.limits is not None
1274
- assert 'memory' in podObject.spec.containers[0].resources.limits
1275
- bytesAllowed = human2bytes(podObject.spec.containers[0].resources.limits['memory'])
1364
+ assert "memory" in podObject.spec.containers[0].resources.limits
1365
+ bytesAllowed = human2bytes(
1366
+ podObject.spec.containers[0].resources.limits["memory"]
1367
+ )
1276
1368
 
1277
1369
  if bytesAllowed - bytesUsed < minFreeBytes:
1278
1370
  # This is too much!
1279
- logger.warning('Pod %s has used %d of %d bytes of memory; reporting as stuck due to OOM.',
1280
- podObject.metadata.name, bytesUsed, bytesAllowed)
1371
+ logger.warning(
1372
+ "Pod %s has used %d of %d bytes of memory; reporting as stuck due to OOM.",
1373
+ podObject.metadata.name,
1374
+ bytesUsed,
1375
+ bytesAllowed,
1376
+ )
1281
1377
 
1282
1378
  return True
1283
1379
  else:
1284
1380
  return False
1285
1381
 
1286
- def _isPodStuckWaiting(self, pod_object: V1Pod, reason: Optional[str] = None, timeout: Optional[float] = None) -> bool:
1382
+ def _isPodStuckWaiting(
1383
+ self,
1384
+ pod_object: V1Pod,
1385
+ reason: Optional[str] = None,
1386
+ timeout: Optional[float] = None,
1387
+ ) -> bool:
1287
1388
  """
1288
1389
  Return True if the pod looks to be in a waiting state, and false otherwise.
1289
1390
 
@@ -1307,7 +1408,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1307
1408
  # Can't be stuck
1308
1409
  return False
1309
1410
 
1310
- waiting_info = getattr(getattr(container_statuses[0], 'state', None), 'waiting', None)
1411
+ waiting_info = getattr(
1412
+ getattr(container_statuses[0], "state", None), "waiting", None
1413
+ )
1311
1414
  if waiting_info is None:
1312
1415
  # Pod is not waiting
1313
1416
  return False
@@ -1316,15 +1419,17 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1316
1419
  # Pod fails reason filter
1317
1420
  return False
1318
1421
 
1319
- start_time = getattr(pod_object.status, 'start_time', None)
1320
- if timeout is not None and (start_time is None or (utc_now() - start_time).total_seconds() < timeout):
1422
+ start_time = getattr(pod_object.status, "start_time", None)
1423
+ if timeout is not None and (
1424
+ start_time is None or (utc_now() - start_time).total_seconds() < timeout
1425
+ ):
1321
1426
  # It hasn't been waiting too long, or we care but don't know how
1322
1427
  # long it has been waiting
1323
1428
  return False
1324
1429
 
1325
1430
  return True
1326
1431
 
1327
- def _is_deleted(self, kube_thing: Union['V1Job', 'V1Pod']) -> bool:
1432
+ def _is_deleted(self, kube_thing: Union["V1Job", "V1Pod"]) -> bool:
1328
1433
  """
1329
1434
  Determine if a job or pod is in the process od being deleted, and
1330
1435
  shouldn't count anymore.
@@ -1333,7 +1438,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1333
1438
  # Kubernetes "Terminating" is the same as having the deletion_timestamp
1334
1439
  # set in the metadata of the object.
1335
1440
 
1336
- deletion_timestamp: Optional[datetime.datetime] = getattr(getattr(kube_thing, 'metadata', None), 'deletion_timestamp', None)
1441
+ deletion_timestamp: Optional[datetime.datetime] = getattr(
1442
+ getattr(kube_thing, "metadata", None), "deletion_timestamp", None
1443
+ )
1337
1444
  # If the deletion timestamp is set to anything, it is in the process of
1338
1445
  # being deleted. We will treat that as as good as gone.
1339
1446
  return deletion_timestamp is not None
@@ -1350,8 +1457,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1350
1457
 
1351
1458
  assert jobObject.metadata is not None
1352
1459
  assert jobObject.metadata.name is not None
1353
- return int(jobObject.metadata.name[len(self.job_prefix):])
1354
-
1460
+ return int(jobObject.metadata.name[len(self.job_prefix) :])
1355
1461
 
1356
1462
  def getUpdatedBatchJob(self, maxWait: float) -> Optional[UpdatedBatchJobInfo]:
1357
1463
 
@@ -1367,22 +1473,27 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1367
1473
  # Otherwise we need to maybe wait.
1368
1474
  if self.enable_watching and maxWait >= 1:
1369
1475
  # We can try a watch. Watches can only work in whole seconds.
1370
- for event in self._stream_until_error(self._api('batch').list_namespaced_job,
1371
- self.namespace,
1372
- label_selector=f"toil_run={self.run_id}",
1373
- timeout_seconds=math.floor(maxWait)):
1476
+ for event in self._stream_until_error(
1477
+ self._api("batch").list_namespaced_job,
1478
+ self.namespace,
1479
+ label_selector=f"toil_run={self.run_id}",
1480
+ timeout_seconds=math.floor(maxWait),
1481
+ ):
1374
1482
  # Grab the metadata data, ID, the list of conditions of the current job, and the total pods
1375
- jobObject = event['object']
1376
-
1483
+ jobObject = event["object"]
1484
+
1377
1485
  if self._is_deleted(jobObject):
1378
1486
  # Job is already deleted, so ignore it.
1379
- logger.warning('Kubernetes job %s is deleted; ignore its update', getattr(getattr(jobObject, 'metadata', None), 'name', None))
1487
+ logger.warning(
1488
+ "Kubernetes job %s is deleted; ignore its update",
1489
+ getattr(getattr(jobObject, "metadata", None), "name", None),
1490
+ )
1380
1491
  continue
1381
-
1492
+
1382
1493
  assert jobObject.metadata is not None
1383
1494
  assert jobObject.metadata.name is not None
1384
-
1385
- jobID = int(jobObject.metadata.name[len(self.job_prefix):])
1495
+
1496
+ jobID = int(jobObject.metadata.name[len(self.job_prefix) :])
1386
1497
  if jobObject.status is None:
1387
1498
  # Can't tell what is up with this job.
1388
1499
  continue
@@ -1392,7 +1503,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1392
1503
  failed_pods = jobObject.status.failed or 0
1393
1504
  # Fetch out the condition object that has info about how the job is going.
1394
1505
  condition: Optional[V1JobCondition] = None
1395
- if jobObject.status.conditions is not None and len(jobObject.status.conditions) > 0:
1506
+ if (
1507
+ jobObject.status.conditions is not None
1508
+ and len(jobObject.status.conditions) > 0
1509
+ ):
1396
1510
  condition = jobObject.status.conditions[0]
1397
1511
 
1398
1512
  totalPods = active_pods + succeeded_pods + failed_pods
@@ -1402,14 +1516,25 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1402
1516
 
1403
1517
  # Check if there are any active pods
1404
1518
  if active_pods > 0:
1405
- logger.info("%s has %d pods running" % jobObject.metadata.name, active_pods)
1519
+ logger.info(
1520
+ "%s has %d pods running" % jobObject.metadata.name, active_pods
1521
+ )
1406
1522
  continue
1407
1523
  elif succeeded_pods > 0 or failed_pods > 0:
1408
1524
  # No more active pods in the current job ; must be finished
1409
- logger.info("%s RESULTS -> Succeeded: %d Failed:%d Active:%d" % jobObject.metadata.name,
1410
- succeeded_pods, failed_pods, active_pods)
1525
+ logger.info(
1526
+ "%s RESULTS -> Succeeded: %d Failed:%d Active:%d"
1527
+ % jobObject.metadata.name,
1528
+ succeeded_pods,
1529
+ failed_pods,
1530
+ active_pods,
1531
+ )
1411
1532
  # Log out success/failure given a reason
1412
- logger.info("%s REASON: %s", getattr(condition, 'type', None), getattr(condition, 'reason', None))
1533
+ logger.info(
1534
+ "%s REASON: %s",
1535
+ getattr(condition, "type", None),
1536
+ getattr(condition, "reason", None),
1537
+ )
1413
1538
 
1414
1539
  # Log out reason of failure and pod exit code
1415
1540
  if failed_pods > 0:
@@ -1419,22 +1544,40 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1419
1544
  if condition is not None:
1420
1545
  logger.warning("Failed Job Message: %s", condition.message)
1421
1546
  pod = self._getPodForJob(jobObject)
1422
- statuses: List[V1ContainerStatus] = getattr(getattr(pod, 'status', None), 'container_statuses', [])
1423
- if len(statuses) > 0 and statuses[0].state is not None and statuses[0].state.terminated is not None:
1547
+ statuses: list[V1ContainerStatus] = getattr(
1548
+ getattr(pod, "status", None), "container_statuses", []
1549
+ )
1550
+ if (
1551
+ len(statuses) > 0
1552
+ and statuses[0].state is not None
1553
+ and statuses[0].state.terminated is not None
1554
+ ):
1424
1555
  exitCode = statuses[0].state.terminated.exit_code
1425
1556
 
1426
1557
  raw_runtime = 0.0
1427
- if jobObject.status.completion_time is not None and jobObject.status.start_time is not None:
1428
- raw_runtime = (jobObject.status.completion_time - jobObject.status.start_time).total_seconds()
1558
+ if (
1559
+ jobObject.status.completion_time is not None
1560
+ and jobObject.status.start_time is not None
1561
+ ):
1562
+ raw_runtime = (
1563
+ jobObject.status.completion_time
1564
+ - jobObject.status.start_time
1565
+ ).total_seconds()
1429
1566
  runtime = slow_down(raw_runtime)
1430
- result = UpdatedBatchJobInfo(jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=exitReason)
1567
+ result = UpdatedBatchJobInfo(
1568
+ jobID=jobID,
1569
+ exitStatus=exitCode,
1570
+ wallTime=runtime,
1571
+ exitReason=exitReason,
1572
+ )
1431
1573
 
1432
- if (exitReason == BatchJobExitReason.FAILED) or (succeeded_pods + failed_pods == totalPods):
1574
+ if (exitReason == BatchJobExitReason.FAILED) or (
1575
+ succeeded_pods + failed_pods == totalPods
1576
+ ):
1433
1577
  # Cleanup if job is all finished or there was a pod that failed
1434
1578
  # TODO: use delete_job() to release acquired resources
1435
1579
  self._delete_job(
1436
- jobObject.metadata.name,
1437
- propagation_policy='Foreground'
1580
+ jobObject.metadata.name, propagation_policy="Foreground"
1438
1581
  )
1439
1582
  # Make sure the job is deleted so we won't see it again.
1440
1583
  self._waitForJobDeath(jobObject.metadata.name)
@@ -1442,12 +1585,19 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1442
1585
  continue
1443
1586
  else:
1444
1587
  # Job is not running/updating ; no active, successful, or failed pods yet
1445
- logger.debug("Job {} -> {}".format(jobObject.metadata.name, getattr(condition, 'reason', None)))
1588
+ logger.debug(
1589
+ "Job {} -> {}".format(
1590
+ jobObject.metadata.name, getattr(condition, "reason", None)
1591
+ )
1592
+ )
1446
1593
  # Pod could be pending; don't say it's lost.
1447
1594
  continue
1448
1595
  else:
1449
1596
  # Try polling instead
1450
- while result is None and (datetime.datetime.now() - entry).total_seconds() < maxWait:
1597
+ while (
1598
+ result is None
1599
+ and (datetime.datetime.now() - entry).total_seconds() < maxWait
1600
+ ):
1451
1601
  # We still have nothing and we haven't hit the timeout.
1452
1602
 
1453
1603
  # Poll
@@ -1455,12 +1605,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1455
1605
 
1456
1606
  if result is None:
1457
1607
  # Still nothing. Wait a second, or some fraction of our max wait time.
1458
- time.sleep(min(maxWait/2, 1.0))
1608
+ time.sleep(min(maxWait / 2, 1.0))
1459
1609
 
1460
1610
  # When we get here, either we found something or we ran out of time
1461
1611
  return result
1462
1612
 
1463
-
1464
1613
  def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]:
1465
1614
  """
1466
1615
  Return None if no updated (completed or failed) batch job is currently
@@ -1484,25 +1633,25 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1484
1633
  # Find a job that is done, failed, or stuck
1485
1634
  jobObject = None
1486
1635
  # Put 'done', 'failed', or 'stuck' here
1487
- chosenFor = ''
1636
+ chosenFor = ""
1488
1637
 
1489
1638
  for j in self._ourJobObject(onlySucceeded=True):
1490
1639
  # Look for succeeded jobs because that's the only filter Kubernetes has
1491
1640
  jobObject = j
1492
- chosenFor = 'done'
1641
+ chosenFor = "done"
1493
1642
 
1494
1643
  if jobObject is None:
1495
1644
  for j in self._ourJobObject():
1496
1645
  # If there aren't any succeeded jobs, scan all jobs
1497
1646
  # See how many times each failed
1498
- failCount = getattr(j.status, 'failed', 0)
1647
+ failCount = getattr(j.status, "failed", 0)
1499
1648
  if failCount is None:
1500
1649
  # Make sure it is an int
1501
1650
  failCount = 0
1502
1651
  if failCount > 0:
1503
1652
  # Take the first failed one you find
1504
1653
  jobObject = j
1505
- chosenFor = 'failed'
1654
+ chosenFor = "failed"
1506
1655
  break
1507
1656
 
1508
1657
  if jobObject is None:
@@ -1515,23 +1664,30 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1515
1664
  continue
1516
1665
 
1517
1666
  # Containers can get stuck in Waiting with reason ImagePullBackOff
1518
- if self._isPodStuckWaiting(pod, reason='ImagePullBackoff'):
1667
+ if self._isPodStuckWaiting(pod, reason="ImagePullBackoff"):
1519
1668
  # Assume it will never finish, even if the registry comes back or whatever.
1520
1669
  # We can get into this state when we send in a non-existent image.
1521
1670
  # See https://github.com/kubernetes/kubernetes/issues/58384
1522
1671
  jobObject = j
1523
- chosenFor = 'stuck'
1524
- logger.warning('Failing stuck job (ImagePullBackoff); did you try to run a non-existent Docker image?'
1525
- ' Check TOIL_APPLIANCE_SELF.')
1672
+ chosenFor = "stuck"
1673
+ logger.warning(
1674
+ "Failing stuck job (ImagePullBackoff); did you try to run a non-existent Docker image?"
1675
+ " Check TOIL_APPLIANCE_SELF."
1676
+ )
1526
1677
  break
1527
1678
 
1528
1679
  # Containers can also get stuck in Waiting with reason
1529
1680
  # ContainerCreating, if for example their mounts don't work.
1530
- if self._isPodStuckWaiting(pod, reason='ContainerCreating', timeout=self.pod_timeout):
1681
+ if self._isPodStuckWaiting(
1682
+ pod, reason="ContainerCreating", timeout=self.pod_timeout
1683
+ ):
1531
1684
  # Assume that it will never finish.
1532
1685
  jobObject = j
1533
- chosenFor = 'stuck'
1534
- logger.warning('Failing stuck job (ContainerCreating longer than %s seconds); did you try to mount something impossible?', self.pod_timeout)
1686
+ chosenFor = "stuck"
1687
+ logger.warning(
1688
+ "Failing stuck job (ContainerCreating longer than %s seconds); did you try to mount something impossible?",
1689
+ self.pod_timeout,
1690
+ )
1535
1691
  break
1536
1692
 
1537
1693
  # Pods can also get stuck nearly but not quite out of memory,
@@ -1541,7 +1697,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1541
1697
  # We found a job that probably should be OOM! Report it as stuck.
1542
1698
  # Polling function takes care of the logging.
1543
1699
  jobObject = j
1544
- chosenFor = 'stuck'
1700
+ chosenFor = "stuck"
1545
1701
  break
1546
1702
 
1547
1703
  if jobObject is None:
@@ -1549,25 +1705,30 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1549
1705
  return None
1550
1706
  else:
1551
1707
  # We actually have something
1552
- logger.debug('Identified stopped Kubernetes job %s as %s', getattr(jobObject.metadata, 'name', None), chosenFor)
1553
-
1708
+ logger.debug(
1709
+ "Identified stopped Kubernetes job %s as %s",
1710
+ getattr(jobObject.metadata, "name", None),
1711
+ chosenFor,
1712
+ )
1554
1713
 
1555
1714
  # Otherwise we got something.
1556
1715
 
1557
1716
  # Work out what the job's ID was (whatever came after our name prefix)
1558
1717
  assert jobObject.metadata is not None
1559
1718
  assert jobObject.metadata.name is not None
1560
- jobID = int(jobObject.metadata.name[len(self.job_prefix):])
1719
+ jobID = int(jobObject.metadata.name[len(self.job_prefix) :])
1561
1720
 
1562
1721
  # Grab the pod
1563
1722
  pod = self._getPodForJob(jobObject)
1564
1723
 
1565
1724
  if pod is not None:
1566
- if chosenFor == 'done' or chosenFor == 'failed':
1725
+ if chosenFor == "done" or chosenFor == "failed":
1567
1726
  # The job actually finished or failed
1568
1727
 
1569
1728
  # Get the statuses of the pod's containers
1570
- containerStatuses = getattr(getattr(pod, 'status', None), 'container_statuses', None)
1729
+ containerStatuses = getattr(
1730
+ getattr(pod, "status", None), "container_statuses", None
1731
+ )
1571
1732
 
1572
1733
  # Get when the pod started (reached the Kubelet) as a datetime
1573
1734
  start_time = self._get_start_time(pod, jobObject)
@@ -1577,18 +1738,24 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1577
1738
  # This happens when a pod is "Scheduled". But how could a
1578
1739
  # 'done' or 'failed' pod be merely "Scheduled"?
1579
1740
  # Complain so we can find out.
1580
- logger.warning('Exit code and runtime unavailable; pod has no container statuses')
1581
- logger.warning('Pod: %s', str(pod))
1741
+ logger.warning(
1742
+ "Exit code and runtime unavailable; pod has no container statuses"
1743
+ )
1744
+ logger.warning("Pod: %s", str(pod))
1582
1745
  exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
1583
1746
  # Say it stopped now and started when it was scheduled/submitted.
1584
1747
  # We still need a strictly positive runtime.
1585
1748
  runtime = slow_down((utc_now() - start_time).total_seconds())
1586
1749
  else:
1587
1750
  # Get the termination info from the pod's main (only) container
1588
- terminatedInfo = getattr(getattr(containerStatuses[0], 'state', None), 'terminated', None)
1751
+ terminatedInfo = getattr(
1752
+ getattr(containerStatuses[0], "state", None), "terminated", None
1753
+ )
1589
1754
  if terminatedInfo is None:
1590
- logger.warning('Exit code and runtime unavailable; pod stopped without container terminating')
1591
- logger.warning('Pod: %s', str(pod))
1755
+ logger.warning(
1756
+ "Exit code and runtime unavailable; pod stopped without container terminating"
1757
+ )
1758
+ logger.warning("Pod: %s", str(pod))
1592
1759
  exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
1593
1760
  # Say it stopped now and started when it was scheduled/submitted.
1594
1761
  # We still need a strictly positive runtime.
@@ -1603,34 +1770,42 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1603
1770
  # created. And we need to look at the pod's end time
1604
1771
  # because the job only gets a completion time if
1605
1772
  # successful.
1606
- runtime = slow_down((terminatedInfo.finished_at -
1607
- start_time).total_seconds())
1773
+ runtime = slow_down(
1774
+ (terminatedInfo.finished_at - start_time).total_seconds()
1775
+ )
1608
1776
 
1609
- if chosenFor == 'failed':
1777
+ if chosenFor == "failed":
1610
1778
  # Warn the user with the failed pod's log
1611
1779
  # TODO: cut this down somehow?
1612
- logger.warning('Log from failed pod: %s', self._getLogForPod(pod))
1780
+ logger.warning(
1781
+ "Log from failed pod: %s", self._getLogForPod(pod)
1782
+ )
1613
1783
 
1614
1784
  else:
1615
1785
  # The job has gotten stuck
1616
1786
 
1617
- assert chosenFor == 'stuck'
1787
+ assert chosenFor == "stuck"
1618
1788
 
1619
1789
  # Synthesize an exit code
1620
1790
  exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
1621
1791
  # Say it ran from when the job was submitted to when the pod got stuck
1622
- runtime = slow_down((utc_now() - self._get_start_time(job=jobObject)).total_seconds())
1792
+ runtime = slow_down(
1793
+ (utc_now() - self._get_start_time(job=jobObject)).total_seconds()
1794
+ )
1623
1795
  else:
1624
1796
  # The pod went away from under the job.
1625
- logging.warning('Exit code and runtime unavailable; pod vanished')
1797
+ logging.warning("Exit code and runtime unavailable; pod vanished")
1626
1798
  exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
1627
1799
  # Say it ran from when the job was submitted to when the pod vanished
1628
- runtime = slow_down((utc_now() - self._get_start_time(job=jobObject)).total_seconds())
1629
-
1800
+ runtime = slow_down(
1801
+ (utc_now() - self._get_start_time(job=jobObject)).total_seconds()
1802
+ )
1630
1803
 
1631
1804
  try:
1632
1805
  # Delete the job and all dependents (pods), hoping to get a 404 if it's magically gone
1633
- self._delete_job(jobObject.metadata.name, propagation_policy='Foreground', gone_ok=True)
1806
+ self._delete_job(
1807
+ jobObject.metadata.name, propagation_policy="Foreground", gone_ok=True
1808
+ )
1634
1809
 
1635
1810
  # That just kicks off the deletion process. Foreground doesn't
1636
1811
  # actually block. See
@@ -1646,7 +1821,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1646
1821
  # Otherwise everything is fine and the job is gone.
1647
1822
 
1648
1823
  # Return the one finished job we found
1649
- return UpdatedBatchJobInfo(jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=None)
1824
+ return UpdatedBatchJobInfo(
1825
+ jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=None
1826
+ )
1650
1827
 
1651
1828
  def _waitForJobDeath(self, jobName: str) -> None:
1652
1829
  """
@@ -1660,7 +1837,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1660
1837
  while True:
1661
1838
  try:
1662
1839
  # Look for the job
1663
- job_object = self._api('batch', errors=[404]).read_namespaced_job(jobName, self.namespace)
1840
+ job_object = self._api("batch", errors=[404]).read_namespaced_job(
1841
+ jobName, self.namespace
1842
+ )
1664
1843
  if self._is_deleted(job_object):
1665
1844
  # The job looks deleted, so we can treat it as not being there.
1666
1845
  return
@@ -1685,59 +1864,80 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1685
1864
  # Shutdown scheduling thread
1686
1865
  self._shutting_down.set()
1687
1866
  with self._work_available:
1688
- self._work_available.notify_all() # Wake it up.
1867
+ self._work_available.notify_all() # Wake it up.
1689
1868
 
1690
1869
  self.schedulingThread.join()
1691
1870
 
1692
1871
  # Kill all of our jobs and clean up pods that are associated with those jobs
1693
1872
  try:
1694
- logger.debug('Deleting all Kubernetes jobs for toil_run=%s', self.run_id)
1695
- self._api('batch', errors=[404]).delete_collection_namespaced_job(
1873
+ logger.debug("Deleting all Kubernetes jobs for toil_run=%s", self.run_id)
1874
+ self._api("batch", errors=[404]).delete_collection_namespaced_job(
1696
1875
  self.namespace,
1697
1876
  label_selector=f"toil_run={self.run_id}",
1698
- propagation_policy='Background'
1877
+ propagation_policy="Background",
1878
+ )
1879
+ logger.debug(
1880
+ "Killed jobs with delete_collection_namespaced_job; cleaned up"
1699
1881
  )
1700
- logger.debug('Killed jobs with delete_collection_namespaced_job; cleaned up')
1701
1882
  # TODO: should we release all resources? We're shutting down so would it matter?
1702
1883
  except ApiException as e:
1703
1884
  if e.status != 404:
1704
1885
  # Anything other than a 404 is weird here.
1705
- logger.error("Exception when calling BatchV1Api->delete_collection_namespaced_job: %s" % e)
1886
+ logger.error(
1887
+ "Exception when calling BatchV1Api->delete_collection_namespaced_job: %s"
1888
+ % e
1889
+ )
1706
1890
 
1707
1891
  # If batch delete fails, try to delete all remaining jobs individually.
1708
- logger.debug('Deleting Kubernetes jobs individually for toil_run=%s', self.run_id)
1892
+ logger.debug(
1893
+ "Deleting Kubernetes jobs individually for toil_run=%s", self.run_id
1894
+ )
1709
1895
  for job_id in self._getIssuedNonLocalBatchJobIDs():
1710
- job_name = f'{self.job_prefix}{job_id}'
1711
- self._delete_job(job_name, propagation_policy='Background', resource_notify=False)
1896
+ job_name = f"{self.job_prefix}{job_id}"
1897
+ self._delete_job(
1898
+ job_name, propagation_policy="Background", resource_notify=False
1899
+ )
1712
1900
 
1713
1901
  # Aggregate all pods and check if any pod has failed to cleanup or is orphaned.
1714
1902
  ourPods = self._ourPodObject()
1715
1903
 
1716
1904
  for pod in ourPods:
1717
1905
  try:
1718
- phase = getattr(pod.status, 'phase', None)
1719
- if phase == 'Failed':
1720
- logger.debug('Failed pod encountered at shutdown:\n%s', self._pretty_print(pod))
1721
- if phase == 'Orphaned':
1722
- logger.debug('Orphaned pod encountered at shutdown:\n%s', self._pretty_print(pod))
1906
+ phase = getattr(pod.status, "phase", None)
1907
+ if phase == "Failed":
1908
+ logger.debug(
1909
+ "Failed pod encountered at shutdown:\n%s",
1910
+ self._pretty_print(pod),
1911
+ )
1912
+ if phase == "Orphaned":
1913
+ logger.debug(
1914
+ "Orphaned pod encountered at shutdown:\n%s",
1915
+ self._pretty_print(pod),
1916
+ )
1723
1917
  except:
1724
1918
  # Don't get mad if that doesn't work.
1725
1919
  pass
1726
1920
  if pod.metadata is not None and pod.metadata.name is not None:
1727
1921
  try:
1728
- logger.debug('Cleaning up pod at shutdown: %s', pod.metadata.name)
1729
- response = self._api('core', errors=[404]).delete_namespaced_pod(
1922
+ logger.debug(
1923
+ "Cleaning up pod at shutdown: %s", pod.metadata.name
1924
+ )
1925
+ response = self._api(
1926
+ "core", errors=[404]
1927
+ ).delete_namespaced_pod(
1730
1928
  pod.metadata.name,
1731
1929
  self.namespace,
1732
- propagation_policy='Background'
1930
+ propagation_policy="Background",
1733
1931
  )
1734
1932
  except ApiException as e:
1735
1933
  if e.status != 404:
1736
1934
  # Anything other than a 404 is weird here.
1737
- logger.error("Exception when calling CoreV1Api->delete_namespaced_pod: %s" % e)
1738
-
1935
+ logger.error(
1936
+ "Exception when calling CoreV1Api->delete_namespaced_pod: %s"
1937
+ % e
1938
+ )
1739
1939
 
1740
- def _getIssuedNonLocalBatchJobIDs(self) -> List[int]:
1940
+ def _getIssuedNonLocalBatchJobIDs(self) -> list[int]:
1741
1941
  """
1742
1942
  Get the issued batch job IDs that are not for local jobs.
1743
1943
  """
@@ -1749,29 +1949,35 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1749
1949
  jobIDs.append(self._getIDForOurJob(job))
1750
1950
  return jobIDs
1751
1951
 
1752
- def getIssuedBatchJobIDs(self) -> List[int]:
1952
+ def getIssuedBatchJobIDs(self) -> list[int]:
1753
1953
  # Make sure to send the local jobs and queued jobs also
1754
1954
  with self._mutex:
1755
1955
  queued_jobs = list(self._queued_job_ids)
1756
- return self._getIssuedNonLocalBatchJobIDs() + list(self.getIssuedLocalJobIDs()) + queued_jobs
1956
+ return (
1957
+ self._getIssuedNonLocalBatchJobIDs()
1958
+ + list(self.getIssuedLocalJobIDs())
1959
+ + queued_jobs
1960
+ )
1757
1961
 
1758
- def _get_start_time(self, pod: Optional[V1Pod] = None, job: Optional[V1Job] = None) -> datetime.datetime:
1962
+ def _get_start_time(
1963
+ self, pod: Optional[V1Pod] = None, job: Optional[V1Job] = None
1964
+ ) -> datetime.datetime:
1759
1965
  """
1760
1966
  Get an actual or estimated start time for a pod.
1761
1967
  """
1762
1968
 
1763
1969
  # Get when the pod started (reached the Kubelet) as a datetime
1764
- start_time = getattr(getattr(pod, 'status', None), 'start_time', None)
1970
+ start_time = getattr(getattr(pod, "status", None), "start_time", None)
1765
1971
  if start_time is None:
1766
1972
  # If the pod never made it to the kubelet to get a
1767
1973
  # start_time, say it was when the job was submitted.
1768
- start_time = getattr(getattr(job, 'status', None), 'start_time', None)
1974
+ start_time = getattr(getattr(job, "status", None), "start_time", None)
1769
1975
  if start_time is None:
1770
1976
  # If this is still unset, say it was just now.
1771
1977
  start_time = utc_now()
1772
1978
  return start_time
1773
1979
 
1774
- def getRunningBatchJobIDs(self) -> Dict[int, float]:
1980
+ def getRunningBatchJobIDs(self) -> dict[int, float]:
1775
1981
  # We need a dict from jobID (integer) to seconds it has been running
1776
1982
  secondsPerJob = dict()
1777
1983
  for job in self._ourJobObject():
@@ -1782,7 +1988,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1782
1988
  # Jobs whose pods are gone are not running
1783
1989
  continue
1784
1990
 
1785
- if getattr(pod.status, 'phase', None) == 'Running':
1991
+ if getattr(pod.status, "phase", None) == "Running":
1786
1992
  # The job's pod is running
1787
1993
 
1788
1994
  # Estimate the runtime
@@ -1794,7 +2000,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1794
2000
  secondsPerJob.update(self.getRunningLocalJobIDs())
1795
2001
  return secondsPerJob
1796
2002
 
1797
- def killBatchJobs(self, jobIDs: List[int]) -> None:
2003
+ def killBatchJobs(self, jobIDs: list[int]) -> None:
1798
2004
 
1799
2005
  # Kill all the ones that are local
1800
2006
  self.killLocalJobs(jobIDs)
@@ -1803,7 +2009,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1803
2009
 
1804
2010
  # First get the jobs we even issued non-locally
1805
2011
  issued_on_kubernetes = set(self._getIssuedNonLocalBatchJobIDs())
1806
- deleted_jobs: List[str] = []
2012
+ deleted_jobs: list[str] = []
1807
2013
 
1808
2014
  for job_id in jobIDs:
1809
2015
  # For each job we are supposed to kill
@@ -1829,10 +2035,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1829
2035
 
1830
2036
  # Delete the requested job in the foreground.
1831
2037
  # This doesn't block, but it does delete expeditiously.
1832
- self._delete_job(job_name, propagation_policy='Foreground')
2038
+ self._delete_job(job_name, propagation_policy="Foreground")
1833
2039
 
1834
2040
  deleted_jobs.append(job_name)
1835
- logger.debug('Killed job by request: %s', job_name)
2041
+ logger.debug("Killed job by request: %s", job_name)
1836
2042
 
1837
2043
  for job_name in deleted_jobs:
1838
2044
  # Now we need to wait for all the jobs we killed to be gone.
@@ -1842,7 +2048,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1842
2048
  # the potential deadlock (if the user code needs exclusive access to
1843
2049
  # a resource) onto the user code, instead of always hanging
1844
2050
  # whenever we can't certify that a faulty node is no longer running
1845
- # the user code.
2051
+ # the user code.
1846
2052
  self._waitForJobDeath(job_name)
1847
2053
 
1848
2054
  @classmethod
@@ -1853,9 +2059,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1853
2059
 
1854
2060
  # Make a Kubernetes-acceptable version of our username: not too long,
1855
2061
  # and all lowercase letters, numbers, or - or .
1856
- acceptable_chars = set(string.ascii_lowercase + string.digits + '-.')
2062
+ acceptable_chars = set(string.ascii_lowercase + string.digits + "-.")
1857
2063
 
1858
- return ''.join([c for c in get_user_name().lower() if c in acceptable_chars])[:100]
2064
+ return "".join([c for c in get_user_name().lower() if c in acceptable_chars])[
2065
+ :100
2066
+ ]
1859
2067
 
1860
2068
  @runtime_checkable
1861
2069
  class KubernetesConfig(Protocol):
@@ -1867,38 +2075,66 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1867
2075
  have to let the fact that this also has to be a Config just be manually
1868
2076
  enforced.
1869
2077
  """
2078
+
1870
2079
  kubernetes_host_path: Optional[str]
1871
2080
  kubernetes_owner: str
1872
2081
  kubernetes_service_account: Optional[str]
1873
2082
  kubernetes_pod_timeout: float
1874
2083
 
1875
-
1876
2084
  @classmethod
1877
2085
  def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
1878
- parser.add_argument("--kubernetesHostPath", dest="kubernetes_host_path", default=None, env_var="TOIL_KUBERNETES_HOST_PATH",
1879
- help="Path on Kubernetes hosts to use as shared inter-pod temp directory. "
1880
- "(default: %(default)s)")
1881
- parser.add_argument("--kubernetesOwner", dest="kubernetes_owner", default=None, env_var="TOIL_KUBERNETES_OWNER",
1882
- help=f"Username to mark Kubernetes jobs with. If the provided value is None, the value will "
1883
- f"be generated at runtime. "
1884
- f"(Generated default: {cls.get_default_kubernetes_owner()})")
1885
- parser.add_argument("--kubernetesServiceAccount", dest="kubernetes_service_account", default=None, env_var="TOIL_KUBERNETES_SERVICE_ACCOUNT",
1886
- help="Service account to run jobs as. "
1887
- "(default: %(default)s)")
1888
- parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float,
1889
- help="Seconds to wait for a scheduled Kubernetes pod to start running. "
1890
- "(default: %(default)s)")
1891
- parser.add_argument("--kubernetesPrivileged", dest="kubernetes_privileged", default=False, env_var="TOIL_KUBERNETES_PRIVILEGED", type=opt_strtobool,
1892
- help="Whether to ask worker pods to run in privileged mode. This should be used to access "
1893
- "privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
1894
- "this is set to True. (default: %(default)s)")
1895
-
1896
- OptionType = TypeVar('OptionType')
2086
+ parser.add_argument(
2087
+ "--kubernetesHostPath",
2088
+ dest="kubernetes_host_path",
2089
+ default=None,
2090
+ env_var="TOIL_KUBERNETES_HOST_PATH",
2091
+ help="Path on Kubernetes hosts to use as shared inter-pod temp directory. "
2092
+ "(default: %(default)s)",
2093
+ )
2094
+ parser.add_argument(
2095
+ "--kubernetesOwner",
2096
+ dest="kubernetes_owner",
2097
+ default=None,
2098
+ env_var="TOIL_KUBERNETES_OWNER",
2099
+ help=f"Username to mark Kubernetes jobs with. If the provided value is None, the value will "
2100
+ f"be generated at runtime. "
2101
+ f"(Generated default: {cls.get_default_kubernetes_owner()})",
2102
+ )
2103
+ parser.add_argument(
2104
+ "--kubernetesServiceAccount",
2105
+ dest="kubernetes_service_account",
2106
+ default=None,
2107
+ env_var="TOIL_KUBERNETES_SERVICE_ACCOUNT",
2108
+ help="Service account to run jobs as. " "(default: %(default)s)",
2109
+ )
2110
+ parser.add_argument(
2111
+ "--kubernetesPodTimeout",
2112
+ dest="kubernetes_pod_timeout",
2113
+ default=120,
2114
+ env_var="TOIL_KUBERNETES_POD_TIMEOUT",
2115
+ type=float,
2116
+ help="Seconds to wait for a scheduled Kubernetes pod to start running. "
2117
+ "(default: %(default)s)",
2118
+ )
2119
+ parser.add_argument(
2120
+ "--kubernetesPrivileged",
2121
+ dest="kubernetes_privileged",
2122
+ default=False,
2123
+ env_var="TOIL_KUBERNETES_PRIVILEGED",
2124
+ type=opt_strtobool,
2125
+ help="Whether to ask worker pods to run in privileged mode. This should be used to access "
2126
+ "privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
2127
+ "this is set to True. (default: %(default)s)",
2128
+ )
2129
+
2130
+ OptionType = TypeVar("OptionType")
2131
+
1897
2132
  @classmethod
1898
2133
  def setOptions(cls, setOption: OptionSetter) -> None:
1899
2134
  setOption("kubernetes_host_path")
1900
2135
  setOption("kubernetes_owner")
1901
- setOption("kubernetes_service_account",)
2136
+ setOption(
2137
+ "kubernetes_service_account",
2138
+ )
1902
2139
  setOption("kubernetes_pod_timeout")
1903
2140
  setOption("kubernetes_privileged")
1904
-