toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
@@ -30,93 +30,90 @@ import tempfile
30
30
  import time
31
31
  import uuid
32
32
  from argparse import ArgumentParser, _ArgumentGroup
33
+ from collections.abc import Iterator
33
34
  from queue import Empty, Queue
34
35
  from threading import Condition, Event, RLock, Thread
35
- from typing import (Any,
36
- Callable,
37
- Dict,
38
- Iterator,
39
- List,
40
- Literal,
41
- Optional,
42
- Set,
43
- Tuple,
44
- Type,
45
- TypeVar,
46
- Union,
47
- cast,
48
- overload)
36
+ from typing import Any, Callable, Literal, Optional, TypeVar, Union, cast, overload
37
+
38
+ from toil.lib.conversions import opt_strtobool
49
39
 
50
40
  if sys.version_info < (3, 10):
51
41
  from typing_extensions import ParamSpec
52
42
  else:
53
43
  from typing import ParamSpec
54
- if sys.version_info >= (3, 8):
55
- from typing import Protocol, TypedDict, runtime_checkable
44
+
45
+ if sys.version_info < (3, 11):
46
+ from typing_extensions import NotRequired
56
47
  else:
57
- from typing_extensions import Protocol, TypedDict, runtime_checkable
58
- # TODO: When this gets into the standard library, get it from there and drop
48
+ from typing import NotRequired
49
+
50
+ from typing import Protocol, TypedDict, runtime_checkable
51
+
59
52
  import urllib3
60
53
  import yaml
54
+
61
55
  # The Right Way to use the Kubernetes module is to `import kubernetes` and then you get all your stuff as like ApiClient. But this doesn't work for the stubs: the stubs seem to only support importing things from the internal modules in `kubernetes` where they are actually defined. See for example <https://github.com/MaterializeInc/kubernetes-stubs/issues/9 and <https://github.com/MaterializeInc/kubernetes-stubs/issues/10>. So we just import all the things we use into our global namespace here.
62
- from kubernetes.client import (BatchV1Api,
63
- CoreV1Api,
64
- CustomObjectsApi,
65
- V1Affinity,
66
- V1Container,
67
- V1ContainerStatus,
68
- V1EmptyDirVolumeSource,
69
- V1HostPathVolumeSource,
70
- V1Job,
71
- V1JobCondition,
72
- V1JobSpec,
73
- V1NodeAffinity,
74
- V1NodeSelector,
75
- V1NodeSelectorRequirement,
76
- V1NodeSelectorTerm,
77
- V1ObjectMeta,
78
- V1Pod,
79
- V1PodSpec,
80
- V1PodTemplateSpec,
81
- V1PreferredSchedulingTerm,
82
- V1ResourceRequirements,
83
- V1SecretVolumeSource,
84
- V1Toleration,
85
- V1Volume,
86
- V1VolumeMount)
56
+ from kubernetes.client import (
57
+ BatchV1Api,
58
+ CoreV1Api,
59
+ CustomObjectsApi,
60
+ V1Affinity,
61
+ V1Container,
62
+ V1ContainerStatus,
63
+ V1EmptyDirVolumeSource,
64
+ V1HostPathVolumeSource,
65
+ V1Job,
66
+ V1JobCondition,
67
+ V1JobSpec,
68
+ V1NodeAffinity,
69
+ V1NodeSelector,
70
+ V1NodeSelectorRequirement,
71
+ V1NodeSelectorTerm,
72
+ V1ObjectMeta,
73
+ V1Pod,
74
+ V1PodSpec,
75
+ V1PodTemplateSpec,
76
+ V1PreferredSchedulingTerm,
77
+ V1ResourceRequirements,
78
+ V1SecretVolumeSource,
79
+ V1SecurityContext,
80
+ V1Toleration,
81
+ V1Volume,
82
+ V1VolumeMount,
83
+ )
87
84
  from kubernetes.client.api_client import ApiClient
88
85
  from kubernetes.client.exceptions import ApiException
89
86
  from kubernetes.config.config_exception import ConfigException
90
87
  from kubernetes.config.incluster_config import load_incluster_config
91
- from kubernetes.config.kube_config import (list_kube_config_contexts,
92
- load_kube_config)
88
+ from kubernetes.config.kube_config import list_kube_config_contexts, load_kube_config
89
+
93
90
  # TODO: Watch API is not typed yet
94
91
  from kubernetes.watch import Watch # type: ignore
95
- # typing-extensions dependency on Pythons that are new enough.
96
- from typing_extensions import NotRequired
97
92
 
98
93
  from toil import applianceSelf
99
- from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
100
- BatchJobExitReason,
101
- InsufficientSystemResources,
102
- ResourcePool,
103
- UpdatedBatchJobInfo)
94
+ from toil.batchSystems.abstractBatchSystem import (
95
+ EXIT_STATUS_UNAVAILABLE_VALUE,
96
+ BatchJobExitReason,
97
+ InsufficientSystemResources,
98
+ ResourcePool,
99
+ UpdatedBatchJobInfo,
100
+ )
104
101
  from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
105
102
  from toil.batchSystems.contained_executor import pack_job
106
103
  from toil.batchSystems.options import OptionSetter
107
104
  from toil.common import Config, Toil
108
- from toil.options.common import SYS_MAX_SIZE
109
105
  from toil.job import JobDescription, Requirer
110
106
  from toil.lib.conversions import human2bytes
111
107
  from toil.lib.misc import get_user_name, slow_down, utc_now
112
108
  from toil.lib.retry import ErrorCondition, retry
109
+ from toil.options.common import SYS_MAX_SIZE
113
110
  from toil.resource import Resource
114
111
 
115
112
  logger = logging.getLogger(__name__)
116
- retryable_kubernetes_errors: List[Union[Type[Exception], ErrorCondition]] = [
113
+ retryable_kubernetes_errors: list[Union[type[Exception], ErrorCondition]] = [
117
114
  urllib3.exceptions.MaxRetryError,
118
115
  urllib3.exceptions.ProtocolError,
119
- ApiException
116
+ ApiException,
120
117
  ]
121
118
 
122
119
 
@@ -130,8 +127,10 @@ def is_retryable_kubernetes_error(e: Exception) -> bool:
130
127
  return True
131
128
  return False
132
129
 
130
+
133
131
  # Represents a collection of label or taint keys and their sets of acceptable (or unacceptable) values.
134
- KeyValuesList = List[Tuple[str, List[str]]]
132
+ KeyValuesList = list[tuple[str, list[str]]]
133
+
135
134
 
136
135
  class KubernetesBatchSystem(BatchSystemCleanupSupport):
137
136
  @classmethod
@@ -148,8 +147,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
148
147
  core: NotRequired[CoreV1Api]
149
148
  customObjects: NotRequired[CustomObjectsApi]
150
149
 
151
-
152
- def __init__(self, config: Config, maxCores: int, maxMemory: int, maxDisk: int) -> None:
150
+ def __init__(
151
+ self, config: Config, maxCores: int, maxMemory: int, maxDisk: int
152
+ ) -> None:
153
153
  super().__init__(config, maxCores, maxMemory, maxDisk)
154
154
 
155
155
  # Re-type the config to make sure it has all the fields we need.
@@ -160,8 +160,8 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
160
160
  # Otherwise if we are at debug log level, we dump every
161
161
  # request/response to Kubernetes, including tokens which we shouldn't
162
162
  # reveal on CI.
163
- logging.getLogger('kubernetes').setLevel(logging.ERROR)
164
- logging.getLogger('requests_oauthlib').setLevel(logging.ERROR)
163
+ logging.getLogger("kubernetes").setLevel(logging.ERROR)
164
+ logging.getLogger("requests_oauthlib").setLevel(logging.ERROR)
165
165
 
166
166
  # This will hold the last time our Kubernetes credentials were refreshed
167
167
  self.credential_time: Optional[datetime.datetime] = None
@@ -169,7 +169,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
169
169
  self._apis: KubernetesBatchSystem._ApiStorageDict = {}
170
170
 
171
171
  # Get our namespace (and our Kubernetes credentials to make sure they exist)
172
- self.namespace: str = self._api('namespace')
172
+ self.namespace: str = self._api("namespace")
173
173
 
174
174
  # Decide if we are going to mount a Kubernetes host path as the Toil
175
175
  # work dir in the workers, for shared caching.
@@ -188,7 +188,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
188
188
  self.unique_id = uuid.uuid4()
189
189
 
190
190
  # Create a prefix for jobs, starting with our username
191
- self.job_prefix: str = f'{username}-toil-{self.unique_id}-'
191
+ self.job_prefix: str = f"{username}-toil-{self.unique_id}-"
192
192
  # Instead of letting Kubernetes assign unique job names, we assign our
193
193
  # own based on a numerical job ID. This functionality is managed by the
194
194
  # BatchSystemLocalSupport.
@@ -212,55 +212,61 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
212
212
  # Try and guess what Toil work dir the workers will use.
213
213
  # We need to be able to provision (possibly shared) space there.
214
214
  self.worker_work_dir: str = Toil.getToilWorkDir(config.workDir)
215
- if (config.workDir is None and
216
- os.getenv('TOIL_WORKDIR') is None and
217
- self.worker_work_dir == tempfile.gettempdir()):
215
+ if (
216
+ config.workDir is None
217
+ and os.getenv("TOIL_WORKDIR") is None
218
+ and self.worker_work_dir == tempfile.gettempdir()
219
+ ):
218
220
 
219
221
  # We defaulted to the system temp directory. But we think the
220
222
  # worker Dockerfiles will make them use /var/lib/toil instead.
221
223
  # TODO: Keep this in sync with the Dockerfile.
222
- self.worker_work_dir = '/var/lib/toil'
224
+ self.worker_work_dir = "/var/lib/toil"
223
225
 
224
226
  # A Toil-managed Kubernetes cluster will have most of its temp space at
225
227
  # /var/tmp, which is where really large temp files really belong
226
228
  # according to https://systemd.io/TEMPORARY_DIRECTORIES/. So we will
227
229
  # set the default temporary directory to there for all our jobs.
228
- self.environment['TMPDIR'] = '/var/tmp'
230
+ self.environment["TMPDIR"] = "/var/tmp"
229
231
 
230
232
  # Get the name of the AWS secret, if any, to mount in containers.
231
- self.aws_secret_name: Optional[str] = os.environ.get("TOIL_AWS_SECRET_NAME", None)
233
+ self.aws_secret_name: Optional[str] = os.environ.get(
234
+ "TOIL_AWS_SECRET_NAME", None
235
+ )
232
236
 
233
237
  # Set this to True to enable the experimental wait-for-job-update code
234
238
  self.enable_watching: bool = os.environ.get("KUBE_WATCH_ENABLED", False)
235
239
 
236
240
  # This will be a label to select all our jobs.
237
- self.run_id: str = f'toil-{self.unique_id}'
241
+ self.run_id: str = f"toil-{self.unique_id}"
238
242
 
239
243
  # Keep track of available resources.
240
- maxMillicores = int(SYS_MAX_SIZE if self.maxCores == SYS_MAX_SIZE else self.maxCores * 1000)
241
- self.resource_sources: List[ResourcePool] = [
244
+ maxMillicores = int(
245
+ SYS_MAX_SIZE if self.maxCores == SYS_MAX_SIZE else self.maxCores * 1000
246
+ )
247
+ self.resource_sources: list[ResourcePool] = [
242
248
  # A pool representing available job slots
243
- ResourcePool(self.config.max_jobs, 'job slots'),
249
+ ResourcePool(self.config.max_jobs, "job slots"),
244
250
  # A pool representing available CPU in units of millicores (1 CPU
245
251
  # unit = 1000 millicores)
246
- ResourcePool(maxMillicores, 'cores'),
252
+ ResourcePool(maxMillicores, "cores"),
247
253
  # A pool representing available memory in bytes
248
- ResourcePool(self.maxMemory, 'memory'),
254
+ ResourcePool(self.maxMemory, "memory"),
249
255
  # A pool representing the available space in bytes
250
- ResourcePool(self.maxDisk, 'disk'),
256
+ ResourcePool(self.maxDisk, "disk"),
251
257
  ]
252
258
 
253
259
  # A set of job IDs that are queued (useful for getIssuedBatchJobIDs())
254
- self._queued_job_ids: Set[int] = set()
260
+ self._queued_job_ids: set[int] = set()
255
261
 
256
262
  # Keep track of the acquired resources for each job
257
- self._acquired_resources: Dict[str, List[int]] = {}
263
+ self._acquired_resources: dict[str, list[int]] = {}
258
264
 
259
265
  # Queue for jobs to be submitted to the Kubernetes cluster
260
- self._jobs_queue: Queue[Tuple[int, JobDescription, V1PodSpec]] = Queue()
266
+ self._jobs_queue: Queue[tuple[int, JobDescription, V1PodSpec]] = Queue()
261
267
 
262
268
  # A set of job IDs that should be killed
263
- self._killed_queue_jobs: Set[int] = set()
269
+ self._killed_queue_jobs: set[int] = set()
264
270
 
265
271
  # We use this event to signal shutdown
266
272
  self._shutting_down: Event = Event()
@@ -284,7 +290,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
284
290
  """
285
291
 
286
292
  if not kubernetes_object:
287
- return 'None'
293
+ return "None"
288
294
 
289
295
  # We need a Kubernetes widget that knows how to translate
290
296
  # its data structures to nice YAML-able dicts. See:
@@ -294,7 +300,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
294
300
  # Convert to a dict
295
301
  root_dict = api_client.sanitize_for_serialization(kubernetes_object)
296
302
 
297
- def drop_boring(here: Dict[str, Any]) -> None:
303
+ def drop_boring(here: dict[str, Any]) -> None:
298
304
  """
299
305
  Drop boring fields recursively.
300
306
  """
@@ -302,7 +308,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
302
308
  for k, v in here.items():
303
309
  if isinstance(v, dict):
304
310
  drop_boring(v)
305
- if k in ['managedFields']:
311
+ if k in ["managedFields"]:
306
312
  boring_keys.append(k)
307
313
  for k in boring_keys:
308
314
  del here[k]
@@ -312,33 +318,43 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
312
318
 
313
319
  @overload
314
320
  def _api(
315
- self, kind: Literal['batch'], max_age_seconds: float = 5 * 60, errors: Optional[List[int]] = None
316
- ) -> BatchV1Api:
317
- ...
321
+ self,
322
+ kind: Literal["batch"],
323
+ max_age_seconds: float = 5 * 60,
324
+ errors: Optional[list[int]] = None,
325
+ ) -> BatchV1Api: ...
318
326
 
319
327
  @overload
320
328
  def _api(
321
- self, kind: Literal['core'], max_age_seconds: float = 5 * 60, errors: Optional[List[int]] = None
322
- ) -> CoreV1Api:
323
- ...
329
+ self,
330
+ kind: Literal["core"],
331
+ max_age_seconds: float = 5 * 60,
332
+ errors: Optional[list[int]] = None,
333
+ ) -> CoreV1Api: ...
324
334
 
325
335
  @overload
326
336
  def _api(
327
- self, kind: Literal['customObjects'], max_age_seconds: float = 5 * 60, errors: Optional[List[int]] = None
328
- ) -> CustomObjectsApi:
329
- ...
337
+ self,
338
+ kind: Literal["customObjects"],
339
+ max_age_seconds: float = 5 * 60,
340
+ errors: Optional[list[int]] = None,
341
+ ) -> CustomObjectsApi: ...
330
342
 
331
343
  @overload
332
344
  def _api(
333
- self, kind: Literal['namespace'], max_age_seconds: float = 5 * 60
334
- ) -> str:
335
- ...
345
+ self, kind: Literal["namespace"], max_age_seconds: float = 5 * 60
346
+ ) -> str: ...
336
347
 
337
348
  def _api(
338
349
  self,
339
- kind: Union[Literal['batch'], Literal['core'], Literal['customObjects'], Literal['namespace']],
350
+ kind: Union[
351
+ Literal["batch"],
352
+ Literal["core"],
353
+ Literal["customObjects"],
354
+ Literal["namespace"],
355
+ ],
340
356
  max_age_seconds: float = 5 * 60,
341
- errors: Optional[List[int]] = None
357
+ errors: Optional[list[int]] = None,
342
358
  ) -> Union[BatchV1Api, CoreV1Api, CustomObjectsApi, str]:
343
359
  """
344
360
  The Kubernetes module isn't clever enough to renew its credentials when
@@ -371,44 +387,53 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
371
387
 
372
388
  now = utc_now()
373
389
 
374
- if self.credential_time is None or (now - self.credential_time).total_seconds() > max_age_seconds:
390
+ if (
391
+ self.credential_time is None
392
+ or (now - self.credential_time).total_seconds() > max_age_seconds
393
+ ):
375
394
  # Credentials need a refresh
376
395
  try:
377
396
  # Load ~/.kube/config or KUBECONFIG
378
397
  load_kube_config()
379
398
  # Worked. We're using kube config
380
- config_source = 'kube'
399
+ config_source = "kube"
381
400
  except ConfigException:
382
401
  # Didn't work. Try pod-based credentials in case we are in a pod.
383
402
  try:
384
403
  load_incluster_config()
385
404
  # Worked. We're using in_cluster config
386
- config_source = 'in_cluster'
405
+ config_source = "in_cluster"
387
406
  except ConfigException:
388
- raise RuntimeError('Could not load Kubernetes configuration from ~/.kube/config, $KUBECONFIG, or current pod.')
407
+ raise RuntimeError(
408
+ "Could not load Kubernetes configuration from ~/.kube/config, $KUBECONFIG, or current pod."
409
+ )
389
410
 
390
411
  # Now fill in the API objects with these credentials
391
- self._apis['batch'] = BatchV1Api()
392
- self._apis['core'] = CoreV1Api()
393
- self._apis['customObjects'] = CustomObjectsApi()
412
+ self._apis["batch"] = BatchV1Api()
413
+ self._apis["core"] = CoreV1Api()
414
+ self._apis["customObjects"] = CustomObjectsApi()
394
415
 
395
416
  # And save the time
396
417
  self.credential_time = now
397
- if kind == 'namespace':
418
+ if kind == "namespace":
398
419
  # We just need the namespace string
399
- if config_source == 'in_cluster':
420
+ if config_source == "in_cluster":
400
421
  # Our namespace comes from a particular file.
401
- with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as fh:
422
+ with open(
423
+ "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
424
+ ) as fh:
402
425
  return fh.read().strip()
403
426
  else:
404
427
  # Find all contexts and the active context.
405
428
  # The active context gets us our namespace.
406
429
  contexts, activeContext = list_kube_config_contexts()
407
430
  if not contexts:
408
- raise RuntimeError("No Kubernetes contexts available in ~/.kube/config or $KUBECONFIG")
431
+ raise RuntimeError(
432
+ "No Kubernetes contexts available in ~/.kube/config or $KUBECONFIG"
433
+ )
409
434
 
410
435
  # Identify the namespace to work in
411
- namespace = activeContext.get('context', {}).get('namespace', 'default')
436
+ namespace = activeContext.get("context", {}).get("namespace", "default")
412
437
  assert isinstance(namespace, str)
413
438
  return namespace
414
439
 
@@ -428,11 +453,13 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
428
453
  ErrorCondition(
429
454
  error=ApiException,
430
455
  error_codes=errors,
431
- retry_on_this_condition=False
456
+ retry_on_this_condition=False,
432
457
  )
433
458
  )
434
459
  decorator = retry(errors=error_list)
435
- wrapper = KubernetesBatchSystem.DecoratorWrapper(api_object, decorator)
460
+ wrapper = KubernetesBatchSystem.DecoratorWrapper(
461
+ api_object, decorator
462
+ )
436
463
  return cast(Union[BatchV1Api, CoreV1Api, CustomObjectsApi], wrapper)
437
464
  except KeyError:
438
465
  raise RuntimeError(f"Unknown Kubernetes API type: {kind}")
@@ -443,7 +470,12 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
443
470
  """
444
471
 
445
472
  P = ParamSpec("P")
446
- def __init__(self, to_wrap: Any, decorator: Callable[[Callable[P, Any]], Callable[P, Any]]) -> None:
473
+
474
+ def __init__(
475
+ self,
476
+ to_wrap: Any,
477
+ decorator: Callable[[Callable[P, Any]], Callable[P, Any]],
478
+ ) -> None:
447
479
  """
448
480
  Make a wrapper around the given object.
449
481
  When methods on the object are called, they will be called through
@@ -467,16 +499,19 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
467
499
  return attr
468
500
 
469
501
  ItemT = TypeVar("ItemT")
502
+
470
503
  class _ItemsHaver(Protocol[ItemT]):
471
504
  """
472
505
  Anything that has a .items that is a list of something.
473
506
  """
507
+
474
508
  # KubernetesBatchSystem isn't defined until the class executes, so any
475
509
  # up-references to types from there that are in signatures (and not
476
510
  # method code) need to be quoted
477
- items: List["KubernetesBatchSystem.ItemT"]
511
+ items: list["KubernetesBatchSystem.ItemT"]
478
512
 
479
513
  CovItemT = TypeVar("CovItemT", covariant=True)
514
+
480
515
  class _WatchEvent(Protocol[CovItemT]):
481
516
  """
482
517
  An event from a Kubernetes watch stream.
@@ -488,23 +523,26 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
488
523
  # __getitem__ instead.
489
524
 
490
525
  @overload
491
- def __getitem__(self, name: Literal['type']) -> str:
492
- ...
526
+ def __getitem__(self, name: Literal["type"]) -> str: ...
493
527
 
494
528
  @overload
495
- def __getitem__(self, name: Literal['object']) -> "KubernetesBatchSystem.CovItemT":
496
- ...
529
+ def __getitem__(
530
+ self, name: Literal["object"]
531
+ ) -> "KubernetesBatchSystem.CovItemT": ...
497
532
 
498
533
  @overload
499
- def __getitem__(self, name: Literal['raw_object']) -> Dict[str, Any]:
500
- ...
534
+ def __getitem__(self, name: Literal["raw_object"]) -> dict[str, Any]: ...
501
535
 
502
- def __getitem__(self, name: Union[Literal['type'], Literal['object'], Literal['raw_object']]) -> Any:
503
- ...
536
+ def __getitem__(
537
+ self, name: Union[Literal["type"], Literal["object"], Literal["raw_object"]]
538
+ ) -> Any: ...
504
539
 
505
540
  P = ParamSpec("P")
506
541
  R = TypeVar("R")
507
- def _stream_until_error(self, method: Callable[P, _ItemsHaver[R]], *args: P.args, **kwargs: P.kwargs) -> Iterator[_WatchEvent[R]]:
542
+
543
+ def _stream_until_error(
544
+ self, method: Callable[P, _ItemsHaver[R]], *args: P.args, **kwargs: P.kwargs
545
+ ) -> Iterator[_WatchEvent[R]]:
508
546
  """
509
547
  Kubernetes kubernetes.watch.Watch().stream() streams can fail and raise
510
548
  errors. We don't want to have those errors fail the entire workflow, so
@@ -570,7 +608,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
570
608
 
571
609
  # Loop through all jobs inside the queue and see if any of them
572
610
  # could be launched.
573
- jobs: Queue[Tuple[int, JobDescription, V1PodSpec]] = Queue()
611
+ jobs: Queue[tuple[int, JobDescription, V1PodSpec]] = Queue()
574
612
  while True:
575
613
  try:
576
614
  job = self._jobs_queue.get_nowait()
@@ -582,7 +620,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
582
620
  logger.debug(f"Skipping killed job {job_id}")
583
621
  continue
584
622
 
585
- job_name = f'{self.job_prefix}{job_id}'
623
+ job_name = f"{self.job_prefix}{job_id}"
586
624
  result = self._launch_job(job_name, job_desc, spec)
587
625
  if result is False:
588
626
  # Not enough resources to launch this job.
@@ -603,7 +641,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
603
641
  logger.debug(f"Roughly {self._jobs_queue.qsize} jobs in the queue")
604
642
 
605
643
  def setUserScript(self, userScript: Resource) -> None:
606
- logger.info(f'Setting user script for deployment: {userScript}')
644
+ logger.info(f"Setting user script for deployment: {userScript}")
607
645
  self.user_script = userScript
608
646
 
609
647
  # setEnv is provided by BatchSystemSupport, updates self.environment
@@ -655,18 +693,21 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
655
693
  # Amazon just uses a label, while Google
656
694
  # <https://cloud.google.com/kubernetes-engine/docs/how-to/preemptible-vms>
657
695
  # uses a label and a taint.
658
- PREEMPTIBLE_SCHEMES = {'labels': [('eks.amazonaws.com/capacityType', ['SPOT']),
659
- ('cloud.google.com/gke-preemptible', ['true'])],
660
- 'taints': [('cloud.google.com/gke-preemptible', ['true'])]}
696
+ PREEMPTIBLE_SCHEMES = {
697
+ "labels": [
698
+ ("eks.amazonaws.com/capacityType", ["SPOT"]),
699
+ ("cloud.google.com/gke-preemptible", ["true"]),
700
+ ],
701
+ "taints": [("cloud.google.com/gke-preemptible", ["true"])],
702
+ }
661
703
 
662
704
  if preemptible:
663
705
  # We want to seek preemptible labels and tolerate preemptible taints.
664
- self.desired_labels += PREEMPTIBLE_SCHEMES['labels']
665
- self.tolerated_taints += PREEMPTIBLE_SCHEMES['taints']
706
+ self.desired_labels += PREEMPTIBLE_SCHEMES["labels"]
707
+ self.tolerated_taints += PREEMPTIBLE_SCHEMES["taints"]
666
708
  else:
667
709
  # We want to prohibit preemptible labels
668
- self.prohibited_labels += PREEMPTIBLE_SCHEMES['labels']
669
-
710
+ self.prohibited_labels += PREEMPTIBLE_SCHEMES["labels"]
670
711
 
671
712
  def apply(self, pod_spec: V1PodSpec) -> None:
672
713
  """
@@ -677,29 +718,26 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
677
718
  # Convert our collections to Kubernetes expressions.
678
719
 
679
720
  # REQUIRE that ALL of these requirements be satisfied
680
- required_selector_requirements: List[V1NodeSelectorRequirement] = []
721
+ required_selector_requirements: list[V1NodeSelectorRequirement] = []
681
722
  # PREFER that EACH of these terms be satisfied
682
- preferred_scheduling_terms: List[V1PreferredSchedulingTerm] = []
723
+ preferred_scheduling_terms: list[V1PreferredSchedulingTerm] = []
683
724
  # And this list of tolerations to apply
684
- tolerations: List[V1Toleration] = []
725
+ tolerations: list[V1Toleration] = []
685
726
 
686
727
  for label, values in self.required_labels:
687
728
  # Collect requirements for the required labels
688
- has_label = V1NodeSelectorRequirement(key=label,
689
- operator='In',
690
- values=values)
729
+ has_label = V1NodeSelectorRequirement(
730
+ key=label, operator="In", values=values
731
+ )
691
732
  required_selector_requirements.append(has_label)
692
733
  for label, values in self.desired_labels:
693
734
  # Collect preferences for the preferred labels
694
- has_label = V1NodeSelectorRequirement(key=label,
695
- operator='In',
696
- values=values)
697
- term = V1NodeSelectorTerm(
698
- match_expressions=[has_label]
735
+ has_label = V1NodeSelectorRequirement(
736
+ key=label, operator="In", values=values
699
737
  )
738
+ term = V1NodeSelectorTerm(match_expressions=[has_label])
700
739
  # Each becomes a separate preference, more is better.
701
- preference = V1PreferredSchedulingTerm(weight=1,
702
- preference=term)
740
+ preference = V1PreferredSchedulingTerm(weight=1, preference=term)
703
741
 
704
742
  preferred_scheduling_terms.append(preference)
705
743
  for label, values in self.prohibited_labels:
@@ -710,15 +748,14 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
710
748
  # <https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#set-based-requirement>
711
749
  # So we create a NotIn for each label and AND them
712
750
  # all together.
713
- not_labeled = V1NodeSelectorRequirement(key=label,
714
- operator='NotIn',
715
- values=values)
751
+ not_labeled = V1NodeSelectorRequirement(
752
+ key=label, operator="NotIn", values=values
753
+ )
716
754
  required_selector_requirements.append(not_labeled)
717
755
  for taint, values in self.tolerated_taints:
718
756
  for value in values:
719
757
  # Each toleration can tolerate one value
720
- taint_ok = V1Toleration(key=taint,
721
- value=value)
758
+ taint_ok = V1Toleration(key=taint, value=value)
722
759
  tolerations.append(taint_ok)
723
760
 
724
761
  # Now combine everything
@@ -732,16 +769,22 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
732
769
  match_expressions=required_selector_requirements
733
770
  )
734
771
  # And a selector to hold the term
735
- requirements_selector = V1NodeSelector(node_selector_terms=[requirements_term])
772
+ requirements_selector = V1NodeSelector(
773
+ node_selector_terms=[requirements_term]
774
+ )
736
775
 
737
776
  # Make an affinity that prefers the preferences and requires the requirements
738
777
  node_affinity = V1NodeAffinity(
739
- preferred_during_scheduling_ignored_during_execution=preferred_scheduling_terms if preferred_scheduling_terms else None,
740
- required_during_scheduling_ignored_during_execution=requirements_selector
778
+ preferred_during_scheduling_ignored_during_execution=(
779
+ preferred_scheduling_terms
780
+ if preferred_scheduling_terms
781
+ else None
782
+ ),
783
+ required_during_scheduling_ignored_during_execution=requirements_selector,
741
784
  )
742
785
 
743
786
  # Apply the affinity
744
- pod_spec.affinity = V1Affinity(node_affinity = node_affinity)
787
+ pod_spec.affinity = V1Affinity(node_affinity=node_affinity)
745
788
 
746
789
  if tolerations:
747
790
  # Apply the tolerations
@@ -749,17 +792,22 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
749
792
 
750
793
  def _check_accelerator_request(self, requirer: Requirer) -> None:
751
794
  for accelerator in requirer.accelerators:
752
- if accelerator['kind'] != 'gpu' and 'model' not in accelerator:
795
+ if accelerator["kind"] != "gpu" and "model" not in accelerator:
753
796
  # We can only provide GPUs or things with a model right now
754
- raise InsufficientSystemResources(requirer, 'accelerators', details=[
755
- f'The accelerator {accelerator} could not be provided.',
756
- 'The Toil Kubernetes batch system only knows how to request gpu accelerators or accelerators with a defined model.'
757
- ])
797
+ raise InsufficientSystemResources(
798
+ requirer,
799
+ "accelerators",
800
+ details=[
801
+ f"The accelerator {accelerator} could not be provided.",
802
+ "The Toil Kubernetes batch system only knows how to request gpu accelerators or accelerators with a defined model.",
803
+ ],
804
+ )
758
805
 
759
806
  def _create_pod_spec(
760
- self,
761
- job_desc: JobDescription,
762
- job_environment: Optional[Dict[str, str]] = None
807
+ self,
808
+ command: str,
809
+ job_desc: JobDescription,
810
+ job_environment: Optional[dict[str, str]] = None,
763
811
  ) -> V1PodSpec:
764
812
  """
765
813
  Make the specification for a pod that can execute the given job.
@@ -770,7 +818,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
770
818
  environment.update(job_environment)
771
819
 
772
820
  # Make a command to run it in the executor
773
- command_list = pack_job(job_desc, self.user_script, environment=environment)
821
+ command_list = pack_job(command, self.user_script, environment=environment)
774
822
 
775
823
  # The Kubernetes API makes sense only in terms of the YAML format. Objects
776
824
  # represent sections of the YAML files. Except from our point of view, all
@@ -786,9 +834,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
786
834
  # OOMing. We also want to provision some extra space so that when
787
835
  # we test _isPodStuckOOM we never get True unless the job has
788
836
  # exceeded job_desc.memory.
789
- requirements_dict = {'cpu': job_desc.cores,
790
- 'memory': job_desc.memory + 1024 * 1024 * 512,
791
- 'ephemeral-storage': job_desc.disk + 1024 * 1024 * 512}
837
+ requirements_dict = {
838
+ "cpu": job_desc.cores,
839
+ "memory": job_desc.memory + 1024 * 1024 * 512,
840
+ "ephemeral-storage": job_desc.disk + 1024 * 1024 * 512,
841
+ }
792
842
 
793
843
  # Also start on the placement constraints
794
844
  placement = KubernetesBatchSystem.Placement()
@@ -798,19 +848,21 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
798
848
  # Add in requirements for accelerators (GPUs).
799
849
  # See https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/
800
850
 
801
- if accelerator['kind'] == 'gpu':
851
+ if accelerator["kind"] == "gpu":
802
852
  # We can't schedule GPUs without a brand, because the
803
853
  # Kubernetes resources are <brand>.com/gpu. If no brand is
804
854
  # specified, default to nvidia, which is very popular.
805
- vendor = accelerator.get('brand', 'nvidia')
855
+ vendor = accelerator.get("brand", "nvidia")
806
856
  key = f'{vendor}.com/{accelerator["kind"]}'
807
857
  if key not in requirements_dict:
808
858
  requirements_dict[key] = 0
809
- requirements_dict[key] += accelerator['count']
859
+ requirements_dict[key] += accelerator["count"]
810
860
 
811
- if 'model' in accelerator:
861
+ if "model" in accelerator:
812
862
  # TODO: What if the cluster uses some other accelerator model labeling scheme?
813
- placement.required_labels.append(('accelerator', [accelerator['model']]))
863
+ placement.required_labels.append(
864
+ ("accelerator", [accelerator["model"]])
865
+ )
814
866
 
815
867
  # TODO: Support AMD's labeling scheme: https://github.com/RadeonOpenCompute/k8s-device-plugin/tree/master/cmd/k8s-node-labeller
816
868
  # That just has each trait of the accelerator as a separate label, but nothing that quite corresponds to a model.
@@ -822,14 +874,15 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
822
874
  # the UCSC Kubernetes admins want it that way. For GPUs, Kubernetes
823
875
  # requires them to be equal.
824
876
  limits_dict = requests_dict
825
- resources = V1ResourceRequirements(limits=limits_dict,
826
- requests=requests_dict)
877
+ resources = V1ResourceRequirements(limits=limits_dict, requests=requests_dict)
827
878
 
828
879
  # Collect volumes and mounts
829
880
  volumes = []
830
881
  mounts = []
831
882
 
832
- def mount_host_path(volume_name: str, host_path: str, mount_path: str, create: bool = False) -> None:
883
+ def mount_host_path(
884
+ volume_name: str, host_path: str, mount_path: str, create: bool = False
885
+ ) -> None:
833
886
  """
834
887
  Add a host path volume with the given name to mount the given path.
835
888
 
@@ -837,10 +890,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
837
890
  not exist. Otherwise, when the directory does not exist, the
838
891
  pod will wait for it to come into existence.
839
892
  """
840
- volume_type = 'DirectoryOrCreate' if create else 'Directory'
893
+ volume_type = "DirectoryOrCreate" if create else "Directory"
841
894
  volume_source = V1HostPathVolumeSource(path=host_path, type=volume_type)
842
- volume = V1Volume(name=volume_name,
843
- host_path=volume_source)
895
+ volume = V1Volume(name=volume_name, host_path=volume_source)
844
896
  volumes.append(volume)
845
897
  volume_mount = V1VolumeMount(mount_path=mount_path, name=volume_name)
846
898
  mounts.append(volume_mount)
@@ -848,43 +900,63 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
848
900
  if self.host_path is not None:
849
901
  # Provision Toil WorkDir from a HostPath volume, to share with other pods.
850
902
  # Create the directory if it doesn't exist already.
851
- mount_host_path('workdir', self.host_path, self.worker_work_dir, create=True)
903
+ mount_host_path(
904
+ "workdir", self.host_path, self.worker_work_dir, create=True
905
+ )
852
906
  # We also need to mount across /run/lock, where we will put
853
907
  # per-node coordiantion info.
854
908
  # Don't create this; it really should always exist.
855
- mount_host_path('coordination', '/run/lock', '/run/lock')
909
+ mount_host_path("coordination", "/run/lock", "/run/lock")
856
910
  else:
857
911
  # Provision Toil WorkDir as an ephemeral volume
858
- ephemeral_volume_name = 'workdir'
912
+ ephemeral_volume_name = "workdir"
859
913
  ephemeral_volume_source = V1EmptyDirVolumeSource()
860
- ephemeral_volume = V1Volume(name=ephemeral_volume_name,
861
- empty_dir=ephemeral_volume_source)
914
+ ephemeral_volume = V1Volume(
915
+ name=ephemeral_volume_name, empty_dir=ephemeral_volume_source
916
+ )
862
917
  volumes.append(ephemeral_volume)
863
- ephemeral_volume_mount = V1VolumeMount(mount_path=self.worker_work_dir, name=ephemeral_volume_name)
918
+ ephemeral_volume_mount = V1VolumeMount(
919
+ mount_path=self.worker_work_dir, name=ephemeral_volume_name
920
+ )
864
921
  mounts.append(ephemeral_volume_mount)
865
922
  # And don't share coordination directory
866
923
 
867
924
  if self.aws_secret_name is not None:
868
925
  # Also mount an AWS secret, if provided.
869
926
  # TODO: make this generic somehow
870
- secret_volume_name = 's3-credentials'
871
- secret_volume_source = V1SecretVolumeSource(secret_name=self.aws_secret_name)
872
- secret_volume = V1Volume(name=secret_volume_name,
873
- secret=secret_volume_source)
927
+ secret_volume_name = "s3-credentials"
928
+ secret_volume_source = V1SecretVolumeSource(
929
+ secret_name=self.aws_secret_name
930
+ )
931
+ secret_volume = V1Volume(
932
+ name=secret_volume_name, secret=secret_volume_source
933
+ )
874
934
  volumes.append(secret_volume)
875
- secret_volume_mount = V1VolumeMount(mount_path='/root/.aws', name=secret_volume_name)
935
+ secret_volume_mount = V1VolumeMount(
936
+ mount_path="/root/.aws", name=secret_volume_name
937
+ )
876
938
  mounts.append(secret_volume_mount)
877
939
 
878
940
  # Make a container definition
879
- container = V1Container(command=command_list,
880
- image=self.docker_image,
881
- name="runner-container",
882
- resources=resources,
883
- volume_mounts=mounts)
941
+ container = V1Container(
942
+ command=command_list,
943
+ image=self.docker_image,
944
+ name="runner-container",
945
+ resources=resources,
946
+ volume_mounts=mounts,
947
+ )
948
+
949
+ # In case security context rules are not allowed to be set, we only apply
950
+ # a security context at all if we need to turn on privileged mode.
951
+ if self.config.kubernetes_privileged:
952
+ container.security_context = V1SecurityContext(
953
+ privileged=self.config.kubernetes_privileged
954
+ )
955
+
884
956
  # Wrap the container in a spec
885
- pod_spec = V1PodSpec(containers=[container],
886
- volumes=volumes,
887
- restart_policy="Never")
957
+ pod_spec = V1PodSpec(
958
+ containers=[container], volumes=volumes, restart_policy="Never"
959
+ )
888
960
  # Tell the spec where to land
889
961
  placement.apply(pod_spec)
890
962
 
@@ -894,7 +966,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
894
966
 
895
967
  return pod_spec
896
968
 
897
- def _release_acquired_resources(self, resources: List[int], notify: bool = False) -> None:
969
+ def _release_acquired_resources(
970
+ self, resources: list[int], notify: bool = False
971
+ ) -> None:
898
972
  """
899
973
  Release all resources acquired for a job.
900
974
 
@@ -913,10 +987,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
913
987
  self._work_available.notify_all()
914
988
 
915
989
  def _launch_job(
916
- self,
917
- job_name: str,
918
- job_desc: JobDescription,
919
- pod_spec: V1PodSpec
990
+ self, job_name: str, job_desc: JobDescription, pod_spec: V1PodSpec
920
991
  ) -> bool:
921
992
  """
922
993
  Try to launch the given job to the Kubernetes cluster. Return False if
@@ -924,19 +995,26 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
924
995
  """
925
996
 
926
997
  # Limit the amount of resources requested at a time.
927
- resource_requests: List[int] = [1, int(job_desc.cores * 1000), job_desc.memory, job_desc.disk]
998
+ resource_requests: list[int] = [
999
+ 1,
1000
+ int(job_desc.cores * 1000),
1001
+ job_desc.memory,
1002
+ job_desc.disk,
1003
+ ]
928
1004
 
929
1005
  acquired = []
930
1006
  for source, request in zip(self.resource_sources, resource_requests):
931
1007
  # For each kind of resource we want, go get it
932
- assert ((isinstance(source, ResourcePool) and isinstance(request, int)))
1008
+ assert isinstance(source, ResourcePool) and isinstance(request, int)
933
1009
  if source.acquireNow(request):
934
1010
  acquired.append(request)
935
1011
  else:
936
1012
  # We can't get everything
937
- self._release_acquired_resources(acquired,
1013
+ self._release_acquired_resources(
1014
+ acquired,
938
1015
  # Put it back quietly.
939
- notify=False)
1016
+ notify=False,
1017
+ )
940
1018
  return False
941
1019
 
942
1020
  self._acquired_resources[job_name] = acquired
@@ -945,9 +1023,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
945
1023
 
946
1024
  # Make metadata to label the job/pod with info.
947
1025
  # Don't let the cluster autoscaler evict any Toil jobs.
948
- metadata = V1ObjectMeta(name=job_name,
949
- labels={"toil_run": self.run_id},
950
- annotations={"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"})
1026
+ metadata = V1ObjectMeta(
1027
+ name=job_name,
1028
+ labels={"toil_run": self.run_id},
1029
+ annotations={"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"},
1030
+ )
951
1031
 
952
1032
  # Wrap the spec in a template
953
1033
  template = V1PodTemplateSpec(spec=pod_spec, metadata=metadata)
@@ -955,18 +1035,21 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
955
1035
  # Make another spec for the job, asking to run the template with no
956
1036
  # backoff/retry. Specify our own TTL to avoid catching the notice
957
1037
  # of over-zealous abandoned job cleanup scripts.
958
- job_spec = V1JobSpec(template=template,
959
- backoff_limit=0,
960
- ttl_seconds_after_finished=self.finished_job_ttl)
1038
+ job_spec = V1JobSpec(
1039
+ template=template,
1040
+ backoff_limit=0,
1041
+ ttl_seconds_after_finished=self.finished_job_ttl,
1042
+ )
961
1043
 
962
1044
  # And make the actual job
963
- job = V1Job(spec=job_spec,
964
- metadata=metadata,
965
- api_version="batch/v1",
966
- kind="Job")
1045
+ job = V1Job(
1046
+ spec=job_spec, metadata=metadata, api_version="batch/v1", kind="Job"
1047
+ )
967
1048
 
968
1049
  # Launch the job
969
- launched = self._api('batch', errors=[]).create_namespaced_job(self.namespace, job)
1050
+ launched = self._api("batch", errors=[]).create_namespaced_job(
1051
+ self.namespace, job
1052
+ )
970
1053
 
971
1054
  logger.debug(f"Launched job: {job_name}")
972
1055
 
@@ -974,10 +1057,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
974
1057
 
975
1058
  def _delete_job(
976
1059
  self,
977
- job_name: str, *,
1060
+ job_name: str,
1061
+ *,
978
1062
  propagation_policy: Literal["Foreground", "Background"] = "Foreground",
979
1063
  gone_ok: bool = False,
980
- resource_notify: bool = True
1064
+ resource_notify: bool = True,
981
1065
  ) -> None:
982
1066
  """
983
1067
  Given the name of a kubernetes job, delete the job and release all
@@ -990,11 +1074,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
990
1074
  the self._work_available condition.
991
1075
  """
992
1076
  try:
993
- logger.debug(f'Deleting Kubernetes job {job_name}')
994
- self._api('batch', errors=[404] if gone_ok else []).delete_namespaced_job(
995
- job_name,
996
- self.namespace,
997
- propagation_policy=propagation_policy
1077
+ logger.debug(f"Deleting Kubernetes job {job_name}")
1078
+ self._api("batch", errors=[404] if gone_ok else []).delete_namespaced_job(
1079
+ job_name, self.namespace, propagation_policy=propagation_policy
998
1080
  )
999
1081
  finally:
1000
1082
  # We should always release the acquired resources.
@@ -1005,9 +1087,14 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1005
1087
  self._release_acquired_resources(resources, notify=resource_notify)
1006
1088
  del self._acquired_resources[job_name]
1007
1089
 
1008
- def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
1090
+ def issueBatchJob(
1091
+ self,
1092
+ command: str,
1093
+ job_desc: JobDescription,
1094
+ job_environment: Optional[dict[str, str]] = None,
1095
+ ) -> int:
1009
1096
  # Try the job as local
1010
- localID = self.handleLocalJob(job_desc)
1097
+ localID = self.handleLocalJob(command, job_desc)
1011
1098
  if localID is not None:
1012
1099
  # It is a local job
1013
1100
  return localID
@@ -1018,7 +1105,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1018
1105
  self.check_resource_request(job_desc)
1019
1106
 
1020
1107
  # Make a pod that describes running the job
1021
- pod_spec = self._create_pod_spec(job_desc, job_environment=job_environment)
1108
+ pod_spec = self._create_pod_spec(
1109
+ command, job_desc, job_environment=job_environment
1110
+ )
1022
1111
 
1023
1112
  # Make a batch system scope job ID
1024
1113
  job_id = self.getNextJobID()
@@ -1046,6 +1135,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1046
1135
  kwargs, so we can't just set unused ones to None. But we also don't
1047
1136
  want to duplicate code for every combination of possible present keys.
1048
1137
  """
1138
+
1049
1139
  _continue: NotRequired[str]
1050
1140
  label_selector: NotRequired[str]
1051
1141
  field_selector: NotRequired[str]
@@ -1075,30 +1165,30 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1075
1165
  token = None
1076
1166
 
1077
1167
  while True:
1078
- kwargs: KubernetesBatchSystem._ArgsDict = {'label_selector': f"toil_run={self.run_id}"}
1168
+ kwargs: KubernetesBatchSystem._ArgsDict = {
1169
+ "label_selector": f"toil_run={self.run_id}"
1170
+ }
1079
1171
 
1080
1172
  if onlySucceeded:
1081
- kwargs['field_selector'] = "status.successful==1"
1173
+ kwargs["field_selector"] = "status.successful==1"
1082
1174
 
1083
1175
  if token is not None:
1084
- kwargs['_continue'] = token
1176
+ kwargs["_continue"] = token
1085
1177
 
1086
- results = self._api('batch', errors=[]).list_namespaced_job(
1087
- self.namespace,
1088
- **kwargs
1178
+ results = self._api("batch", errors=[]).list_namespaced_job(
1179
+ self.namespace, **kwargs
1089
1180
  )
1090
-
1181
+
1091
1182
  # These jobs belong to us
1092
1183
  yield from (j for j in results.items if not self._is_deleted(j))
1093
1184
 
1094
1185
  # Remember the continuation token, if any
1095
- token = getattr(results.metadata, 'continue', None)
1186
+ token = getattr(results.metadata, "continue", None)
1096
1187
 
1097
1188
  if token is None:
1098
1189
  # There isn't one. We got everything.
1099
1190
  break
1100
1191
 
1101
-
1102
1192
  def _ourPodObject(self) -> Iterator[V1Pod]:
1103
1193
  """
1104
1194
  Yield Kubernetes V1Pod objects that we are responsible for that the
@@ -1108,25 +1198,25 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1108
1198
  token = None
1109
1199
 
1110
1200
  while True:
1111
- kwargs: KubernetesBatchSystem._ArgsDict = {'label_selector': f"toil_run={self.run_id}"}
1201
+ kwargs: KubernetesBatchSystem._ArgsDict = {
1202
+ "label_selector": f"toil_run={self.run_id}"
1203
+ }
1112
1204
 
1113
1205
  if token is not None:
1114
- kwargs['_continue'] = token
1206
+ kwargs["_continue"] = token
1115
1207
 
1116
- results = self._api('core', errors=[]).list_namespaced_pod(
1117
- self.namespace,
1118
- **kwargs
1208
+ results = self._api("core", errors=[]).list_namespaced_pod(
1209
+ self.namespace, **kwargs
1119
1210
  )
1120
1211
 
1121
1212
  yield from (j for j in results.items if not self._is_deleted(j))
1122
1213
  # Remember the continuation token, if any
1123
- token = getattr(results.metadata, 'continue', None)
1214
+ token = getattr(results.metadata, "continue", None)
1124
1215
 
1125
1216
  if token is None:
1126
1217
  # There isn't one. We got everything.
1127
1218
  break
1128
1219
 
1129
-
1130
1220
  def _getPodForJob(self, jobObject: V1Job) -> Optional[V1Pod]:
1131
1221
  """
1132
1222
  Get the pod that belongs to the given job, or None if the job's pod is
@@ -1140,22 +1230,26 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1140
1230
  """
1141
1231
 
1142
1232
  # Make sure the job has the fields we need
1143
- assert(jobObject.metadata is not None)
1233
+ assert jobObject.metadata is not None
1144
1234
 
1145
1235
  token = None
1146
1236
 
1147
1237
  while True:
1148
- kwargs: KubernetesBatchSystem._ArgsDict = {'label_selector': f'job-name={jobObject.metadata.name}'}
1238
+ kwargs: KubernetesBatchSystem._ArgsDict = {
1239
+ "label_selector": f"job-name={jobObject.metadata.name}"
1240
+ }
1149
1241
  if token is not None:
1150
- kwargs['_continue'] = token
1151
- results = self._api('core', errors=[]).list_namespaced_pod(self.namespace, **kwargs)
1242
+ kwargs["_continue"] = token
1243
+ results = self._api("core", errors=[]).list_namespaced_pod(
1244
+ self.namespace, **kwargs
1245
+ )
1152
1246
 
1153
1247
  for pod in results.items:
1154
1248
  # Return the first pod we find
1155
1249
  return pod
1156
1250
 
1157
1251
  # Remember the continuation token, if any
1158
- token = getattr(results.metadata, 'continue', None)
1252
+ token = getattr(results.metadata, "continue", None)
1159
1253
 
1160
1254
  if token is None:
1161
1255
  # There isn't one. We got everything.
@@ -1179,12 +1273,13 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1179
1273
  assert podObject.metadata is not None
1180
1274
  assert podObject.metadata.name is not None
1181
1275
 
1182
- return self._api('core', errors=[]).read_namespaced_pod_log(
1183
- podObject.metadata.name,
1184
- namespace=self.namespace
1276
+ return self._api("core", errors=[]).read_namespaced_pod_log(
1277
+ podObject.metadata.name, namespace=self.namespace
1185
1278
  )
1186
1279
 
1187
- def _isPodStuckOOM(self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2) -> bool:
1280
+ def _isPodStuckOOM(
1281
+ self, podObject: V1Pod, minFreeBytes: float = 1024 * 1024 * 2
1282
+ ) -> bool:
1188
1283
  """
1189
1284
  Poll the current memory usage for the pod from the cluster.
1190
1285
 
@@ -1214,14 +1309,18 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1214
1309
  assert podObject.metadata.name is not None
1215
1310
 
1216
1311
  # Compose a query to get just the pod we care about
1217
- query = f'metadata.name={podObject.metadata.name}'
1312
+ query = f"metadata.name={podObject.metadata.name}"
1218
1313
 
1219
1314
  # Look for it, but manage our own exceptions
1220
1315
  try:
1221
1316
  # TODO: When the Kubernetes Python API actually wraps the metrics API, switch to that
1222
- response = self._api('customObjects').list_namespaced_custom_object('metrics.k8s.io', 'v1beta1',
1223
- self.namespace, 'pods',
1224
- field_selector=query)
1317
+ response = self._api("customObjects").list_namespaced_custom_object(
1318
+ "metrics.k8s.io",
1319
+ "v1beta1",
1320
+ self.namespace,
1321
+ "pods",
1322
+ field_selector=query,
1323
+ )
1225
1324
  except Exception as e:
1226
1325
  # We couldn't talk to the metrics service on this attempt. We don't
1227
1326
  # retry, but we also don't want to just ignore all errors. We only
@@ -1237,7 +1336,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1237
1336
  raise
1238
1337
 
1239
1338
  # Pull out the items
1240
- items = response.get('items', [])
1339
+ items = response.get("items", [])
1241
1340
 
1242
1341
  if len(items) == 0:
1243
1342
  # If there's no statistics we can't say we're stuck OOM
@@ -1246,7 +1345,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1246
1345
  # Assume the first result is the right one, because of the selector.
1247
1346
  # That means we don't need to bother with _continue.
1248
1347
  # Assume it has exactly one pod, because we made it.
1249
- containers = items[0].get('containers', [{}])
1348
+ containers = items[0].get("containers", [{}])
1250
1349
 
1251
1350
  if len(containers) == 0:
1252
1351
  # If there are no containers (because none have started yet?), we can't say we're stuck OOM
@@ -1255,26 +1354,37 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1255
1354
  # Otherwise, assume it just has one container.
1256
1355
  # Grab the memory usage string, like 123Ki, and convert to bytes.
1257
1356
  # If anything is missing, assume 0 bytes used.
1258
- bytesUsed = human2bytes(containers[0].get('usage', {}).get('memory', '0'))
1357
+ bytesUsed = human2bytes(containers[0].get("usage", {}).get("memory", "0"))
1259
1358
 
1260
1359
  # Also get the limit out of the pod object's spec
1261
1360
  assert podObject.spec is not None
1262
1361
  assert len(podObject.spec.containers) > 0
1263
1362
  assert podObject.spec.containers[0].resources is not None
1264
1363
  assert podObject.spec.containers[0].resources.limits is not None
1265
- assert 'memory' in podObject.spec.containers[0].resources.limits
1266
- bytesAllowed = human2bytes(podObject.spec.containers[0].resources.limits['memory'])
1364
+ assert "memory" in podObject.spec.containers[0].resources.limits
1365
+ bytesAllowed = human2bytes(
1366
+ podObject.spec.containers[0].resources.limits["memory"]
1367
+ )
1267
1368
 
1268
1369
  if bytesAllowed - bytesUsed < minFreeBytes:
1269
1370
  # This is too much!
1270
- logger.warning('Pod %s has used %d of %d bytes of memory; reporting as stuck due to OOM.',
1271
- podObject.metadata.name, bytesUsed, bytesAllowed)
1371
+ logger.warning(
1372
+ "Pod %s has used %d of %d bytes of memory; reporting as stuck due to OOM.",
1373
+ podObject.metadata.name,
1374
+ bytesUsed,
1375
+ bytesAllowed,
1376
+ )
1272
1377
 
1273
1378
  return True
1274
1379
  else:
1275
1380
  return False
1276
1381
 
1277
- def _isPodStuckWaiting(self, pod_object: V1Pod, reason: Optional[str] = None, timeout: Optional[float] = None) -> bool:
1382
+ def _isPodStuckWaiting(
1383
+ self,
1384
+ pod_object: V1Pod,
1385
+ reason: Optional[str] = None,
1386
+ timeout: Optional[float] = None,
1387
+ ) -> bool:
1278
1388
  """
1279
1389
  Return True if the pod looks to be in a waiting state, and false otherwise.
1280
1390
 
@@ -1298,7 +1408,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1298
1408
  # Can't be stuck
1299
1409
  return False
1300
1410
 
1301
- waiting_info = getattr(getattr(container_statuses[0], 'state', None), 'waiting', None)
1411
+ waiting_info = getattr(
1412
+ getattr(container_statuses[0], "state", None), "waiting", None
1413
+ )
1302
1414
  if waiting_info is None:
1303
1415
  # Pod is not waiting
1304
1416
  return False
@@ -1307,15 +1419,17 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1307
1419
  # Pod fails reason filter
1308
1420
  return False
1309
1421
 
1310
- start_time = getattr(pod_object.status, 'start_time', None)
1311
- if timeout is not None and (start_time is None or (utc_now() - start_time).total_seconds() < timeout):
1422
+ start_time = getattr(pod_object.status, "start_time", None)
1423
+ if timeout is not None and (
1424
+ start_time is None or (utc_now() - start_time).total_seconds() < timeout
1425
+ ):
1312
1426
  # It hasn't been waiting too long, or we care but don't know how
1313
1427
  # long it has been waiting
1314
1428
  return False
1315
1429
 
1316
1430
  return True
1317
1431
 
1318
- def _is_deleted(self, kube_thing: Union['V1Job', 'V1Pod']) -> bool:
1432
+ def _is_deleted(self, kube_thing: Union["V1Job", "V1Pod"]) -> bool:
1319
1433
  """
1320
1434
  Determine if a job or pod is in the process od being deleted, and
1321
1435
  shouldn't count anymore.
@@ -1324,7 +1438,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1324
1438
  # Kubernetes "Terminating" is the same as having the deletion_timestamp
1325
1439
  # set in the metadata of the object.
1326
1440
 
1327
- deletion_timestamp: Optional[datetime.datetime] = getattr(getattr(kube_thing, 'metadata', None), 'deletion_timestamp', None)
1441
+ deletion_timestamp: Optional[datetime.datetime] = getattr(
1442
+ getattr(kube_thing, "metadata", None), "deletion_timestamp", None
1443
+ )
1328
1444
  # If the deletion timestamp is set to anything, it is in the process of
1329
1445
  # being deleted. We will treat that as as good as gone.
1330
1446
  return deletion_timestamp is not None
@@ -1341,8 +1457,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1341
1457
 
1342
1458
  assert jobObject.metadata is not None
1343
1459
  assert jobObject.metadata.name is not None
1344
- return int(jobObject.metadata.name[len(self.job_prefix):])
1345
-
1460
+ return int(jobObject.metadata.name[len(self.job_prefix) :])
1346
1461
 
1347
1462
  def getUpdatedBatchJob(self, maxWait: float) -> Optional[UpdatedBatchJobInfo]:
1348
1463
 
@@ -1358,22 +1473,27 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1358
1473
  # Otherwise we need to maybe wait.
1359
1474
  if self.enable_watching and maxWait >= 1:
1360
1475
  # We can try a watch. Watches can only work in whole seconds.
1361
- for event in self._stream_until_error(self._api('batch').list_namespaced_job,
1362
- self.namespace,
1363
- label_selector=f"toil_run={self.run_id}",
1364
- timeout_seconds=math.floor(maxWait)):
1476
+ for event in self._stream_until_error(
1477
+ self._api("batch").list_namespaced_job,
1478
+ self.namespace,
1479
+ label_selector=f"toil_run={self.run_id}",
1480
+ timeout_seconds=math.floor(maxWait),
1481
+ ):
1365
1482
  # Grab the metadata data, ID, the list of conditions of the current job, and the total pods
1366
- jobObject = event['object']
1367
-
1483
+ jobObject = event["object"]
1484
+
1368
1485
  if self._is_deleted(jobObject):
1369
1486
  # Job is already deleted, so ignore it.
1370
- logger.warning('Kubernetes job %s is deleted; ignore its update', getattr(getattr(jobObject, 'metadata', None), 'name', None))
1487
+ logger.warning(
1488
+ "Kubernetes job %s is deleted; ignore its update",
1489
+ getattr(getattr(jobObject, "metadata", None), "name", None),
1490
+ )
1371
1491
  continue
1372
-
1492
+
1373
1493
  assert jobObject.metadata is not None
1374
1494
  assert jobObject.metadata.name is not None
1375
-
1376
- jobID = int(jobObject.metadata.name[len(self.job_prefix):])
1495
+
1496
+ jobID = int(jobObject.metadata.name[len(self.job_prefix) :])
1377
1497
  if jobObject.status is None:
1378
1498
  # Can't tell what is up with this job.
1379
1499
  continue
@@ -1383,7 +1503,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1383
1503
  failed_pods = jobObject.status.failed or 0
1384
1504
  # Fetch out the condition object that has info about how the job is going.
1385
1505
  condition: Optional[V1JobCondition] = None
1386
- if jobObject.status.conditions is not None and len(jobObject.status.conditions) > 0:
1506
+ if (
1507
+ jobObject.status.conditions is not None
1508
+ and len(jobObject.status.conditions) > 0
1509
+ ):
1387
1510
  condition = jobObject.status.conditions[0]
1388
1511
 
1389
1512
  totalPods = active_pods + succeeded_pods + failed_pods
@@ -1393,14 +1516,25 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1393
1516
 
1394
1517
  # Check if there are any active pods
1395
1518
  if active_pods > 0:
1396
- logger.info("%s has %d pods running" % jobObject.metadata.name, active_pods)
1519
+ logger.info(
1520
+ "%s has %d pods running" % jobObject.metadata.name, active_pods
1521
+ )
1397
1522
  continue
1398
1523
  elif succeeded_pods > 0 or failed_pods > 0:
1399
1524
  # No more active pods in the current job ; must be finished
1400
- logger.info("%s RESULTS -> Succeeded: %d Failed:%d Active:%d" % jobObject.metadata.name,
1401
- succeeded_pods, failed_pods, active_pods)
1525
+ logger.info(
1526
+ "%s RESULTS -> Succeeded: %d Failed:%d Active:%d"
1527
+ % jobObject.metadata.name,
1528
+ succeeded_pods,
1529
+ failed_pods,
1530
+ active_pods,
1531
+ )
1402
1532
  # Log out success/failure given a reason
1403
- logger.info("%s REASON: %s", getattr(condition, 'type', None), getattr(condition, 'reason', None))
1533
+ logger.info(
1534
+ "%s REASON: %s",
1535
+ getattr(condition, "type", None),
1536
+ getattr(condition, "reason", None),
1537
+ )
1404
1538
 
1405
1539
  # Log out reason of failure and pod exit code
1406
1540
  if failed_pods > 0:
@@ -1410,22 +1544,40 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1410
1544
  if condition is not None:
1411
1545
  logger.warning("Failed Job Message: %s", condition.message)
1412
1546
  pod = self._getPodForJob(jobObject)
1413
- statuses: List[V1ContainerStatus] = getattr(getattr(pod, 'status', None), 'container_statuses', [])
1414
- if len(statuses) > 0 and statuses[0].state is not None and statuses[0].state.terminated is not None:
1547
+ statuses: list[V1ContainerStatus] = getattr(
1548
+ getattr(pod, "status", None), "container_statuses", []
1549
+ )
1550
+ if (
1551
+ len(statuses) > 0
1552
+ and statuses[0].state is not None
1553
+ and statuses[0].state.terminated is not None
1554
+ ):
1415
1555
  exitCode = statuses[0].state.terminated.exit_code
1416
1556
 
1417
1557
  raw_runtime = 0.0
1418
- if jobObject.status.completion_time is not None and jobObject.status.start_time is not None:
1419
- raw_runtime = (jobObject.status.completion_time - jobObject.status.start_time).total_seconds()
1558
+ if (
1559
+ jobObject.status.completion_time is not None
1560
+ and jobObject.status.start_time is not None
1561
+ ):
1562
+ raw_runtime = (
1563
+ jobObject.status.completion_time
1564
+ - jobObject.status.start_time
1565
+ ).total_seconds()
1420
1566
  runtime = slow_down(raw_runtime)
1421
- result = UpdatedBatchJobInfo(jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=exitReason)
1567
+ result = UpdatedBatchJobInfo(
1568
+ jobID=jobID,
1569
+ exitStatus=exitCode,
1570
+ wallTime=runtime,
1571
+ exitReason=exitReason,
1572
+ )
1422
1573
 
1423
- if (exitReason == BatchJobExitReason.FAILED) or (succeeded_pods + failed_pods == totalPods):
1574
+ if (exitReason == BatchJobExitReason.FAILED) or (
1575
+ succeeded_pods + failed_pods == totalPods
1576
+ ):
1424
1577
  # Cleanup if job is all finished or there was a pod that failed
1425
1578
  # TODO: use delete_job() to release acquired resources
1426
1579
  self._delete_job(
1427
- jobObject.metadata.name,
1428
- propagation_policy='Foreground'
1580
+ jobObject.metadata.name, propagation_policy="Foreground"
1429
1581
  )
1430
1582
  # Make sure the job is deleted so we won't see it again.
1431
1583
  self._waitForJobDeath(jobObject.metadata.name)
@@ -1433,12 +1585,19 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1433
1585
  continue
1434
1586
  else:
1435
1587
  # Job is not running/updating ; no active, successful, or failed pods yet
1436
- logger.debug("Job {} -> {}".format(jobObject.metadata.name, getattr(condition, 'reason', None)))
1588
+ logger.debug(
1589
+ "Job {} -> {}".format(
1590
+ jobObject.metadata.name, getattr(condition, "reason", None)
1591
+ )
1592
+ )
1437
1593
  # Pod could be pending; don't say it's lost.
1438
1594
  continue
1439
1595
  else:
1440
1596
  # Try polling instead
1441
- while result is None and (datetime.datetime.now() - entry).total_seconds() < maxWait:
1597
+ while (
1598
+ result is None
1599
+ and (datetime.datetime.now() - entry).total_seconds() < maxWait
1600
+ ):
1442
1601
  # We still have nothing and we haven't hit the timeout.
1443
1602
 
1444
1603
  # Poll
@@ -1446,12 +1605,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1446
1605
 
1447
1606
  if result is None:
1448
1607
  # Still nothing. Wait a second, or some fraction of our max wait time.
1449
- time.sleep(min(maxWait/2, 1.0))
1608
+ time.sleep(min(maxWait / 2, 1.0))
1450
1609
 
1451
1610
  # When we get here, either we found something or we ran out of time
1452
1611
  return result
1453
1612
 
1454
-
1455
1613
  def _getUpdatedBatchJobImmediately(self) -> Optional[UpdatedBatchJobInfo]:
1456
1614
  """
1457
1615
  Return None if no updated (completed or failed) batch job is currently
@@ -1475,25 +1633,25 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1475
1633
  # Find a job that is done, failed, or stuck
1476
1634
  jobObject = None
1477
1635
  # Put 'done', 'failed', or 'stuck' here
1478
- chosenFor = ''
1636
+ chosenFor = ""
1479
1637
 
1480
1638
  for j in self._ourJobObject(onlySucceeded=True):
1481
1639
  # Look for succeeded jobs because that's the only filter Kubernetes has
1482
1640
  jobObject = j
1483
- chosenFor = 'done'
1641
+ chosenFor = "done"
1484
1642
 
1485
1643
  if jobObject is None:
1486
1644
  for j in self._ourJobObject():
1487
1645
  # If there aren't any succeeded jobs, scan all jobs
1488
1646
  # See how many times each failed
1489
- failCount = getattr(j.status, 'failed', 0)
1647
+ failCount = getattr(j.status, "failed", 0)
1490
1648
  if failCount is None:
1491
1649
  # Make sure it is an int
1492
1650
  failCount = 0
1493
1651
  if failCount > 0:
1494
1652
  # Take the first failed one you find
1495
1653
  jobObject = j
1496
- chosenFor = 'failed'
1654
+ chosenFor = "failed"
1497
1655
  break
1498
1656
 
1499
1657
  if jobObject is None:
@@ -1506,23 +1664,30 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1506
1664
  continue
1507
1665
 
1508
1666
  # Containers can get stuck in Waiting with reason ImagePullBackOff
1509
- if self._isPodStuckWaiting(pod, reason='ImagePullBackoff'):
1667
+ if self._isPodStuckWaiting(pod, reason="ImagePullBackoff"):
1510
1668
  # Assume it will never finish, even if the registry comes back or whatever.
1511
1669
  # We can get into this state when we send in a non-existent image.
1512
1670
  # See https://github.com/kubernetes/kubernetes/issues/58384
1513
1671
  jobObject = j
1514
- chosenFor = 'stuck'
1515
- logger.warning('Failing stuck job (ImagePullBackoff); did you try to run a non-existent Docker image?'
1516
- ' Check TOIL_APPLIANCE_SELF.')
1672
+ chosenFor = "stuck"
1673
+ logger.warning(
1674
+ "Failing stuck job (ImagePullBackoff); did you try to run a non-existent Docker image?"
1675
+ " Check TOIL_APPLIANCE_SELF."
1676
+ )
1517
1677
  break
1518
1678
 
1519
1679
  # Containers can also get stuck in Waiting with reason
1520
1680
  # ContainerCreating, if for example their mounts don't work.
1521
- if self._isPodStuckWaiting(pod, reason='ContainerCreating', timeout=self.pod_timeout):
1681
+ if self._isPodStuckWaiting(
1682
+ pod, reason="ContainerCreating", timeout=self.pod_timeout
1683
+ ):
1522
1684
  # Assume that it will never finish.
1523
1685
  jobObject = j
1524
- chosenFor = 'stuck'
1525
- logger.warning('Failing stuck job (ContainerCreating longer than %s seconds); did you try to mount something impossible?', self.pod_timeout)
1686
+ chosenFor = "stuck"
1687
+ logger.warning(
1688
+ "Failing stuck job (ContainerCreating longer than %s seconds); did you try to mount something impossible?",
1689
+ self.pod_timeout,
1690
+ )
1526
1691
  break
1527
1692
 
1528
1693
  # Pods can also get stuck nearly but not quite out of memory,
@@ -1532,7 +1697,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1532
1697
  # We found a job that probably should be OOM! Report it as stuck.
1533
1698
  # Polling function takes care of the logging.
1534
1699
  jobObject = j
1535
- chosenFor = 'stuck'
1700
+ chosenFor = "stuck"
1536
1701
  break
1537
1702
 
1538
1703
  if jobObject is None:
@@ -1540,25 +1705,30 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1540
1705
  return None
1541
1706
  else:
1542
1707
  # We actually have something
1543
- logger.debug('Identified stopped Kubernetes job %s as %s', getattr(jobObject.metadata, 'name', None), chosenFor)
1544
-
1708
+ logger.debug(
1709
+ "Identified stopped Kubernetes job %s as %s",
1710
+ getattr(jobObject.metadata, "name", None),
1711
+ chosenFor,
1712
+ )
1545
1713
 
1546
1714
  # Otherwise we got something.
1547
1715
 
1548
1716
  # Work out what the job's ID was (whatever came after our name prefix)
1549
1717
  assert jobObject.metadata is not None
1550
1718
  assert jobObject.metadata.name is not None
1551
- jobID = int(jobObject.metadata.name[len(self.job_prefix):])
1719
+ jobID = int(jobObject.metadata.name[len(self.job_prefix) :])
1552
1720
 
1553
1721
  # Grab the pod
1554
1722
  pod = self._getPodForJob(jobObject)
1555
1723
 
1556
1724
  if pod is not None:
1557
- if chosenFor == 'done' or chosenFor == 'failed':
1725
+ if chosenFor == "done" or chosenFor == "failed":
1558
1726
  # The job actually finished or failed
1559
1727
 
1560
1728
  # Get the statuses of the pod's containers
1561
- containerStatuses = getattr(getattr(pod, 'status', None), 'container_statuses', None)
1729
+ containerStatuses = getattr(
1730
+ getattr(pod, "status", None), "container_statuses", None
1731
+ )
1562
1732
 
1563
1733
  # Get when the pod started (reached the Kubelet) as a datetime
1564
1734
  start_time = self._get_start_time(pod, jobObject)
@@ -1568,18 +1738,24 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1568
1738
  # This happens when a pod is "Scheduled". But how could a
1569
1739
  # 'done' or 'failed' pod be merely "Scheduled"?
1570
1740
  # Complain so we can find out.
1571
- logger.warning('Exit code and runtime unavailable; pod has no container statuses')
1572
- logger.warning('Pod: %s', str(pod))
1741
+ logger.warning(
1742
+ "Exit code and runtime unavailable; pod has no container statuses"
1743
+ )
1744
+ logger.warning("Pod: %s", str(pod))
1573
1745
  exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
1574
1746
  # Say it stopped now and started when it was scheduled/submitted.
1575
1747
  # We still need a strictly positive runtime.
1576
1748
  runtime = slow_down((utc_now() - start_time).total_seconds())
1577
1749
  else:
1578
1750
  # Get the termination info from the pod's main (only) container
1579
- terminatedInfo = getattr(getattr(containerStatuses[0], 'state', None), 'terminated', None)
1751
+ terminatedInfo = getattr(
1752
+ getattr(containerStatuses[0], "state", None), "terminated", None
1753
+ )
1580
1754
  if terminatedInfo is None:
1581
- logger.warning('Exit code and runtime unavailable; pod stopped without container terminating')
1582
- logger.warning('Pod: %s', str(pod))
1755
+ logger.warning(
1756
+ "Exit code and runtime unavailable; pod stopped without container terminating"
1757
+ )
1758
+ logger.warning("Pod: %s", str(pod))
1583
1759
  exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
1584
1760
  # Say it stopped now and started when it was scheduled/submitted.
1585
1761
  # We still need a strictly positive runtime.
@@ -1594,34 +1770,42 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1594
1770
  # created. And we need to look at the pod's end time
1595
1771
  # because the job only gets a completion time if
1596
1772
  # successful.
1597
- runtime = slow_down((terminatedInfo.finished_at -
1598
- start_time).total_seconds())
1773
+ runtime = slow_down(
1774
+ (terminatedInfo.finished_at - start_time).total_seconds()
1775
+ )
1599
1776
 
1600
- if chosenFor == 'failed':
1777
+ if chosenFor == "failed":
1601
1778
  # Warn the user with the failed pod's log
1602
1779
  # TODO: cut this down somehow?
1603
- logger.warning('Log from failed pod: %s', self._getLogForPod(pod))
1780
+ logger.warning(
1781
+ "Log from failed pod: %s", self._getLogForPod(pod)
1782
+ )
1604
1783
 
1605
1784
  else:
1606
1785
  # The job has gotten stuck
1607
1786
 
1608
- assert chosenFor == 'stuck'
1787
+ assert chosenFor == "stuck"
1609
1788
 
1610
1789
  # Synthesize an exit code
1611
1790
  exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
1612
1791
  # Say it ran from when the job was submitted to when the pod got stuck
1613
- runtime = slow_down((utc_now() - self._get_start_time(job=jobObject)).total_seconds())
1792
+ runtime = slow_down(
1793
+ (utc_now() - self._get_start_time(job=jobObject)).total_seconds()
1794
+ )
1614
1795
  else:
1615
1796
  # The pod went away from under the job.
1616
- logging.warning('Exit code and runtime unavailable; pod vanished')
1797
+ logging.warning("Exit code and runtime unavailable; pod vanished")
1617
1798
  exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
1618
1799
  # Say it ran from when the job was submitted to when the pod vanished
1619
- runtime = slow_down((utc_now() - self._get_start_time(job=jobObject)).total_seconds())
1620
-
1800
+ runtime = slow_down(
1801
+ (utc_now() - self._get_start_time(job=jobObject)).total_seconds()
1802
+ )
1621
1803
 
1622
1804
  try:
1623
1805
  # Delete the job and all dependents (pods), hoping to get a 404 if it's magically gone
1624
- self._delete_job(jobObject.metadata.name, propagation_policy='Foreground', gone_ok=True)
1806
+ self._delete_job(
1807
+ jobObject.metadata.name, propagation_policy="Foreground", gone_ok=True
1808
+ )
1625
1809
 
1626
1810
  # That just kicks off the deletion process. Foreground doesn't
1627
1811
  # actually block. See
@@ -1637,7 +1821,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1637
1821
  # Otherwise everything is fine and the job is gone.
1638
1822
 
1639
1823
  # Return the one finished job we found
1640
- return UpdatedBatchJobInfo(jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=None)
1824
+ return UpdatedBatchJobInfo(
1825
+ jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=None
1826
+ )
1641
1827
 
1642
1828
  def _waitForJobDeath(self, jobName: str) -> None:
1643
1829
  """
@@ -1651,7 +1837,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1651
1837
  while True:
1652
1838
  try:
1653
1839
  # Look for the job
1654
- job_object = self._api('batch', errors=[404]).read_namespaced_job(jobName, self.namespace)
1840
+ job_object = self._api("batch", errors=[404]).read_namespaced_job(
1841
+ jobName, self.namespace
1842
+ )
1655
1843
  if self._is_deleted(job_object):
1656
1844
  # The job looks deleted, so we can treat it as not being there.
1657
1845
  return
@@ -1676,59 +1864,80 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1676
1864
  # Shutdown scheduling thread
1677
1865
  self._shutting_down.set()
1678
1866
  with self._work_available:
1679
- self._work_available.notify_all() # Wake it up.
1867
+ self._work_available.notify_all() # Wake it up.
1680
1868
 
1681
1869
  self.schedulingThread.join()
1682
1870
 
1683
1871
  # Kill all of our jobs and clean up pods that are associated with those jobs
1684
1872
  try:
1685
- logger.debug('Deleting all Kubernetes jobs for toil_run=%s', self.run_id)
1686
- self._api('batch', errors=[404]).delete_collection_namespaced_job(
1873
+ logger.debug("Deleting all Kubernetes jobs for toil_run=%s", self.run_id)
1874
+ self._api("batch", errors=[404]).delete_collection_namespaced_job(
1687
1875
  self.namespace,
1688
1876
  label_selector=f"toil_run={self.run_id}",
1689
- propagation_policy='Background'
1877
+ propagation_policy="Background",
1878
+ )
1879
+ logger.debug(
1880
+ "Killed jobs with delete_collection_namespaced_job; cleaned up"
1690
1881
  )
1691
- logger.debug('Killed jobs with delete_collection_namespaced_job; cleaned up')
1692
1882
  # TODO: should we release all resources? We're shutting down so would it matter?
1693
1883
  except ApiException as e:
1694
1884
  if e.status != 404:
1695
1885
  # Anything other than a 404 is weird here.
1696
- logger.error("Exception when calling BatchV1Api->delete_collection_namespaced_job: %s" % e)
1886
+ logger.error(
1887
+ "Exception when calling BatchV1Api->delete_collection_namespaced_job: %s"
1888
+ % e
1889
+ )
1697
1890
 
1698
1891
  # If batch delete fails, try to delete all remaining jobs individually.
1699
- logger.debug('Deleting Kubernetes jobs individually for toil_run=%s', self.run_id)
1892
+ logger.debug(
1893
+ "Deleting Kubernetes jobs individually for toil_run=%s", self.run_id
1894
+ )
1700
1895
  for job_id in self._getIssuedNonLocalBatchJobIDs():
1701
- job_name = f'{self.job_prefix}{job_id}'
1702
- self._delete_job(job_name, propagation_policy='Background', resource_notify=False)
1896
+ job_name = f"{self.job_prefix}{job_id}"
1897
+ self._delete_job(
1898
+ job_name, propagation_policy="Background", resource_notify=False
1899
+ )
1703
1900
 
1704
1901
  # Aggregate all pods and check if any pod has failed to cleanup or is orphaned.
1705
1902
  ourPods = self._ourPodObject()
1706
1903
 
1707
1904
  for pod in ourPods:
1708
1905
  try:
1709
- phase = getattr(pod.status, 'phase', None)
1710
- if phase == 'Failed':
1711
- logger.debug('Failed pod encountered at shutdown:\n%s', self._pretty_print(pod))
1712
- if phase == 'Orphaned':
1713
- logger.debug('Orphaned pod encountered at shutdown:\n%s', self._pretty_print(pod))
1906
+ phase = getattr(pod.status, "phase", None)
1907
+ if phase == "Failed":
1908
+ logger.debug(
1909
+ "Failed pod encountered at shutdown:\n%s",
1910
+ self._pretty_print(pod),
1911
+ )
1912
+ if phase == "Orphaned":
1913
+ logger.debug(
1914
+ "Orphaned pod encountered at shutdown:\n%s",
1915
+ self._pretty_print(pod),
1916
+ )
1714
1917
  except:
1715
1918
  # Don't get mad if that doesn't work.
1716
1919
  pass
1717
1920
  if pod.metadata is not None and pod.metadata.name is not None:
1718
1921
  try:
1719
- logger.debug('Cleaning up pod at shutdown: %s', pod.metadata.name)
1720
- response = self._api('core', errors=[404]).delete_namespaced_pod(
1922
+ logger.debug(
1923
+ "Cleaning up pod at shutdown: %s", pod.metadata.name
1924
+ )
1925
+ response = self._api(
1926
+ "core", errors=[404]
1927
+ ).delete_namespaced_pod(
1721
1928
  pod.metadata.name,
1722
1929
  self.namespace,
1723
- propagation_policy='Background'
1930
+ propagation_policy="Background",
1724
1931
  )
1725
1932
  except ApiException as e:
1726
1933
  if e.status != 404:
1727
1934
  # Anything other than a 404 is weird here.
1728
- logger.error("Exception when calling CoreV1Api->delete_namespaced_pod: %s" % e)
1729
-
1935
+ logger.error(
1936
+ "Exception when calling CoreV1Api->delete_namespaced_pod: %s"
1937
+ % e
1938
+ )
1730
1939
 
1731
- def _getIssuedNonLocalBatchJobIDs(self) -> List[int]:
1940
+ def _getIssuedNonLocalBatchJobIDs(self) -> list[int]:
1732
1941
  """
1733
1942
  Get the issued batch job IDs that are not for local jobs.
1734
1943
  """
@@ -1740,29 +1949,35 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1740
1949
  jobIDs.append(self._getIDForOurJob(job))
1741
1950
  return jobIDs
1742
1951
 
1743
- def getIssuedBatchJobIDs(self) -> List[int]:
1952
+ def getIssuedBatchJobIDs(self) -> list[int]:
1744
1953
  # Make sure to send the local jobs and queued jobs also
1745
1954
  with self._mutex:
1746
1955
  queued_jobs = list(self._queued_job_ids)
1747
- return self._getIssuedNonLocalBatchJobIDs() + list(self.getIssuedLocalJobIDs()) + queued_jobs
1956
+ return (
1957
+ self._getIssuedNonLocalBatchJobIDs()
1958
+ + list(self.getIssuedLocalJobIDs())
1959
+ + queued_jobs
1960
+ )
1748
1961
 
1749
- def _get_start_time(self, pod: Optional[V1Pod] = None, job: Optional[V1Job] = None) -> datetime.datetime:
1962
+ def _get_start_time(
1963
+ self, pod: Optional[V1Pod] = None, job: Optional[V1Job] = None
1964
+ ) -> datetime.datetime:
1750
1965
  """
1751
1966
  Get an actual or estimated start time for a pod.
1752
1967
  """
1753
1968
 
1754
1969
  # Get when the pod started (reached the Kubelet) as a datetime
1755
- start_time = getattr(getattr(pod, 'status', None), 'start_time', None)
1970
+ start_time = getattr(getattr(pod, "status", None), "start_time", None)
1756
1971
  if start_time is None:
1757
1972
  # If the pod never made it to the kubelet to get a
1758
1973
  # start_time, say it was when the job was submitted.
1759
- start_time = getattr(getattr(job, 'status', None), 'start_time', None)
1974
+ start_time = getattr(getattr(job, "status", None), "start_time", None)
1760
1975
  if start_time is None:
1761
1976
  # If this is still unset, say it was just now.
1762
1977
  start_time = utc_now()
1763
1978
  return start_time
1764
1979
 
1765
- def getRunningBatchJobIDs(self) -> Dict[int, float]:
1980
+ def getRunningBatchJobIDs(self) -> dict[int, float]:
1766
1981
  # We need a dict from jobID (integer) to seconds it has been running
1767
1982
  secondsPerJob = dict()
1768
1983
  for job in self._ourJobObject():
@@ -1773,7 +1988,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1773
1988
  # Jobs whose pods are gone are not running
1774
1989
  continue
1775
1990
 
1776
- if getattr(pod.status, 'phase', None) == 'Running':
1991
+ if getattr(pod.status, "phase", None) == "Running":
1777
1992
  # The job's pod is running
1778
1993
 
1779
1994
  # Estimate the runtime
@@ -1785,7 +2000,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1785
2000
  secondsPerJob.update(self.getRunningLocalJobIDs())
1786
2001
  return secondsPerJob
1787
2002
 
1788
- def killBatchJobs(self, jobIDs: List[int]) -> None:
2003
+ def killBatchJobs(self, jobIDs: list[int]) -> None:
1789
2004
 
1790
2005
  # Kill all the ones that are local
1791
2006
  self.killLocalJobs(jobIDs)
@@ -1794,7 +2009,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1794
2009
 
1795
2010
  # First get the jobs we even issued non-locally
1796
2011
  issued_on_kubernetes = set(self._getIssuedNonLocalBatchJobIDs())
1797
- deleted_jobs: List[str] = []
2012
+ deleted_jobs: list[str] = []
1798
2013
 
1799
2014
  for job_id in jobIDs:
1800
2015
  # For each job we are supposed to kill
@@ -1820,10 +2035,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1820
2035
 
1821
2036
  # Delete the requested job in the foreground.
1822
2037
  # This doesn't block, but it does delete expeditiously.
1823
- self._delete_job(job_name, propagation_policy='Foreground')
2038
+ self._delete_job(job_name, propagation_policy="Foreground")
1824
2039
 
1825
2040
  deleted_jobs.append(job_name)
1826
- logger.debug('Killed job by request: %s', job_name)
2041
+ logger.debug("Killed job by request: %s", job_name)
1827
2042
 
1828
2043
  for job_name in deleted_jobs:
1829
2044
  # Now we need to wait for all the jobs we killed to be gone.
@@ -1833,7 +2048,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1833
2048
  # the potential deadlock (if the user code needs exclusive access to
1834
2049
  # a resource) onto the user code, instead of always hanging
1835
2050
  # whenever we can't certify that a faulty node is no longer running
1836
- # the user code.
2051
+ # the user code.
1837
2052
  self._waitForJobDeath(job_name)
1838
2053
 
1839
2054
  @classmethod
@@ -1844,9 +2059,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1844
2059
 
1845
2060
  # Make a Kubernetes-acceptable version of our username: not too long,
1846
2061
  # and all lowercase letters, numbers, or - or .
1847
- acceptable_chars = set(string.ascii_lowercase + string.digits + '-.')
2062
+ acceptable_chars = set(string.ascii_lowercase + string.digits + "-.")
1848
2063
 
1849
- return ''.join([c for c in get_user_name().lower() if c in acceptable_chars])[:100]
2064
+ return "".join([c for c in get_user_name().lower() if c in acceptable_chars])[
2065
+ :100
2066
+ ]
1850
2067
 
1851
2068
  @runtime_checkable
1852
2069
  class KubernetesConfig(Protocol):
@@ -1858,33 +2075,66 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1858
2075
  have to let the fact that this also has to be a Config just be manually
1859
2076
  enforced.
1860
2077
  """
2078
+
1861
2079
  kubernetes_host_path: Optional[str]
1862
2080
  kubernetes_owner: str
1863
2081
  kubernetes_service_account: Optional[str]
1864
2082
  kubernetes_pod_timeout: float
1865
2083
 
1866
-
1867
2084
  @classmethod
1868
2085
  def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
1869
- parser.add_argument("--kubernetesHostPath", dest="kubernetes_host_path", default=None, env_var="TOIL_KUBERNETES_HOST_PATH",
1870
- help="Path on Kubernetes hosts to use as shared inter-pod temp directory. "
1871
- "(default: %(default)s)")
1872
- parser.add_argument("--kubernetesOwner", dest="kubernetes_owner", default=None, env_var="TOIL_KUBERNETES_OWNER",
1873
- help=f"Username to mark Kubernetes jobs with. If the provided value is None, the value will "
1874
- f"be generated at runtime. "
1875
- f"(Generated default: {cls.get_default_kubernetes_owner()})")
1876
- parser.add_argument("--kubernetesServiceAccount", dest="kubernetes_service_account", default=None, env_var="TOIL_KUBERNETES_SERVICE_ACCOUNT",
1877
- help="Service account to run jobs as. "
1878
- "(default: %(default)s)")
1879
- parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float,
1880
- help="Seconds to wait for a scheduled Kubernetes pod to start running. "
1881
- "(default: %(default)s)")
1882
-
1883
- OptionType = TypeVar('OptionType')
2086
+ parser.add_argument(
2087
+ "--kubernetesHostPath",
2088
+ dest="kubernetes_host_path",
2089
+ default=None,
2090
+ env_var="TOIL_KUBERNETES_HOST_PATH",
2091
+ help="Path on Kubernetes hosts to use as shared inter-pod temp directory. "
2092
+ "(default: %(default)s)",
2093
+ )
2094
+ parser.add_argument(
2095
+ "--kubernetesOwner",
2096
+ dest="kubernetes_owner",
2097
+ default=None,
2098
+ env_var="TOIL_KUBERNETES_OWNER",
2099
+ help=f"Username to mark Kubernetes jobs with. If the provided value is None, the value will "
2100
+ f"be generated at runtime. "
2101
+ f"(Generated default: {cls.get_default_kubernetes_owner()})",
2102
+ )
2103
+ parser.add_argument(
2104
+ "--kubernetesServiceAccount",
2105
+ dest="kubernetes_service_account",
2106
+ default=None,
2107
+ env_var="TOIL_KUBERNETES_SERVICE_ACCOUNT",
2108
+ help="Service account to run jobs as. " "(default: %(default)s)",
2109
+ )
2110
+ parser.add_argument(
2111
+ "--kubernetesPodTimeout",
2112
+ dest="kubernetes_pod_timeout",
2113
+ default=120,
2114
+ env_var="TOIL_KUBERNETES_POD_TIMEOUT",
2115
+ type=float,
2116
+ help="Seconds to wait for a scheduled Kubernetes pod to start running. "
2117
+ "(default: %(default)s)",
2118
+ )
2119
+ parser.add_argument(
2120
+ "--kubernetesPrivileged",
2121
+ dest="kubernetes_privileged",
2122
+ default=False,
2123
+ env_var="TOIL_KUBERNETES_PRIVILEGED",
2124
+ type=opt_strtobool,
2125
+ help="Whether to ask worker pods to run in privileged mode. This should be used to access "
2126
+ "privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
2127
+ "this is set to True. (default: %(default)s)",
2128
+ )
2129
+
2130
+ OptionType = TypeVar("OptionType")
2131
+
1884
2132
  @classmethod
1885
2133
  def setOptions(cls, setOption: OptionSetter) -> None:
1886
2134
  setOption("kubernetes_host_path")
1887
2135
  setOption("kubernetes_owner")
1888
- setOption("kubernetes_service_account",)
2136
+ setOption(
2137
+ "kubernetes_service_account",
2138
+ )
1889
2139
  setOption("kubernetes_pod_timeout")
1890
-
2140
+ setOption("kubernetes_privileged")