toil 9.1.1__py3-none-any.whl → 9.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +5 -9
- toil/batchSystems/abstractBatchSystem.py +23 -22
- toil/batchSystems/abstractGridEngineBatchSystem.py +17 -12
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/cleanup_support.py +4 -4
- toil/batchSystems/contained_executor.py +3 -3
- toil/batchSystems/gridengine.py +3 -4
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +65 -63
- toil/batchSystems/local_support.py +2 -3
- toil/batchSystems/lsf.py +6 -7
- toil/batchSystems/mesos/batchSystem.py +11 -7
- toil/batchSystems/mesos/test/__init__.py +1 -2
- toil/batchSystems/options.py +9 -10
- toil/batchSystems/registry.py +3 -7
- toil/batchSystems/singleMachine.py +8 -11
- toil/batchSystems/slurm.py +49 -38
- toil/batchSystems/torque.py +3 -4
- toil/bus.py +36 -34
- toil/common.py +129 -89
- toil/cwl/cwltoil.py +857 -729
- toil/cwl/utils.py +44 -35
- toil/fileStores/__init__.py +3 -1
- toil/fileStores/abstractFileStore.py +28 -30
- toil/fileStores/cachingFileStore.py +8 -8
- toil/fileStores/nonCachingFileStore.py +10 -21
- toil/job.py +159 -158
- toil/jobStores/abstractJobStore.py +68 -69
- toil/jobStores/aws/jobStore.py +249 -213
- toil/jobStores/aws/utils.py +13 -24
- toil/jobStores/fileJobStore.py +28 -22
- toil/jobStores/googleJobStore.py +21 -17
- toil/jobStores/utils.py +3 -7
- toil/leader.py +17 -22
- toil/lib/accelerators.py +6 -4
- toil/lib/aws/__init__.py +9 -10
- toil/lib/aws/ami.py +33 -19
- toil/lib/aws/iam.py +6 -6
- toil/lib/aws/s3.py +259 -157
- toil/lib/aws/session.py +76 -76
- toil/lib/aws/utils.py +51 -43
- toil/lib/checksum.py +19 -15
- toil/lib/compatibility.py +3 -2
- toil/lib/conversions.py +45 -18
- toil/lib/directory.py +29 -26
- toil/lib/docker.py +93 -99
- toil/lib/dockstore.py +77 -50
- toil/lib/ec2.py +39 -38
- toil/lib/ec2nodes.py +11 -4
- toil/lib/exceptions.py +8 -5
- toil/lib/ftp_utils.py +9 -14
- toil/lib/generatedEC2Lists.py +161 -20
- toil/lib/history.py +141 -97
- toil/lib/history_submission.py +163 -72
- toil/lib/io.py +27 -17
- toil/lib/memoize.py +2 -1
- toil/lib/misc.py +15 -11
- toil/lib/pipes.py +40 -25
- toil/lib/plugins.py +12 -8
- toil/lib/resources.py +1 -0
- toil/lib/retry.py +32 -38
- toil/lib/threading.py +12 -12
- toil/lib/throttle.py +1 -2
- toil/lib/trs.py +113 -51
- toil/lib/url.py +14 -23
- toil/lib/web.py +7 -2
- toil/options/common.py +18 -15
- toil/options/cwl.py +2 -2
- toil/options/runner.py +9 -5
- toil/options/wdl.py +1 -3
- toil/provisioners/__init__.py +9 -9
- toil/provisioners/abstractProvisioner.py +22 -20
- toil/provisioners/aws/__init__.py +20 -14
- toil/provisioners/aws/awsProvisioner.py +10 -8
- toil/provisioners/clusterScaler.py +19 -18
- toil/provisioners/gceProvisioner.py +2 -3
- toil/provisioners/node.py +11 -13
- toil/realtimeLogger.py +4 -4
- toil/resource.py +5 -5
- toil/server/app.py +2 -2
- toil/server/cli/wes_cwl_runner.py +11 -11
- toil/server/utils.py +18 -21
- toil/server/wes/abstract_backend.py +9 -8
- toil/server/wes/amazon_wes_utils.py +3 -3
- toil/server/wes/tasks.py +3 -5
- toil/server/wes/toil_backend.py +17 -21
- toil/server/wsgi_app.py +3 -3
- toil/serviceManager.py +3 -4
- toil/statsAndLogging.py +12 -13
- toil/test/__init__.py +33 -24
- toil/test/batchSystems/batchSystemTest.py +12 -11
- toil/test/batchSystems/batch_system_plugin_test.py +3 -5
- toil/test/batchSystems/test_slurm.py +38 -24
- toil/test/cwl/conftest.py +5 -6
- toil/test/cwl/cwlTest.py +194 -78
- toil/test/cwl/download_file_uri.json +6 -0
- toil/test/cwl/download_file_uri_no_hostname.json +6 -0
- toil/test/docs/scripts/tutorial_staging.py +1 -0
- toil/test/jobStores/jobStoreTest.py +9 -7
- toil/test/lib/aws/test_iam.py +1 -3
- toil/test/lib/aws/test_s3.py +1 -1
- toil/test/lib/dockerTest.py +9 -9
- toil/test/lib/test_ec2.py +12 -11
- toil/test/lib/test_history.py +4 -4
- toil/test/lib/test_trs.py +16 -14
- toil/test/lib/test_url.py +7 -6
- toil/test/lib/url_plugin_test.py +12 -18
- toil/test/provisioners/aws/awsProvisionerTest.py +10 -8
- toil/test/provisioners/clusterScalerTest.py +2 -5
- toil/test/provisioners/clusterTest.py +1 -3
- toil/test/server/serverTest.py +13 -4
- toil/test/sort/restart_sort.py +2 -6
- toil/test/sort/sort.py +3 -8
- toil/test/src/deferredFunctionTest.py +7 -7
- toil/test/src/environmentTest.py +1 -2
- toil/test/src/fileStoreTest.py +5 -5
- toil/test/src/importExportFileTest.py +5 -6
- toil/test/src/jobServiceTest.py +22 -14
- toil/test/src/jobTest.py +121 -25
- toil/test/src/miscTests.py +5 -7
- toil/test/src/promisedRequirementTest.py +8 -7
- toil/test/src/regularLogTest.py +2 -3
- toil/test/src/resourceTest.py +5 -8
- toil/test/src/restartDAGTest.py +5 -6
- toil/test/src/resumabilityTest.py +2 -2
- toil/test/src/retainTempDirTest.py +3 -3
- toil/test/src/systemTest.py +3 -3
- toil/test/src/threadingTest.py +1 -1
- toil/test/src/workerTest.py +1 -2
- toil/test/utils/toilDebugTest.py +6 -4
- toil/test/utils/toilKillTest.py +1 -1
- toil/test/utils/utilsTest.py +15 -14
- toil/test/wdl/wdltoil_test.py +247 -124
- toil/test/wdl/wdltoil_test_kubernetes.py +2 -2
- toil/toilState.py +2 -3
- toil/utils/toilDebugFile.py +3 -8
- toil/utils/toilDebugJob.py +1 -2
- toil/utils/toilLaunchCluster.py +1 -2
- toil/utils/toilSshCluster.py +2 -0
- toil/utils/toilStats.py +19 -24
- toil/utils/toilStatus.py +11 -14
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +313 -209
- toil/worker.py +18 -12
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/METADATA +11 -14
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/RECORD +150 -153
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/WHEEL +1 -1
- toil/test/cwl/staging_cat.cwl +0 -27
- toil/test/cwl/staging_make_file.cwl +0 -25
- toil/test/cwl/staging_workflow.cwl +0 -43
- toil/test/cwl/zero_default.cwl +0 -61
- toil/test/utils/ABCWorkflowDebug/ABC.txt +0 -1
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/entry_points.txt +0 -0
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/licenses/LICENSE +0 -0
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/top_level.txt +0 -0
toil/batchSystems/kubernetes.py
CHANGED
|
@@ -31,29 +31,24 @@ import tempfile
|
|
|
31
31
|
import time
|
|
32
32
|
import uuid
|
|
33
33
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
34
|
-
from collections.abc import Iterator
|
|
34
|
+
from collections.abc import Callable, Iterator
|
|
35
35
|
from queue import Empty, Queue
|
|
36
36
|
from threading import Condition, Event, RLock, Thread
|
|
37
|
-
from typing import Any,
|
|
37
|
+
from typing import Any, Literal, ParamSpec, TypeVar, Union, cast, overload
|
|
38
38
|
|
|
39
39
|
from toil.lib.conversions import opt_strtobool
|
|
40
40
|
from toil.lib.throttle import LocalThrottle
|
|
41
41
|
|
|
42
|
-
if sys.version_info < (3, 10):
|
|
43
|
-
from typing_extensions import ParamSpec
|
|
44
|
-
else:
|
|
45
|
-
from typing import ParamSpec
|
|
46
|
-
|
|
47
42
|
if sys.version_info < (3, 11):
|
|
48
43
|
from typing_extensions import NotRequired
|
|
49
44
|
else:
|
|
50
45
|
from typing import NotRequired
|
|
51
46
|
|
|
47
|
+
import json
|
|
52
48
|
from typing import Protocol, TypedDict, runtime_checkable
|
|
53
49
|
|
|
54
|
-
import urllib3
|
|
55
50
|
import ruamel.yaml as yaml
|
|
56
|
-
import
|
|
51
|
+
import urllib3
|
|
57
52
|
|
|
58
53
|
# The Right Way to use the Kubernetes module is to `import kubernetes` and then you get all your stuff as like ApiClient. But this doesn't work for the stubs: the stubs seem to only support importing things from the internal modules in `kubernetes` where they are actually defined. See for example <https://github.com/MaterializeInc/kubernetes-stubs/issues/9 and <https://github.com/MaterializeInc/kubernetes-stubs/issues/10>. So we just import all the things we use into our global namespace here.
|
|
59
54
|
from kubernetes.client import (
|
|
@@ -74,13 +69,13 @@ from kubernetes.client import (
|
|
|
74
69
|
V1NodeSelectorTerm,
|
|
75
70
|
V1ObjectMeta,
|
|
76
71
|
V1Pod,
|
|
72
|
+
V1PodSecurityContext,
|
|
77
73
|
V1PodSpec,
|
|
78
74
|
V1PodTemplateSpec,
|
|
79
75
|
V1PreferredSchedulingTerm,
|
|
80
76
|
V1ResourceRequirements,
|
|
81
77
|
V1SecretVolumeSource,
|
|
82
78
|
V1SecurityContext,
|
|
83
|
-
V1PodSecurityContext,
|
|
84
79
|
V1Toleration,
|
|
85
80
|
V1Volume,
|
|
86
81
|
V1VolumeMount,
|
|
@@ -114,7 +109,7 @@ from toil.options.common import SYS_MAX_SIZE
|
|
|
114
109
|
from toil.resource import Resource
|
|
115
110
|
|
|
116
111
|
logger = logging.getLogger(__name__)
|
|
117
|
-
retryable_kubernetes_errors: list[
|
|
112
|
+
retryable_kubernetes_errors: list[type[Exception] | ErrorCondition] = [
|
|
118
113
|
urllib3.exceptions.MaxRetryError,
|
|
119
114
|
urllib3.exceptions.ProtocolError,
|
|
120
115
|
ApiException,
|
|
@@ -168,7 +163,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
168
163
|
logging.getLogger("requests_oauthlib").setLevel(logging.ERROR)
|
|
169
164
|
|
|
170
165
|
# This will hold the last time our Kubernetes credentials were refreshed
|
|
171
|
-
self.credential_time:
|
|
166
|
+
self.credential_time: datetime.datetime | None = None
|
|
172
167
|
# And this will hold our cache of API objects
|
|
173
168
|
self._apis: KubernetesBatchSystem._ApiStorageDict = {}
|
|
174
169
|
|
|
@@ -177,10 +172,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
177
172
|
|
|
178
173
|
# Decide if we are going to mount a Kubernetes host path as the Toil
|
|
179
174
|
# work dir in the workers, for shared caching.
|
|
180
|
-
self.host_path:
|
|
175
|
+
self.host_path: str | None = config.kubernetes_host_path
|
|
181
176
|
|
|
182
177
|
# Get the service account name to use, if any.
|
|
183
|
-
self.service_account:
|
|
178
|
+
self.service_account: str | None = config.kubernetes_service_account
|
|
184
179
|
|
|
185
180
|
# Get how long we should wait for a pod that lands on a node to
|
|
186
181
|
# actually start.
|
|
@@ -208,7 +203,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
208
203
|
self.finished_job_ttl: int = 3600 # seconds
|
|
209
204
|
|
|
210
205
|
# Here is where we will store the user script resource object if we get one.
|
|
211
|
-
self.user_script:
|
|
206
|
+
self.user_script: Resource | None = None
|
|
212
207
|
|
|
213
208
|
# Ge the image to deploy from Toil's configuration
|
|
214
209
|
self.docker_image: str = applianceSelf()
|
|
@@ -234,9 +229,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
234
229
|
self.environment["TMPDIR"] = "/var/tmp"
|
|
235
230
|
|
|
236
231
|
# Get the name of the AWS secret, if any, to mount in containers.
|
|
237
|
-
self.aws_secret_name:
|
|
238
|
-
"TOIL_AWS_SECRET_NAME", None
|
|
239
|
-
)
|
|
232
|
+
self.aws_secret_name: str | None = os.environ.get("TOIL_AWS_SECRET_NAME", None)
|
|
240
233
|
|
|
241
234
|
# Set this to True to enable the experimental wait-for-job-update code
|
|
242
235
|
self.enable_watching: bool = os.environ.get("KUBE_WATCH_ENABLED", False)
|
|
@@ -323,7 +316,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
323
316
|
|
|
324
317
|
drop_boring(root_dict)
|
|
325
318
|
s = io.StringIO()
|
|
326
|
-
YAML = yaml.YAML(typ=
|
|
319
|
+
YAML = yaml.YAML(typ="safe")
|
|
327
320
|
YAML.dump(root_dict, s)
|
|
328
321
|
return s.getvalue()
|
|
329
322
|
|
|
@@ -332,7 +325,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
332
325
|
self,
|
|
333
326
|
kind: Literal["batch"],
|
|
334
327
|
max_age_seconds: float = 5 * 60,
|
|
335
|
-
errors:
|
|
328
|
+
errors: list[int] | None = None,
|
|
336
329
|
) -> BatchV1Api: ...
|
|
337
330
|
|
|
338
331
|
@overload
|
|
@@ -340,7 +333,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
340
333
|
self,
|
|
341
334
|
kind: Literal["core"],
|
|
342
335
|
max_age_seconds: float = 5 * 60,
|
|
343
|
-
errors:
|
|
336
|
+
errors: list[int] | None = None,
|
|
344
337
|
) -> CoreV1Api: ...
|
|
345
338
|
|
|
346
339
|
@overload
|
|
@@ -348,7 +341,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
348
341
|
self,
|
|
349
342
|
kind: Literal["customObjects"],
|
|
350
343
|
max_age_seconds: float = 5 * 60,
|
|
351
|
-
errors:
|
|
344
|
+
errors: list[int] | None = None,
|
|
352
345
|
) -> CustomObjectsApi: ...
|
|
353
346
|
|
|
354
347
|
@overload
|
|
@@ -358,15 +351,15 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
358
351
|
|
|
359
352
|
def _api(
|
|
360
353
|
self,
|
|
361
|
-
kind:
|
|
362
|
-
Literal["batch"]
|
|
363
|
-
Literal["core"]
|
|
364
|
-
Literal["customObjects"]
|
|
365
|
-
Literal["namespace"]
|
|
366
|
-
|
|
354
|
+
kind: (
|
|
355
|
+
Literal["batch"]
|
|
356
|
+
| Literal["core"]
|
|
357
|
+
| Literal["customObjects"]
|
|
358
|
+
| Literal["namespace"]
|
|
359
|
+
),
|
|
367
360
|
max_age_seconds: float = 5 * 60,
|
|
368
|
-
errors:
|
|
369
|
-
) ->
|
|
361
|
+
errors: list[int] | None = None,
|
|
362
|
+
) -> BatchV1Api | CoreV1Api | CustomObjectsApi | str:
|
|
370
363
|
"""
|
|
371
364
|
The Kubernetes module isn't clever enough to renew its credentials when
|
|
372
365
|
they are about to expire. See
|
|
@@ -545,7 +538,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
545
538
|
def __getitem__(self, name: Literal["raw_object"]) -> dict[str, Any]: ...
|
|
546
539
|
|
|
547
540
|
def __getitem__(
|
|
548
|
-
self, name:
|
|
541
|
+
self, name: Literal["type"] | Literal["object"] | Literal["raw_object"]
|
|
549
542
|
) -> Any: ...
|
|
550
543
|
|
|
551
544
|
P = ParamSpec("P")
|
|
@@ -773,7 +766,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
773
766
|
if preferred_scheduling_terms or required_selector_requirements:
|
|
774
767
|
# We prefer or require something about labels.
|
|
775
768
|
|
|
776
|
-
requirements_selector:
|
|
769
|
+
requirements_selector: V1NodeSelector | None = None
|
|
777
770
|
if required_selector_requirements:
|
|
778
771
|
# Make a term that says we match all the requirements
|
|
779
772
|
requirements_term = V1NodeSelectorTerm(
|
|
@@ -817,7 +810,8 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
817
810
|
class FakeResponse:
|
|
818
811
|
data: str
|
|
819
812
|
|
|
820
|
-
T = TypeVar(
|
|
813
|
+
T = TypeVar("T")
|
|
814
|
+
|
|
821
815
|
def _load_kubernetes_object(self, file: str, cls: type[T]) -> T:
|
|
822
816
|
"""
|
|
823
817
|
Deserialize a YAML representation into a Kubernetes object
|
|
@@ -825,20 +819,19 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
825
819
|
:param cls: Kubernetes API model type for deserialized object
|
|
826
820
|
:return: Deserialized object
|
|
827
821
|
"""
|
|
828
|
-
YAML = yaml.YAML(typ=
|
|
829
|
-
object_def = YAML.load(open(
|
|
822
|
+
YAML = yaml.YAML(typ="safe")
|
|
823
|
+
object_def = YAML.load(open("container.yaml").read())
|
|
830
824
|
# The kubernetes API does not have an actual deserializer, so this is a workaround
|
|
831
825
|
# See: https://github.com/kubernetes-client/python/issues/977
|
|
832
826
|
faked_response = self.FakeResponse()
|
|
833
827
|
faked_response.data = json.dumps(object_def)
|
|
834
828
|
return ApiClient().deserialize(faked_response, cls)
|
|
835
829
|
|
|
836
|
-
|
|
837
830
|
def _create_pod_spec(
|
|
838
831
|
self,
|
|
839
832
|
command: str,
|
|
840
833
|
job_desc: JobDescription,
|
|
841
|
-
job_environment:
|
|
834
|
+
job_environment: dict[str, str] | None = None,
|
|
842
835
|
) -> V1PodSpec:
|
|
843
836
|
"""
|
|
844
837
|
Make the specification for a pod that can execute the given job.
|
|
@@ -978,7 +971,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
978
971
|
)
|
|
979
972
|
|
|
980
973
|
if self.config.kubernetes_security_context:
|
|
981
|
-
container.security_context = self._load_kubernetes_object(
|
|
974
|
+
container.security_context = self._load_kubernetes_object(
|
|
975
|
+
self.config.kubernetes_security_context, V1SecurityContext
|
|
976
|
+
)
|
|
982
977
|
|
|
983
978
|
# In case security context rules are not allowed to be set, we only apply
|
|
984
979
|
# a security context at all if we need to turn on privileged mode.
|
|
@@ -993,7 +988,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
993
988
|
)
|
|
994
989
|
|
|
995
990
|
if self.config.kubernetes_pod_security_context:
|
|
996
|
-
pod_spec.security_context = self._load_kubernetes_object(
|
|
991
|
+
pod_spec.security_context = self._load_kubernetes_object(
|
|
992
|
+
self.config.kubernetes_pod_security_context, V1PodSecurityContext
|
|
993
|
+
)
|
|
997
994
|
|
|
998
995
|
# Tell the spec where to land
|
|
999
996
|
placement.apply(pod_spec)
|
|
@@ -1129,7 +1126,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1129
1126
|
self,
|
|
1130
1127
|
command: str,
|
|
1131
1128
|
job_desc: JobDescription,
|
|
1132
|
-
job_environment:
|
|
1129
|
+
job_environment: dict[str, str] | None = None,
|
|
1133
1130
|
) -> int:
|
|
1134
1131
|
# Try the job as local
|
|
1135
1132
|
localID = self.handleLocalJob(command, job_desc)
|
|
@@ -1255,7 +1252,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1255
1252
|
# There isn't one. We got everything.
|
|
1256
1253
|
break
|
|
1257
1254
|
|
|
1258
|
-
def _getPodForJob(self, jobObject: V1Job) ->
|
|
1255
|
+
def _getPodForJob(self, jobObject: V1Job) -> V1Pod | None:
|
|
1259
1256
|
"""
|
|
1260
1257
|
Get the pod that belongs to the given job, or None if the job's pod is
|
|
1261
1258
|
missing. The pod knows about things like the job's exit code.
|
|
@@ -1421,8 +1418,8 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1421
1418
|
def _isPodStuckWaiting(
|
|
1422
1419
|
self,
|
|
1423
1420
|
pod_object: V1Pod,
|
|
1424
|
-
reason:
|
|
1425
|
-
timeout:
|
|
1421
|
+
reason: str | None = None,
|
|
1422
|
+
timeout: float | None = None,
|
|
1426
1423
|
) -> bool:
|
|
1427
1424
|
"""
|
|
1428
1425
|
Return True if the pod looks to be in a waiting state, and false otherwise.
|
|
@@ -1477,7 +1474,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1477
1474
|
# Kubernetes "Terminating" is the same as having the deletion_timestamp
|
|
1478
1475
|
# set in the metadata of the object.
|
|
1479
1476
|
|
|
1480
|
-
deletion_timestamp:
|
|
1477
|
+
deletion_timestamp: datetime.datetime | None = getattr(
|
|
1481
1478
|
getattr(kube_thing, "metadata", None), "deletion_timestamp", None
|
|
1482
1479
|
)
|
|
1483
1480
|
# If the deletion timestamp is set to anything, it is in the process of
|
|
@@ -1498,7 +1495,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1498
1495
|
assert jobObject.metadata.name is not None
|
|
1499
1496
|
return int(jobObject.metadata.name[len(self.job_prefix) :])
|
|
1500
1497
|
|
|
1501
|
-
def getUpdatedBatchJob(self, maxWait: float) ->
|
|
1498
|
+
def getUpdatedBatchJob(self, maxWait: float) -> UpdatedBatchJobInfo | None:
|
|
1502
1499
|
|
|
1503
1500
|
entry = datetime.datetime.now()
|
|
1504
1501
|
|
|
@@ -1541,7 +1538,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1541
1538
|
succeeded_pods = jobObject.status.succeeded or 0
|
|
1542
1539
|
failed_pods = jobObject.status.failed or 0
|
|
1543
1540
|
# Fetch out the condition object that has info about how the job is going.
|
|
1544
|
-
condition:
|
|
1541
|
+
condition: V1JobCondition | None = None
|
|
1545
1542
|
if (
|
|
1546
1543
|
jobObject.status.conditions is not None
|
|
1547
1544
|
and len(jobObject.status.conditions) > 0
|
|
@@ -1650,7 +1647,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1650
1647
|
# When we get here, either we found something or we ran out of time
|
|
1651
1648
|
return result
|
|
1652
1649
|
|
|
1653
|
-
def _getUpdatedBatchJobImmediately(self) ->
|
|
1650
|
+
def _getUpdatedBatchJobImmediately(self) -> UpdatedBatchJobInfo | None:
|
|
1654
1651
|
"""
|
|
1655
1652
|
Return None if no updated (completed or failed) batch job is currently
|
|
1656
1653
|
available, and jobID, exitCode, runtime if such a job can be found.
|
|
@@ -2004,7 +2001,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
2004
2001
|
)
|
|
2005
2002
|
|
|
2006
2003
|
def _get_start_time(
|
|
2007
|
-
self, pod:
|
|
2004
|
+
self, pod: V1Pod | None = None, job: V1Job | None = None
|
|
2008
2005
|
) -> datetime.datetime:
|
|
2009
2006
|
"""
|
|
2010
2007
|
Get an actual or estimated start time for a pod.
|
|
@@ -2120,13 +2117,13 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
2120
2117
|
enforced.
|
|
2121
2118
|
"""
|
|
2122
2119
|
|
|
2123
|
-
kubernetes_host_path:
|
|
2120
|
+
kubernetes_host_path: str | None
|
|
2124
2121
|
kubernetes_owner: str
|
|
2125
|
-
kubernetes_service_account:
|
|
2122
|
+
kubernetes_service_account: str | None
|
|
2126
2123
|
kubernetes_pod_timeout: float
|
|
2127
2124
|
|
|
2128
2125
|
@classmethod
|
|
2129
|
-
def add_options(cls, parser:
|
|
2126
|
+
def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None:
|
|
2130
2127
|
parser.add_argument(
|
|
2131
2128
|
"--kubernetesHostPath",
|
|
2132
2129
|
dest="kubernetes_host_path",
|
|
@@ -2170,18 +2167,23 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
2170
2167
|
"privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
|
|
2171
2168
|
"this is set to True. (default: %(default)s)",
|
|
2172
2169
|
)
|
|
2173
|
-
parser.add_argument(
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2170
|
+
parser.add_argument(
|
|
2171
|
+
"--kubernetesPodSecurityContext",
|
|
2172
|
+
dest="kubernetes_pod_security_context",
|
|
2173
|
+
type=str,
|
|
2174
|
+
env_var="TOIL_KUBERNETES_POD_SECURITY_CONTEXT",
|
|
2175
|
+
default=None,
|
|
2176
|
+
help="Path to a YAML defining a pod security context to apply to all pods.",
|
|
2177
|
+
)
|
|
2178
|
+
parser.add_argument(
|
|
2179
|
+
"--kubernetesSecurityContext",
|
|
2180
|
+
dest="kubernetes_security_context",
|
|
2181
|
+
type=str,
|
|
2182
|
+
env_var="TOIL_KUBERNETES_SECURITY_CONTEXT",
|
|
2183
|
+
default=None,
|
|
2184
|
+
help="Path to a YAML defining a security context to apply to all containers.",
|
|
2185
|
+
)
|
|
2186
|
+
|
|
2185
2187
|
OptionType = TypeVar("OptionType")
|
|
2186
2188
|
|
|
2187
2189
|
@classmethod
|
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
-
from typing import Optional
|
|
16
15
|
|
|
17
16
|
from toil.batchSystems.abstractBatchSystem import (
|
|
18
17
|
BatchSystemSupport,
|
|
@@ -40,7 +39,7 @@ class BatchSystemLocalSupport(BatchSystemSupport):
|
|
|
40
39
|
config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
|
|
41
40
|
)
|
|
42
41
|
|
|
43
|
-
def handleLocalJob(self, command: str, jobDesc: JobDescription) ->
|
|
42
|
+
def handleLocalJob(self, command: str, jobDesc: JobDescription) -> int | None:
|
|
44
43
|
"""
|
|
45
44
|
To be called by issueBatchJob.
|
|
46
45
|
|
|
@@ -78,7 +77,7 @@ class BatchSystemLocalSupport(BatchSystemSupport):
|
|
|
78
77
|
local_running: dict[int, float] = self.localBatch.getRunningBatchJobIDs()
|
|
79
78
|
return local_running
|
|
80
79
|
|
|
81
|
-
def getUpdatedLocalJob(self, maxWait: int) ->
|
|
80
|
+
def getUpdatedLocalJob(self, maxWait: int) -> UpdatedBatchJobInfo | None:
|
|
82
81
|
"""To be called by getUpdatedBatchJob()."""
|
|
83
82
|
return self.localBatch.getUpdatedBatchJob(maxWait)
|
|
84
83
|
|
toil/batchSystems/lsf.py
CHANGED
|
@@ -25,7 +25,6 @@ import re
|
|
|
25
25
|
import subprocess
|
|
26
26
|
from datetime import datetime
|
|
27
27
|
from random import randint
|
|
28
|
-
from typing import Optional, Union
|
|
29
28
|
|
|
30
29
|
from dateutil.parser import parse
|
|
31
30
|
from dateutil.tz import tzlocal
|
|
@@ -101,8 +100,8 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
101
100
|
jobID: int,
|
|
102
101
|
command: str,
|
|
103
102
|
jobName: str,
|
|
104
|
-
job_environment:
|
|
105
|
-
gpus:
|
|
103
|
+
job_environment: dict[str, str] | None = None,
|
|
104
|
+
gpus: int | None = None,
|
|
106
105
|
):
|
|
107
106
|
return (
|
|
108
107
|
self.prepareBsub(cpu, memory, jobID) + [command],
|
|
@@ -184,7 +183,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
184
183
|
|
|
185
184
|
def getJobExitCode(
|
|
186
185
|
self, lsfJobID
|
|
187
|
-
) ->
|
|
186
|
+
) -> int | tuple[int, BatchJobExitReason | None] | None:
|
|
188
187
|
# the task is set as part of the job ID if using getBatchSystemID()
|
|
189
188
|
if "NOT_SUBMITTED" in lsfJobID:
|
|
190
189
|
logger.error("bjobs detected job failed to submit")
|
|
@@ -217,7 +216,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
217
216
|
|
|
218
217
|
def parse_bjobs_record(
|
|
219
218
|
self, bjobs_record: dict, job: int
|
|
220
|
-
) ->
|
|
219
|
+
) -> int | tuple[int, BatchJobExitReason | None] | None:
|
|
221
220
|
"""
|
|
222
221
|
Helper functions for getJobExitCode and to parse the bjobs status record
|
|
223
222
|
"""
|
|
@@ -279,7 +278,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
279
278
|
|
|
280
279
|
def getJobExitCodeBACCT(
|
|
281
280
|
self, job
|
|
282
|
-
) ->
|
|
281
|
+
) -> int | tuple[int, BatchJobExitReason | None] | None:
|
|
283
282
|
# if not found in bjobs, then try bacct (slower than bjobs)
|
|
284
283
|
logger.debug("bjobs failed to detect job - trying bacct: " "{}".format(job))
|
|
285
284
|
|
|
@@ -301,7 +300,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
301
300
|
|
|
302
301
|
def fallbackGetJobExitCode(
|
|
303
302
|
self, job
|
|
304
|
-
) ->
|
|
303
|
+
) -> int | tuple[int, BatchJobExitReason | None] | None:
|
|
305
304
|
args = ["bjobs", "-l", str(job)]
|
|
306
305
|
logger.debug(f"Checking job exit code for job via bjobs (fallback): {job}")
|
|
307
306
|
stdout = call_command(args)
|
|
@@ -22,7 +22,6 @@ import time
|
|
|
22
22
|
import traceback
|
|
23
23
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
24
24
|
from queue import Empty, Queue
|
|
25
|
-
from typing import Optional, Union
|
|
26
25
|
from urllib.parse import quote_plus
|
|
27
26
|
from urllib.request import urlopen
|
|
28
27
|
|
|
@@ -186,7 +185,7 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
|
|
|
186
185
|
self,
|
|
187
186
|
command: str,
|
|
188
187
|
jobNode: JobDescription,
|
|
189
|
-
job_environment:
|
|
188
|
+
job_environment: dict[str, str] | None = None,
|
|
190
189
|
) -> str:
|
|
191
190
|
"""
|
|
192
191
|
Issues the following command returning a unique jobID. Command is the string to run, memory
|
|
@@ -370,16 +369,21 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
|
|
|
370
369
|
# TODO: Use a condition instead of a spin wait.
|
|
371
370
|
|
|
372
371
|
if wait_count >= self.mesos_timeout:
|
|
373
|
-
error_message =
|
|
372
|
+
error_message = (
|
|
373
|
+
f"Could not connect to Mesos endpoint at {self.mesos_endpoint}"
|
|
374
|
+
)
|
|
374
375
|
log.error(error_message)
|
|
375
376
|
self.shutdown()
|
|
376
377
|
raise RuntimeError(error_message)
|
|
377
378
|
elif wait_count > 1 and wait_count % 10 == 0:
|
|
378
|
-
log.warning(
|
|
379
|
+
log.warning(
|
|
380
|
+
"Waiting for Mesos registration (try %s/%s)",
|
|
381
|
+
wait_count,
|
|
382
|
+
self.mesos_timeout,
|
|
383
|
+
)
|
|
379
384
|
time.sleep(1)
|
|
380
385
|
wait_count += 1
|
|
381
386
|
|
|
382
|
-
|
|
383
387
|
@staticmethod
|
|
384
388
|
def _resolveAddress(address):
|
|
385
389
|
"""
|
|
@@ -862,7 +866,7 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
|
|
|
862
866
|
return executor
|
|
863
867
|
|
|
864
868
|
def getNodes(
|
|
865
|
-
self, preemptible:
|
|
869
|
+
self, preemptible: bool | None = None, timeout: int | None = None
|
|
866
870
|
) -> dict[str, NodeInfo]:
|
|
867
871
|
"""
|
|
868
872
|
Return all nodes that match:
|
|
@@ -1008,7 +1012,7 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
|
|
|
1008
1012
|
return f"{get_public_ip()}:5050"
|
|
1009
1013
|
|
|
1010
1014
|
@classmethod
|
|
1011
|
-
def add_options(cls, parser:
|
|
1015
|
+
def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None:
|
|
1012
1016
|
parser.add_argument(
|
|
1013
1017
|
"--mesosEndpoint",
|
|
1014
1018
|
"--mesosMaster",
|
|
@@ -7,7 +7,6 @@ from abc import ABCMeta, abstractmethod
|
|
|
7
7
|
from contextlib import closing
|
|
8
8
|
from shutil import which
|
|
9
9
|
from urllib.request import urlopen
|
|
10
|
-
from typing import Optional
|
|
11
10
|
|
|
12
11
|
from toil.lib.retry import retry
|
|
13
12
|
from toil.lib.threading import ExceptionalThread, cpu_count
|
|
@@ -26,7 +25,7 @@ class MesosTestSupport:
|
|
|
26
25
|
with closing(urlopen("http://127.0.0.1:5050/version")) as content:
|
|
27
26
|
content.read()
|
|
28
27
|
|
|
29
|
-
def _startMesos(self, numCores:
|
|
28
|
+
def _startMesos(self, numCores: int | None = None) -> None:
|
|
30
29
|
if numCores is None:
|
|
31
30
|
numCores = cpu_count()
|
|
32
31
|
shutil.rmtree("/tmp/mesos", ignore_errors=True)
|
toil/batchSystems/options.py
CHANGED
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
|
|
14
14
|
import logging
|
|
15
15
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
16
|
-
from
|
|
16
|
+
from collections.abc import Callable
|
|
17
|
+
from typing import Any, Protocol, TypeVar
|
|
17
18
|
|
|
18
19
|
from toil.batchSystems.registry import (
|
|
19
20
|
DEFAULT_BATCH_SYSTEM,
|
|
@@ -38,17 +39,15 @@ class OptionSetter(Protocol):
|
|
|
38
39
|
def __call__(
|
|
39
40
|
self,
|
|
40
41
|
option_name: str,
|
|
41
|
-
parsing_function:
|
|
42
|
-
check_function:
|
|
43
|
-
default:
|
|
44
|
-
env:
|
|
45
|
-
old_names:
|
|
42
|
+
parsing_function: Callable[[Any], OptionType] | None = None,
|
|
43
|
+
check_function: Callable[[OptionType], None | bool] | None = None,
|
|
44
|
+
default: OptionType | None = None,
|
|
45
|
+
env: list[str] | None = None,
|
|
46
|
+
old_names: list[str] | None = None,
|
|
46
47
|
) -> bool: ...
|
|
47
48
|
|
|
48
49
|
|
|
49
|
-
def set_batchsystem_options(
|
|
50
|
-
batch_system: Optional[str], set_option: OptionSetter
|
|
51
|
-
) -> None:
|
|
50
|
+
def set_batchsystem_options(batch_system: str | None, set_option: OptionSetter) -> None:
|
|
52
51
|
"""
|
|
53
52
|
Call set_option for all the options for the given named batch system, or
|
|
54
53
|
all batch systems if no name is provided.
|
|
@@ -80,7 +79,7 @@ def set_batchsystem_options(
|
|
|
80
79
|
set_option("batch_logs_dir")
|
|
81
80
|
|
|
82
81
|
|
|
83
|
-
def add_all_batchsystem_options(parser:
|
|
82
|
+
def add_all_batchsystem_options(parser: ArgumentParser | _ArgumentGroup) -> None:
|
|
84
83
|
from toil.options.common import SYS_MAX_SIZE
|
|
85
84
|
|
|
86
85
|
# Do the global cross-batch-system arguments
|
toil/batchSystems/registry.py
CHANGED
|
@@ -13,13 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
|
-
import
|
|
17
|
-
import
|
|
18
|
-
from collections.abc import Sequence
|
|
19
|
-
from typing import TYPE_CHECKING, Callable
|
|
16
|
+
from collections.abc import Callable, Sequence
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
20
18
|
|
|
21
|
-
from toil.lib.compatibility import deprecated
|
|
22
|
-
from toil.lib.memoize import memoize
|
|
23
19
|
import toil.lib.plugins
|
|
24
20
|
|
|
25
21
|
if TYPE_CHECKING:
|
|
@@ -133,4 +129,4 @@ add_batch_system_factory("mesos", mesos_batch_system_factory)
|
|
|
133
129
|
add_batch_system_factory("slurm", slurm_batch_system_factory)
|
|
134
130
|
add_batch_system_factory("torque", torque_batch_system_factory)
|
|
135
131
|
add_batch_system_factory("htcondor", htcondor_batch_system_factory)
|
|
136
|
-
add_batch_system_factory("kubernetes", kubernetes_batch_system_factory)
|
|
132
|
+
add_batch_system_factory("kubernetes", kubernetes_batch_system_factory)
|
|
@@ -23,7 +23,6 @@ from argparse import ArgumentParser, _ArgumentGroup
|
|
|
23
23
|
from collections.abc import Sequence
|
|
24
24
|
from queue import Empty, Queue
|
|
25
25
|
from threading import Event, Lock, Thread
|
|
26
|
-
from typing import Optional, Union
|
|
27
26
|
|
|
28
27
|
import toil
|
|
29
28
|
from toil import worker as toil_worker
|
|
@@ -96,7 +95,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
96
95
|
maxCores: float,
|
|
97
96
|
maxMemory: float,
|
|
98
97
|
maxDisk: int,
|
|
99
|
-
max_jobs:
|
|
98
|
+
max_jobs: int | None = None,
|
|
100
99
|
) -> None:
|
|
101
100
|
self.config = config
|
|
102
101
|
|
|
@@ -219,7 +218,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
219
218
|
# Also takes care of resource accounting.
|
|
220
219
|
self.daddyThread = None
|
|
221
220
|
# If it breaks it will fill this in
|
|
222
|
-
self.daddyException:
|
|
221
|
+
self.daddyException: Exception | None = None
|
|
223
222
|
|
|
224
223
|
if self.debugWorker:
|
|
225
224
|
logger.debug("Started batch system %s in worker debug mode.", id(self))
|
|
@@ -614,9 +613,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
614
613
|
details=[f"The accelerator {problem} could not be provided."],
|
|
615
614
|
)
|
|
616
615
|
|
|
617
|
-
def _release_acquired_resources(
|
|
618
|
-
self, resources: list[Union[int, set[int]]]
|
|
619
|
-
) -> None:
|
|
616
|
+
def _release_acquired_resources(self, resources: list[int | set[int]]) -> None:
|
|
620
617
|
"""
|
|
621
618
|
Release all resources acquired for a job.
|
|
622
619
|
Assumes resources are in the order: core fractions, memory, disk, accelerators.
|
|
@@ -634,7 +631,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
634
631
|
self,
|
|
635
632
|
needed_accelerators: list[AcceleratorRequirement],
|
|
636
633
|
available_accelerator_ids: set[int],
|
|
637
|
-
) -> tuple[
|
|
634
|
+
) -> tuple[set[int] | None, AcceleratorRequirement | None]:
|
|
638
635
|
"""
|
|
639
636
|
Given the accelerator requirements of a job, and the set of available
|
|
640
637
|
accelerators out of our associated collection of accelerators, find a
|
|
@@ -709,7 +706,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
709
706
|
|
|
710
707
|
# And what do we want from each resource in self.resource_sources?
|
|
711
708
|
# We know they go job slot, cores, memory, disk, accelerators.
|
|
712
|
-
resource_requests: list[
|
|
709
|
+
resource_requests: list[int | set[int]] = [
|
|
713
710
|
1,
|
|
714
711
|
coreFractions,
|
|
715
712
|
jobMemory,
|
|
@@ -891,7 +888,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
891
888
|
self,
|
|
892
889
|
command: str,
|
|
893
890
|
job_desc: JobDescription,
|
|
894
|
-
job_environment:
|
|
891
|
+
job_environment: dict[str, str] | None = None,
|
|
895
892
|
) -> int:
|
|
896
893
|
"""Adds the command and resources to a queue to be run."""
|
|
897
894
|
|
|
@@ -995,7 +992,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
995
992
|
|
|
996
993
|
BatchSystemSupport.workerCleanup(self.workerCleanupInfo)
|
|
997
994
|
|
|
998
|
-
def getUpdatedBatchJob(self, maxWait: int) ->
|
|
995
|
+
def getUpdatedBatchJob(self, maxWait: int) -> UpdatedBatchJobInfo | None:
|
|
999
996
|
"""Returns a tuple of a no-longer-running job, the return value of its process, and its runtime, or None."""
|
|
1000
997
|
|
|
1001
998
|
self._checkOnDaddy()
|
|
@@ -1009,7 +1006,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
1009
1006
|
return item
|
|
1010
1007
|
|
|
1011
1008
|
@classmethod
|
|
1012
|
-
def add_options(cls, parser:
|
|
1009
|
+
def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None:
|
|
1013
1010
|
parser.add_argument(
|
|
1014
1011
|
"--scale",
|
|
1015
1012
|
dest="scale",
|