toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/batchSystems/awsBatch.py
CHANGED
|
@@ -34,15 +34,18 @@ import tempfile
|
|
|
34
34
|
import time
|
|
35
35
|
import uuid
|
|
36
36
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
37
|
-
from
|
|
37
|
+
from collections.abc import Iterator
|
|
38
|
+
from typing import Any, Optional, Union
|
|
38
39
|
|
|
39
40
|
from botocore.exceptions import ClientError
|
|
40
41
|
|
|
41
42
|
from toil import applianceSelf
|
|
42
|
-
from toil.batchSystems.abstractBatchSystem import (
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
44
|
+
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
45
|
+
BatchJobExitReason,
|
|
46
|
+
InsufficientSystemResources,
|
|
47
|
+
UpdatedBatchJobInfo,
|
|
48
|
+
)
|
|
46
49
|
from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
|
|
47
50
|
from toil.batchSystems.contained_executor import pack_job
|
|
48
51
|
from toil.batchSystems.options import OptionSetter
|
|
@@ -60,9 +63,9 @@ logger = logging.getLogger(__name__)
|
|
|
60
63
|
|
|
61
64
|
|
|
62
65
|
# Map from AWS Batch terminal states to Toil batch job exit reasons
|
|
63
|
-
STATE_TO_EXIT_REASON:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
+
STATE_TO_EXIT_REASON: dict[str, BatchJobExitReason] = {
|
|
67
|
+
"SUCCEEDED": BatchJobExitReason.FINISHED,
|
|
68
|
+
"FAILED": BatchJobExitReason.FAILED,
|
|
66
69
|
}
|
|
67
70
|
|
|
68
71
|
# What's the max polling list size?
|
|
@@ -73,53 +76,62 @@ MIN_REQUESTABLE_MIB = 4
|
|
|
73
76
|
# AWS batch won't accept API requests asking for less than this many CPUs.
|
|
74
77
|
MIN_REQUESTABLE_CORES = 1
|
|
75
78
|
|
|
79
|
+
|
|
76
80
|
class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
77
81
|
@classmethod
|
|
78
82
|
def supportsAutoDeployment(cls) -> bool:
|
|
79
83
|
return True
|
|
80
84
|
|
|
81
|
-
def __init__(
|
|
85
|
+
def __init__(
|
|
86
|
+
self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
|
|
87
|
+
) -> None:
|
|
82
88
|
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
83
89
|
|
|
84
90
|
# Determine region to use.
|
|
85
91
|
# Either it's set specifically or maybe we can get it from the "best" zone.
|
|
86
92
|
# TODO: Parse it from a full queue ARN?
|
|
87
|
-
self.region = getattr(config,
|
|
93
|
+
self.region = getattr(config, "aws_batch_region")
|
|
88
94
|
if self.region is None:
|
|
89
95
|
self.region = get_current_aws_region()
|
|
90
96
|
if self.region is None:
|
|
91
97
|
# Can't proceed without a real region
|
|
92
|
-
raise RuntimeError(
|
|
93
|
-
|
|
94
|
-
|
|
98
|
+
raise RuntimeError(
|
|
99
|
+
"To use AWS Batch, specify --awsBatchRegion or "
|
|
100
|
+
"TOIL_AWS_REGION or TOIL_AWS_ZONE, or configure "
|
|
101
|
+
"a default zone in boto"
|
|
102
|
+
)
|
|
95
103
|
|
|
96
104
|
# Connect to AWS Batch.
|
|
97
105
|
# TODO: Use a global AWSConnectionManager so we can share a client
|
|
98
106
|
# cache with provisioners, etc.
|
|
99
|
-
self.client = establish_boto3_session(self.region).client(
|
|
107
|
+
self.client = establish_boto3_session(self.region).client("batch")
|
|
100
108
|
|
|
101
109
|
# Determine our batch queue
|
|
102
|
-
self.queue = getattr(config,
|
|
110
|
+
self.queue = getattr(config, "aws_batch_queue")
|
|
103
111
|
if self.queue is None:
|
|
104
112
|
# Make sure we actually have a queue
|
|
105
|
-
raise RuntimeError(
|
|
113
|
+
raise RuntimeError(
|
|
114
|
+
"To use AWS Batch, --awsBatchQueue or TOIL_AWS_BATCH_QUEUE must be set"
|
|
115
|
+
)
|
|
106
116
|
# And the role, if any, jobs should assume
|
|
107
|
-
self.job_role_arn = getattr(config,
|
|
117
|
+
self.job_role_arn = getattr(config, "aws_batch_job_role_arn")
|
|
108
118
|
# And the Owner tag value, if any, to apply to things we create
|
|
109
|
-
self.owner_tag = os.environ.get(
|
|
119
|
+
self.owner_tag = os.environ.get("TOIL_OWNER_TAG")
|
|
110
120
|
|
|
111
121
|
# Try and guess what Toil work dir the workers will use.
|
|
112
122
|
# We need to be able to provision (possibly shared) space there.
|
|
113
123
|
# TODO: Deduplicate with Kubernetes batch system.
|
|
114
124
|
self.worker_work_dir = Toil.getToilWorkDir(config.workDir)
|
|
115
|
-
if (
|
|
116
|
-
|
|
117
|
-
|
|
125
|
+
if (
|
|
126
|
+
config.workDir is None
|
|
127
|
+
and os.getenv("TOIL_WORKDIR") is None
|
|
128
|
+
and self.worker_work_dir == tempfile.gettempdir()
|
|
129
|
+
):
|
|
118
130
|
|
|
119
131
|
# We defaulted to the system temp directory. But we think the
|
|
120
132
|
# worker Dockerfiles will make them use /var/lib/toil instead.
|
|
121
133
|
# TODO: Keep this in sync with the Dockerfile.
|
|
122
|
-
self.worker_work_dir =
|
|
134
|
+
self.worker_work_dir = "/var/lib/toil"
|
|
123
135
|
|
|
124
136
|
# We assign job names based on a numerical job ID. This functionality
|
|
125
137
|
# is managed by the BatchSystemLocalSupport.
|
|
@@ -136,27 +148,39 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
136
148
|
self.job_definition: Optional[str] = None
|
|
137
149
|
|
|
138
150
|
# We need a way to map between our batch system ID numbers, and AWS Batch job IDs from the server.
|
|
139
|
-
self.bs_id_to_aws_id:
|
|
140
|
-
self.aws_id_to_bs_id:
|
|
151
|
+
self.bs_id_to_aws_id: dict[int, str] = {}
|
|
152
|
+
self.aws_id_to_bs_id: dict[str, int] = {}
|
|
141
153
|
# We need to track if jobs were killed so they don't come out as updated
|
|
142
|
-
self.killed_job_aws_ids:
|
|
154
|
+
self.killed_job_aws_ids: set[str] = set()
|
|
143
155
|
|
|
144
156
|
def setUserScript(self, user_script: Resource) -> None:
|
|
145
|
-
logger.debug(f
|
|
157
|
+
logger.debug(f"Setting user script for deployment: {user_script}")
|
|
146
158
|
self.user_script = user_script
|
|
147
159
|
|
|
148
160
|
# setEnv is provided by BatchSystemSupport, updates self.environment
|
|
149
161
|
|
|
150
162
|
def _check_accelerator_request(self, requirer: Requirer) -> None:
|
|
151
163
|
for accelerator in requirer.accelerators:
|
|
152
|
-
if
|
|
164
|
+
if (
|
|
165
|
+
accelerator["kind"] != "gpu"
|
|
166
|
+
or accelerator.get("brand", "nvidia") != "nvidia"
|
|
167
|
+
):
|
|
153
168
|
# We can only provide GPUs, and of those only nvidia ones.
|
|
154
|
-
raise InsufficientSystemResources(
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
169
|
+
raise InsufficientSystemResources(
|
|
170
|
+
requirer,
|
|
171
|
+
"accelerators",
|
|
172
|
+
details=[
|
|
173
|
+
f"The accelerator {accelerator} could not be provided.",
|
|
174
|
+
"AWS Batch can only provide nvidia gpu accelerators.",
|
|
175
|
+
],
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def issueBatchJob(
|
|
179
|
+
self,
|
|
180
|
+
command: str,
|
|
181
|
+
job_desc: JobDescription,
|
|
182
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
183
|
+
) -> int:
|
|
160
184
|
# Try the job as local
|
|
161
185
|
local_id = self.handleLocalJob(command, job_desc)
|
|
162
186
|
if local_id is not None:
|
|
@@ -188,41 +212,54 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
188
212
|
|
|
189
213
|
# Compose a job spec to submit
|
|
190
214
|
job_spec = {
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
215
|
+
"jobName": job_name,
|
|
216
|
+
"jobQueue": self.queue,
|
|
217
|
+
"jobDefinition": self._get_or_create_job_definition(),
|
|
218
|
+
"containerOverrides": {
|
|
219
|
+
"command": command_list,
|
|
220
|
+
"environment": [
|
|
221
|
+
{"name": k, "value": v} for k, v in environment.items()
|
|
222
|
+
],
|
|
223
|
+
"resourceRequirements": [
|
|
224
|
+
{
|
|
225
|
+
"type": "MEMORY",
|
|
226
|
+
"value": str(
|
|
227
|
+
max(
|
|
228
|
+
MIN_REQUESTABLE_MIB,
|
|
229
|
+
math.ceil(b_to_mib(job_desc.memory)),
|
|
230
|
+
)
|
|
231
|
+
),
|
|
232
|
+
},
|
|
233
|
+
{
|
|
234
|
+
"type": "VCPU",
|
|
235
|
+
"value": str(
|
|
236
|
+
max(MIN_REQUESTABLE_CORES, math.ceil(job_desc.cores))
|
|
237
|
+
),
|
|
238
|
+
},
|
|
239
|
+
],
|
|
240
|
+
},
|
|
202
241
|
}
|
|
203
242
|
gpus_needed = 0
|
|
204
243
|
for accelerator in job_desc.accelerators:
|
|
205
|
-
if accelerator[
|
|
244
|
+
if accelerator["kind"] == "gpu":
|
|
206
245
|
# We just assume that all GPUs are equivalent when running
|
|
207
246
|
# on AWS Batch because there's no way to tell AWS Batch to
|
|
208
247
|
# send us to one or another.
|
|
209
|
-
gpus_needed += accelerator[
|
|
248
|
+
gpus_needed += accelerator["count"]
|
|
210
249
|
# Other accelerators are rejected by check_resource_request
|
|
211
250
|
if gpus_needed > 0:
|
|
212
251
|
# We need some GPUs so ask for them.
|
|
213
|
-
job_spec[
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
})
|
|
252
|
+
job_spec["containerOverrides"]["resourceRequirements"].append(
|
|
253
|
+
{"type": "GPU", "value": gpus_needed}
|
|
254
|
+
)
|
|
217
255
|
if self.owner_tag:
|
|
218
256
|
# We are meant to tag everything with an owner
|
|
219
|
-
job_spec[
|
|
220
|
-
|
|
257
|
+
job_spec["tags"] = {"Owner": self.owner_tag}
|
|
221
258
|
|
|
222
259
|
# Launch it and get back the AWS ID that we can use to poll the task.
|
|
223
260
|
# TODO: retry!
|
|
224
261
|
response = self.client.submit_job(**job_spec)
|
|
225
|
-
aws_id = response[
|
|
262
|
+
aws_id = response["jobId"]
|
|
226
263
|
|
|
227
264
|
# Tie it to the numeric ID
|
|
228
265
|
self.bs_id_to_aws_id[bs_id] = aws_id
|
|
@@ -230,8 +267,10 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
230
267
|
|
|
231
268
|
if self._outbox is not None:
|
|
232
269
|
# Specify relationship between toil batch ID and aws ID in message bus
|
|
233
|
-
self._outbox.publish(
|
|
234
|
-
|
|
270
|
+
self._outbox.publish(
|
|
271
|
+
ExternalBatchIdMessage(bs_id, aws_id, self.__class__.__name__)
|
|
272
|
+
)
|
|
273
|
+
logger.debug("Launched job: %s", job_name)
|
|
235
274
|
|
|
236
275
|
return bs_id
|
|
237
276
|
|
|
@@ -250,16 +289,16 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
250
289
|
# Do replacements to enhance readability
|
|
251
290
|
input_name = input_name.replace(" ", "-")
|
|
252
291
|
# Keep only acceptable characters
|
|
253
|
-
kept_chars = [c for c in input_name if c.isalnum() or c ==
|
|
292
|
+
kept_chars = [c for c in input_name if c.isalnum() or c == "-" or c == "_"]
|
|
254
293
|
if len(kept_chars) == 0 or not kept_chars[0].isalnum():
|
|
255
294
|
# Make sure we start with something alphanumeric
|
|
256
|
-
kept_chars = [
|
|
295
|
+
kept_chars = ["j"] + kept_chars
|
|
257
296
|
# Keep no more than the limit of them
|
|
258
297
|
kept_chars = kept_chars[:128]
|
|
259
298
|
# And re-compose them into a string
|
|
260
|
-
return
|
|
299
|
+
return "".join(kept_chars)
|
|
261
300
|
|
|
262
|
-
def _get_runtime(self, job_detail:
|
|
301
|
+
def _get_runtime(self, job_detail: dict[str, Any]) -> Optional[float]:
|
|
263
302
|
"""
|
|
264
303
|
Internal function. Should not be called outside this class.
|
|
265
304
|
|
|
@@ -269,20 +308,25 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
269
308
|
Takes an AWS JobDetail as a dict.
|
|
270
309
|
"""
|
|
271
310
|
|
|
272
|
-
if
|
|
311
|
+
if "status" not in job_detail or job_detail["status"] not in [
|
|
312
|
+
"STARTING",
|
|
313
|
+
"RUNNING",
|
|
314
|
+
"SUCCEEDED",
|
|
315
|
+
"FAILED",
|
|
316
|
+
]:
|
|
273
317
|
# Job is not running yet.
|
|
274
318
|
logger.info("Runtime unavailable because job is still waiting")
|
|
275
319
|
return None
|
|
276
320
|
|
|
277
|
-
if
|
|
321
|
+
if "startedAt" not in job_detail:
|
|
278
322
|
# Job has no known start time
|
|
279
323
|
logger.info("Runtime unavailable because job has no start time")
|
|
280
324
|
return None
|
|
281
325
|
|
|
282
|
-
start_ms = job_detail[
|
|
326
|
+
start_ms = job_detail["startedAt"]
|
|
283
327
|
|
|
284
|
-
if
|
|
285
|
-
end_ms = job_detail[
|
|
328
|
+
if "stoppedAt" in job_detail:
|
|
329
|
+
end_ms = job_detail["stoppedAt"]
|
|
286
330
|
else:
|
|
287
331
|
end_ms = unix_now_ms()
|
|
288
332
|
|
|
@@ -291,7 +335,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
291
335
|
# Return the time it has been running for.
|
|
292
336
|
return runtime
|
|
293
337
|
|
|
294
|
-
def _get_exit_code(self, job_detail:
|
|
338
|
+
def _get_exit_code(self, job_detail: dict[str, Any]) -> int:
|
|
295
339
|
"""
|
|
296
340
|
Internal function. Should not be called outside this class.
|
|
297
341
|
|
|
@@ -299,12 +343,18 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
299
343
|
EXIT_STATUS_UNAVAILABLE_VALUE if it cannot be gotten.
|
|
300
344
|
"""
|
|
301
345
|
|
|
302
|
-
return int(
|
|
346
|
+
return int(
|
|
347
|
+
job_detail.get("container", {}).get(
|
|
348
|
+
"exitCode", EXIT_STATUS_UNAVAILABLE_VALUE
|
|
349
|
+
)
|
|
350
|
+
)
|
|
303
351
|
|
|
304
352
|
def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]:
|
|
305
353
|
# Remember when we started, for respecting the timeout
|
|
306
354
|
entry = datetime.datetime.now()
|
|
307
|
-
while (
|
|
355
|
+
while (
|
|
356
|
+
datetime.datetime.now() - entry
|
|
357
|
+
).total_seconds() < maxWait or not maxWait:
|
|
308
358
|
result = self.getUpdatedLocalJob(0)
|
|
309
359
|
if result:
|
|
310
360
|
return result
|
|
@@ -315,9 +365,9 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
315
365
|
acknowledged = []
|
|
316
366
|
|
|
317
367
|
for job_detail in self._describe_jobs_in_batches():
|
|
318
|
-
if job_detail.get(
|
|
368
|
+
if job_detail.get("status") in ["SUCCEEDED", "FAILED"]:
|
|
319
369
|
# This job is done!
|
|
320
|
-
aws_id = job_detail[
|
|
370
|
+
aws_id = job_detail["jobId"]
|
|
321
371
|
bs_id = self.aws_id_to_bs_id[aws_id]
|
|
322
372
|
|
|
323
373
|
# Acknowledge it
|
|
@@ -325,7 +375,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
325
375
|
|
|
326
376
|
if aws_id in self.killed_job_aws_ids:
|
|
327
377
|
# Killed jobs aren't allowed to appear as updated.
|
|
328
|
-
logger.debug(
|
|
378
|
+
logger.debug("Job %s was killed so skipping it", bs_id)
|
|
329
379
|
continue
|
|
330
380
|
|
|
331
381
|
# Otherwise, it stopped running and it wasn't our fault.
|
|
@@ -334,21 +384,33 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
334
384
|
runtime = self._get_runtime(job_detail)
|
|
335
385
|
|
|
336
386
|
# Determine if it succeeded
|
|
337
|
-
exit_reason = STATE_TO_EXIT_REASON[job_detail[
|
|
387
|
+
exit_reason = STATE_TO_EXIT_REASON[job_detail["status"]]
|
|
338
388
|
|
|
339
389
|
# Get its exit code
|
|
340
390
|
exit_code = self._get_exit_code(job_detail)
|
|
341
391
|
|
|
342
|
-
if
|
|
392
|
+
if (
|
|
393
|
+
job_detail["status"] == "FAILED"
|
|
394
|
+
and "statusReason" in job_detail
|
|
395
|
+
):
|
|
343
396
|
# AWS knows why the job failed, so log the error
|
|
344
|
-
logger.error(
|
|
397
|
+
logger.error(
|
|
398
|
+
"Job %s failed because: %s",
|
|
399
|
+
bs_id,
|
|
400
|
+
job_detail["statusReason"],
|
|
401
|
+
)
|
|
345
402
|
|
|
346
403
|
# Compose a result
|
|
347
|
-
return UpdatedBatchJobInfo(
|
|
404
|
+
return UpdatedBatchJobInfo(
|
|
405
|
+
jobID=bs_id,
|
|
406
|
+
exitStatus=exit_code,
|
|
407
|
+
wallTime=runtime,
|
|
408
|
+
exitReason=exit_reason,
|
|
409
|
+
)
|
|
348
410
|
|
|
349
411
|
finally:
|
|
350
412
|
# Drop all the records for tasks we acknowledged
|
|
351
|
-
for
|
|
413
|
+
for aws_id, bs_id in acknowledged:
|
|
352
414
|
del self.aws_id_to_bs_id[aws_id]
|
|
353
415
|
del self.bs_id_to_aws_id[bs_id]
|
|
354
416
|
if aws_id in self.killed_job_aws_ids:
|
|
@@ -357,7 +419,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
357
419
|
|
|
358
420
|
if maxWait:
|
|
359
421
|
# Wait a bit and poll again
|
|
360
|
-
time.sleep(min(maxWait/2, 1.0))
|
|
422
|
+
time.sleep(min(maxWait / 2, 1.0))
|
|
361
423
|
else:
|
|
362
424
|
# Only poll once
|
|
363
425
|
break
|
|
@@ -390,7 +452,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
390
452
|
# later.
|
|
391
453
|
self.killed_job_aws_ids.add(aws_id)
|
|
392
454
|
# Kill the AWS Batch job
|
|
393
|
-
self.client.terminate_job(jobId=aws_id, reason=
|
|
455
|
+
self.client.terminate_job(jobId=aws_id, reason="Killed by Toil")
|
|
394
456
|
|
|
395
457
|
@retry(errors=[ClientError])
|
|
396
458
|
def _wait_until_stopped(self, aws_id: str) -> None:
|
|
@@ -406,16 +468,19 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
406
468
|
while True:
|
|
407
469
|
# Poll the job
|
|
408
470
|
response = self.client.describe_jobs(jobs=[aws_id])
|
|
409
|
-
jobs = response.get(
|
|
471
|
+
jobs = response.get("jobs", [])
|
|
410
472
|
if len(jobs) == 0:
|
|
411
473
|
# Job no longer exists at all
|
|
412
474
|
return
|
|
413
475
|
job = jobs[0]
|
|
414
|
-
if job.get(
|
|
476
|
+
if job.get("status") and job["status"] in ["SUCCEEDED", "FAILED"]:
|
|
415
477
|
# The job has stopped
|
|
416
478
|
return
|
|
417
479
|
# Otherwise the job is still going. Wait for it to stop.
|
|
418
|
-
logger.info(
|
|
480
|
+
logger.info(
|
|
481
|
+
"Waiting for killed job %s to stop",
|
|
482
|
+
self.aws_id_to_bs_id.get(aws_id, aws_id),
|
|
483
|
+
)
|
|
419
484
|
time.sleep(2)
|
|
420
485
|
|
|
421
486
|
@retry(errors=[ClientError])
|
|
@@ -429,56 +494,76 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
429
494
|
if self.job_definition is None:
|
|
430
495
|
# First work out what volume mounts to make, because the type
|
|
431
496
|
# system is happiest this way
|
|
432
|
-
volumes:
|
|
433
|
-
mount_points:
|
|
434
|
-
for i, shared_path in enumerate(
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
497
|
+
volumes: list[dict[str, Union[str, dict[str, str]]]] = []
|
|
498
|
+
mount_points: list[dict[str, str]] = []
|
|
499
|
+
for i, shared_path in enumerate(
|
|
500
|
+
{
|
|
501
|
+
"/var/lib/toil",
|
|
502
|
+
"/var/lib/docker",
|
|
503
|
+
"/var/lib/cwl",
|
|
504
|
+
"/var/run/docker.sock",
|
|
505
|
+
"/var/run/user",
|
|
506
|
+
"/tmp",
|
|
507
|
+
self.worker_work_dir,
|
|
508
|
+
}
|
|
509
|
+
):
|
|
443
510
|
# For every path we want to be the same on the host and the
|
|
444
511
|
# container, choose a name
|
|
445
|
-
vol_name = f
|
|
512
|
+
vol_name = f"mnt{i}"
|
|
446
513
|
# Make a volume for that path
|
|
447
|
-
volumes.append({
|
|
514
|
+
volumes.append({"name": vol_name, "host": {"sourcePath": shared_path}})
|
|
448
515
|
# Mount the volume at that path
|
|
449
|
-
mount_points.append(
|
|
516
|
+
mount_points.append(
|
|
517
|
+
{"containerPath": shared_path, "sourceVolume": vol_name}
|
|
518
|
+
)
|
|
450
519
|
|
|
451
520
|
job_def_spec = {
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
521
|
+
"jobDefinitionName": "toil-" + str(uuid.uuid4()),
|
|
522
|
+
"type": "container",
|
|
523
|
+
"containerProperties": {
|
|
524
|
+
"image": self.docker_image,
|
|
525
|
+
"volumes": volumes,
|
|
526
|
+
"mountPoints": mount_points,
|
|
458
527
|
# Requirements will always be overridden but must be present anyway
|
|
459
|
-
|
|
460
|
-
{
|
|
461
|
-
|
|
528
|
+
"resourceRequirements": [
|
|
529
|
+
{
|
|
530
|
+
"type": "MEMORY",
|
|
531
|
+
"value": str(
|
|
532
|
+
max(
|
|
533
|
+
MIN_REQUESTABLE_MIB,
|
|
534
|
+
math.ceil(b_to_mib(self.config.defaultMemory)),
|
|
535
|
+
)
|
|
536
|
+
),
|
|
537
|
+
},
|
|
538
|
+
{
|
|
539
|
+
"type": "VCPU",
|
|
540
|
+
"value": str(
|
|
541
|
+
max(
|
|
542
|
+
MIN_REQUESTABLE_CORES,
|
|
543
|
+
math.ceil(self.config.defaultCores),
|
|
544
|
+
)
|
|
545
|
+
),
|
|
546
|
+
},
|
|
462
547
|
],
|
|
463
548
|
# Be privileged because we can. And we'd like Singularity
|
|
464
549
|
# to work even if we do have the Docker socket. See
|
|
465
550
|
# <https://github.com/moby/moby/issues/42441>.
|
|
466
|
-
|
|
551
|
+
"privileged": True,
|
|
467
552
|
},
|
|
468
|
-
|
|
469
|
-
|
|
553
|
+
"retryStrategy": {"attempts": 1},
|
|
554
|
+
"propagateTags": True, # This will propagate to ECS task but not to job!
|
|
470
555
|
}
|
|
471
556
|
if self.job_role_arn:
|
|
472
557
|
# We need to give the job a role.
|
|
473
558
|
# We might not be able to do much job store access without this!
|
|
474
|
-
container_properties = job_def_spec[
|
|
559
|
+
container_properties = job_def_spec["containerProperties"]
|
|
475
560
|
assert isinstance(container_properties, dict)
|
|
476
|
-
container_properties[
|
|
561
|
+
container_properties["jobRoleArn"] = self.job_role_arn
|
|
477
562
|
if self.owner_tag:
|
|
478
563
|
# We are meant to tag everything with an owner
|
|
479
|
-
job_def_spec[
|
|
564
|
+
job_def_spec["tags"] = {"Owner": self.owner_tag}
|
|
480
565
|
response = self.client.register_job_definition(**job_def_spec)
|
|
481
|
-
self.job_definition = response[
|
|
566
|
+
self.job_definition = response["jobDefinitionArn"]
|
|
482
567
|
|
|
483
568
|
return self.job_definition
|
|
484
569
|
|
|
@@ -494,10 +579,10 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
494
579
|
# TODO: How do we tolerate it not existing anymore?
|
|
495
580
|
self.job_definition = None
|
|
496
581
|
|
|
497
|
-
def getIssuedBatchJobIDs(self) ->
|
|
582
|
+
def getIssuedBatchJobIDs(self) -> list[int]:
|
|
498
583
|
return self.getIssuedLocalJobIDs() + list(self.bs_id_to_aws_id.keys())
|
|
499
584
|
|
|
500
|
-
def _describe_jobs_in_batches(self) -> Iterator[
|
|
585
|
+
def _describe_jobs_in_batches(self) -> Iterator[dict[str, Any]]:
|
|
501
586
|
"""
|
|
502
587
|
Internal function. Should not be called outside this class.
|
|
503
588
|
|
|
@@ -506,28 +591,30 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
506
591
|
"""
|
|
507
592
|
|
|
508
593
|
# Get all the AWS IDs to poll
|
|
509
|
-
to_check = list(
|
|
594
|
+
to_check = list(
|
|
595
|
+
aws_and_bs_id[0] for aws_and_bs_id in self.aws_id_to_bs_id.items()
|
|
596
|
+
)
|
|
510
597
|
|
|
511
598
|
while len(to_check) > 0:
|
|
512
599
|
# Go through jobs we want to poll in batches of the max size
|
|
513
600
|
check_batch = to_check[-MAX_POLL_COUNT:]
|
|
514
601
|
# And pop them off the end of the list of jobs to check
|
|
515
|
-
to_check = to_check[
|
|
602
|
+
to_check = to_check[: -len(check_batch)]
|
|
516
603
|
|
|
517
604
|
# TODO: retry
|
|
518
605
|
response = self.client.describe_jobs(jobs=check_batch)
|
|
519
606
|
|
|
520
607
|
# Yield each returned JobDetail
|
|
521
|
-
yield from response.get(
|
|
608
|
+
yield from response.get("jobs", [])
|
|
522
609
|
|
|
523
|
-
def getRunningBatchJobIDs(self) ->
|
|
610
|
+
def getRunningBatchJobIDs(self) -> dict[int, float]:
|
|
524
611
|
# We need a dict from job_id (integer) to seconds it has been running
|
|
525
612
|
bs_id_to_runtime = {}
|
|
526
613
|
|
|
527
614
|
for job_detail in self._describe_jobs_in_batches():
|
|
528
|
-
if job_detail.get(
|
|
615
|
+
if job_detail.get("status") == "RUNNING":
|
|
529
616
|
runtime = self._get_runtime(job_detail)
|
|
530
|
-
aws_id = job_detail[
|
|
617
|
+
aws_id = job_detail["jobId"]
|
|
531
618
|
bs_id = self.aws_id_to_bs_id[aws_id]
|
|
532
619
|
if runtime:
|
|
533
620
|
# We can measure a runtime
|
|
@@ -535,12 +622,17 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
535
622
|
else:
|
|
536
623
|
# If we can't find a runtime, we can't say it's running
|
|
537
624
|
# because we can't say how long it has been running for.
|
|
538
|
-
logger.warning(
|
|
625
|
+
logger.warning(
|
|
626
|
+
"Job %s is %s but has no runtime: %s",
|
|
627
|
+
bs_id,
|
|
628
|
+
job_detail["status"],
|
|
629
|
+
job_detail,
|
|
630
|
+
)
|
|
539
631
|
|
|
540
632
|
# Give back the times all our running jobs have been running for.
|
|
541
633
|
return bs_id_to_runtime
|
|
542
634
|
|
|
543
|
-
def killBatchJobs(self, job_ids:
|
|
635
|
+
def killBatchJobs(self, job_ids: list[int]) -> None:
|
|
544
636
|
# Kill all the ones that are local
|
|
545
637
|
self.killLocalJobs(job_ids)
|
|
546
638
|
|
|
@@ -559,14 +651,31 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
559
651
|
|
|
560
652
|
@classmethod
|
|
561
653
|
def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
|
|
562
|
-
parser.add_argument(
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
654
|
+
parser.add_argument(
|
|
655
|
+
"--awsBatchRegion",
|
|
656
|
+
dest="aws_batch_region",
|
|
657
|
+
default=None,
|
|
658
|
+
env_var="TOIL_AWS_REGION",
|
|
659
|
+
help="The AWS region containing the AWS Batch queue to submit to.",
|
|
660
|
+
)
|
|
661
|
+
parser.add_argument(
|
|
662
|
+
"--awsBatchQueue",
|
|
663
|
+
dest="aws_batch_queue",
|
|
664
|
+
default=None,
|
|
665
|
+
env_var="TOIL_AWS_BATCH_QUEUE",
|
|
666
|
+
help="The name or ARN of the AWS Batch queue to submit to.",
|
|
667
|
+
)
|
|
668
|
+
parser.add_argument(
|
|
669
|
+
"--awsBatchJobRoleArn",
|
|
670
|
+
dest="aws_batch_job_role_arn",
|
|
671
|
+
default=None,
|
|
672
|
+
env_var="TOIL_AWS_BATCH_JOB_ROLE_ARN",
|
|
673
|
+
help=(
|
|
674
|
+
"The ARN of an IAM role to run AWS Batch jobs as, so they "
|
|
675
|
+
"can e.g. access a job store. Must be assumable by "
|
|
676
|
+
"ecs-tasks.amazonaws.com."
|
|
677
|
+
),
|
|
678
|
+
)
|
|
570
679
|
|
|
571
680
|
@classmethod
|
|
572
681
|
def setOptions(cls, setOption: OptionSetter) -> None:
|