toil 8.2.0__py3-none-any.whl → 9.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/batchSystems/abstractBatchSystem.py +13 -5
- toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
- toil/batchSystems/kubernetes.py +13 -2
- toil/batchSystems/mesos/batchSystem.py +33 -2
- toil/batchSystems/registry.py +15 -118
- toil/batchSystems/slurm.py +191 -16
- toil/common.py +20 -1
- toil/cwl/cwltoil.py +97 -119
- toil/cwl/utils.py +103 -3
- toil/fileStores/__init__.py +1 -1
- toil/fileStores/abstractFileStore.py +5 -2
- toil/fileStores/cachingFileStore.py +1 -1
- toil/job.py +30 -14
- toil/jobStores/abstractJobStore.py +35 -255
- toil/jobStores/aws/jobStore.py +864 -1964
- toil/jobStores/aws/utils.py +24 -270
- toil/jobStores/fileJobStore.py +2 -1
- toil/jobStores/googleJobStore.py +32 -13
- toil/jobStores/utils.py +0 -327
- toil/leader.py +27 -22
- toil/lib/accelerators.py +1 -1
- toil/lib/aws/config.py +22 -0
- toil/lib/aws/s3.py +477 -9
- toil/lib/aws/utils.py +22 -33
- toil/lib/checksum.py +88 -0
- toil/lib/conversions.py +33 -31
- toil/lib/directory.py +217 -0
- toil/lib/ec2.py +97 -29
- toil/lib/exceptions.py +2 -1
- toil/lib/expando.py +2 -2
- toil/lib/generatedEC2Lists.py +138 -19
- toil/lib/io.py +33 -2
- toil/lib/memoize.py +21 -7
- toil/lib/misc.py +1 -1
- toil/lib/pipes.py +385 -0
- toil/lib/plugins.py +106 -0
- toil/lib/retry.py +1 -1
- toil/lib/threading.py +1 -1
- toil/lib/url.py +320 -0
- toil/lib/web.py +4 -5
- toil/options/cwl.py +13 -1
- toil/options/runner.py +17 -10
- toil/options/wdl.py +12 -1
- toil/provisioners/__init__.py +5 -2
- toil/provisioners/aws/__init__.py +43 -36
- toil/provisioners/aws/awsProvisioner.py +47 -15
- toil/provisioners/node.py +60 -12
- toil/resource.py +3 -13
- toil/server/app.py +12 -6
- toil/server/cli/wes_cwl_runner.py +2 -2
- toil/server/wes/abstract_backend.py +21 -43
- toil/server/wes/toil_backend.py +2 -2
- toil/test/__init__.py +16 -18
- toil/test/batchSystems/batchSystemTest.py +2 -9
- toil/test/batchSystems/batch_system_plugin_test.py +7 -0
- toil/test/batchSystems/test_slurm.py +103 -14
- toil/test/cwl/cwlTest.py +181 -8
- toil/test/cwl/staging_cat.cwl +27 -0
- toil/test/cwl/staging_make_file.cwl +25 -0
- toil/test/cwl/staging_workflow.cwl +43 -0
- toil/test/cwl/zero_default.cwl +61 -0
- toil/test/docs/scripts/tutorial_staging.py +17 -8
- toil/test/docs/scriptsTest.py +2 -1
- toil/test/jobStores/jobStoreTest.py +23 -133
- toil/test/lib/aws/test_iam.py +7 -7
- toil/test/lib/aws/test_s3.py +30 -33
- toil/test/lib/aws/test_utils.py +9 -9
- toil/test/lib/test_url.py +69 -0
- toil/test/lib/url_plugin_test.py +105 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +60 -7
- toil/test/provisioners/clusterTest.py +15 -2
- toil/test/provisioners/gceProvisionerTest.py +1 -1
- toil/test/server/serverTest.py +78 -36
- toil/test/src/autoDeploymentTest.py +2 -3
- toil/test/src/fileStoreTest.py +89 -87
- toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
- toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
- toil/test/utils/toilKillTest.py +35 -28
- toil/test/wdl/md5sum/md5sum-gs.json +1 -1
- toil/test/wdl/md5sum/md5sum.json +1 -1
- toil/test/wdl/testfiles/read_file.wdl +18 -0
- toil/test/wdl/testfiles/url_to_optional_file.wdl +2 -1
- toil/test/wdl/wdltoil_test.py +171 -162
- toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
- toil/utils/toilDebugFile.py +6 -3
- toil/utils/toilSshCluster.py +23 -0
- toil/utils/toilStats.py +17 -2
- toil/utils/toilUpdateEC2Instances.py +1 -0
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +1179 -825
- toil/worker.py +16 -8
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/METADATA +32 -32
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/RECORD +97 -85
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/WHEEL +1 -1
- toil/lib/iterables.py +0 -112
- toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/entry_points.txt +0 -0
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/licenses/LICENSE +0 -0
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/top_level.txt +0 -0
toil/lib/aws/s3.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2015-
|
|
1
|
+
# Copyright (C) 2015-2023 Regents of the University of California
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -11,21 +11,489 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import hashlib
|
|
16
|
+
import itertools
|
|
17
|
+
import urllib.parse
|
|
14
18
|
import logging
|
|
15
19
|
|
|
16
|
-
from
|
|
20
|
+
from io import BytesIO
|
|
21
|
+
from datetime import timedelta
|
|
22
|
+
from contextlib import contextmanager
|
|
23
|
+
from typing import (Any,
|
|
24
|
+
Dict,
|
|
25
|
+
List,
|
|
26
|
+
Optional,
|
|
27
|
+
Union,
|
|
28
|
+
Tuple,
|
|
29
|
+
cast,
|
|
30
|
+
Literal,
|
|
31
|
+
Iterator,
|
|
32
|
+
IO)
|
|
33
|
+
|
|
34
|
+
from toil.lib.retry import retry, get_error_status, ErrorCondition
|
|
35
|
+
from toil.lib.misc import printq
|
|
36
|
+
from toil.lib.aws import AWSServerErrors, session, build_tag_dict_from_env
|
|
37
|
+
from toil.lib.aws.utils import enable_public_objects, flatten_tags
|
|
38
|
+
from toil.lib.conversions import modify_url, MB, MIB, TB
|
|
39
|
+
from toil.lib.pipes import WritablePipe, ReadablePipe, HashingPipe
|
|
40
|
+
|
|
41
|
+
# This file cannot be imported without the botocore/boto3 modules. We need
|
|
42
|
+
# these types to set up @retry annotations, and @retry annotations really need
|
|
43
|
+
# to be passed the real types at import time, since they do annotation-time
|
|
44
|
+
# type checking internally.
|
|
45
|
+
from botocore.exceptions import ClientError
|
|
46
|
+
|
|
47
|
+
from mypy_boto3_s3 import S3Client, S3ServiceResource
|
|
48
|
+
from mypy_boto3_s3.literals import BucketLocationConstraintType
|
|
49
|
+
from mypy_boto3_s3.service_resource import Bucket, Object
|
|
50
|
+
from mypy_boto3_s3.type_defs import ListMultipartUploadsOutputTypeDef, HeadObjectOutputTypeDef, GetObjectOutputTypeDef, PutObjectOutputTypeDef, ObjectTypeDef
|
|
17
51
|
|
|
18
|
-
from toil.lib.aws import AWSServerErrors, session
|
|
19
|
-
from toil.lib.retry import retry
|
|
20
52
|
|
|
21
53
|
logger = logging.getLogger(__name__)
|
|
22
54
|
|
|
23
55
|
|
|
56
|
+
# AWS Defined Limits
|
|
57
|
+
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
|
|
58
|
+
AWS_MAX_MULTIPART_COUNT = 10000
|
|
59
|
+
AWS_MAX_CHUNK_SIZE = 5 * TB
|
|
60
|
+
AWS_MIN_CHUNK_SIZE = 5 * MB
|
|
61
|
+
# Note: There is no minimum size limit on the last part of a multipart upload.
|
|
62
|
+
|
|
63
|
+
# The chunk size we chose arbitrarily, but it must be consistent for etags
|
|
64
|
+
DEFAULT_AWS_CHUNK_SIZE = 128 * MIB
|
|
65
|
+
assert AWS_MAX_CHUNK_SIZE > DEFAULT_AWS_CHUNK_SIZE > AWS_MIN_CHUNK_SIZE
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class AWSKeyNotFoundError(Exception):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class AWSKeyAlreadyExistsError(Exception):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class AWSBadEncryptionKeyError(Exception):
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@retry(errors=[ClientError])
|
|
81
|
+
def create_s3_bucket(
|
|
82
|
+
s3_resource: S3ServiceResource,
|
|
83
|
+
bucket_name: str,
|
|
84
|
+
region: Union[BucketLocationConstraintType, Literal["us-east-1"]],
|
|
85
|
+
tags: Optional[Dict[str, str]] = None,
|
|
86
|
+
public: bool = True
|
|
87
|
+
) -> "Bucket":
|
|
88
|
+
"""
|
|
89
|
+
Create an AWS S3 bucket, using the given Boto3 S3 session, with the
|
|
90
|
+
given name, in the given region.
|
|
91
|
+
|
|
92
|
+
Supports the us-east-1 region, where bucket creation is special.
|
|
93
|
+
|
|
94
|
+
*ALL* S3 bucket creation should use this function.
|
|
95
|
+
"""
|
|
96
|
+
logger.info("Creating bucket '%s' in region %s.", bucket_name, region)
|
|
97
|
+
if region == "us-east-1": # see https://github.com/boto/boto3/issues/125
|
|
98
|
+
bucket = s3_resource.create_bucket(Bucket=bucket_name)
|
|
99
|
+
else:
|
|
100
|
+
bucket = s3_resource.create_bucket(
|
|
101
|
+
Bucket=bucket_name,
|
|
102
|
+
CreateBucketConfiguration={"LocationConstraint": region},
|
|
103
|
+
)
|
|
104
|
+
# wait until the bucket exists before adding tags
|
|
105
|
+
bucket.wait_until_exists()
|
|
106
|
+
|
|
107
|
+
tags = build_tag_dict_from_env() if tags is None else tags
|
|
108
|
+
bucket_tagging = s3_resource.BucketTagging(bucket_name)
|
|
109
|
+
bucket_tagging.put(Tagging={'TagSet': flatten_tags(tags)}) # type: ignore
|
|
110
|
+
|
|
111
|
+
# enabling public objects is the historical default
|
|
112
|
+
if public:
|
|
113
|
+
enable_public_objects(bucket_name)
|
|
114
|
+
|
|
115
|
+
return bucket
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@retry(errors=[ClientError])
|
|
119
|
+
def delete_s3_bucket(
|
|
120
|
+
s3_resource: S3ServiceResource,
|
|
121
|
+
bucket_name: str,
|
|
122
|
+
quiet: bool = True
|
|
123
|
+
) -> None:
|
|
124
|
+
"""
|
|
125
|
+
Delete the bucket with 'bucket_name'.
|
|
126
|
+
|
|
127
|
+
Note: 'quiet' is False when used for a clean up utility script (contrib/admin/cleanup_aws_resources.py)
|
|
128
|
+
that prints progress rather than logging. Logging should be used for all other internal Toil usage.
|
|
129
|
+
"""
|
|
130
|
+
assert isinstance(bucket_name, str), f'{bucket_name} is not a string ({type(bucket_name)}).'
|
|
131
|
+
logger.debug("Deleting bucket '%s'.", bucket_name)
|
|
132
|
+
printq(f'\n * Deleting s3 bucket: {bucket_name}\n\n', quiet)
|
|
133
|
+
|
|
134
|
+
s3_client = s3_resource.meta.client
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
for u in s3_client.list_multipart_uploads(Bucket=bucket_name).get('Uploads', []):
|
|
138
|
+
s3_client.abort_multipart_upload(
|
|
139
|
+
Bucket=bucket_name,
|
|
140
|
+
Key=u["Key"],
|
|
141
|
+
UploadId=u["UploadId"]
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
paginator = s3_client.get_paginator('list_object_versions')
|
|
145
|
+
for response in paginator.paginate(Bucket=bucket_name):
|
|
146
|
+
# Versions and delete markers can both go in here to be deleted.
|
|
147
|
+
# They both have Key and VersionId, but there's no shared base type
|
|
148
|
+
# defined for them in the stubs to express that. See
|
|
149
|
+
# <https://github.com/vemel/mypy_boto3_builder/issues/123>. So we
|
|
150
|
+
# have to do gymnastics to get them into the same list.
|
|
151
|
+
to_delete: List[Dict[str, Any]] = cast(List[Dict[str, Any]], response.get('Versions', [])) + \
|
|
152
|
+
cast(List[Dict[str, Any]], response.get('DeleteMarkers', []))
|
|
153
|
+
for entry in to_delete:
|
|
154
|
+
printq(f" Deleting {entry['Key']} version {entry['VersionId']}", quiet)
|
|
155
|
+
s3_client.delete_object(
|
|
156
|
+
Bucket=bucket_name,
|
|
157
|
+
Key=entry['Key'],
|
|
158
|
+
VersionId=entry['VersionId']
|
|
159
|
+
)
|
|
160
|
+
bucket = s3_resource.Bucket(bucket_name)
|
|
161
|
+
bucket.objects.all().delete()
|
|
162
|
+
bucket.object_versions.delete()
|
|
163
|
+
bucket.delete()
|
|
164
|
+
printq(f'\n * Deleted s3 bucket successfully: {bucket_name}\n\n', quiet)
|
|
165
|
+
logger.debug("Deleted s3 bucket successfully '%s'.", bucket_name)
|
|
166
|
+
except s3_client.exceptions.NoSuchBucket:
|
|
167
|
+
printq(f'\n * S3 bucket no longer exists: {bucket_name}\n\n', quiet)
|
|
168
|
+
logger.debug("S3 bucket no longer exists '%s'.", bucket_name)
|
|
169
|
+
except ClientError as e:
|
|
170
|
+
if get_error_status(e) != 404:
|
|
171
|
+
raise
|
|
172
|
+
printq(f'\n * S3 bucket no longer exists: {bucket_name}\n\n', quiet)
|
|
173
|
+
logger.debug("S3 bucket no longer exists '%s'.", bucket_name)
|
|
174
|
+
|
|
175
|
+
|
|
24
176
|
@retry(errors=[AWSServerErrors])
|
|
25
|
-
def
|
|
26
|
-
|
|
27
|
-
|
|
177
|
+
def bucket_exists(s3_resource: S3ServiceResource, bucket: str) -> Union[bool, Bucket]:
|
|
178
|
+
s3_client = s3_resource.meta.client
|
|
179
|
+
try:
|
|
180
|
+
s3_client.head_bucket(Bucket=bucket)
|
|
181
|
+
return s3_resource.Bucket(bucket)
|
|
182
|
+
except (ClientError, s3_client.exceptions.NoSuchBucket) as e:
|
|
183
|
+
error_code = e.response.get('ResponseMetadata', {}).get('HTTPStatusCode')
|
|
184
|
+
if error_code == 404:
|
|
185
|
+
return False
|
|
186
|
+
else:
|
|
187
|
+
raise
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@retry(errors=[AWSServerErrors])
|
|
191
|
+
def head_s3_object(bucket: str, key: str, header: Dict[str, Any], region: Optional[str] = None) -> HeadObjectOutputTypeDef:
|
|
192
|
+
"""
|
|
193
|
+
Attempt to HEAD an s3 object and return its response.
|
|
194
|
+
|
|
195
|
+
:param bucket: AWS bucket name
|
|
196
|
+
:param key: AWS Key name for the s3 object
|
|
197
|
+
:param header: Headers to include (mostly for encryption).
|
|
198
|
+
See: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/head_object.html
|
|
199
|
+
:param region: Region that we want to look for the bucket in
|
|
200
|
+
"""
|
|
28
201
|
s3_client = session.client("s3", region_name=region)
|
|
29
|
-
return s3_client.
|
|
30
|
-
|
|
202
|
+
return s3_client.head_object(Bucket=bucket, Key=key, **header)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@retry(errors=[AWSServerErrors])
|
|
206
|
+
def list_multipart_uploads(bucket: str, region: str, prefix: str, max_uploads: int = 1) -> ListMultipartUploadsOutputTypeDef:
|
|
207
|
+
s3_client = session.client("s3", region_name=region)
|
|
208
|
+
return s3_client.list_multipart_uploads(Bucket=bucket, MaxUploads=max_uploads, Prefix=prefix)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@retry(errors=[AWSServerErrors])
|
|
212
|
+
def copy_s3_to_s3(s3_resource: S3ServiceResource, src_bucket: str, src_key: str, dst_bucket: str, dst_key: str, extra_args: Optional[Dict[Any, Any]] = None) -> None:
|
|
213
|
+
source = {'Bucket': src_bucket, 'Key': src_key}
|
|
214
|
+
# Note: this may have errors if using sse-c because of
|
|
215
|
+
# a bug with encryption using copy_object and copy (which uses copy_object for files <5GB):
|
|
216
|
+
# https://github.com/aws/aws-cli/issues/6012
|
|
217
|
+
# this will only happen if we attempt to copy a file previously encrypted with sse-c
|
|
218
|
+
# copying an unencrypted file and encrypting it as sse-c seems to work fine though
|
|
219
|
+
kwargs = dict(CopySource=source, Bucket=dst_bucket, Key=dst_key, ExtraArgs=extra_args)
|
|
220
|
+
s3_resource.meta.client.copy(**kwargs) # type: ignore
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# TODO: Determine specific retries
|
|
224
|
+
@retry(errors=[AWSServerErrors])
|
|
225
|
+
def copy_local_to_s3(
|
|
226
|
+
s3_resource: S3ServiceResource,
|
|
227
|
+
local_file_path: str,
|
|
228
|
+
dst_bucket: str,
|
|
229
|
+
dst_key: str,
|
|
230
|
+
extra_args: Optional[Dict[Any, Any]] = None
|
|
231
|
+
) -> None:
|
|
232
|
+
s3_client = s3_resource.meta.client
|
|
233
|
+
s3_client.upload_file(local_file_path, dst_bucket, dst_key, ExtraArgs=extra_args)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# TODO: Determine specific retries
|
|
237
|
+
@retry(errors=[AWSServerErrors])
|
|
238
|
+
def copy_s3_to_local(
|
|
239
|
+
s3_resource: S3ServiceResource,
|
|
240
|
+
local_file_path: str,
|
|
241
|
+
src_bucket: str,
|
|
242
|
+
src_key: str,
|
|
243
|
+
extra_args: Optional[Dict[Any, Any]] = None
|
|
244
|
+
) -> None:
|
|
245
|
+
s3_client = s3_resource.meta.client
|
|
246
|
+
s3_client.download_file(src_bucket, src_key, local_file_path, ExtraArgs=extra_args)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class MultiPartPipe(WritablePipe):
|
|
250
|
+
def __init__(self, part_size: int, s3_resource: S3ServiceResource, bucket_name: str, file_id: str, encryption_args: Optional[Dict[Any, Any]], encoding: Optional[str] = None, errors: Optional[str] = None) -> None:
|
|
251
|
+
super(MultiPartPipe, self).__init__()
|
|
252
|
+
self.encoding = encoding
|
|
253
|
+
self.errors = errors
|
|
254
|
+
self.part_size = part_size
|
|
255
|
+
self.s3_client = s3_resource.meta.client
|
|
256
|
+
self.bucket_name = bucket_name
|
|
257
|
+
self.file_id = file_id
|
|
258
|
+
self.encryption_args = encryption_args or dict()
|
|
259
|
+
|
|
260
|
+
def readFrom(self, readable: IO[Any]) -> None:
|
|
261
|
+
# Get the first block of data we want to put
|
|
262
|
+
buf = readable.read(self.part_size)
|
|
263
|
+
assert isinstance(buf, bytes)
|
|
264
|
+
|
|
265
|
+
# We will compute a checksum
|
|
266
|
+
hasher = hashlib.sha1()
|
|
267
|
+
hasher.update(buf)
|
|
268
|
+
|
|
269
|
+
# low-level clients are thread safe
|
|
270
|
+
response = self.s3_client.create_multipart_upload(Bucket=self.bucket_name,
|
|
271
|
+
Key=self.file_id,
|
|
272
|
+
**self.encryption_args)
|
|
273
|
+
upload_id = response['UploadId']
|
|
274
|
+
parts = []
|
|
275
|
+
try:
|
|
276
|
+
for part_num in itertools.count():
|
|
277
|
+
logger.debug(f'[{upload_id}] Uploading part %d of %d bytes', part_num + 1, len(buf))
|
|
278
|
+
# TODO: include the Content-MD5 header:
|
|
279
|
+
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.complete_multipart_upload
|
|
280
|
+
part = self.s3_client.upload_part(Bucket=self.bucket_name,
|
|
281
|
+
Key=self.file_id,
|
|
282
|
+
PartNumber=part_num + 1,
|
|
283
|
+
UploadId=upload_id,
|
|
284
|
+
Body=BytesIO(buf),
|
|
285
|
+
**self.encryption_args)
|
|
286
|
+
parts.append({"PartNumber": part_num + 1, "ETag": part["ETag"]})
|
|
287
|
+
|
|
288
|
+
# Get the next block of data we want to put
|
|
289
|
+
buf = readable.read(self.part_size)
|
|
290
|
+
if len(buf) == 0:
|
|
291
|
+
# Don't allow any part other than the very first to be empty.
|
|
292
|
+
break
|
|
293
|
+
hasher.update(buf)
|
|
294
|
+
except:
|
|
295
|
+
self.s3_client.abort_multipart_upload(Bucket=self.bucket_name,
|
|
296
|
+
Key=self.file_id,
|
|
297
|
+
UploadId=upload_id)
|
|
298
|
+
else:
|
|
299
|
+
# Save the checksum
|
|
300
|
+
checksum = f'sha1${hasher.hexdigest()}'
|
|
301
|
+
# Encryption information is not needed here because the upload was
|
|
302
|
+
# not started with a checksum.
|
|
303
|
+
response = self.s3_client.complete_multipart_upload(Bucket=self.bucket_name, # type: ignore
|
|
304
|
+
Key=self.file_id,
|
|
305
|
+
UploadId=upload_id,
|
|
306
|
+
MultipartUpload={"Parts": parts}) # type: ignore
|
|
307
|
+
logger.debug(f'[{upload_id}] Upload of {self.file_id} complete...')
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def parse_s3_uri(uri: str) -> Tuple[str, str]:
|
|
311
|
+
# does not support s3/gs: https://docs.python.org/3/library/urllib.parse.html
|
|
312
|
+
# use regex instead?
|
|
313
|
+
uri = urllib.parse.urlparse(uri) # type: ignore
|
|
314
|
+
if uri.scheme.lower() != 's3': # type: ignore
|
|
315
|
+
raise ValueError(f'Invalid schema. Expecting s3 prefix, not: {uri}')
|
|
316
|
+
bucket_name, key_name = uri.netloc.strip('/'), uri.path.strip('/') # type: ignore
|
|
317
|
+
return bucket_name, key_name
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def list_s3_items(s3_resource: S3ServiceResource, bucket: str, prefix: str, startafter: Optional[str] = None) -> Iterator[ObjectTypeDef]:
|
|
321
|
+
s3_client = s3_resource.meta.client
|
|
322
|
+
paginator = s3_client.get_paginator('list_objects_v2')
|
|
323
|
+
kwargs = dict(Bucket=bucket, Prefix=prefix)
|
|
324
|
+
if startafter:
|
|
325
|
+
kwargs['StartAfter'] = startafter
|
|
326
|
+
for page in paginator.paginate(**kwargs): # type: ignore
|
|
327
|
+
for key in page.get('Contents', []):
|
|
328
|
+
yield key
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@retry(errors=[AWSServerErrors])
|
|
332
|
+
def upload_to_s3(readable: IO[Any],
|
|
333
|
+
s3_resource: S3ServiceResource,
|
|
334
|
+
bucket: str,
|
|
335
|
+
key: str,
|
|
336
|
+
extra_args: Optional[Dict[Any, Any]] = None) -> None:
|
|
337
|
+
"""
|
|
338
|
+
Upload a readable object to s3, using multipart uploading if applicable.
|
|
339
|
+
|
|
340
|
+
:param readable: a readable stream or a local file path to upload to s3
|
|
341
|
+
:param S3.Resource resource: boto3 resource
|
|
342
|
+
:param str bucket: name of the bucket to upload to
|
|
343
|
+
:param str key: the name of the file to upload to
|
|
344
|
+
:param dict extra_args: http headers to use when uploading - generally used for encryption purposes
|
|
345
|
+
:param int partSize: max size of each part in the multipart upload, in bytes
|
|
346
|
+
:return: version of the newly uploaded file
|
|
347
|
+
"""
|
|
348
|
+
if extra_args is None:
|
|
349
|
+
extra_args = {}
|
|
350
|
+
|
|
351
|
+
s3_client = s3_resource.meta.client
|
|
352
|
+
from boto3.s3.transfer import TransferConfig
|
|
353
|
+
config = TransferConfig(
|
|
354
|
+
multipart_threshold=DEFAULT_AWS_CHUNK_SIZE,
|
|
355
|
+
multipart_chunksize=DEFAULT_AWS_CHUNK_SIZE,
|
|
356
|
+
use_threads=True
|
|
31
357
|
)
|
|
358
|
+
logger.debug("Uploading %s", key)
|
|
359
|
+
# these methods use multipart if necessary
|
|
360
|
+
if isinstance(readable, str):
|
|
361
|
+
s3_client.upload_file(Filename=readable,
|
|
362
|
+
Bucket=bucket,
|
|
363
|
+
Key=key,
|
|
364
|
+
ExtraArgs=extra_args,
|
|
365
|
+
Config=config)
|
|
366
|
+
else:
|
|
367
|
+
s3_client.upload_fileobj(Fileobj=readable,
|
|
368
|
+
Bucket=bucket,
|
|
369
|
+
Key=key,
|
|
370
|
+
ExtraArgs=extra_args,
|
|
371
|
+
Config=config)
|
|
372
|
+
|
|
373
|
+
object_summary = s3_resource.ObjectSummary(bucket, key)
|
|
374
|
+
object_summary.wait_until_exists(**extra_args)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
@contextmanager
|
|
378
|
+
def download_stream(s3_resource: S3ServiceResource, bucket: str, key: str, checksum_to_verify: Optional[str] = None,
|
|
379
|
+
extra_args: Optional[Dict[Any, Any]] = None, encoding: Optional[str] = None, errors: Optional[str] = None) -> Iterator[IO[Any]]:
|
|
380
|
+
"""Context manager that gives out a download stream to download data."""
|
|
381
|
+
bucket_obj: Bucket = s3_resource.Bucket(bucket)
|
|
382
|
+
|
|
383
|
+
class DownloadPipe(ReadablePipe):
|
|
384
|
+
def writeTo(self, writable: IO[Any]) -> None:
|
|
385
|
+
kwargs = dict(Key=key, Fileobj=writable, ExtraArgs=extra_args)
|
|
386
|
+
if not extra_args:
|
|
387
|
+
del kwargs['ExtraArgs']
|
|
388
|
+
bucket_obj.download_fileobj(**kwargs) # type: ignore
|
|
389
|
+
|
|
390
|
+
try:
|
|
391
|
+
if checksum_to_verify:
|
|
392
|
+
with DownloadPipe(encoding=encoding, errors=errors) as readable:
|
|
393
|
+
# Interpose a pipe to check the hash
|
|
394
|
+
with HashingPipe(readable, encoding=encoding, errors=errors) as verified:
|
|
395
|
+
yield verified
|
|
396
|
+
else:
|
|
397
|
+
# Readable end of pipe produces text mode output if encoding specified
|
|
398
|
+
with DownloadPipe(encoding=encoding, errors=errors) as readable:
|
|
399
|
+
# No true checksum available, so don't hash
|
|
400
|
+
yield readable
|
|
401
|
+
except s3_resource.meta.client.exceptions.NoSuchKey:
|
|
402
|
+
raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
|
|
403
|
+
except ClientError as e:
|
|
404
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
405
|
+
raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
|
|
406
|
+
elif e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 400 and \
|
|
407
|
+
e.response.get('Error', {}).get('Message') == 'Bad Request' and \
|
|
408
|
+
e.operation_name == 'HeadObject':
|
|
409
|
+
# An error occurred (400) when calling the HeadObject operation: Bad Request
|
|
410
|
+
raise AWSBadEncryptionKeyError("Your AWS encryption key is most likely configured incorrectly "
|
|
411
|
+
f"(HeadObject operation on key '{key}': Bad Request).")
|
|
412
|
+
raise
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def download_fileobject(s3_resource: S3ServiceResource, bucket: Bucket, key: str, fileobj: BytesIO, extra_args: Optional[Dict[Any, Any]] = None) -> None:
|
|
416
|
+
try:
|
|
417
|
+
bucket.download_fileobj(Key=key, Fileobj=fileobj, ExtraArgs=extra_args)
|
|
418
|
+
except s3_resource.meta.client.exceptions.NoSuchKey:
|
|
419
|
+
raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
|
|
420
|
+
except ClientError as e:
|
|
421
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
422
|
+
raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
|
|
423
|
+
elif e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 400 and \
|
|
424
|
+
e.response.get('Error', {}).get('Message') == 'Bad Request' and \
|
|
425
|
+
e.operation_name == 'HeadObject':
|
|
426
|
+
# An error occurred (400) when calling the HeadObject operation: Bad Request
|
|
427
|
+
raise AWSBadEncryptionKeyError("Your AWS encryption key is most likely configured incorrectly "
|
|
428
|
+
f"(HeadObject operation on key '{key}': Bad Request).")
|
|
429
|
+
raise
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def s3_key_exists(s3_resource: S3ServiceResource, bucket: str, key: str, check: bool = False, extra_args: Optional[Dict[Any, Any]] = None) -> bool:
|
|
433
|
+
"""Return True if the s3 obect exists, and False if not. Will error if encryption args are incorrect."""
|
|
434
|
+
extra_args = extra_args or {}
|
|
435
|
+
s3_client = s3_resource.meta.client
|
|
436
|
+
try:
|
|
437
|
+
s3_client.head_object(Bucket=bucket, Key=key, **extra_args)
|
|
438
|
+
return True
|
|
439
|
+
except s3_client.exceptions.NoSuchKey:
|
|
440
|
+
if check:
|
|
441
|
+
raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
|
|
442
|
+
return False
|
|
443
|
+
except ClientError as e:
|
|
444
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
445
|
+
if check:
|
|
446
|
+
raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
|
|
447
|
+
return False
|
|
448
|
+
elif e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 400 and \
|
|
449
|
+
e.response.get('Error', {}).get('Message') == 'Bad Request' and \
|
|
450
|
+
e.operation_name == 'HeadObject':
|
|
451
|
+
# An error occurred (400) when calling the HeadObject operation: Bad Request
|
|
452
|
+
raise AWSBadEncryptionKeyError("Your AWS encryption key is most likely configured incorrectly "
|
|
453
|
+
f"(HeadObject operation on key '{key}': Bad Request).")
|
|
454
|
+
else:
|
|
455
|
+
raise
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def get_s3_object(s3_resource: S3ServiceResource, bucket: str, key: str, extra_args: Optional[Dict[Any, Any]] = None) -> GetObjectOutputTypeDef:
|
|
459
|
+
if extra_args is None:
|
|
460
|
+
extra_args = dict()
|
|
461
|
+
s3_client = s3_resource.meta.client
|
|
462
|
+
return s3_client.get_object(Bucket=bucket, Key=key, **extra_args)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def put_s3_object(s3_resource: S3ServiceResource, bucket: str, key: str, body: Union[str, bytes], extra_args: Optional[Dict[Any, Any]] = None) -> PutObjectOutputTypeDef:
|
|
466
|
+
if extra_args is None:
|
|
467
|
+
extra_args = dict()
|
|
468
|
+
s3_client = s3_resource.meta.client
|
|
469
|
+
return s3_client.put_object(Bucket=bucket, Key=key, Body=body, **extra_args)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def generate_presigned_url(s3_resource: S3ServiceResource, bucket: str, key_name: str, expiration: int) -> str:
|
|
473
|
+
s3_client = s3_resource.meta.client
|
|
474
|
+
return s3_client.generate_presigned_url(
|
|
475
|
+
'get_object',
|
|
476
|
+
Params={'Bucket': bucket, 'Key': key_name},
|
|
477
|
+
ExpiresIn=expiration)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def create_public_url(s3_resource: S3ServiceResource, bucket: str, key: str) -> str:
|
|
481
|
+
bucket_obj = s3_resource.Bucket(bucket)
|
|
482
|
+
bucket_obj.Object(key).Acl().put(ACL='public-read') # TODO: do we need to generate a signed url after doing this?
|
|
483
|
+
url = generate_presigned_url(s3_resource=s3_resource,
|
|
484
|
+
bucket=bucket,
|
|
485
|
+
key_name=key,
|
|
486
|
+
# One year should be sufficient to finish any pipeline ;-)
|
|
487
|
+
expiration=int(timedelta(days=365).total_seconds()))
|
|
488
|
+
# boto doesn't properly remove the x-amz-security-token parameter when
|
|
489
|
+
# query_auth is False when using an IAM role (see issue #2043). Including the
|
|
490
|
+
# x-amz-security-token parameter without the access key results in a 403,
|
|
491
|
+
# even if the resource is public, so we need to remove it.
|
|
492
|
+
# TODO: verify that this is still the case
|
|
493
|
+
return modify_url(url, remove=['x-amz-security-token', 'AWSAccessKeyId', 'Signature'])
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def get_s3_bucket_region(s3_resource: S3ServiceResource, bucket: str) -> str:
|
|
497
|
+
s3_client = s3_resource.meta.client
|
|
498
|
+
# AWS returns None for the default of 'us-east-1'
|
|
499
|
+
return s3_client.get_bucket_location(Bucket=bucket).get('LocationConstraint', None) or 'us-east-1'
|
toil/lib/aws/utils.py
CHANGED
|
@@ -16,8 +16,17 @@ import logging
|
|
|
16
16
|
import os
|
|
17
17
|
import socket
|
|
18
18
|
from collections.abc import Iterable, Iterator
|
|
19
|
-
from typing import
|
|
20
|
-
|
|
19
|
+
from typing import (
|
|
20
|
+
TYPE_CHECKING,
|
|
21
|
+
Any,
|
|
22
|
+
Callable,
|
|
23
|
+
ContextManager,
|
|
24
|
+
Literal,
|
|
25
|
+
Optional,
|
|
26
|
+
Union,
|
|
27
|
+
cast,
|
|
28
|
+
)
|
|
29
|
+
from urllib.parse import ParseResult, urlparse
|
|
21
30
|
|
|
22
31
|
# To import toil.lib.aws.session, the AWS libraries must be installed
|
|
23
32
|
from toil.lib.aws import AWSRegionName, AWSServerErrors, session
|
|
@@ -37,7 +46,6 @@ if TYPE_CHECKING:
|
|
|
37
46
|
from mypy_boto3_s3 import S3ServiceResource
|
|
38
47
|
from mypy_boto3_s3.service_resource import Bucket
|
|
39
48
|
from mypy_boto3_s3.service_resource import Object as S3Object
|
|
40
|
-
from mypy_boto3_sdb.type_defs import AttributeTypeDef
|
|
41
49
|
|
|
42
50
|
from botocore.exceptions import ClientError, EndpointConnectionError
|
|
43
51
|
|
|
@@ -131,6 +139,11 @@ def delete_s3_bucket(
|
|
|
131
139
|
) -> None:
|
|
132
140
|
"""
|
|
133
141
|
Delete the given S3 bucket.
|
|
142
|
+
|
|
143
|
+
Note that S3 bucket deletion is only eventually-consistent, and anyone can
|
|
144
|
+
register any S3 bucket name. For both of these reasons, a bucket name might
|
|
145
|
+
not be immediately (or ever) available for re-use after this function
|
|
146
|
+
returns.
|
|
134
147
|
"""
|
|
135
148
|
printq(f"Deleting s3 bucket: {bucket}", quiet)
|
|
136
149
|
|
|
@@ -153,7 +166,9 @@ def delete_s3_bucket(
|
|
|
153
166
|
Bucket=bucket, Key=entry["Key"], VersionId=entry["VersionId"]
|
|
154
167
|
)
|
|
155
168
|
s3_resource.Bucket(bucket).delete()
|
|
156
|
-
|
|
169
|
+
# S3 bucket deletion is only eventually-consistent. See
|
|
170
|
+
# <https://docs.aws.amazon.com/AmazonS3/latest/userguide/delete-bucket.html>
|
|
171
|
+
printq(f"\n * S3 bucket successfully scheduled for deletion: {bucket}\n\n", quiet)
|
|
157
172
|
except s3_resource.meta.client.exceptions.NoSuchBucket:
|
|
158
173
|
printq(f"\n * S3 bucket no longer exists: {bucket}\n\n", quiet)
|
|
159
174
|
|
|
@@ -323,17 +338,13 @@ def get_bucket_region(
|
|
|
323
338
|
raise
|
|
324
339
|
except KeyError as e:
|
|
325
340
|
# If we get a weird head response we will have a KeyError
|
|
326
|
-
logger.debug(
|
|
327
|
-
"Strategy %d to get bucket location did not work: %s", i + 1, e
|
|
328
|
-
)
|
|
341
|
+
logger.debug("Strategy %d to get bucket location did not work: %s", i + 1, e)
|
|
329
342
|
error_logs.append((i + 1, str(e)))
|
|
330
343
|
last_error = e
|
|
331
344
|
|
|
332
345
|
error_messages = []
|
|
333
346
|
for rank, message in error_logs:
|
|
334
|
-
error_messages.append(
|
|
335
|
-
f"Strategy {rank} failed to get bucket location because: {message}"
|
|
336
|
-
)
|
|
347
|
+
error_messages.append(f"Strategy {rank} failed to get bucket location because: {message}")
|
|
337
348
|
# If we get here we ran out of attempts.
|
|
338
349
|
raise NoBucketLocationError(
|
|
339
350
|
"Could not get bucket location: " + "\n".join(error_messages)
|
|
@@ -385,7 +396,6 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None, anonym
|
|
|
385
396
|
:raises RuntimeError: when existing is False but the object exists.
|
|
386
397
|
:raises PermissionError: when we are not authorized to look at the object.
|
|
387
398
|
"""
|
|
388
|
-
|
|
389
399
|
key_name = url.path[1:]
|
|
390
400
|
bucket_name = url.netloc
|
|
391
401
|
|
|
@@ -458,7 +468,6 @@ def list_objects_for_url(url: ParseResult, anonymous: Optional[bool] = None) ->
|
|
|
458
468
|
|
|
459
469
|
:raises PermissionError: when we are not authorized to do the list operation.
|
|
460
470
|
"""
|
|
461
|
-
|
|
462
471
|
key_name = url.path[1:]
|
|
463
472
|
bucket_name = url.netloc
|
|
464
473
|
|
|
@@ -509,7 +518,7 @@ def list_objects_for_url(url: ParseResult, anonymous: Optional[bool] = None) ->
|
|
|
509
518
|
return listing
|
|
510
519
|
|
|
511
520
|
|
|
512
|
-
def flatten_tags(tags: dict[str, str]) -> list[dict[
|
|
521
|
+
def flatten_tags(tags: dict[str, str]) -> list[dict[Union[Literal["Key"], Literal["Value"]], str]]:
|
|
513
522
|
"""
|
|
514
523
|
Convert tags from a key to value dict into a list of 'Key': xxx, 'Value': xxx dicts.
|
|
515
524
|
"""
|
|
@@ -537,23 +546,3 @@ def boto3_pager(
|
|
|
537
546
|
for page in paginator.paginate(**kwargs):
|
|
538
547
|
# Invoke it and go through the pages, yielding from them
|
|
539
548
|
yield from page.get(result_attribute_name, [])
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
def get_item_from_attributes(attributes: list["AttributeTypeDef"], name: str) -> Any:
|
|
543
|
-
"""
|
|
544
|
-
Given a list of attributes, find the attribute associated with the name and return its corresponding value.
|
|
545
|
-
|
|
546
|
-
The `attribute_list` will be a list of TypedDict's (which boto3 SDB functions commonly return),
|
|
547
|
-
where each TypedDict has a "Name" and "Value" key value pair.
|
|
548
|
-
This function grabs the value out of the associated TypedDict.
|
|
549
|
-
|
|
550
|
-
If the attribute with the name does not exist, the function will return None.
|
|
551
|
-
|
|
552
|
-
:param attributes: list of attributes
|
|
553
|
-
:param name: name of the attribute
|
|
554
|
-
:return: value of the attribute
|
|
555
|
-
"""
|
|
556
|
-
return next(
|
|
557
|
-
(attribute["Value"] for attribute in attributes if attribute["Name"] == name),
|
|
558
|
-
None,
|
|
559
|
-
)
|