toil 9.1.1__py3-none-any.whl → 9.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +5 -9
- toil/batchSystems/abstractBatchSystem.py +23 -22
- toil/batchSystems/abstractGridEngineBatchSystem.py +17 -12
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/cleanup_support.py +4 -4
- toil/batchSystems/contained_executor.py +3 -3
- toil/batchSystems/gridengine.py +3 -4
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +65 -63
- toil/batchSystems/local_support.py +2 -3
- toil/batchSystems/lsf.py +6 -7
- toil/batchSystems/mesos/batchSystem.py +11 -7
- toil/batchSystems/mesos/test/__init__.py +1 -2
- toil/batchSystems/options.py +9 -10
- toil/batchSystems/registry.py +3 -7
- toil/batchSystems/singleMachine.py +8 -11
- toil/batchSystems/slurm.py +49 -38
- toil/batchSystems/torque.py +3 -4
- toil/bus.py +36 -34
- toil/common.py +129 -89
- toil/cwl/cwltoil.py +857 -729
- toil/cwl/utils.py +44 -35
- toil/fileStores/__init__.py +3 -1
- toil/fileStores/abstractFileStore.py +28 -30
- toil/fileStores/cachingFileStore.py +8 -8
- toil/fileStores/nonCachingFileStore.py +10 -21
- toil/job.py +159 -158
- toil/jobStores/abstractJobStore.py +68 -69
- toil/jobStores/aws/jobStore.py +249 -213
- toil/jobStores/aws/utils.py +13 -24
- toil/jobStores/fileJobStore.py +28 -22
- toil/jobStores/googleJobStore.py +21 -17
- toil/jobStores/utils.py +3 -7
- toil/leader.py +17 -22
- toil/lib/accelerators.py +6 -4
- toil/lib/aws/__init__.py +9 -10
- toil/lib/aws/ami.py +33 -19
- toil/lib/aws/iam.py +6 -6
- toil/lib/aws/s3.py +259 -157
- toil/lib/aws/session.py +76 -76
- toil/lib/aws/utils.py +51 -43
- toil/lib/checksum.py +19 -15
- toil/lib/compatibility.py +3 -2
- toil/lib/conversions.py +45 -18
- toil/lib/directory.py +29 -26
- toil/lib/docker.py +93 -99
- toil/lib/dockstore.py +77 -50
- toil/lib/ec2.py +39 -38
- toil/lib/ec2nodes.py +11 -4
- toil/lib/exceptions.py +8 -5
- toil/lib/ftp_utils.py +9 -14
- toil/lib/generatedEC2Lists.py +161 -20
- toil/lib/history.py +141 -97
- toil/lib/history_submission.py +163 -72
- toil/lib/io.py +27 -17
- toil/lib/memoize.py +2 -1
- toil/lib/misc.py +15 -11
- toil/lib/pipes.py +40 -25
- toil/lib/plugins.py +12 -8
- toil/lib/resources.py +1 -0
- toil/lib/retry.py +32 -38
- toil/lib/threading.py +12 -12
- toil/lib/throttle.py +1 -2
- toil/lib/trs.py +113 -51
- toil/lib/url.py +14 -23
- toil/lib/web.py +7 -2
- toil/options/common.py +18 -15
- toil/options/cwl.py +2 -2
- toil/options/runner.py +9 -5
- toil/options/wdl.py +1 -3
- toil/provisioners/__init__.py +9 -9
- toil/provisioners/abstractProvisioner.py +22 -20
- toil/provisioners/aws/__init__.py +20 -14
- toil/provisioners/aws/awsProvisioner.py +10 -8
- toil/provisioners/clusterScaler.py +19 -18
- toil/provisioners/gceProvisioner.py +2 -3
- toil/provisioners/node.py +11 -13
- toil/realtimeLogger.py +4 -4
- toil/resource.py +5 -5
- toil/server/app.py +2 -2
- toil/server/cli/wes_cwl_runner.py +11 -11
- toil/server/utils.py +18 -21
- toil/server/wes/abstract_backend.py +9 -8
- toil/server/wes/amazon_wes_utils.py +3 -3
- toil/server/wes/tasks.py +3 -5
- toil/server/wes/toil_backend.py +17 -21
- toil/server/wsgi_app.py +3 -3
- toil/serviceManager.py +3 -4
- toil/statsAndLogging.py +12 -13
- toil/test/__init__.py +33 -24
- toil/test/batchSystems/batchSystemTest.py +12 -11
- toil/test/batchSystems/batch_system_plugin_test.py +3 -5
- toil/test/batchSystems/test_slurm.py +38 -24
- toil/test/cwl/conftest.py +5 -6
- toil/test/cwl/cwlTest.py +194 -78
- toil/test/cwl/download_file_uri.json +6 -0
- toil/test/cwl/download_file_uri_no_hostname.json +6 -0
- toil/test/docs/scripts/tutorial_staging.py +1 -0
- toil/test/jobStores/jobStoreTest.py +9 -7
- toil/test/lib/aws/test_iam.py +1 -3
- toil/test/lib/aws/test_s3.py +1 -1
- toil/test/lib/dockerTest.py +9 -9
- toil/test/lib/test_ec2.py +12 -11
- toil/test/lib/test_history.py +4 -4
- toil/test/lib/test_trs.py +16 -14
- toil/test/lib/test_url.py +7 -6
- toil/test/lib/url_plugin_test.py +12 -18
- toil/test/provisioners/aws/awsProvisionerTest.py +10 -8
- toil/test/provisioners/clusterScalerTest.py +2 -5
- toil/test/provisioners/clusterTest.py +1 -3
- toil/test/server/serverTest.py +13 -4
- toil/test/sort/restart_sort.py +2 -6
- toil/test/sort/sort.py +3 -8
- toil/test/src/deferredFunctionTest.py +7 -7
- toil/test/src/environmentTest.py +1 -2
- toil/test/src/fileStoreTest.py +5 -5
- toil/test/src/importExportFileTest.py +5 -6
- toil/test/src/jobServiceTest.py +22 -14
- toil/test/src/jobTest.py +121 -25
- toil/test/src/miscTests.py +5 -7
- toil/test/src/promisedRequirementTest.py +8 -7
- toil/test/src/regularLogTest.py +2 -3
- toil/test/src/resourceTest.py +5 -8
- toil/test/src/restartDAGTest.py +5 -6
- toil/test/src/resumabilityTest.py +2 -2
- toil/test/src/retainTempDirTest.py +3 -3
- toil/test/src/systemTest.py +3 -3
- toil/test/src/threadingTest.py +1 -1
- toil/test/src/workerTest.py +1 -2
- toil/test/utils/toilDebugTest.py +6 -4
- toil/test/utils/toilKillTest.py +1 -1
- toil/test/utils/utilsTest.py +15 -14
- toil/test/wdl/wdltoil_test.py +247 -124
- toil/test/wdl/wdltoil_test_kubernetes.py +2 -2
- toil/toilState.py +2 -3
- toil/utils/toilDebugFile.py +3 -8
- toil/utils/toilDebugJob.py +1 -2
- toil/utils/toilLaunchCluster.py +1 -2
- toil/utils/toilSshCluster.py +2 -0
- toil/utils/toilStats.py +19 -24
- toil/utils/toilStatus.py +11 -14
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +313 -209
- toil/worker.py +18 -12
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/METADATA +11 -14
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/RECORD +150 -153
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/WHEEL +1 -1
- toil/test/cwl/staging_cat.cwl +0 -27
- toil/test/cwl/staging_make_file.cwl +0 -25
- toil/test/cwl/staging_workflow.cwl +0 -43
- toil/test/cwl/zero_default.cwl +0 -61
- toil/test/utils/ABCWorkflowDebug/ABC.txt +0 -1
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/entry_points.txt +0 -0
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/licenses/LICENSE +0 -0
- {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/top_level.txt +0 -0
toil/lib/aws/s3.py
CHANGED
|
@@ -14,41 +14,37 @@
|
|
|
14
14
|
|
|
15
15
|
import hashlib
|
|
16
16
|
import itertools
|
|
17
|
-
import urllib.parse
|
|
18
17
|
import logging
|
|
19
|
-
|
|
20
|
-
from
|
|
21
|
-
from datetime import timedelta
|
|
18
|
+
import urllib.parse
|
|
19
|
+
from collections.abc import Iterator, MutableSequence
|
|
22
20
|
from contextlib import contextmanager
|
|
23
|
-
from
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
Optional,
|
|
27
|
-
Union,
|
|
28
|
-
Tuple,
|
|
29
|
-
cast,
|
|
30
|
-
Literal,
|
|
31
|
-
Iterator,
|
|
32
|
-
IO)
|
|
33
|
-
|
|
34
|
-
from toil.lib.retry import retry, get_error_status, ErrorCondition
|
|
35
|
-
from toil.lib.misc import printq
|
|
36
|
-
from toil.lib.aws import AWSServerErrors, session, build_tag_dict_from_env
|
|
37
|
-
from toil.lib.aws.utils import enable_public_objects, flatten_tags
|
|
38
|
-
from toil.lib.conversions import modify_url, MB, MIB, TB
|
|
39
|
-
from toil.lib.pipes import WritablePipe, ReadablePipe, HashingPipe
|
|
21
|
+
from datetime import timedelta
|
|
22
|
+
from io import BytesIO
|
|
23
|
+
from typing import IO, Any, Literal, cast
|
|
40
24
|
|
|
41
25
|
# This file cannot be imported without the botocore/boto3 modules. We need
|
|
42
26
|
# these types to set up @retry annotations, and @retry annotations really need
|
|
43
27
|
# to be passed the real types at import time, since they do annotation-time
|
|
44
28
|
# type checking internally.
|
|
45
29
|
from botocore.exceptions import ClientError
|
|
46
|
-
|
|
47
|
-
from mypy_boto3_s3 import S3Client, S3ServiceResource
|
|
30
|
+
from mypy_boto3_s3 import S3ServiceResource
|
|
48
31
|
from mypy_boto3_s3.literals import BucketLocationConstraintType
|
|
49
|
-
from mypy_boto3_s3.service_resource import Bucket
|
|
50
|
-
from mypy_boto3_s3.type_defs import
|
|
51
|
-
|
|
32
|
+
from mypy_boto3_s3.service_resource import Bucket
|
|
33
|
+
from mypy_boto3_s3.type_defs import (
|
|
34
|
+
CompletedPartTypeDef,
|
|
35
|
+
GetObjectOutputTypeDef,
|
|
36
|
+
HeadObjectOutputTypeDef,
|
|
37
|
+
ListMultipartUploadsOutputTypeDef,
|
|
38
|
+
ObjectTypeDef,
|
|
39
|
+
PutObjectOutputTypeDef,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
from toil.lib.aws import AWSServerErrors, build_tag_dict_from_env, session
|
|
43
|
+
from toil.lib.aws.utils import enable_public_objects, flatten_tags
|
|
44
|
+
from toil.lib.conversions import MB, MIB, TB, modify_url
|
|
45
|
+
from toil.lib.misc import printq
|
|
46
|
+
from toil.lib.pipes import HashingPipe, ReadablePipe, WritablePipe
|
|
47
|
+
from toil.lib.retry import get_error_status, retry
|
|
52
48
|
|
|
53
49
|
logger = logging.getLogger(__name__)
|
|
54
50
|
|
|
@@ -81,9 +77,9 @@ class AWSBadEncryptionKeyError(Exception):
|
|
|
81
77
|
def create_s3_bucket(
|
|
82
78
|
s3_resource: S3ServiceResource,
|
|
83
79
|
bucket_name: str,
|
|
84
|
-
region:
|
|
85
|
-
tags:
|
|
86
|
-
public: bool = True
|
|
80
|
+
region: BucketLocationConstraintType | Literal["us-east-1"],
|
|
81
|
+
tags: dict[str, str] | None = None,
|
|
82
|
+
public: bool = True,
|
|
87
83
|
) -> "Bucket":
|
|
88
84
|
"""
|
|
89
85
|
Create an AWS S3 bucket, using the given Boto3 S3 session, with the
|
|
@@ -106,7 +102,7 @@ def create_s3_bucket(
|
|
|
106
102
|
|
|
107
103
|
tags = build_tag_dict_from_env() if tags is None else tags
|
|
108
104
|
bucket_tagging = s3_resource.BucketTagging(bucket_name)
|
|
109
|
-
bucket_tagging.put(Tagging={
|
|
105
|
+
bucket_tagging.put(Tagging={"TagSet": flatten_tags(tags)}) # type: ignore
|
|
110
106
|
|
|
111
107
|
# enabling public objects is the historical default
|
|
112
108
|
if public:
|
|
@@ -117,9 +113,7 @@ def create_s3_bucket(
|
|
|
117
113
|
|
|
118
114
|
@retry(errors=[ClientError])
|
|
119
115
|
def delete_s3_bucket(
|
|
120
|
-
s3_resource: S3ServiceResource,
|
|
121
|
-
bucket_name: str,
|
|
122
|
-
quiet: bool = True
|
|
116
|
+
s3_resource: S3ServiceResource, bucket_name: str, quiet: bool = True
|
|
123
117
|
) -> None:
|
|
124
118
|
"""
|
|
125
119
|
Delete the bucket with 'bucket_name'.
|
|
@@ -127,60 +121,63 @@ def delete_s3_bucket(
|
|
|
127
121
|
Note: 'quiet' is False when used for a clean up utility script (contrib/admin/cleanup_aws_resources.py)
|
|
128
122
|
that prints progress rather than logging. Logging should be used for all other internal Toil usage.
|
|
129
123
|
"""
|
|
130
|
-
assert isinstance(
|
|
124
|
+
assert isinstance(
|
|
125
|
+
bucket_name, str
|
|
126
|
+
), f"{bucket_name} is not a string ({type(bucket_name)})."
|
|
131
127
|
logger.debug("Deleting bucket '%s'.", bucket_name)
|
|
132
|
-
printq(f
|
|
128
|
+
printq(f"\n * Deleting s3 bucket: {bucket_name}\n\n", quiet)
|
|
133
129
|
|
|
134
130
|
s3_client = s3_resource.meta.client
|
|
135
131
|
|
|
136
132
|
try:
|
|
137
|
-
for u in s3_client.list_multipart_uploads(Bucket=bucket_name).get(
|
|
133
|
+
for u in s3_client.list_multipart_uploads(Bucket=bucket_name).get(
|
|
134
|
+
"Uploads", []
|
|
135
|
+
):
|
|
138
136
|
s3_client.abort_multipart_upload(
|
|
139
|
-
Bucket=bucket_name,
|
|
140
|
-
Key=u["Key"],
|
|
141
|
-
UploadId=u["UploadId"]
|
|
137
|
+
Bucket=bucket_name, Key=u["Key"], UploadId=u["UploadId"]
|
|
142
138
|
)
|
|
143
139
|
|
|
144
|
-
paginator = s3_client.get_paginator(
|
|
140
|
+
paginator = s3_client.get_paginator("list_object_versions")
|
|
145
141
|
for response in paginator.paginate(Bucket=bucket_name):
|
|
146
142
|
# Versions and delete markers can both go in here to be deleted.
|
|
147
143
|
# They both have Key and VersionId, but there's no shared base type
|
|
148
144
|
# defined for them in the stubs to express that. See
|
|
149
145
|
# <https://github.com/vemel/mypy_boto3_builder/issues/123>. So we
|
|
150
146
|
# have to do gymnastics to get them into the same list.
|
|
151
|
-
to_delete:
|
|
152
|
-
|
|
147
|
+
to_delete: list[dict[str, Any]] = cast(
|
|
148
|
+
list[dict[str, Any]], response.get("Versions", [])
|
|
149
|
+
) + cast(list[dict[str, Any]], response.get("DeleteMarkers", []))
|
|
153
150
|
for entry in to_delete:
|
|
154
|
-
printq(
|
|
151
|
+
printq(
|
|
152
|
+
f" Deleting {entry['Key']} version {entry['VersionId']}", quiet
|
|
153
|
+
)
|
|
155
154
|
s3_client.delete_object(
|
|
156
|
-
Bucket=bucket_name,
|
|
157
|
-
Key=entry['Key'],
|
|
158
|
-
VersionId=entry['VersionId']
|
|
155
|
+
Bucket=bucket_name, Key=entry["Key"], VersionId=entry["VersionId"]
|
|
159
156
|
)
|
|
160
157
|
bucket = s3_resource.Bucket(bucket_name)
|
|
161
158
|
bucket.objects.all().delete()
|
|
162
159
|
bucket.object_versions.delete()
|
|
163
160
|
bucket.delete()
|
|
164
|
-
printq(f
|
|
161
|
+
printq(f"\n * Deleted s3 bucket successfully: {bucket_name}\n\n", quiet)
|
|
165
162
|
logger.debug("Deleted s3 bucket successfully '%s'.", bucket_name)
|
|
166
163
|
except s3_client.exceptions.NoSuchBucket:
|
|
167
|
-
printq(f
|
|
164
|
+
printq(f"\n * S3 bucket no longer exists: {bucket_name}\n\n", quiet)
|
|
168
165
|
logger.debug("S3 bucket no longer exists '%s'.", bucket_name)
|
|
169
166
|
except ClientError as e:
|
|
170
167
|
if get_error_status(e) != 404:
|
|
171
168
|
raise
|
|
172
|
-
printq(f
|
|
169
|
+
printq(f"\n * S3 bucket no longer exists: {bucket_name}\n\n", quiet)
|
|
173
170
|
logger.debug("S3 bucket no longer exists '%s'.", bucket_name)
|
|
174
171
|
|
|
175
172
|
|
|
176
173
|
@retry(errors=[AWSServerErrors])
|
|
177
|
-
def bucket_exists(s3_resource: S3ServiceResource, bucket: str) ->
|
|
174
|
+
def bucket_exists(s3_resource: S3ServiceResource, bucket: str) -> bool | Bucket:
|
|
178
175
|
s3_client = s3_resource.meta.client
|
|
179
176
|
try:
|
|
180
177
|
s3_client.head_bucket(Bucket=bucket)
|
|
181
178
|
return s3_resource.Bucket(bucket)
|
|
182
179
|
except (ClientError, s3_client.exceptions.NoSuchBucket) as e:
|
|
183
|
-
error_code = e.response.get(
|
|
180
|
+
error_code = e.response.get("ResponseMetadata", {}).get("HTTPStatusCode")
|
|
184
181
|
if error_code == 404:
|
|
185
182
|
return False
|
|
186
183
|
else:
|
|
@@ -188,7 +185,9 @@ def bucket_exists(s3_resource: S3ServiceResource, bucket: str) -> Union[bool, Bu
|
|
|
188
185
|
|
|
189
186
|
|
|
190
187
|
@retry(errors=[AWSServerErrors])
|
|
191
|
-
def head_s3_object(
|
|
188
|
+
def head_s3_object(
|
|
189
|
+
bucket: str, key: str, header: dict[str, Any], region: str | None = None
|
|
190
|
+
) -> HeadObjectOutputTypeDef:
|
|
192
191
|
"""
|
|
193
192
|
Attempt to HEAD an s3 object and return its response.
|
|
194
193
|
|
|
@@ -203,31 +202,44 @@ def head_s3_object(bucket: str, key: str, header: Dict[str, Any], region: Option
|
|
|
203
202
|
|
|
204
203
|
|
|
205
204
|
@retry(errors=[AWSServerErrors])
|
|
206
|
-
def list_multipart_uploads(
|
|
205
|
+
def list_multipart_uploads(
|
|
206
|
+
bucket: str, region: str, prefix: str, max_uploads: int = 1
|
|
207
|
+
) -> ListMultipartUploadsOutputTypeDef:
|
|
207
208
|
s3_client = session.client("s3", region_name=region)
|
|
208
|
-
return s3_client.list_multipart_uploads(
|
|
209
|
+
return s3_client.list_multipart_uploads(
|
|
210
|
+
Bucket=bucket, MaxUploads=max_uploads, Prefix=prefix
|
|
211
|
+
)
|
|
209
212
|
|
|
210
213
|
|
|
211
214
|
@retry(errors=[AWSServerErrors])
|
|
212
|
-
def copy_s3_to_s3(
|
|
213
|
-
|
|
215
|
+
def copy_s3_to_s3(
|
|
216
|
+
s3_resource: S3ServiceResource,
|
|
217
|
+
src_bucket: str,
|
|
218
|
+
src_key: str,
|
|
219
|
+
dst_bucket: str,
|
|
220
|
+
dst_key: str,
|
|
221
|
+
extra_args: dict[Any, Any] | None = None,
|
|
222
|
+
) -> None:
|
|
223
|
+
source = {"Bucket": src_bucket, "Key": src_key}
|
|
214
224
|
# Note: this may have errors if using sse-c because of
|
|
215
225
|
# a bug with encryption using copy_object and copy (which uses copy_object for files <5GB):
|
|
216
226
|
# https://github.com/aws/aws-cli/issues/6012
|
|
217
227
|
# this will only happen if we attempt to copy a file previously encrypted with sse-c
|
|
218
228
|
# copying an unencrypted file and encrypting it as sse-c seems to work fine though
|
|
219
|
-
kwargs = dict(
|
|
229
|
+
kwargs = dict(
|
|
230
|
+
CopySource=source, Bucket=dst_bucket, Key=dst_key, ExtraArgs=extra_args
|
|
231
|
+
)
|
|
220
232
|
s3_resource.meta.client.copy(**kwargs) # type: ignore
|
|
221
233
|
|
|
222
234
|
|
|
223
235
|
# TODO: Determine specific retries
|
|
224
236
|
@retry(errors=[AWSServerErrors])
|
|
225
237
|
def copy_local_to_s3(
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
238
|
+
s3_resource: S3ServiceResource,
|
|
239
|
+
local_file_path: str,
|
|
240
|
+
dst_bucket: str,
|
|
241
|
+
dst_key: str,
|
|
242
|
+
extra_args: dict[Any, Any] | None = None,
|
|
231
243
|
) -> None:
|
|
232
244
|
s3_client = s3_resource.meta.client
|
|
233
245
|
s3_client.upload_file(local_file_path, dst_bucket, dst_key, ExtraArgs=extra_args)
|
|
@@ -236,19 +248,28 @@ def copy_local_to_s3(
|
|
|
236
248
|
# TODO: Determine specific retries
|
|
237
249
|
@retry(errors=[AWSServerErrors])
|
|
238
250
|
def copy_s3_to_local(
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
251
|
+
s3_resource: S3ServiceResource,
|
|
252
|
+
local_file_path: str,
|
|
253
|
+
src_bucket: str,
|
|
254
|
+
src_key: str,
|
|
255
|
+
extra_args: dict[Any, Any] | None = None,
|
|
244
256
|
) -> None:
|
|
245
257
|
s3_client = s3_resource.meta.client
|
|
246
258
|
s3_client.download_file(src_bucket, src_key, local_file_path, ExtraArgs=extra_args)
|
|
247
259
|
|
|
248
260
|
|
|
249
261
|
class MultiPartPipe(WritablePipe):
|
|
250
|
-
def __init__(
|
|
251
|
-
|
|
262
|
+
def __init__(
|
|
263
|
+
self,
|
|
264
|
+
part_size: int,
|
|
265
|
+
s3_resource: S3ServiceResource,
|
|
266
|
+
bucket_name: str,
|
|
267
|
+
file_id: str,
|
|
268
|
+
encryption_args: dict[Any, Any] | None,
|
|
269
|
+
encoding: str | None = None,
|
|
270
|
+
errors: str | None = None,
|
|
271
|
+
) -> None:
|
|
272
|
+
super().__init__()
|
|
252
273
|
self.encoding = encoding
|
|
253
274
|
self.errors = errors
|
|
254
275
|
self.part_size = part_size
|
|
@@ -267,22 +288,28 @@ class MultiPartPipe(WritablePipe):
|
|
|
267
288
|
hasher.update(buf)
|
|
268
289
|
|
|
269
290
|
# low-level clients are thread safe
|
|
270
|
-
response = self.s3_client.create_multipart_upload(
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
upload_id = response[
|
|
274
|
-
parts = []
|
|
291
|
+
response = self.s3_client.create_multipart_upload(
|
|
292
|
+
Bucket=self.bucket_name, Key=self.file_id, **self.encryption_args
|
|
293
|
+
)
|
|
294
|
+
upload_id = response["UploadId"]
|
|
295
|
+
parts: MutableSequence[CompletedPartTypeDef] = []
|
|
275
296
|
try:
|
|
276
297
|
for part_num in itertools.count():
|
|
277
|
-
logger.debug(
|
|
298
|
+
logger.debug(
|
|
299
|
+
f"[{upload_id}] Uploading part %d of %d bytes",
|
|
300
|
+
part_num + 1,
|
|
301
|
+
len(buf),
|
|
302
|
+
)
|
|
278
303
|
# TODO: include the Content-MD5 header:
|
|
279
304
|
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.complete_multipart_upload
|
|
280
|
-
part = self.s3_client.upload_part(
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
305
|
+
part = self.s3_client.upload_part(
|
|
306
|
+
Bucket=self.bucket_name,
|
|
307
|
+
Key=self.file_id,
|
|
308
|
+
PartNumber=part_num + 1,
|
|
309
|
+
UploadId=upload_id,
|
|
310
|
+
Body=BytesIO(buf),
|
|
311
|
+
**self.encryption_args,
|
|
312
|
+
)
|
|
286
313
|
parts.append({"PartNumber": part_num + 1, "ETag": part["ETag"]})
|
|
287
314
|
|
|
288
315
|
# Get the next block of data we want to put
|
|
@@ -292,48 +319,56 @@ class MultiPartPipe(WritablePipe):
|
|
|
292
319
|
break
|
|
293
320
|
hasher.update(buf)
|
|
294
321
|
except:
|
|
295
|
-
self.s3_client.abort_multipart_upload(
|
|
296
|
-
|
|
297
|
-
|
|
322
|
+
self.s3_client.abort_multipart_upload(
|
|
323
|
+
Bucket=self.bucket_name, Key=self.file_id, UploadId=upload_id
|
|
324
|
+
)
|
|
298
325
|
else:
|
|
299
326
|
# Save the checksum
|
|
300
|
-
checksum = f
|
|
327
|
+
checksum = f"sha1${hasher.hexdigest()}"
|
|
301
328
|
# Encryption information is not needed here because the upload was
|
|
302
329
|
# not started with a checksum.
|
|
303
|
-
response = self.s3_client.complete_multipart_upload(
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
330
|
+
response = self.s3_client.complete_multipart_upload(
|
|
331
|
+
Bucket=self.bucket_name,
|
|
332
|
+
Key=self.file_id,
|
|
333
|
+
UploadId=upload_id,
|
|
334
|
+
MultipartUpload={"Parts": parts},
|
|
335
|
+
) # type: ignore
|
|
336
|
+
logger.debug(f"[{upload_id}] Upload of {self.file_id} complete...")
|
|
308
337
|
|
|
309
338
|
|
|
310
|
-
def parse_s3_uri(uri: str) ->
|
|
339
|
+
def parse_s3_uri(uri: str) -> tuple[str, str]:
|
|
311
340
|
# does not support s3/gs: https://docs.python.org/3/library/urllib.parse.html
|
|
312
341
|
# use regex instead?
|
|
313
342
|
uri = urllib.parse.urlparse(uri) # type: ignore
|
|
314
|
-
if uri.scheme.lower() !=
|
|
315
|
-
raise ValueError(f
|
|
316
|
-
bucket_name, key_name = uri.netloc.strip(
|
|
343
|
+
if uri.scheme.lower() != "s3": # type: ignore
|
|
344
|
+
raise ValueError(f"Invalid schema. Expecting s3 prefix, not: {uri}")
|
|
345
|
+
bucket_name, key_name = uri.netloc.strip("/"), uri.path.strip("/") # type: ignore
|
|
317
346
|
return bucket_name, key_name
|
|
318
347
|
|
|
319
348
|
|
|
320
|
-
def list_s3_items(
|
|
349
|
+
def list_s3_items(
|
|
350
|
+
s3_resource: S3ServiceResource,
|
|
351
|
+
bucket: str,
|
|
352
|
+
prefix: str,
|
|
353
|
+
startafter: str | None = None,
|
|
354
|
+
) -> Iterator[ObjectTypeDef]:
|
|
321
355
|
s3_client = s3_resource.meta.client
|
|
322
|
-
paginator = s3_client.get_paginator(
|
|
356
|
+
paginator = s3_client.get_paginator("list_objects_v2")
|
|
323
357
|
kwargs = dict(Bucket=bucket, Prefix=prefix)
|
|
324
358
|
if startafter:
|
|
325
|
-
kwargs[
|
|
359
|
+
kwargs["StartAfter"] = startafter
|
|
326
360
|
for page in paginator.paginate(**kwargs): # type: ignore
|
|
327
|
-
|
|
328
|
-
yield key
|
|
361
|
+
yield from page.get("Contents", [])
|
|
329
362
|
|
|
330
363
|
|
|
331
364
|
@retry(errors=[AWSServerErrors])
|
|
332
|
-
def upload_to_s3(
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
365
|
+
def upload_to_s3(
|
|
366
|
+
readable: IO[Any],
|
|
367
|
+
s3_resource: S3ServiceResource,
|
|
368
|
+
bucket: str,
|
|
369
|
+
key: str,
|
|
370
|
+
extra_args: dict[Any, Any] | None = None,
|
|
371
|
+
) -> None:
|
|
337
372
|
"""
|
|
338
373
|
Upload a readable object to s3, using multipart uploading if applicable.
|
|
339
374
|
|
|
@@ -350,33 +385,45 @@ def upload_to_s3(readable: IO[Any],
|
|
|
350
385
|
|
|
351
386
|
s3_client = s3_resource.meta.client
|
|
352
387
|
from boto3.s3.transfer import TransferConfig
|
|
388
|
+
|
|
353
389
|
config = TransferConfig(
|
|
354
390
|
multipart_threshold=DEFAULT_AWS_CHUNK_SIZE,
|
|
355
391
|
multipart_chunksize=DEFAULT_AWS_CHUNK_SIZE,
|
|
356
|
-
use_threads=True
|
|
392
|
+
use_threads=True,
|
|
357
393
|
)
|
|
358
394
|
logger.debug("Uploading %s", key)
|
|
359
395
|
# these methods use multipart if necessary
|
|
360
396
|
if isinstance(readable, str):
|
|
361
|
-
s3_client.upload_file(
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
397
|
+
s3_client.upload_file(
|
|
398
|
+
Filename=readable,
|
|
399
|
+
Bucket=bucket,
|
|
400
|
+
Key=key,
|
|
401
|
+
ExtraArgs=extra_args,
|
|
402
|
+
Config=config,
|
|
403
|
+
)
|
|
366
404
|
else:
|
|
367
|
-
s3_client.upload_fileobj(
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
405
|
+
s3_client.upload_fileobj(
|
|
406
|
+
Fileobj=readable,
|
|
407
|
+
Bucket=bucket,
|
|
408
|
+
Key=key,
|
|
409
|
+
ExtraArgs=extra_args,
|
|
410
|
+
Config=config,
|
|
411
|
+
)
|
|
372
412
|
|
|
373
413
|
object_summary = s3_resource.ObjectSummary(bucket, key)
|
|
374
414
|
object_summary.wait_until_exists(**extra_args)
|
|
375
415
|
|
|
376
416
|
|
|
377
417
|
@contextmanager
|
|
378
|
-
def download_stream(
|
|
379
|
-
|
|
418
|
+
def download_stream(
|
|
419
|
+
s3_resource: S3ServiceResource,
|
|
420
|
+
bucket: str,
|
|
421
|
+
key: str,
|
|
422
|
+
checksum_to_verify: str | None = None,
|
|
423
|
+
extra_args: dict[Any, Any] | None = None,
|
|
424
|
+
encoding: str | None = None,
|
|
425
|
+
errors: str | None = None,
|
|
426
|
+
) -> Iterator[IO[Any]]:
|
|
380
427
|
"""Context manager that gives out a download stream to download data."""
|
|
381
428
|
bucket_obj: Bucket = s3_resource.Bucket(bucket)
|
|
382
429
|
|
|
@@ -384,14 +431,16 @@ def download_stream(s3_resource: S3ServiceResource, bucket: str, key: str, check
|
|
|
384
431
|
def writeTo(self, writable: IO[Any]) -> None:
|
|
385
432
|
kwargs = dict(Key=key, Fileobj=writable, ExtraArgs=extra_args)
|
|
386
433
|
if not extra_args:
|
|
387
|
-
del kwargs[
|
|
434
|
+
del kwargs["ExtraArgs"]
|
|
388
435
|
bucket_obj.download_fileobj(**kwargs) # type: ignore
|
|
389
436
|
|
|
390
437
|
try:
|
|
391
438
|
if checksum_to_verify:
|
|
392
439
|
with DownloadPipe(encoding=encoding, errors=errors) as readable:
|
|
393
440
|
# Interpose a pipe to check the hash
|
|
394
|
-
with HashingPipe(
|
|
441
|
+
with HashingPipe(
|
|
442
|
+
readable, encoding=encoding, errors=errors
|
|
443
|
+
) as verified:
|
|
395
444
|
yield verified
|
|
396
445
|
else:
|
|
397
446
|
# Readable end of pipe produces text mode output if encoding specified
|
|
@@ -401,35 +450,59 @@ def download_stream(s3_resource: S3ServiceResource, bucket: str, key: str, check
|
|
|
401
450
|
except s3_resource.meta.client.exceptions.NoSuchKey:
|
|
402
451
|
raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
|
|
403
452
|
except ClientError as e:
|
|
404
|
-
if e.response.get(
|
|
405
|
-
raise AWSKeyNotFoundError(
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
453
|
+
if e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404:
|
|
454
|
+
raise AWSKeyNotFoundError(
|
|
455
|
+
f"Key '{key}' does not exist in bucket '{bucket}'."
|
|
456
|
+
)
|
|
457
|
+
elif (
|
|
458
|
+
e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 400
|
|
459
|
+
and e.response.get("Error", {}).get("Message") == "Bad Request"
|
|
460
|
+
and e.operation_name == "HeadObject"
|
|
461
|
+
):
|
|
409
462
|
# An error occurred (400) when calling the HeadObject operation: Bad Request
|
|
410
|
-
raise AWSBadEncryptionKeyError(
|
|
411
|
-
|
|
463
|
+
raise AWSBadEncryptionKeyError(
|
|
464
|
+
"Your AWS encryption key is most likely configured incorrectly "
|
|
465
|
+
f"(HeadObject operation on key '{key}': Bad Request)."
|
|
466
|
+
)
|
|
412
467
|
raise
|
|
413
468
|
|
|
414
469
|
|
|
415
|
-
def download_fileobject(
|
|
470
|
+
def download_fileobject(
|
|
471
|
+
s3_resource: S3ServiceResource,
|
|
472
|
+
bucket: Bucket,
|
|
473
|
+
key: str,
|
|
474
|
+
fileobj: BytesIO,
|
|
475
|
+
extra_args: dict[Any, Any] | None = None,
|
|
476
|
+
) -> None:
|
|
416
477
|
try:
|
|
417
478
|
bucket.download_fileobj(Key=key, Fileobj=fileobj, ExtraArgs=extra_args)
|
|
418
479
|
except s3_resource.meta.client.exceptions.NoSuchKey:
|
|
419
480
|
raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
|
|
420
481
|
except ClientError as e:
|
|
421
|
-
if e.response.get(
|
|
422
|
-
raise AWSKeyNotFoundError(
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
482
|
+
if e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404:
|
|
483
|
+
raise AWSKeyNotFoundError(
|
|
484
|
+
f"Key '{key}' does not exist in bucket '{bucket}'."
|
|
485
|
+
)
|
|
486
|
+
elif (
|
|
487
|
+
e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 400
|
|
488
|
+
and e.response.get("Error", {}).get("Message") == "Bad Request"
|
|
489
|
+
and e.operation_name == "HeadObject"
|
|
490
|
+
):
|
|
426
491
|
# An error occurred (400) when calling the HeadObject operation: Bad Request
|
|
427
|
-
raise AWSBadEncryptionKeyError(
|
|
428
|
-
|
|
492
|
+
raise AWSBadEncryptionKeyError(
|
|
493
|
+
"Your AWS encryption key is most likely configured incorrectly "
|
|
494
|
+
f"(HeadObject operation on key '{key}': Bad Request)."
|
|
495
|
+
)
|
|
429
496
|
raise
|
|
430
497
|
|
|
431
498
|
|
|
432
|
-
def s3_key_exists(
|
|
499
|
+
def s3_key_exists(
|
|
500
|
+
s3_resource: S3ServiceResource,
|
|
501
|
+
bucket: str,
|
|
502
|
+
key: str,
|
|
503
|
+
check: bool = False,
|
|
504
|
+
extra_args: dict[Any, Any] | None = None,
|
|
505
|
+
) -> bool:
|
|
433
506
|
"""Return True if the s3 obect exists, and False if not. Will error if encryption args are incorrect."""
|
|
434
507
|
extra_args = extra_args or {}
|
|
435
508
|
s3_client = s3_resource.meta.client
|
|
@@ -438,62 +511,91 @@ def s3_key_exists(s3_resource: S3ServiceResource, bucket: str, key: str, check:
|
|
|
438
511
|
return True
|
|
439
512
|
except s3_client.exceptions.NoSuchKey:
|
|
440
513
|
if check:
|
|
441
|
-
raise AWSKeyNotFoundError(
|
|
514
|
+
raise AWSKeyNotFoundError(
|
|
515
|
+
f"Key '{key}' does not exist in bucket '{bucket}'."
|
|
516
|
+
)
|
|
442
517
|
return False
|
|
443
518
|
except ClientError as e:
|
|
444
|
-
if e.response.get(
|
|
519
|
+
if e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404:
|
|
445
520
|
if check:
|
|
446
|
-
raise AWSKeyNotFoundError(
|
|
521
|
+
raise AWSKeyNotFoundError(
|
|
522
|
+
f"Key '{key}' does not exist in bucket '{bucket}'."
|
|
523
|
+
)
|
|
447
524
|
return False
|
|
448
|
-
elif
|
|
449
|
-
|
|
450
|
-
|
|
525
|
+
elif (
|
|
526
|
+
e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 400
|
|
527
|
+
and e.response.get("Error", {}).get("Message") == "Bad Request"
|
|
528
|
+
and e.operation_name == "HeadObject"
|
|
529
|
+
):
|
|
451
530
|
# An error occurred (400) when calling the HeadObject operation: Bad Request
|
|
452
|
-
raise AWSBadEncryptionKeyError(
|
|
453
|
-
|
|
531
|
+
raise AWSBadEncryptionKeyError(
|
|
532
|
+
"Your AWS encryption key is most likely configured incorrectly "
|
|
533
|
+
f"(HeadObject operation on key '{key}': Bad Request)."
|
|
534
|
+
)
|
|
454
535
|
else:
|
|
455
536
|
raise
|
|
456
537
|
|
|
457
538
|
|
|
458
|
-
def get_s3_object(
|
|
539
|
+
def get_s3_object(
|
|
540
|
+
s3_resource: S3ServiceResource,
|
|
541
|
+
bucket: str,
|
|
542
|
+
key: str,
|
|
543
|
+
extra_args: dict[Any, Any] | None = None,
|
|
544
|
+
) -> GetObjectOutputTypeDef:
|
|
459
545
|
if extra_args is None:
|
|
460
546
|
extra_args = dict()
|
|
461
547
|
s3_client = s3_resource.meta.client
|
|
462
548
|
return s3_client.get_object(Bucket=bucket, Key=key, **extra_args)
|
|
463
549
|
|
|
464
550
|
|
|
465
|
-
def put_s3_object(
|
|
551
|
+
def put_s3_object(
|
|
552
|
+
s3_resource: S3ServiceResource,
|
|
553
|
+
bucket: str,
|
|
554
|
+
key: str,
|
|
555
|
+
body: str | bytes,
|
|
556
|
+
extra_args: dict[Any, Any] | None = None,
|
|
557
|
+
) -> PutObjectOutputTypeDef:
|
|
466
558
|
if extra_args is None:
|
|
467
559
|
extra_args = dict()
|
|
468
560
|
s3_client = s3_resource.meta.client
|
|
469
561
|
return s3_client.put_object(Bucket=bucket, Key=key, Body=body, **extra_args)
|
|
470
562
|
|
|
471
563
|
|
|
472
|
-
def generate_presigned_url(
|
|
564
|
+
def generate_presigned_url(
|
|
565
|
+
s3_resource: S3ServiceResource, bucket: str, key_name: str, expiration: int
|
|
566
|
+
) -> str:
|
|
473
567
|
s3_client = s3_resource.meta.client
|
|
474
568
|
return s3_client.generate_presigned_url(
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
ExpiresIn=expiration)
|
|
569
|
+
"get_object", Params={"Bucket": bucket, "Key": key_name}, ExpiresIn=expiration
|
|
570
|
+
)
|
|
478
571
|
|
|
479
572
|
|
|
480
573
|
def create_public_url(s3_resource: S3ServiceResource, bucket: str, key: str) -> str:
|
|
481
574
|
bucket_obj = s3_resource.Bucket(bucket)
|
|
482
|
-
bucket_obj.Object(key).Acl().put(
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
575
|
+
bucket_obj.Object(key).Acl().put(
|
|
576
|
+
ACL="public-read"
|
|
577
|
+
) # TODO: do we need to generate a signed url after doing this?
|
|
578
|
+
url = generate_presigned_url(
|
|
579
|
+
s3_resource=s3_resource,
|
|
580
|
+
bucket=bucket,
|
|
581
|
+
key_name=key,
|
|
582
|
+
# One year should be sufficient to finish any pipeline ;-)
|
|
583
|
+
expiration=int(timedelta(days=365).total_seconds()),
|
|
584
|
+
)
|
|
488
585
|
# boto doesn't properly remove the x-amz-security-token parameter when
|
|
489
586
|
# query_auth is False when using an IAM role (see issue #2043). Including the
|
|
490
587
|
# x-amz-security-token parameter without the access key results in a 403,
|
|
491
588
|
# even if the resource is public, so we need to remove it.
|
|
492
589
|
# TODO: verify that this is still the case
|
|
493
|
-
return modify_url(
|
|
590
|
+
return modify_url(
|
|
591
|
+
url, remove=["x-amz-security-token", "AWSAccessKeyId", "Signature"]
|
|
592
|
+
)
|
|
494
593
|
|
|
495
594
|
|
|
496
595
|
def get_s3_bucket_region(s3_resource: S3ServiceResource, bucket: str) -> str:
|
|
497
596
|
s3_client = s3_resource.meta.client
|
|
498
597
|
# AWS returns None for the default of 'us-east-1'
|
|
499
|
-
return
|
|
598
|
+
return (
|
|
599
|
+
s3_client.get_bucket_location(Bucket=bucket).get("LocationConstraint", None)
|
|
600
|
+
or "us-east-1"
|
|
601
|
+
)
|