toil 8.2.0__py3-none-any.whl → 9.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. toil/batchSystems/abstractBatchSystem.py +13 -5
  2. toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
  3. toil/batchSystems/kubernetes.py +13 -2
  4. toil/batchSystems/mesos/batchSystem.py +33 -2
  5. toil/batchSystems/registry.py +15 -118
  6. toil/batchSystems/slurm.py +191 -16
  7. toil/common.py +20 -1
  8. toil/cwl/cwltoil.py +97 -119
  9. toil/cwl/utils.py +103 -3
  10. toil/fileStores/__init__.py +1 -1
  11. toil/fileStores/abstractFileStore.py +5 -2
  12. toil/fileStores/cachingFileStore.py +1 -1
  13. toil/job.py +30 -14
  14. toil/jobStores/abstractJobStore.py +35 -255
  15. toil/jobStores/aws/jobStore.py +864 -1964
  16. toil/jobStores/aws/utils.py +24 -270
  17. toil/jobStores/fileJobStore.py +2 -1
  18. toil/jobStores/googleJobStore.py +32 -13
  19. toil/jobStores/utils.py +0 -327
  20. toil/leader.py +27 -22
  21. toil/lib/accelerators.py +1 -1
  22. toil/lib/aws/config.py +22 -0
  23. toil/lib/aws/s3.py +477 -9
  24. toil/lib/aws/utils.py +22 -33
  25. toil/lib/checksum.py +88 -0
  26. toil/lib/conversions.py +33 -31
  27. toil/lib/directory.py +217 -0
  28. toil/lib/ec2.py +97 -29
  29. toil/lib/exceptions.py +2 -1
  30. toil/lib/expando.py +2 -2
  31. toil/lib/generatedEC2Lists.py +138 -19
  32. toil/lib/io.py +33 -2
  33. toil/lib/memoize.py +21 -7
  34. toil/lib/misc.py +1 -1
  35. toil/lib/pipes.py +385 -0
  36. toil/lib/plugins.py +106 -0
  37. toil/lib/retry.py +1 -1
  38. toil/lib/threading.py +1 -1
  39. toil/lib/url.py +320 -0
  40. toil/lib/web.py +4 -5
  41. toil/options/cwl.py +13 -1
  42. toil/options/runner.py +17 -10
  43. toil/options/wdl.py +12 -1
  44. toil/provisioners/__init__.py +5 -2
  45. toil/provisioners/aws/__init__.py +43 -36
  46. toil/provisioners/aws/awsProvisioner.py +47 -15
  47. toil/provisioners/node.py +60 -12
  48. toil/resource.py +3 -13
  49. toil/server/app.py +12 -6
  50. toil/server/cli/wes_cwl_runner.py +2 -2
  51. toil/server/wes/abstract_backend.py +21 -43
  52. toil/server/wes/toil_backend.py +2 -2
  53. toil/test/__init__.py +16 -18
  54. toil/test/batchSystems/batchSystemTest.py +2 -9
  55. toil/test/batchSystems/batch_system_plugin_test.py +7 -0
  56. toil/test/batchSystems/test_slurm.py +103 -14
  57. toil/test/cwl/cwlTest.py +181 -8
  58. toil/test/cwl/staging_cat.cwl +27 -0
  59. toil/test/cwl/staging_make_file.cwl +25 -0
  60. toil/test/cwl/staging_workflow.cwl +43 -0
  61. toil/test/cwl/zero_default.cwl +61 -0
  62. toil/test/docs/scripts/tutorial_staging.py +17 -8
  63. toil/test/docs/scriptsTest.py +2 -1
  64. toil/test/jobStores/jobStoreTest.py +23 -133
  65. toil/test/lib/aws/test_iam.py +7 -7
  66. toil/test/lib/aws/test_s3.py +30 -33
  67. toil/test/lib/aws/test_utils.py +9 -9
  68. toil/test/lib/test_url.py +69 -0
  69. toil/test/lib/url_plugin_test.py +105 -0
  70. toil/test/provisioners/aws/awsProvisionerTest.py +60 -7
  71. toil/test/provisioners/clusterTest.py +15 -2
  72. toil/test/provisioners/gceProvisionerTest.py +1 -1
  73. toil/test/server/serverTest.py +78 -36
  74. toil/test/src/autoDeploymentTest.py +2 -3
  75. toil/test/src/fileStoreTest.py +89 -87
  76. toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
  77. toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
  78. toil/test/utils/toilKillTest.py +35 -28
  79. toil/test/wdl/md5sum/md5sum-gs.json +1 -1
  80. toil/test/wdl/md5sum/md5sum.json +1 -1
  81. toil/test/wdl/testfiles/read_file.wdl +18 -0
  82. toil/test/wdl/testfiles/url_to_optional_file.wdl +2 -1
  83. toil/test/wdl/wdltoil_test.py +171 -162
  84. toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
  85. toil/utils/toilDebugFile.py +6 -3
  86. toil/utils/toilSshCluster.py +23 -0
  87. toil/utils/toilStats.py +17 -2
  88. toil/utils/toilUpdateEC2Instances.py +1 -0
  89. toil/version.py +10 -10
  90. toil/wdl/wdltoil.py +1179 -825
  91. toil/worker.py +16 -8
  92. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/METADATA +32 -32
  93. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/RECORD +97 -85
  94. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/WHEEL +1 -1
  95. toil/lib/iterables.py +0 -112
  96. toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
  97. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/entry_points.txt +0 -0
  98. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/licenses/LICENSE +0 -0
  99. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/top_level.txt +0 -0
toil/lib/aws/s3.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2015-2024 Regents of the University of California
1
+ # Copyright (C) 2015-2023 Regents of the University of California
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -11,21 +11,489 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
15
+ import hashlib
16
+ import itertools
17
+ import urllib.parse
14
18
  import logging
15
19
 
16
- from mypy_boto3_s3.type_defs import ListMultipartUploadsOutputTypeDef
20
+ from io import BytesIO
21
+ from datetime import timedelta
22
+ from contextlib import contextmanager
23
+ from typing import (Any,
24
+ Dict,
25
+ List,
26
+ Optional,
27
+ Union,
28
+ Tuple,
29
+ cast,
30
+ Literal,
31
+ Iterator,
32
+ IO)
33
+
34
+ from toil.lib.retry import retry, get_error_status, ErrorCondition
35
+ from toil.lib.misc import printq
36
+ from toil.lib.aws import AWSServerErrors, session, build_tag_dict_from_env
37
+ from toil.lib.aws.utils import enable_public_objects, flatten_tags
38
+ from toil.lib.conversions import modify_url, MB, MIB, TB
39
+ from toil.lib.pipes import WritablePipe, ReadablePipe, HashingPipe
40
+
41
+ # This file cannot be imported without the botocore/boto3 modules. We need
42
+ # these types to set up @retry annotations, and @retry annotations really need
43
+ # to be passed the real types at import time, since they do annotation-time
44
+ # type checking internally.
45
+ from botocore.exceptions import ClientError
46
+
47
+ from mypy_boto3_s3 import S3Client, S3ServiceResource
48
+ from mypy_boto3_s3.literals import BucketLocationConstraintType
49
+ from mypy_boto3_s3.service_resource import Bucket, Object
50
+ from mypy_boto3_s3.type_defs import ListMultipartUploadsOutputTypeDef, HeadObjectOutputTypeDef, GetObjectOutputTypeDef, PutObjectOutputTypeDef, ObjectTypeDef
17
51
 
18
- from toil.lib.aws import AWSServerErrors, session
19
- from toil.lib.retry import retry
20
52
 
21
53
  logger = logging.getLogger(__name__)
22
54
 
23
55
 
56
+ # AWS Defined Limits
57
+ # https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
58
+ AWS_MAX_MULTIPART_COUNT = 10000
59
+ AWS_MAX_CHUNK_SIZE = 5 * TB
60
+ AWS_MIN_CHUNK_SIZE = 5 * MB
61
+ # Note: There is no minimum size limit on the last part of a multipart upload.
62
+
63
+ # The chunk size we chose arbitrarily, but it must be consistent for etags
64
+ DEFAULT_AWS_CHUNK_SIZE = 128 * MIB
65
+ assert AWS_MAX_CHUNK_SIZE > DEFAULT_AWS_CHUNK_SIZE > AWS_MIN_CHUNK_SIZE
66
+
67
+
68
+ class AWSKeyNotFoundError(Exception):
69
+ pass
70
+
71
+
72
+ class AWSKeyAlreadyExistsError(Exception):
73
+ pass
74
+
75
+
76
+ class AWSBadEncryptionKeyError(Exception):
77
+ pass
78
+
79
+
80
+ @retry(errors=[ClientError])
81
+ def create_s3_bucket(
82
+ s3_resource: S3ServiceResource,
83
+ bucket_name: str,
84
+ region: Union[BucketLocationConstraintType, Literal["us-east-1"]],
85
+ tags: Optional[Dict[str, str]] = None,
86
+ public: bool = True
87
+ ) -> "Bucket":
88
+ """
89
+ Create an AWS S3 bucket, using the given Boto3 S3 session, with the
90
+ given name, in the given region.
91
+
92
+ Supports the us-east-1 region, where bucket creation is special.
93
+
94
+ *ALL* S3 bucket creation should use this function.
95
+ """
96
+ logger.info("Creating bucket '%s' in region %s.", bucket_name, region)
97
+ if region == "us-east-1": # see https://github.com/boto/boto3/issues/125
98
+ bucket = s3_resource.create_bucket(Bucket=bucket_name)
99
+ else:
100
+ bucket = s3_resource.create_bucket(
101
+ Bucket=bucket_name,
102
+ CreateBucketConfiguration={"LocationConstraint": region},
103
+ )
104
+ # wait until the bucket exists before adding tags
105
+ bucket.wait_until_exists()
106
+
107
+ tags = build_tag_dict_from_env() if tags is None else tags
108
+ bucket_tagging = s3_resource.BucketTagging(bucket_name)
109
+ bucket_tagging.put(Tagging={'TagSet': flatten_tags(tags)}) # type: ignore
110
+
111
+ # enabling public objects is the historical default
112
+ if public:
113
+ enable_public_objects(bucket_name)
114
+
115
+ return bucket
116
+
117
+
118
+ @retry(errors=[ClientError])
119
+ def delete_s3_bucket(
120
+ s3_resource: S3ServiceResource,
121
+ bucket_name: str,
122
+ quiet: bool = True
123
+ ) -> None:
124
+ """
125
+ Delete the bucket with 'bucket_name'.
126
+
127
+ Note: 'quiet' is False when used for a clean up utility script (contrib/admin/cleanup_aws_resources.py)
128
+ that prints progress rather than logging. Logging should be used for all other internal Toil usage.
129
+ """
130
+ assert isinstance(bucket_name, str), f'{bucket_name} is not a string ({type(bucket_name)}).'
131
+ logger.debug("Deleting bucket '%s'.", bucket_name)
132
+ printq(f'\n * Deleting s3 bucket: {bucket_name}\n\n', quiet)
133
+
134
+ s3_client = s3_resource.meta.client
135
+
136
+ try:
137
+ for u in s3_client.list_multipart_uploads(Bucket=bucket_name).get('Uploads', []):
138
+ s3_client.abort_multipart_upload(
139
+ Bucket=bucket_name,
140
+ Key=u["Key"],
141
+ UploadId=u["UploadId"]
142
+ )
143
+
144
+ paginator = s3_client.get_paginator('list_object_versions')
145
+ for response in paginator.paginate(Bucket=bucket_name):
146
+ # Versions and delete markers can both go in here to be deleted.
147
+ # They both have Key and VersionId, but there's no shared base type
148
+ # defined for them in the stubs to express that. See
149
+ # <https://github.com/vemel/mypy_boto3_builder/issues/123>. So we
150
+ # have to do gymnastics to get them into the same list.
151
+ to_delete: List[Dict[str, Any]] = cast(List[Dict[str, Any]], response.get('Versions', [])) + \
152
+ cast(List[Dict[str, Any]], response.get('DeleteMarkers', []))
153
+ for entry in to_delete:
154
+ printq(f" Deleting {entry['Key']} version {entry['VersionId']}", quiet)
155
+ s3_client.delete_object(
156
+ Bucket=bucket_name,
157
+ Key=entry['Key'],
158
+ VersionId=entry['VersionId']
159
+ )
160
+ bucket = s3_resource.Bucket(bucket_name)
161
+ bucket.objects.all().delete()
162
+ bucket.object_versions.delete()
163
+ bucket.delete()
164
+ printq(f'\n * Deleted s3 bucket successfully: {bucket_name}\n\n', quiet)
165
+ logger.debug("Deleted s3 bucket successfully '%s'.", bucket_name)
166
+ except s3_client.exceptions.NoSuchBucket:
167
+ printq(f'\n * S3 bucket no longer exists: {bucket_name}\n\n', quiet)
168
+ logger.debug("S3 bucket no longer exists '%s'.", bucket_name)
169
+ except ClientError as e:
170
+ if get_error_status(e) != 404:
171
+ raise
172
+ printq(f'\n * S3 bucket no longer exists: {bucket_name}\n\n', quiet)
173
+ logger.debug("S3 bucket no longer exists '%s'.", bucket_name)
174
+
175
+
24
176
  @retry(errors=[AWSServerErrors])
25
- def list_multipart_uploads(
26
- bucket: str, region: str, prefix: str, max_uploads: int = 1
27
- ) -> ListMultipartUploadsOutputTypeDef:
177
+ def bucket_exists(s3_resource: S3ServiceResource, bucket: str) -> Union[bool, Bucket]:
178
+ s3_client = s3_resource.meta.client
179
+ try:
180
+ s3_client.head_bucket(Bucket=bucket)
181
+ return s3_resource.Bucket(bucket)
182
+ except (ClientError, s3_client.exceptions.NoSuchBucket) as e:
183
+ error_code = e.response.get('ResponseMetadata', {}).get('HTTPStatusCode')
184
+ if error_code == 404:
185
+ return False
186
+ else:
187
+ raise
188
+
189
+
190
+ @retry(errors=[AWSServerErrors])
191
+ def head_s3_object(bucket: str, key: str, header: Dict[str, Any], region: Optional[str] = None) -> HeadObjectOutputTypeDef:
192
+ """
193
+ Attempt to HEAD an s3 object and return its response.
194
+
195
+ :param bucket: AWS bucket name
196
+ :param key: AWS Key name for the s3 object
197
+ :param header: Headers to include (mostly for encryption).
198
+ See: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/head_object.html
199
+ :param region: Region that we want to look for the bucket in
200
+ """
28
201
  s3_client = session.client("s3", region_name=region)
29
- return s3_client.list_multipart_uploads(
30
- Bucket=bucket, MaxUploads=max_uploads, Prefix=prefix
202
+ return s3_client.head_object(Bucket=bucket, Key=key, **header)
203
+
204
+
205
+ @retry(errors=[AWSServerErrors])
206
+ def list_multipart_uploads(bucket: str, region: str, prefix: str, max_uploads: int = 1) -> ListMultipartUploadsOutputTypeDef:
207
+ s3_client = session.client("s3", region_name=region)
208
+ return s3_client.list_multipart_uploads(Bucket=bucket, MaxUploads=max_uploads, Prefix=prefix)
209
+
210
+
211
+ @retry(errors=[AWSServerErrors])
212
+ def copy_s3_to_s3(s3_resource: S3ServiceResource, src_bucket: str, src_key: str, dst_bucket: str, dst_key: str, extra_args: Optional[Dict[Any, Any]] = None) -> None:
213
+ source = {'Bucket': src_bucket, 'Key': src_key}
214
+ # Note: this may have errors if using sse-c because of
215
+ # a bug with encryption using copy_object and copy (which uses copy_object for files <5GB):
216
+ # https://github.com/aws/aws-cli/issues/6012
217
+ # this will only happen if we attempt to copy a file previously encrypted with sse-c
218
+ # copying an unencrypted file and encrypting it as sse-c seems to work fine though
219
+ kwargs = dict(CopySource=source, Bucket=dst_bucket, Key=dst_key, ExtraArgs=extra_args)
220
+ s3_resource.meta.client.copy(**kwargs) # type: ignore
221
+
222
+
223
+ # TODO: Determine specific retries
224
+ @retry(errors=[AWSServerErrors])
225
+ def copy_local_to_s3(
226
+ s3_resource: S3ServiceResource,
227
+ local_file_path: str,
228
+ dst_bucket: str,
229
+ dst_key: str,
230
+ extra_args: Optional[Dict[Any, Any]] = None
231
+ ) -> None:
232
+ s3_client = s3_resource.meta.client
233
+ s3_client.upload_file(local_file_path, dst_bucket, dst_key, ExtraArgs=extra_args)
234
+
235
+
236
+ # TODO: Determine specific retries
237
+ @retry(errors=[AWSServerErrors])
238
+ def copy_s3_to_local(
239
+ s3_resource: S3ServiceResource,
240
+ local_file_path: str,
241
+ src_bucket: str,
242
+ src_key: str,
243
+ extra_args: Optional[Dict[Any, Any]] = None
244
+ ) -> None:
245
+ s3_client = s3_resource.meta.client
246
+ s3_client.download_file(src_bucket, src_key, local_file_path, ExtraArgs=extra_args)
247
+
248
+
249
+ class MultiPartPipe(WritablePipe):
250
+ def __init__(self, part_size: int, s3_resource: S3ServiceResource, bucket_name: str, file_id: str, encryption_args: Optional[Dict[Any, Any]], encoding: Optional[str] = None, errors: Optional[str] = None) -> None:
251
+ super(MultiPartPipe, self).__init__()
252
+ self.encoding = encoding
253
+ self.errors = errors
254
+ self.part_size = part_size
255
+ self.s3_client = s3_resource.meta.client
256
+ self.bucket_name = bucket_name
257
+ self.file_id = file_id
258
+ self.encryption_args = encryption_args or dict()
259
+
260
+ def readFrom(self, readable: IO[Any]) -> None:
261
+ # Get the first block of data we want to put
262
+ buf = readable.read(self.part_size)
263
+ assert isinstance(buf, bytes)
264
+
265
+ # We will compute a checksum
266
+ hasher = hashlib.sha1()
267
+ hasher.update(buf)
268
+
269
+ # low-level clients are thread safe
270
+ response = self.s3_client.create_multipart_upload(Bucket=self.bucket_name,
271
+ Key=self.file_id,
272
+ **self.encryption_args)
273
+ upload_id = response['UploadId']
274
+ parts = []
275
+ try:
276
+ for part_num in itertools.count():
277
+ logger.debug(f'[{upload_id}] Uploading part %d of %d bytes', part_num + 1, len(buf))
278
+ # TODO: include the Content-MD5 header:
279
+ # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.complete_multipart_upload
280
+ part = self.s3_client.upload_part(Bucket=self.bucket_name,
281
+ Key=self.file_id,
282
+ PartNumber=part_num + 1,
283
+ UploadId=upload_id,
284
+ Body=BytesIO(buf),
285
+ **self.encryption_args)
286
+ parts.append({"PartNumber": part_num + 1, "ETag": part["ETag"]})
287
+
288
+ # Get the next block of data we want to put
289
+ buf = readable.read(self.part_size)
290
+ if len(buf) == 0:
291
+ # Don't allow any part other than the very first to be empty.
292
+ break
293
+ hasher.update(buf)
294
+ except:
295
+ self.s3_client.abort_multipart_upload(Bucket=self.bucket_name,
296
+ Key=self.file_id,
297
+ UploadId=upload_id)
298
+ else:
299
+ # Save the checksum
300
+ checksum = f'sha1${hasher.hexdigest()}'
301
+ # Encryption information is not needed here because the upload was
302
+ # not started with a checksum.
303
+ response = self.s3_client.complete_multipart_upload(Bucket=self.bucket_name, # type: ignore
304
+ Key=self.file_id,
305
+ UploadId=upload_id,
306
+ MultipartUpload={"Parts": parts}) # type: ignore
307
+ logger.debug(f'[{upload_id}] Upload of {self.file_id} complete...')
308
+
309
+
310
+ def parse_s3_uri(uri: str) -> Tuple[str, str]:
311
+ # does not support s3/gs: https://docs.python.org/3/library/urllib.parse.html
312
+ # use regex instead?
313
+ uri = urllib.parse.urlparse(uri) # type: ignore
314
+ if uri.scheme.lower() != 's3': # type: ignore
315
+ raise ValueError(f'Invalid schema. Expecting s3 prefix, not: {uri}')
316
+ bucket_name, key_name = uri.netloc.strip('/'), uri.path.strip('/') # type: ignore
317
+ return bucket_name, key_name
318
+
319
+
320
+ def list_s3_items(s3_resource: S3ServiceResource, bucket: str, prefix: str, startafter: Optional[str] = None) -> Iterator[ObjectTypeDef]:
321
+ s3_client = s3_resource.meta.client
322
+ paginator = s3_client.get_paginator('list_objects_v2')
323
+ kwargs = dict(Bucket=bucket, Prefix=prefix)
324
+ if startafter:
325
+ kwargs['StartAfter'] = startafter
326
+ for page in paginator.paginate(**kwargs): # type: ignore
327
+ for key in page.get('Contents', []):
328
+ yield key
329
+
330
+
331
+ @retry(errors=[AWSServerErrors])
332
+ def upload_to_s3(readable: IO[Any],
333
+ s3_resource: S3ServiceResource,
334
+ bucket: str,
335
+ key: str,
336
+ extra_args: Optional[Dict[Any, Any]] = None) -> None:
337
+ """
338
+ Upload a readable object to s3, using multipart uploading if applicable.
339
+
340
+ :param readable: a readable stream or a local file path to upload to s3
341
+ :param S3.Resource resource: boto3 resource
342
+ :param str bucket: name of the bucket to upload to
343
+ :param str key: the name of the file to upload to
344
+ :param dict extra_args: http headers to use when uploading - generally used for encryption purposes
345
+ :param int partSize: max size of each part in the multipart upload, in bytes
346
+ :return: version of the newly uploaded file
347
+ """
348
+ if extra_args is None:
349
+ extra_args = {}
350
+
351
+ s3_client = s3_resource.meta.client
352
+ from boto3.s3.transfer import TransferConfig
353
+ config = TransferConfig(
354
+ multipart_threshold=DEFAULT_AWS_CHUNK_SIZE,
355
+ multipart_chunksize=DEFAULT_AWS_CHUNK_SIZE,
356
+ use_threads=True
31
357
  )
358
+ logger.debug("Uploading %s", key)
359
+ # these methods use multipart if necessary
360
+ if isinstance(readable, str):
361
+ s3_client.upload_file(Filename=readable,
362
+ Bucket=bucket,
363
+ Key=key,
364
+ ExtraArgs=extra_args,
365
+ Config=config)
366
+ else:
367
+ s3_client.upload_fileobj(Fileobj=readable,
368
+ Bucket=bucket,
369
+ Key=key,
370
+ ExtraArgs=extra_args,
371
+ Config=config)
372
+
373
+ object_summary = s3_resource.ObjectSummary(bucket, key)
374
+ object_summary.wait_until_exists(**extra_args)
375
+
376
+
377
+ @contextmanager
378
+ def download_stream(s3_resource: S3ServiceResource, bucket: str, key: str, checksum_to_verify: Optional[str] = None,
379
+ extra_args: Optional[Dict[Any, Any]] = None, encoding: Optional[str] = None, errors: Optional[str] = None) -> Iterator[IO[Any]]:
380
+ """Context manager that gives out a download stream to download data."""
381
+ bucket_obj: Bucket = s3_resource.Bucket(bucket)
382
+
383
+ class DownloadPipe(ReadablePipe):
384
+ def writeTo(self, writable: IO[Any]) -> None:
385
+ kwargs = dict(Key=key, Fileobj=writable, ExtraArgs=extra_args)
386
+ if not extra_args:
387
+ del kwargs['ExtraArgs']
388
+ bucket_obj.download_fileobj(**kwargs) # type: ignore
389
+
390
+ try:
391
+ if checksum_to_verify:
392
+ with DownloadPipe(encoding=encoding, errors=errors) as readable:
393
+ # Interpose a pipe to check the hash
394
+ with HashingPipe(readable, encoding=encoding, errors=errors) as verified:
395
+ yield verified
396
+ else:
397
+ # Readable end of pipe produces text mode output if encoding specified
398
+ with DownloadPipe(encoding=encoding, errors=errors) as readable:
399
+ # No true checksum available, so don't hash
400
+ yield readable
401
+ except s3_resource.meta.client.exceptions.NoSuchKey:
402
+ raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
403
+ except ClientError as e:
404
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
405
+ raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
406
+ elif e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 400 and \
407
+ e.response.get('Error', {}).get('Message') == 'Bad Request' and \
408
+ e.operation_name == 'HeadObject':
409
+ # An error occurred (400) when calling the HeadObject operation: Bad Request
410
+ raise AWSBadEncryptionKeyError("Your AWS encryption key is most likely configured incorrectly "
411
+ f"(HeadObject operation on key '{key}': Bad Request).")
412
+ raise
413
+
414
+
415
+ def download_fileobject(s3_resource: S3ServiceResource, bucket: Bucket, key: str, fileobj: BytesIO, extra_args: Optional[Dict[Any, Any]] = None) -> None:
416
+ try:
417
+ bucket.download_fileobj(Key=key, Fileobj=fileobj, ExtraArgs=extra_args)
418
+ except s3_resource.meta.client.exceptions.NoSuchKey:
419
+ raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
420
+ except ClientError as e:
421
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
422
+ raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
423
+ elif e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 400 and \
424
+ e.response.get('Error', {}).get('Message') == 'Bad Request' and \
425
+ e.operation_name == 'HeadObject':
426
+ # An error occurred (400) when calling the HeadObject operation: Bad Request
427
+ raise AWSBadEncryptionKeyError("Your AWS encryption key is most likely configured incorrectly "
428
+ f"(HeadObject operation on key '{key}': Bad Request).")
429
+ raise
430
+
431
+
432
+ def s3_key_exists(s3_resource: S3ServiceResource, bucket: str, key: str, check: bool = False, extra_args: Optional[Dict[Any, Any]] = None) -> bool:
433
+ """Return True if the s3 obect exists, and False if not. Will error if encryption args are incorrect."""
434
+ extra_args = extra_args or {}
435
+ s3_client = s3_resource.meta.client
436
+ try:
437
+ s3_client.head_object(Bucket=bucket, Key=key, **extra_args)
438
+ return True
439
+ except s3_client.exceptions.NoSuchKey:
440
+ if check:
441
+ raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
442
+ return False
443
+ except ClientError as e:
444
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
445
+ if check:
446
+ raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
447
+ return False
448
+ elif e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 400 and \
449
+ e.response.get('Error', {}).get('Message') == 'Bad Request' and \
450
+ e.operation_name == 'HeadObject':
451
+ # An error occurred (400) when calling the HeadObject operation: Bad Request
452
+ raise AWSBadEncryptionKeyError("Your AWS encryption key is most likely configured incorrectly "
453
+ f"(HeadObject operation on key '{key}': Bad Request).")
454
+ else:
455
+ raise
456
+
457
+
458
+ def get_s3_object(s3_resource: S3ServiceResource, bucket: str, key: str, extra_args: Optional[Dict[Any, Any]] = None) -> GetObjectOutputTypeDef:
459
+ if extra_args is None:
460
+ extra_args = dict()
461
+ s3_client = s3_resource.meta.client
462
+ return s3_client.get_object(Bucket=bucket, Key=key, **extra_args)
463
+
464
+
465
+ def put_s3_object(s3_resource: S3ServiceResource, bucket: str, key: str, body: Union[str, bytes], extra_args: Optional[Dict[Any, Any]] = None) -> PutObjectOutputTypeDef:
466
+ if extra_args is None:
467
+ extra_args = dict()
468
+ s3_client = s3_resource.meta.client
469
+ return s3_client.put_object(Bucket=bucket, Key=key, Body=body, **extra_args)
470
+
471
+
472
+ def generate_presigned_url(s3_resource: S3ServiceResource, bucket: str, key_name: str, expiration: int) -> str:
473
+ s3_client = s3_resource.meta.client
474
+ return s3_client.generate_presigned_url(
475
+ 'get_object',
476
+ Params={'Bucket': bucket, 'Key': key_name},
477
+ ExpiresIn=expiration)
478
+
479
+
480
+ def create_public_url(s3_resource: S3ServiceResource, bucket: str, key: str) -> str:
481
+ bucket_obj = s3_resource.Bucket(bucket)
482
+ bucket_obj.Object(key).Acl().put(ACL='public-read') # TODO: do we need to generate a signed url after doing this?
483
+ url = generate_presigned_url(s3_resource=s3_resource,
484
+ bucket=bucket,
485
+ key_name=key,
486
+ # One year should be sufficient to finish any pipeline ;-)
487
+ expiration=int(timedelta(days=365).total_seconds()))
488
+ # boto doesn't properly remove the x-amz-security-token parameter when
489
+ # query_auth is False when using an IAM role (see issue #2043). Including the
490
+ # x-amz-security-token parameter without the access key results in a 403,
491
+ # even if the resource is public, so we need to remove it.
492
+ # TODO: verify that this is still the case
493
+ return modify_url(url, remove=['x-amz-security-token', 'AWSAccessKeyId', 'Signature'])
494
+
495
+
496
+ def get_s3_bucket_region(s3_resource: S3ServiceResource, bucket: str) -> str:
497
+ s3_client = s3_resource.meta.client
498
+ # AWS returns None for the default of 'us-east-1'
499
+ return s3_client.get_bucket_location(Bucket=bucket).get('LocationConstraint', None) or 'us-east-1'
toil/lib/aws/utils.py CHANGED
@@ -16,8 +16,17 @@ import logging
16
16
  import os
17
17
  import socket
18
18
  from collections.abc import Iterable, Iterator
19
- from typing import TYPE_CHECKING, Any, Callable, ContextManager, Optional, cast
20
- from urllib.parse import ParseResult
19
+ from typing import (
20
+ TYPE_CHECKING,
21
+ Any,
22
+ Callable,
23
+ ContextManager,
24
+ Literal,
25
+ Optional,
26
+ Union,
27
+ cast,
28
+ )
29
+ from urllib.parse import ParseResult, urlparse
21
30
 
22
31
  # To import toil.lib.aws.session, the AWS libraries must be installed
23
32
  from toil.lib.aws import AWSRegionName, AWSServerErrors, session
@@ -37,7 +46,6 @@ if TYPE_CHECKING:
37
46
  from mypy_boto3_s3 import S3ServiceResource
38
47
  from mypy_boto3_s3.service_resource import Bucket
39
48
  from mypy_boto3_s3.service_resource import Object as S3Object
40
- from mypy_boto3_sdb.type_defs import AttributeTypeDef
41
49
 
42
50
  from botocore.exceptions import ClientError, EndpointConnectionError
43
51
 
@@ -131,6 +139,11 @@ def delete_s3_bucket(
131
139
  ) -> None:
132
140
  """
133
141
  Delete the given S3 bucket.
142
+
143
+ Note that S3 bucket deletion is only eventually-consistent, and anyone can
144
+ register any S3 bucket name. For both of these reasons, a bucket name might
145
+ not be immediately (or ever) available for re-use after this function
146
+ returns.
134
147
  """
135
148
  printq(f"Deleting s3 bucket: {bucket}", quiet)
136
149
 
@@ -153,7 +166,9 @@ def delete_s3_bucket(
153
166
  Bucket=bucket, Key=entry["Key"], VersionId=entry["VersionId"]
154
167
  )
155
168
  s3_resource.Bucket(bucket).delete()
156
- printq(f"\n * Deleted s3 bucket successfully: {bucket}\n\n", quiet)
169
+ # S3 bucket deletion is only eventually-consistent. See
170
+ # <https://docs.aws.amazon.com/AmazonS3/latest/userguide/delete-bucket.html>
171
+ printq(f"\n * S3 bucket successfully scheduled for deletion: {bucket}\n\n", quiet)
157
172
  except s3_resource.meta.client.exceptions.NoSuchBucket:
158
173
  printq(f"\n * S3 bucket no longer exists: {bucket}\n\n", quiet)
159
174
 
@@ -323,17 +338,13 @@ def get_bucket_region(
323
338
  raise
324
339
  except KeyError as e:
325
340
  # If we get a weird head response we will have a KeyError
326
- logger.debug(
327
- "Strategy %d to get bucket location did not work: %s", i + 1, e
328
- )
341
+ logger.debug("Strategy %d to get bucket location did not work: %s", i + 1, e)
329
342
  error_logs.append((i + 1, str(e)))
330
343
  last_error = e
331
344
 
332
345
  error_messages = []
333
346
  for rank, message in error_logs:
334
- error_messages.append(
335
- f"Strategy {rank} failed to get bucket location because: {message}"
336
- )
347
+ error_messages.append(f"Strategy {rank} failed to get bucket location because: {message}")
337
348
  # If we get here we ran out of attempts.
338
349
  raise NoBucketLocationError(
339
350
  "Could not get bucket location: " + "\n".join(error_messages)
@@ -385,7 +396,6 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None, anonym
385
396
  :raises RuntimeError: when existing is False but the object exists.
386
397
  :raises PermissionError: when we are not authorized to look at the object.
387
398
  """
388
-
389
399
  key_name = url.path[1:]
390
400
  bucket_name = url.netloc
391
401
 
@@ -458,7 +468,6 @@ def list_objects_for_url(url: ParseResult, anonymous: Optional[bool] = None) ->
458
468
 
459
469
  :raises PermissionError: when we are not authorized to do the list operation.
460
470
  """
461
-
462
471
  key_name = url.path[1:]
463
472
  bucket_name = url.netloc
464
473
 
@@ -509,7 +518,7 @@ def list_objects_for_url(url: ParseResult, anonymous: Optional[bool] = None) ->
509
518
  return listing
510
519
 
511
520
 
512
- def flatten_tags(tags: dict[str, str]) -> list[dict[str, str]]:
521
+ def flatten_tags(tags: dict[str, str]) -> list[dict[Union[Literal["Key"], Literal["Value"]], str]]:
513
522
  """
514
523
  Convert tags from a key to value dict into a list of 'Key': xxx, 'Value': xxx dicts.
515
524
  """
@@ -537,23 +546,3 @@ def boto3_pager(
537
546
  for page in paginator.paginate(**kwargs):
538
547
  # Invoke it and go through the pages, yielding from them
539
548
  yield from page.get(result_attribute_name, [])
540
-
541
-
542
- def get_item_from_attributes(attributes: list["AttributeTypeDef"], name: str) -> Any:
543
- """
544
- Given a list of attributes, find the attribute associated with the name and return its corresponding value.
545
-
546
- The `attribute_list` will be a list of TypedDict's (which boto3 SDB functions commonly return),
547
- where each TypedDict has a "Name" and "Value" key value pair.
548
- This function grabs the value out of the associated TypedDict.
549
-
550
- If the attribute with the name does not exist, the function will return None.
551
-
552
- :param attributes: list of attributes
553
- :param name: name of the attribute
554
- :return: value of the attribute
555
- """
556
- return next(
557
- (attribute["Value"] for attribute in attributes if attribute["Name"] == name),
558
- None,
559
- )