toil 9.1.1__py3-none-any.whl → 9.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. toil/__init__.py +5 -9
  2. toil/batchSystems/abstractBatchSystem.py +23 -22
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +17 -12
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +4 -4
  6. toil/batchSystems/contained_executor.py +3 -3
  7. toil/batchSystems/gridengine.py +3 -4
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +65 -63
  10. toil/batchSystems/local_support.py +2 -3
  11. toil/batchSystems/lsf.py +6 -7
  12. toil/batchSystems/mesos/batchSystem.py +11 -7
  13. toil/batchSystems/mesos/test/__init__.py +1 -2
  14. toil/batchSystems/options.py +9 -10
  15. toil/batchSystems/registry.py +3 -7
  16. toil/batchSystems/singleMachine.py +8 -11
  17. toil/batchSystems/slurm.py +49 -38
  18. toil/batchSystems/torque.py +3 -4
  19. toil/bus.py +36 -34
  20. toil/common.py +129 -89
  21. toil/cwl/cwltoil.py +857 -729
  22. toil/cwl/utils.py +44 -35
  23. toil/fileStores/__init__.py +3 -1
  24. toil/fileStores/abstractFileStore.py +28 -30
  25. toil/fileStores/cachingFileStore.py +8 -8
  26. toil/fileStores/nonCachingFileStore.py +10 -21
  27. toil/job.py +159 -158
  28. toil/jobStores/abstractJobStore.py +68 -69
  29. toil/jobStores/aws/jobStore.py +249 -213
  30. toil/jobStores/aws/utils.py +13 -24
  31. toil/jobStores/fileJobStore.py +28 -22
  32. toil/jobStores/googleJobStore.py +21 -17
  33. toil/jobStores/utils.py +3 -7
  34. toil/leader.py +17 -22
  35. toil/lib/accelerators.py +6 -4
  36. toil/lib/aws/__init__.py +9 -10
  37. toil/lib/aws/ami.py +33 -19
  38. toil/lib/aws/iam.py +6 -6
  39. toil/lib/aws/s3.py +259 -157
  40. toil/lib/aws/session.py +76 -76
  41. toil/lib/aws/utils.py +51 -43
  42. toil/lib/checksum.py +19 -15
  43. toil/lib/compatibility.py +3 -2
  44. toil/lib/conversions.py +45 -18
  45. toil/lib/directory.py +29 -26
  46. toil/lib/docker.py +93 -99
  47. toil/lib/dockstore.py +77 -50
  48. toil/lib/ec2.py +39 -38
  49. toil/lib/ec2nodes.py +11 -4
  50. toil/lib/exceptions.py +8 -5
  51. toil/lib/ftp_utils.py +9 -14
  52. toil/lib/generatedEC2Lists.py +161 -20
  53. toil/lib/history.py +141 -97
  54. toil/lib/history_submission.py +163 -72
  55. toil/lib/io.py +27 -17
  56. toil/lib/memoize.py +2 -1
  57. toil/lib/misc.py +15 -11
  58. toil/lib/pipes.py +40 -25
  59. toil/lib/plugins.py +12 -8
  60. toil/lib/resources.py +1 -0
  61. toil/lib/retry.py +32 -38
  62. toil/lib/threading.py +12 -12
  63. toil/lib/throttle.py +1 -2
  64. toil/lib/trs.py +113 -51
  65. toil/lib/url.py +14 -23
  66. toil/lib/web.py +7 -2
  67. toil/options/common.py +18 -15
  68. toil/options/cwl.py +2 -2
  69. toil/options/runner.py +9 -5
  70. toil/options/wdl.py +1 -3
  71. toil/provisioners/__init__.py +9 -9
  72. toil/provisioners/abstractProvisioner.py +22 -20
  73. toil/provisioners/aws/__init__.py +20 -14
  74. toil/provisioners/aws/awsProvisioner.py +10 -8
  75. toil/provisioners/clusterScaler.py +19 -18
  76. toil/provisioners/gceProvisioner.py +2 -3
  77. toil/provisioners/node.py +11 -13
  78. toil/realtimeLogger.py +4 -4
  79. toil/resource.py +5 -5
  80. toil/server/app.py +2 -2
  81. toil/server/cli/wes_cwl_runner.py +11 -11
  82. toil/server/utils.py +18 -21
  83. toil/server/wes/abstract_backend.py +9 -8
  84. toil/server/wes/amazon_wes_utils.py +3 -3
  85. toil/server/wes/tasks.py +3 -5
  86. toil/server/wes/toil_backend.py +17 -21
  87. toil/server/wsgi_app.py +3 -3
  88. toil/serviceManager.py +3 -4
  89. toil/statsAndLogging.py +12 -13
  90. toil/test/__init__.py +33 -24
  91. toil/test/batchSystems/batchSystemTest.py +12 -11
  92. toil/test/batchSystems/batch_system_plugin_test.py +3 -5
  93. toil/test/batchSystems/test_slurm.py +38 -24
  94. toil/test/cwl/conftest.py +5 -6
  95. toil/test/cwl/cwlTest.py +194 -78
  96. toil/test/cwl/download_file_uri.json +6 -0
  97. toil/test/cwl/download_file_uri_no_hostname.json +6 -0
  98. toil/test/docs/scripts/tutorial_staging.py +1 -0
  99. toil/test/jobStores/jobStoreTest.py +9 -7
  100. toil/test/lib/aws/test_iam.py +1 -3
  101. toil/test/lib/aws/test_s3.py +1 -1
  102. toil/test/lib/dockerTest.py +9 -9
  103. toil/test/lib/test_ec2.py +12 -11
  104. toil/test/lib/test_history.py +4 -4
  105. toil/test/lib/test_trs.py +16 -14
  106. toil/test/lib/test_url.py +7 -6
  107. toil/test/lib/url_plugin_test.py +12 -18
  108. toil/test/provisioners/aws/awsProvisionerTest.py +10 -8
  109. toil/test/provisioners/clusterScalerTest.py +2 -5
  110. toil/test/provisioners/clusterTest.py +1 -3
  111. toil/test/server/serverTest.py +13 -4
  112. toil/test/sort/restart_sort.py +2 -6
  113. toil/test/sort/sort.py +3 -8
  114. toil/test/src/deferredFunctionTest.py +7 -7
  115. toil/test/src/environmentTest.py +1 -2
  116. toil/test/src/fileStoreTest.py +5 -5
  117. toil/test/src/importExportFileTest.py +5 -6
  118. toil/test/src/jobServiceTest.py +22 -14
  119. toil/test/src/jobTest.py +121 -25
  120. toil/test/src/miscTests.py +5 -7
  121. toil/test/src/promisedRequirementTest.py +8 -7
  122. toil/test/src/regularLogTest.py +2 -3
  123. toil/test/src/resourceTest.py +5 -8
  124. toil/test/src/restartDAGTest.py +5 -6
  125. toil/test/src/resumabilityTest.py +2 -2
  126. toil/test/src/retainTempDirTest.py +3 -3
  127. toil/test/src/systemTest.py +3 -3
  128. toil/test/src/threadingTest.py +1 -1
  129. toil/test/src/workerTest.py +1 -2
  130. toil/test/utils/toilDebugTest.py +6 -4
  131. toil/test/utils/toilKillTest.py +1 -1
  132. toil/test/utils/utilsTest.py +15 -14
  133. toil/test/wdl/wdltoil_test.py +247 -124
  134. toil/test/wdl/wdltoil_test_kubernetes.py +2 -2
  135. toil/toilState.py +2 -3
  136. toil/utils/toilDebugFile.py +3 -8
  137. toil/utils/toilDebugJob.py +1 -2
  138. toil/utils/toilLaunchCluster.py +1 -2
  139. toil/utils/toilSshCluster.py +2 -0
  140. toil/utils/toilStats.py +19 -24
  141. toil/utils/toilStatus.py +11 -14
  142. toil/version.py +10 -10
  143. toil/wdl/wdltoil.py +313 -209
  144. toil/worker.py +18 -12
  145. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/METADATA +11 -14
  146. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/RECORD +150 -153
  147. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/WHEEL +1 -1
  148. toil/test/cwl/staging_cat.cwl +0 -27
  149. toil/test/cwl/staging_make_file.cwl +0 -25
  150. toil/test/cwl/staging_workflow.cwl +0 -43
  151. toil/test/cwl/zero_default.cwl +0 -61
  152. toil/test/utils/ABCWorkflowDebug/ABC.txt +0 -1
  153. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/entry_points.txt +0 -0
  154. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/licenses/LICENSE +0 -0
  155. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/top_level.txt +0 -0
toil/lib/aws/s3.py CHANGED
@@ -14,41 +14,37 @@
14
14
 
15
15
  import hashlib
16
16
  import itertools
17
- import urllib.parse
18
17
  import logging
19
-
20
- from io import BytesIO
21
- from datetime import timedelta
18
+ import urllib.parse
19
+ from collections.abc import Iterator, MutableSequence
22
20
  from contextlib import contextmanager
23
- from typing import (Any,
24
- Dict,
25
- List,
26
- Optional,
27
- Union,
28
- Tuple,
29
- cast,
30
- Literal,
31
- Iterator,
32
- IO)
33
-
34
- from toil.lib.retry import retry, get_error_status, ErrorCondition
35
- from toil.lib.misc import printq
36
- from toil.lib.aws import AWSServerErrors, session, build_tag_dict_from_env
37
- from toil.lib.aws.utils import enable_public_objects, flatten_tags
38
- from toil.lib.conversions import modify_url, MB, MIB, TB
39
- from toil.lib.pipes import WritablePipe, ReadablePipe, HashingPipe
21
+ from datetime import timedelta
22
+ from io import BytesIO
23
+ from typing import IO, Any, Literal, cast
40
24
 
41
25
  # This file cannot be imported without the botocore/boto3 modules. We need
42
26
  # these types to set up @retry annotations, and @retry annotations really need
43
27
  # to be passed the real types at import time, since they do annotation-time
44
28
  # type checking internally.
45
29
  from botocore.exceptions import ClientError
46
-
47
- from mypy_boto3_s3 import S3Client, S3ServiceResource
30
+ from mypy_boto3_s3 import S3ServiceResource
48
31
  from mypy_boto3_s3.literals import BucketLocationConstraintType
49
- from mypy_boto3_s3.service_resource import Bucket, Object
50
- from mypy_boto3_s3.type_defs import ListMultipartUploadsOutputTypeDef, HeadObjectOutputTypeDef, GetObjectOutputTypeDef, PutObjectOutputTypeDef, ObjectTypeDef
51
-
32
+ from mypy_boto3_s3.service_resource import Bucket
33
+ from mypy_boto3_s3.type_defs import (
34
+ CompletedPartTypeDef,
35
+ GetObjectOutputTypeDef,
36
+ HeadObjectOutputTypeDef,
37
+ ListMultipartUploadsOutputTypeDef,
38
+ ObjectTypeDef,
39
+ PutObjectOutputTypeDef,
40
+ )
41
+
42
+ from toil.lib.aws import AWSServerErrors, build_tag_dict_from_env, session
43
+ from toil.lib.aws.utils import enable_public_objects, flatten_tags
44
+ from toil.lib.conversions import MB, MIB, TB, modify_url
45
+ from toil.lib.misc import printq
46
+ from toil.lib.pipes import HashingPipe, ReadablePipe, WritablePipe
47
+ from toil.lib.retry import get_error_status, retry
52
48
 
53
49
  logger = logging.getLogger(__name__)
54
50
 
@@ -81,9 +77,9 @@ class AWSBadEncryptionKeyError(Exception):
81
77
  def create_s3_bucket(
82
78
  s3_resource: S3ServiceResource,
83
79
  bucket_name: str,
84
- region: Union[BucketLocationConstraintType, Literal["us-east-1"]],
85
- tags: Optional[Dict[str, str]] = None,
86
- public: bool = True
80
+ region: BucketLocationConstraintType | Literal["us-east-1"],
81
+ tags: dict[str, str] | None = None,
82
+ public: bool = True,
87
83
  ) -> "Bucket":
88
84
  """
89
85
  Create an AWS S3 bucket, using the given Boto3 S3 session, with the
@@ -106,7 +102,7 @@ def create_s3_bucket(
106
102
 
107
103
  tags = build_tag_dict_from_env() if tags is None else tags
108
104
  bucket_tagging = s3_resource.BucketTagging(bucket_name)
109
- bucket_tagging.put(Tagging={'TagSet': flatten_tags(tags)}) # type: ignore
105
+ bucket_tagging.put(Tagging={"TagSet": flatten_tags(tags)}) # type: ignore
110
106
 
111
107
  # enabling public objects is the historical default
112
108
  if public:
@@ -117,9 +113,7 @@ def create_s3_bucket(
117
113
 
118
114
  @retry(errors=[ClientError])
119
115
  def delete_s3_bucket(
120
- s3_resource: S3ServiceResource,
121
- bucket_name: str,
122
- quiet: bool = True
116
+ s3_resource: S3ServiceResource, bucket_name: str, quiet: bool = True
123
117
  ) -> None:
124
118
  """
125
119
  Delete the bucket with 'bucket_name'.
@@ -127,60 +121,63 @@ def delete_s3_bucket(
127
121
  Note: 'quiet' is False when used for a clean up utility script (contrib/admin/cleanup_aws_resources.py)
128
122
  that prints progress rather than logging. Logging should be used for all other internal Toil usage.
129
123
  """
130
- assert isinstance(bucket_name, str), f'{bucket_name} is not a string ({type(bucket_name)}).'
124
+ assert isinstance(
125
+ bucket_name, str
126
+ ), f"{bucket_name} is not a string ({type(bucket_name)})."
131
127
  logger.debug("Deleting bucket '%s'.", bucket_name)
132
- printq(f'\n * Deleting s3 bucket: {bucket_name}\n\n', quiet)
128
+ printq(f"\n * Deleting s3 bucket: {bucket_name}\n\n", quiet)
133
129
 
134
130
  s3_client = s3_resource.meta.client
135
131
 
136
132
  try:
137
- for u in s3_client.list_multipart_uploads(Bucket=bucket_name).get('Uploads', []):
133
+ for u in s3_client.list_multipart_uploads(Bucket=bucket_name).get(
134
+ "Uploads", []
135
+ ):
138
136
  s3_client.abort_multipart_upload(
139
- Bucket=bucket_name,
140
- Key=u["Key"],
141
- UploadId=u["UploadId"]
137
+ Bucket=bucket_name, Key=u["Key"], UploadId=u["UploadId"]
142
138
  )
143
139
 
144
- paginator = s3_client.get_paginator('list_object_versions')
140
+ paginator = s3_client.get_paginator("list_object_versions")
145
141
  for response in paginator.paginate(Bucket=bucket_name):
146
142
  # Versions and delete markers can both go in here to be deleted.
147
143
  # They both have Key and VersionId, but there's no shared base type
148
144
  # defined for them in the stubs to express that. See
149
145
  # <https://github.com/vemel/mypy_boto3_builder/issues/123>. So we
150
146
  # have to do gymnastics to get them into the same list.
151
- to_delete: List[Dict[str, Any]] = cast(List[Dict[str, Any]], response.get('Versions', [])) + \
152
- cast(List[Dict[str, Any]], response.get('DeleteMarkers', []))
147
+ to_delete: list[dict[str, Any]] = cast(
148
+ list[dict[str, Any]], response.get("Versions", [])
149
+ ) + cast(list[dict[str, Any]], response.get("DeleteMarkers", []))
153
150
  for entry in to_delete:
154
- printq(f" Deleting {entry['Key']} version {entry['VersionId']}", quiet)
151
+ printq(
152
+ f" Deleting {entry['Key']} version {entry['VersionId']}", quiet
153
+ )
155
154
  s3_client.delete_object(
156
- Bucket=bucket_name,
157
- Key=entry['Key'],
158
- VersionId=entry['VersionId']
155
+ Bucket=bucket_name, Key=entry["Key"], VersionId=entry["VersionId"]
159
156
  )
160
157
  bucket = s3_resource.Bucket(bucket_name)
161
158
  bucket.objects.all().delete()
162
159
  bucket.object_versions.delete()
163
160
  bucket.delete()
164
- printq(f'\n * Deleted s3 bucket successfully: {bucket_name}\n\n', quiet)
161
+ printq(f"\n * Deleted s3 bucket successfully: {bucket_name}\n\n", quiet)
165
162
  logger.debug("Deleted s3 bucket successfully '%s'.", bucket_name)
166
163
  except s3_client.exceptions.NoSuchBucket:
167
- printq(f'\n * S3 bucket no longer exists: {bucket_name}\n\n', quiet)
164
+ printq(f"\n * S3 bucket no longer exists: {bucket_name}\n\n", quiet)
168
165
  logger.debug("S3 bucket no longer exists '%s'.", bucket_name)
169
166
  except ClientError as e:
170
167
  if get_error_status(e) != 404:
171
168
  raise
172
- printq(f'\n * S3 bucket no longer exists: {bucket_name}\n\n', quiet)
169
+ printq(f"\n * S3 bucket no longer exists: {bucket_name}\n\n", quiet)
173
170
  logger.debug("S3 bucket no longer exists '%s'.", bucket_name)
174
171
 
175
172
 
176
173
  @retry(errors=[AWSServerErrors])
177
- def bucket_exists(s3_resource: S3ServiceResource, bucket: str) -> Union[bool, Bucket]:
174
+ def bucket_exists(s3_resource: S3ServiceResource, bucket: str) -> bool | Bucket:
178
175
  s3_client = s3_resource.meta.client
179
176
  try:
180
177
  s3_client.head_bucket(Bucket=bucket)
181
178
  return s3_resource.Bucket(bucket)
182
179
  except (ClientError, s3_client.exceptions.NoSuchBucket) as e:
183
- error_code = e.response.get('ResponseMetadata', {}).get('HTTPStatusCode')
180
+ error_code = e.response.get("ResponseMetadata", {}).get("HTTPStatusCode")
184
181
  if error_code == 404:
185
182
  return False
186
183
  else:
@@ -188,7 +185,9 @@ def bucket_exists(s3_resource: S3ServiceResource, bucket: str) -> Union[bool, Bu
188
185
 
189
186
 
190
187
  @retry(errors=[AWSServerErrors])
191
- def head_s3_object(bucket: str, key: str, header: Dict[str, Any], region: Optional[str] = None) -> HeadObjectOutputTypeDef:
188
+ def head_s3_object(
189
+ bucket: str, key: str, header: dict[str, Any], region: str | None = None
190
+ ) -> HeadObjectOutputTypeDef:
192
191
  """
193
192
  Attempt to HEAD an s3 object and return its response.
194
193
 
@@ -203,31 +202,44 @@ def head_s3_object(bucket: str, key: str, header: Dict[str, Any], region: Option
203
202
 
204
203
 
205
204
  @retry(errors=[AWSServerErrors])
206
- def list_multipart_uploads(bucket: str, region: str, prefix: str, max_uploads: int = 1) -> ListMultipartUploadsOutputTypeDef:
205
+ def list_multipart_uploads(
206
+ bucket: str, region: str, prefix: str, max_uploads: int = 1
207
+ ) -> ListMultipartUploadsOutputTypeDef:
207
208
  s3_client = session.client("s3", region_name=region)
208
- return s3_client.list_multipart_uploads(Bucket=bucket, MaxUploads=max_uploads, Prefix=prefix)
209
+ return s3_client.list_multipart_uploads(
210
+ Bucket=bucket, MaxUploads=max_uploads, Prefix=prefix
211
+ )
209
212
 
210
213
 
211
214
  @retry(errors=[AWSServerErrors])
212
- def copy_s3_to_s3(s3_resource: S3ServiceResource, src_bucket: str, src_key: str, dst_bucket: str, dst_key: str, extra_args: Optional[Dict[Any, Any]] = None) -> None:
213
- source = {'Bucket': src_bucket, 'Key': src_key}
215
+ def copy_s3_to_s3(
216
+ s3_resource: S3ServiceResource,
217
+ src_bucket: str,
218
+ src_key: str,
219
+ dst_bucket: str,
220
+ dst_key: str,
221
+ extra_args: dict[Any, Any] | None = None,
222
+ ) -> None:
223
+ source = {"Bucket": src_bucket, "Key": src_key}
214
224
  # Note: this may have errors if using sse-c because of
215
225
  # a bug with encryption using copy_object and copy (which uses copy_object for files <5GB):
216
226
  # https://github.com/aws/aws-cli/issues/6012
217
227
  # this will only happen if we attempt to copy a file previously encrypted with sse-c
218
228
  # copying an unencrypted file and encrypting it as sse-c seems to work fine though
219
- kwargs = dict(CopySource=source, Bucket=dst_bucket, Key=dst_key, ExtraArgs=extra_args)
229
+ kwargs = dict(
230
+ CopySource=source, Bucket=dst_bucket, Key=dst_key, ExtraArgs=extra_args
231
+ )
220
232
  s3_resource.meta.client.copy(**kwargs) # type: ignore
221
233
 
222
234
 
223
235
  # TODO: Determine specific retries
224
236
  @retry(errors=[AWSServerErrors])
225
237
  def copy_local_to_s3(
226
- s3_resource: S3ServiceResource,
227
- local_file_path: str,
228
- dst_bucket: str,
229
- dst_key: str,
230
- extra_args: Optional[Dict[Any, Any]] = None
238
+ s3_resource: S3ServiceResource,
239
+ local_file_path: str,
240
+ dst_bucket: str,
241
+ dst_key: str,
242
+ extra_args: dict[Any, Any] | None = None,
231
243
  ) -> None:
232
244
  s3_client = s3_resource.meta.client
233
245
  s3_client.upload_file(local_file_path, dst_bucket, dst_key, ExtraArgs=extra_args)
@@ -236,19 +248,28 @@ def copy_local_to_s3(
236
248
  # TODO: Determine specific retries
237
249
  @retry(errors=[AWSServerErrors])
238
250
  def copy_s3_to_local(
239
- s3_resource: S3ServiceResource,
240
- local_file_path: str,
241
- src_bucket: str,
242
- src_key: str,
243
- extra_args: Optional[Dict[Any, Any]] = None
251
+ s3_resource: S3ServiceResource,
252
+ local_file_path: str,
253
+ src_bucket: str,
254
+ src_key: str,
255
+ extra_args: dict[Any, Any] | None = None,
244
256
  ) -> None:
245
257
  s3_client = s3_resource.meta.client
246
258
  s3_client.download_file(src_bucket, src_key, local_file_path, ExtraArgs=extra_args)
247
259
 
248
260
 
249
261
  class MultiPartPipe(WritablePipe):
250
- def __init__(self, part_size: int, s3_resource: S3ServiceResource, bucket_name: str, file_id: str, encryption_args: Optional[Dict[Any, Any]], encoding: Optional[str] = None, errors: Optional[str] = None) -> None:
251
- super(MultiPartPipe, self).__init__()
262
+ def __init__(
263
+ self,
264
+ part_size: int,
265
+ s3_resource: S3ServiceResource,
266
+ bucket_name: str,
267
+ file_id: str,
268
+ encryption_args: dict[Any, Any] | None,
269
+ encoding: str | None = None,
270
+ errors: str | None = None,
271
+ ) -> None:
272
+ super().__init__()
252
273
  self.encoding = encoding
253
274
  self.errors = errors
254
275
  self.part_size = part_size
@@ -267,22 +288,28 @@ class MultiPartPipe(WritablePipe):
267
288
  hasher.update(buf)
268
289
 
269
290
  # low-level clients are thread safe
270
- response = self.s3_client.create_multipart_upload(Bucket=self.bucket_name,
271
- Key=self.file_id,
272
- **self.encryption_args)
273
- upload_id = response['UploadId']
274
- parts = []
291
+ response = self.s3_client.create_multipart_upload(
292
+ Bucket=self.bucket_name, Key=self.file_id, **self.encryption_args
293
+ )
294
+ upload_id = response["UploadId"]
295
+ parts: MutableSequence[CompletedPartTypeDef] = []
275
296
  try:
276
297
  for part_num in itertools.count():
277
- logger.debug(f'[{upload_id}] Uploading part %d of %d bytes', part_num + 1, len(buf))
298
+ logger.debug(
299
+ f"[{upload_id}] Uploading part %d of %d bytes",
300
+ part_num + 1,
301
+ len(buf),
302
+ )
278
303
  # TODO: include the Content-MD5 header:
279
304
  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.complete_multipart_upload
280
- part = self.s3_client.upload_part(Bucket=self.bucket_name,
281
- Key=self.file_id,
282
- PartNumber=part_num + 1,
283
- UploadId=upload_id,
284
- Body=BytesIO(buf),
285
- **self.encryption_args)
305
+ part = self.s3_client.upload_part(
306
+ Bucket=self.bucket_name,
307
+ Key=self.file_id,
308
+ PartNumber=part_num + 1,
309
+ UploadId=upload_id,
310
+ Body=BytesIO(buf),
311
+ **self.encryption_args,
312
+ )
286
313
  parts.append({"PartNumber": part_num + 1, "ETag": part["ETag"]})
287
314
 
288
315
  # Get the next block of data we want to put
@@ -292,48 +319,56 @@ class MultiPartPipe(WritablePipe):
292
319
  break
293
320
  hasher.update(buf)
294
321
  except:
295
- self.s3_client.abort_multipart_upload(Bucket=self.bucket_name,
296
- Key=self.file_id,
297
- UploadId=upload_id)
322
+ self.s3_client.abort_multipart_upload(
323
+ Bucket=self.bucket_name, Key=self.file_id, UploadId=upload_id
324
+ )
298
325
  else:
299
326
  # Save the checksum
300
- checksum = f'sha1${hasher.hexdigest()}'
327
+ checksum = f"sha1${hasher.hexdigest()}"
301
328
  # Encryption information is not needed here because the upload was
302
329
  # not started with a checksum.
303
- response = self.s3_client.complete_multipart_upload(Bucket=self.bucket_name, # type: ignore
304
- Key=self.file_id,
305
- UploadId=upload_id,
306
- MultipartUpload={"Parts": parts}) # type: ignore
307
- logger.debug(f'[{upload_id}] Upload of {self.file_id} complete...')
330
+ response = self.s3_client.complete_multipart_upload(
331
+ Bucket=self.bucket_name,
332
+ Key=self.file_id,
333
+ UploadId=upload_id,
334
+ MultipartUpload={"Parts": parts},
335
+ ) # type: ignore
336
+ logger.debug(f"[{upload_id}] Upload of {self.file_id} complete...")
308
337
 
309
338
 
310
- def parse_s3_uri(uri: str) -> Tuple[str, str]:
339
+ def parse_s3_uri(uri: str) -> tuple[str, str]:
311
340
  # does not support s3/gs: https://docs.python.org/3/library/urllib.parse.html
312
341
  # use regex instead?
313
342
  uri = urllib.parse.urlparse(uri) # type: ignore
314
- if uri.scheme.lower() != 's3': # type: ignore
315
- raise ValueError(f'Invalid schema. Expecting s3 prefix, not: {uri}')
316
- bucket_name, key_name = uri.netloc.strip('/'), uri.path.strip('/') # type: ignore
343
+ if uri.scheme.lower() != "s3": # type: ignore
344
+ raise ValueError(f"Invalid schema. Expecting s3 prefix, not: {uri}")
345
+ bucket_name, key_name = uri.netloc.strip("/"), uri.path.strip("/") # type: ignore
317
346
  return bucket_name, key_name
318
347
 
319
348
 
320
- def list_s3_items(s3_resource: S3ServiceResource, bucket: str, prefix: str, startafter: Optional[str] = None) -> Iterator[ObjectTypeDef]:
349
+ def list_s3_items(
350
+ s3_resource: S3ServiceResource,
351
+ bucket: str,
352
+ prefix: str,
353
+ startafter: str | None = None,
354
+ ) -> Iterator[ObjectTypeDef]:
321
355
  s3_client = s3_resource.meta.client
322
- paginator = s3_client.get_paginator('list_objects_v2')
356
+ paginator = s3_client.get_paginator("list_objects_v2")
323
357
  kwargs = dict(Bucket=bucket, Prefix=prefix)
324
358
  if startafter:
325
- kwargs['StartAfter'] = startafter
359
+ kwargs["StartAfter"] = startafter
326
360
  for page in paginator.paginate(**kwargs): # type: ignore
327
- for key in page.get('Contents', []):
328
- yield key
361
+ yield from page.get("Contents", [])
329
362
 
330
363
 
331
364
  @retry(errors=[AWSServerErrors])
332
- def upload_to_s3(readable: IO[Any],
333
- s3_resource: S3ServiceResource,
334
- bucket: str,
335
- key: str,
336
- extra_args: Optional[Dict[Any, Any]] = None) -> None:
365
+ def upload_to_s3(
366
+ readable: IO[Any],
367
+ s3_resource: S3ServiceResource,
368
+ bucket: str,
369
+ key: str,
370
+ extra_args: dict[Any, Any] | None = None,
371
+ ) -> None:
337
372
  """
338
373
  Upload a readable object to s3, using multipart uploading if applicable.
339
374
 
@@ -350,33 +385,45 @@ def upload_to_s3(readable: IO[Any],
350
385
 
351
386
  s3_client = s3_resource.meta.client
352
387
  from boto3.s3.transfer import TransferConfig
388
+
353
389
  config = TransferConfig(
354
390
  multipart_threshold=DEFAULT_AWS_CHUNK_SIZE,
355
391
  multipart_chunksize=DEFAULT_AWS_CHUNK_SIZE,
356
- use_threads=True
392
+ use_threads=True,
357
393
  )
358
394
  logger.debug("Uploading %s", key)
359
395
  # these methods use multipart if necessary
360
396
  if isinstance(readable, str):
361
- s3_client.upload_file(Filename=readable,
362
- Bucket=bucket,
363
- Key=key,
364
- ExtraArgs=extra_args,
365
- Config=config)
397
+ s3_client.upload_file(
398
+ Filename=readable,
399
+ Bucket=bucket,
400
+ Key=key,
401
+ ExtraArgs=extra_args,
402
+ Config=config,
403
+ )
366
404
  else:
367
- s3_client.upload_fileobj(Fileobj=readable,
368
- Bucket=bucket,
369
- Key=key,
370
- ExtraArgs=extra_args,
371
- Config=config)
405
+ s3_client.upload_fileobj(
406
+ Fileobj=readable,
407
+ Bucket=bucket,
408
+ Key=key,
409
+ ExtraArgs=extra_args,
410
+ Config=config,
411
+ )
372
412
 
373
413
  object_summary = s3_resource.ObjectSummary(bucket, key)
374
414
  object_summary.wait_until_exists(**extra_args)
375
415
 
376
416
 
377
417
  @contextmanager
378
- def download_stream(s3_resource: S3ServiceResource, bucket: str, key: str, checksum_to_verify: Optional[str] = None,
379
- extra_args: Optional[Dict[Any, Any]] = None, encoding: Optional[str] = None, errors: Optional[str] = None) -> Iterator[IO[Any]]:
418
+ def download_stream(
419
+ s3_resource: S3ServiceResource,
420
+ bucket: str,
421
+ key: str,
422
+ checksum_to_verify: str | None = None,
423
+ extra_args: dict[Any, Any] | None = None,
424
+ encoding: str | None = None,
425
+ errors: str | None = None,
426
+ ) -> Iterator[IO[Any]]:
380
427
  """Context manager that gives out a download stream to download data."""
381
428
  bucket_obj: Bucket = s3_resource.Bucket(bucket)
382
429
 
@@ -384,14 +431,16 @@ def download_stream(s3_resource: S3ServiceResource, bucket: str, key: str, check
384
431
  def writeTo(self, writable: IO[Any]) -> None:
385
432
  kwargs = dict(Key=key, Fileobj=writable, ExtraArgs=extra_args)
386
433
  if not extra_args:
387
- del kwargs['ExtraArgs']
434
+ del kwargs["ExtraArgs"]
388
435
  bucket_obj.download_fileobj(**kwargs) # type: ignore
389
436
 
390
437
  try:
391
438
  if checksum_to_verify:
392
439
  with DownloadPipe(encoding=encoding, errors=errors) as readable:
393
440
  # Interpose a pipe to check the hash
394
- with HashingPipe(readable, encoding=encoding, errors=errors) as verified:
441
+ with HashingPipe(
442
+ readable, encoding=encoding, errors=errors
443
+ ) as verified:
395
444
  yield verified
396
445
  else:
397
446
  # Readable end of pipe produces text mode output if encoding specified
@@ -401,35 +450,59 @@ def download_stream(s3_resource: S3ServiceResource, bucket: str, key: str, check
401
450
  except s3_resource.meta.client.exceptions.NoSuchKey:
402
451
  raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
403
452
  except ClientError as e:
404
- if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
405
- raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
406
- elif e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 400 and \
407
- e.response.get('Error', {}).get('Message') == 'Bad Request' and \
408
- e.operation_name == 'HeadObject':
453
+ if e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404:
454
+ raise AWSKeyNotFoundError(
455
+ f"Key '{key}' does not exist in bucket '{bucket}'."
456
+ )
457
+ elif (
458
+ e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 400
459
+ and e.response.get("Error", {}).get("Message") == "Bad Request"
460
+ and e.operation_name == "HeadObject"
461
+ ):
409
462
  # An error occurred (400) when calling the HeadObject operation: Bad Request
410
- raise AWSBadEncryptionKeyError("Your AWS encryption key is most likely configured incorrectly "
411
- f"(HeadObject operation on key '{key}': Bad Request).")
463
+ raise AWSBadEncryptionKeyError(
464
+ "Your AWS encryption key is most likely configured incorrectly "
465
+ f"(HeadObject operation on key '{key}': Bad Request)."
466
+ )
412
467
  raise
413
468
 
414
469
 
415
- def download_fileobject(s3_resource: S3ServiceResource, bucket: Bucket, key: str, fileobj: BytesIO, extra_args: Optional[Dict[Any, Any]] = None) -> None:
470
+ def download_fileobject(
471
+ s3_resource: S3ServiceResource,
472
+ bucket: Bucket,
473
+ key: str,
474
+ fileobj: BytesIO,
475
+ extra_args: dict[Any, Any] | None = None,
476
+ ) -> None:
416
477
  try:
417
478
  bucket.download_fileobj(Key=key, Fileobj=fileobj, ExtraArgs=extra_args)
418
479
  except s3_resource.meta.client.exceptions.NoSuchKey:
419
480
  raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
420
481
  except ClientError as e:
421
- if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
422
- raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
423
- elif e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 400 and \
424
- e.response.get('Error', {}).get('Message') == 'Bad Request' and \
425
- e.operation_name == 'HeadObject':
482
+ if e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404:
483
+ raise AWSKeyNotFoundError(
484
+ f"Key '{key}' does not exist in bucket '{bucket}'."
485
+ )
486
+ elif (
487
+ e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 400
488
+ and e.response.get("Error", {}).get("Message") == "Bad Request"
489
+ and e.operation_name == "HeadObject"
490
+ ):
426
491
  # An error occurred (400) when calling the HeadObject operation: Bad Request
427
- raise AWSBadEncryptionKeyError("Your AWS encryption key is most likely configured incorrectly "
428
- f"(HeadObject operation on key '{key}': Bad Request).")
492
+ raise AWSBadEncryptionKeyError(
493
+ "Your AWS encryption key is most likely configured incorrectly "
494
+ f"(HeadObject operation on key '{key}': Bad Request)."
495
+ )
429
496
  raise
430
497
 
431
498
 
432
- def s3_key_exists(s3_resource: S3ServiceResource, bucket: str, key: str, check: bool = False, extra_args: Optional[Dict[Any, Any]] = None) -> bool:
499
+ def s3_key_exists(
500
+ s3_resource: S3ServiceResource,
501
+ bucket: str,
502
+ key: str,
503
+ check: bool = False,
504
+ extra_args: dict[Any, Any] | None = None,
505
+ ) -> bool:
433
506
  """Return True if the s3 obect exists, and False if not. Will error if encryption args are incorrect."""
434
507
  extra_args = extra_args or {}
435
508
  s3_client = s3_resource.meta.client
@@ -438,62 +511,91 @@ def s3_key_exists(s3_resource: S3ServiceResource, bucket: str, key: str, check:
438
511
  return True
439
512
  except s3_client.exceptions.NoSuchKey:
440
513
  if check:
441
- raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
514
+ raise AWSKeyNotFoundError(
515
+ f"Key '{key}' does not exist in bucket '{bucket}'."
516
+ )
442
517
  return False
443
518
  except ClientError as e:
444
- if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
519
+ if e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404:
445
520
  if check:
446
- raise AWSKeyNotFoundError(f"Key '{key}' does not exist in bucket '{bucket}'.")
521
+ raise AWSKeyNotFoundError(
522
+ f"Key '{key}' does not exist in bucket '{bucket}'."
523
+ )
447
524
  return False
448
- elif e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 400 and \
449
- e.response.get('Error', {}).get('Message') == 'Bad Request' and \
450
- e.operation_name == 'HeadObject':
525
+ elif (
526
+ e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 400
527
+ and e.response.get("Error", {}).get("Message") == "Bad Request"
528
+ and e.operation_name == "HeadObject"
529
+ ):
451
530
  # An error occurred (400) when calling the HeadObject operation: Bad Request
452
- raise AWSBadEncryptionKeyError("Your AWS encryption key is most likely configured incorrectly "
453
- f"(HeadObject operation on key '{key}': Bad Request).")
531
+ raise AWSBadEncryptionKeyError(
532
+ "Your AWS encryption key is most likely configured incorrectly "
533
+ f"(HeadObject operation on key '{key}': Bad Request)."
534
+ )
454
535
  else:
455
536
  raise
456
537
 
457
538
 
458
- def get_s3_object(s3_resource: S3ServiceResource, bucket: str, key: str, extra_args: Optional[Dict[Any, Any]] = None) -> GetObjectOutputTypeDef:
539
+ def get_s3_object(
540
+ s3_resource: S3ServiceResource,
541
+ bucket: str,
542
+ key: str,
543
+ extra_args: dict[Any, Any] | None = None,
544
+ ) -> GetObjectOutputTypeDef:
459
545
  if extra_args is None:
460
546
  extra_args = dict()
461
547
  s3_client = s3_resource.meta.client
462
548
  return s3_client.get_object(Bucket=bucket, Key=key, **extra_args)
463
549
 
464
550
 
465
- def put_s3_object(s3_resource: S3ServiceResource, bucket: str, key: str, body: Union[str, bytes], extra_args: Optional[Dict[Any, Any]] = None) -> PutObjectOutputTypeDef:
551
+ def put_s3_object(
552
+ s3_resource: S3ServiceResource,
553
+ bucket: str,
554
+ key: str,
555
+ body: str | bytes,
556
+ extra_args: dict[Any, Any] | None = None,
557
+ ) -> PutObjectOutputTypeDef:
466
558
  if extra_args is None:
467
559
  extra_args = dict()
468
560
  s3_client = s3_resource.meta.client
469
561
  return s3_client.put_object(Bucket=bucket, Key=key, Body=body, **extra_args)
470
562
 
471
563
 
472
- def generate_presigned_url(s3_resource: S3ServiceResource, bucket: str, key_name: str, expiration: int) -> str:
564
+ def generate_presigned_url(
565
+ s3_resource: S3ServiceResource, bucket: str, key_name: str, expiration: int
566
+ ) -> str:
473
567
  s3_client = s3_resource.meta.client
474
568
  return s3_client.generate_presigned_url(
475
- 'get_object',
476
- Params={'Bucket': bucket, 'Key': key_name},
477
- ExpiresIn=expiration)
569
+ "get_object", Params={"Bucket": bucket, "Key": key_name}, ExpiresIn=expiration
570
+ )
478
571
 
479
572
 
480
573
  def create_public_url(s3_resource: S3ServiceResource, bucket: str, key: str) -> str:
481
574
  bucket_obj = s3_resource.Bucket(bucket)
482
- bucket_obj.Object(key).Acl().put(ACL='public-read') # TODO: do we need to generate a signed url after doing this?
483
- url = generate_presigned_url(s3_resource=s3_resource,
484
- bucket=bucket,
485
- key_name=key,
486
- # One year should be sufficient to finish any pipeline ;-)
487
- expiration=int(timedelta(days=365).total_seconds()))
575
+ bucket_obj.Object(key).Acl().put(
576
+ ACL="public-read"
577
+ ) # TODO: do we need to generate a signed url after doing this?
578
+ url = generate_presigned_url(
579
+ s3_resource=s3_resource,
580
+ bucket=bucket,
581
+ key_name=key,
582
+ # One year should be sufficient to finish any pipeline ;-)
583
+ expiration=int(timedelta(days=365).total_seconds()),
584
+ )
488
585
  # boto doesn't properly remove the x-amz-security-token parameter when
489
586
  # query_auth is False when using an IAM role (see issue #2043). Including the
490
587
  # x-amz-security-token parameter without the access key results in a 403,
491
588
  # even if the resource is public, so we need to remove it.
492
589
  # TODO: verify that this is still the case
493
- return modify_url(url, remove=['x-amz-security-token', 'AWSAccessKeyId', 'Signature'])
590
+ return modify_url(
591
+ url, remove=["x-amz-security-token", "AWSAccessKeyId", "Signature"]
592
+ )
494
593
 
495
594
 
496
595
  def get_s3_bucket_region(s3_resource: S3ServiceResource, bucket: str) -> str:
497
596
  s3_client = s3_resource.meta.client
498
597
  # AWS returns None for the default of 'us-east-1'
499
- return s3_client.get_bucket_location(Bucket=bucket).get('LocationConstraint', None) or 'us-east-1'
598
+ return (
599
+ s3_client.get_bucket_location(Bucket=bucket).get("LocationConstraint", None)
600
+ or "us-east-1"
601
+ )