amapy-plugin-s3 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. amapy-plugin-s3-1.0.0/PKG-INFO +16 -0
  2. amapy-plugin-s3-1.0.0/README.md +1 -0
  3. amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/PKG-INFO +16 -0
  4. amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/SOURCES.txt +42 -0
  5. amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/dependency_links.txt +1 -0
  6. amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/requires.txt +5 -0
  7. amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/top_level.txt +1 -0
  8. amapy-plugin-s3-1.0.0/asset_plugin_s3/__init__.py +1 -0
  9. amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_auth.py +79 -0
  10. amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_blob.py +231 -0
  11. amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_http_handler.py +222 -0
  12. amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_mount_handler.py +96 -0
  13. amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_storage.py +57 -0
  14. amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_storage_mixin.py +44 -0
  15. amapy-plugin-s3-1.0.0/asset_plugin_s3/bucket_cors.py +67 -0
  16. amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/__init__.py +0 -0
  17. amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/async_mount_copy.py +48 -0
  18. amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/async_mount_transporter.py +60 -0
  19. amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/mounted_resource.py +20 -0
  20. amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/mounted_url.py +25 -0
  21. amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/tests/__init__.py +0 -0
  22. amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/tests/test_mounted_url.py +73 -0
  23. amapy-plugin-s3-1.0.0/asset_plugin_s3/s3_proxy.py +58 -0
  24. amapy-plugin-s3-1.0.0/asset_plugin_s3/signed_url.py +41 -0
  25. amapy-plugin-s3-1.0.0/asset_plugin_s3/tests/__init__.py +0 -0
  26. amapy-plugin-s3-1.0.0/asset_plugin_s3/tests/test_aws_storage.py +185 -0
  27. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/__init__.py +2 -0
  28. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/__init__.py +0 -0
  29. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_aws_transporter.py +68 -0
  30. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_copy.py +272 -0
  31. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_download.py +70 -0
  32. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_update_blob.py +37 -0
  33. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_upload.py +53 -0
  34. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/tests/__init__.py +0 -0
  35. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/tests/test_async_aws_transporter.py +184 -0
  36. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/aws_transport_resource.py +154 -0
  37. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/__init__.py +0 -0
  38. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/async_download.py +72 -0
  39. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/async_upload.py +59 -0
  40. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/legacy_aws_transporter.py +20 -0
  41. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/tests/__init__.py +0 -0
  42. amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/tests/test_legacy_aws_transporter.py +131 -0
  43. amapy-plugin-s3-1.0.0/pyproject.toml +28 -0
  44. amapy-plugin-s3-1.0.0/setup.cfg +4 -0
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.1
2
+ Name: amapy-plugin-s3
3
+ Version: 1.0.0
4
+ Summary: The S3 plugin for the asset manager project. It provides functionalities to manage and integrate Amazon S3 storage within the asset management system.
5
+ Author-email: Swarup Mahanti <swarup.mahanti@roche.com>
6
+ Maintainer-email: Swarup Mahanti <swarup.mahanti@roche.com>
7
+ License: Copyright (c) 2024 Roche Diagnostics Computation Science & Informatics
8
+ Requires-Python: <3.11,>=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: aioboto3<=10.2.0
11
+ Requires-Dist: amapy-contents==1.*
12
+ Requires-Dist: amapy-pluggy==1.*
13
+ Requires-Dist: amapy-utils==1.*
14
+ Requires-Dist: backoff==2.2.*
15
+
16
+ asset-plugin-s3
@@ -0,0 +1 @@
1
+ asset-plugin-s3
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.1
2
+ Name: amapy-plugin-s3
3
+ Version: 1.0.0
4
+ Summary: The S3 plugin for the asset manager project. It provides functionalities to manage and integrate Amazon S3 storage within the asset management system.
5
+ Author-email: Swarup Mahanti <swarup.mahanti@roche.com>
6
+ Maintainer-email: Swarup Mahanti <swarup.mahanti@roche.com>
7
+ License: Copyright (c) 2024 Roche Diagnostics Computation Science & Informatics
8
+ Requires-Python: <3.11,>=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: aioboto3<=10.2.0
11
+ Requires-Dist: amapy-contents==1.*
12
+ Requires-Dist: amapy-pluggy==1.*
13
+ Requires-Dist: amapy-utils==1.*
14
+ Requires-Dist: backoff==2.2.*
15
+
16
+ asset-plugin-s3
@@ -0,0 +1,42 @@
1
+ README.md
2
+ pyproject.toml
3
+ amapy_plugin_s3.egg-info/PKG-INFO
4
+ amapy_plugin_s3.egg-info/SOURCES.txt
5
+ amapy_plugin_s3.egg-info/dependency_links.txt
6
+ amapy_plugin_s3.egg-info/requires.txt
7
+ amapy_plugin_s3.egg-info/top_level.txt
8
+ asset_plugin_s3/__init__.py
9
+ asset_plugin_s3/aws_auth.py
10
+ asset_plugin_s3/aws_blob.py
11
+ asset_plugin_s3/aws_http_handler.py
12
+ asset_plugin_s3/aws_mount_handler.py
13
+ asset_plugin_s3/aws_storage.py
14
+ asset_plugin_s3/aws_storage_mixin.py
15
+ asset_plugin_s3/bucket_cors.py
16
+ asset_plugin_s3/s3_proxy.py
17
+ asset_plugin_s3/signed_url.py
18
+ asset_plugin_s3/mounted_bucket/__init__.py
19
+ asset_plugin_s3/mounted_bucket/async_mount_copy.py
20
+ asset_plugin_s3/mounted_bucket/async_mount_transporter.py
21
+ asset_plugin_s3/mounted_bucket/mounted_resource.py
22
+ asset_plugin_s3/mounted_bucket/mounted_url.py
23
+ asset_plugin_s3/mounted_bucket/tests/__init__.py
24
+ asset_plugin_s3/mounted_bucket/tests/test_mounted_url.py
25
+ asset_plugin_s3/tests/__init__.py
26
+ asset_plugin_s3/tests/test_aws_storage.py
27
+ asset_plugin_s3/transporter/__init__.py
28
+ asset_plugin_s3/transporter/aws_transport_resource.py
29
+ asset_plugin_s3/transporter/async_aws/__init__.py
30
+ asset_plugin_s3/transporter/async_aws/async_aws_transporter.py
31
+ asset_plugin_s3/transporter/async_aws/async_copy.py
32
+ asset_plugin_s3/transporter/async_aws/async_download.py
33
+ asset_plugin_s3/transporter/async_aws/async_update_blob.py
34
+ asset_plugin_s3/transporter/async_aws/async_upload.py
35
+ asset_plugin_s3/transporter/async_aws/tests/__init__.py
36
+ asset_plugin_s3/transporter/async_aws/tests/test_async_aws_transporter.py
37
+ asset_plugin_s3/transporter/legacy_aws/__init__.py
38
+ asset_plugin_s3/transporter/legacy_aws/async_download.py
39
+ asset_plugin_s3/transporter/legacy_aws/async_upload.py
40
+ asset_plugin_s3/transporter/legacy_aws/legacy_aws_transporter.py
41
+ asset_plugin_s3/transporter/legacy_aws/tests/__init__.py
42
+ asset_plugin_s3/transporter/legacy_aws/tests/test_legacy_aws_transporter.py
@@ -0,0 +1,5 @@
1
+ aioboto3<=10.2.0
2
+ amapy-contents==1.*
3
+ amapy-pluggy==1.*
4
+ amapy-utils==1.*
5
+ backoff==2.2.*
@@ -0,0 +1 @@
1
+ asset_plugin_s3
@@ -0,0 +1 @@
1
+ from .aws_storage import AwsStoragePlugin
@@ -0,0 +1,79 @@
1
+ import hmac
2
+ import os
3
+ from datetime import datetime
4
+ from hashlib import sha256
5
+
6
+ from botocore.auth import S3SigV4Auth
7
+
8
+ from asset_pluggy.storage.storage_credentials import StorageCredentials
9
+ from asset_utils.common import exceptions
10
+
11
+
12
+ def get_aws_id_k_date() -> dict:
13
+ """
14
+ TODO: will move it into the server, added for testing purpose only
15
+ Returns
16
+ -------
17
+ str
18
+ the k_date hex string
19
+ """
20
+ secret_key = StorageCredentials.shared().credentials.get("aws_secret_access_key")
21
+ today = datetime.utcnow().strftime('%Y%m%d') # always use the UTC time for k_date
22
+ hex_k_date = hmac.new(f"AWS4{secret_key}".encode("utf-8"), today.encode("utf-8"), sha256).hexdigest()
23
+ return {"aws_access_key_id": StorageCredentials.shared().credentials.get("aws_access_key_id"),
24
+ "k_date": hex_k_date}
25
+
26
+
27
+ class AwsAuth(S3SigV4Auth):
28
+ """
29
+ Custom AWS authentication class for S3 that uses K_DATE from the environment if the secret key is not provided
30
+ """
31
+
32
+ def signature(self, string_to_sign, request):
33
+ """
34
+ Returns the signature for the given string and request
35
+
36
+ Parameters
37
+ ----------
38
+ string_to_sign : str
39
+ The string to sign
40
+ request : obj
41
+ The request object
42
+
43
+ Returns
44
+ -------
45
+ str
46
+ The signature for the given string and request
47
+ """
48
+ key = self.credentials.secret_key
49
+ if key:
50
+ return super().signature(string_to_sign, request)
51
+ else:
52
+ return self._sign_with_k_date(string_to_sign)
53
+
54
+ def _sign_with_k_date(self, string_to_sign):
55
+ """Returns the signature for the given string using K_DATE from the environment
56
+
57
+ Parameters
58
+ ----------
59
+ string_to_sign : str
60
+ The string to sign
61
+
62
+ Returns
63
+ -------
64
+ str
65
+ The signature for the given string using K_DATE
66
+
67
+ Raises
68
+ ------
69
+ AssetException
70
+ If K_DATE is not found in the environment
71
+ """
72
+ k_date_str = os.environ.get("ASSET_K_DATE")
73
+ if not k_date_str:
74
+ raise exceptions.AssetException("ASSET_K_DATE not found in environment")
75
+ k_date = bytes.fromhex(k_date_str)
76
+ k_region = self._sign(k_date, self._region_name)
77
+ k_service = self._sign(k_region, self._service_name)
78
+ k_signing = self._sign(k_service, msg="aws4_request")
79
+ return self._sign(k_signing, string_to_sign, hex=True)
@@ -0,0 +1,231 @@
1
+ from typing import Any
2
+
3
+ from asset_pluggy.storage import BlobData
4
+ from asset_pluggy.storage import BlobStoreURL
5
+ from asset_utils.common.exceptions import AssetException
6
+ from asset_utils.utils.file_utils import FileUtils
7
+
8
+
9
+ class AwsBlob(BlobData):
10
+ """Class to handle AWS S3 blob data."""
11
+ _aws_obj = None
12
+ _multipart_size = None
13
+
14
+ def initialize(self, data: Any, url_object: BlobStoreURL):
15
+ """Initializes the AwsBlob object with data from an S3 object or S3 object summary.
16
+
17
+ Parameters
18
+ ----------
19
+ data : Any
20
+ A dict or s3.ObjectSummary or s3.object.
21
+ url_object : BlobStoreURL
22
+ The URL of the blob.
23
+ """
24
+ self._aws_obj = data
25
+ try:
26
+ if type(self._aws_obj) is dict: # json data from S3Proxy
27
+ self._initialize_from_dict(data=self._aws_obj)
28
+ return
29
+ elif hasattr(self._aws_obj, 'content_type'): # s3.Object
30
+ self._initialize_from_s3_object(s3_obj=self._aws_obj)
31
+ else:
32
+ self._initialize_from_s3_summary_object(s3_summary_obj=self._aws_obj)
33
+ except Exception as e:
34
+ raise AssetException(f"Error initializing blob from {url_object.url}") from e
35
+ self.host = url_object.host
36
+ self.url = url_object.url_for_blob(host=self.host, bucket=self.bucket, name=self.name)
37
+
38
+ def _initialize_from_s3_object(self, s3_obj: Any):
39
+ """Initializes the AwsBlob object with data from an S3 object.
40
+
41
+ Parameters
42
+ ----------
43
+ s3_obj : Any
44
+ The S3.object.
45
+ """
46
+ self.bucket = s3_obj.bucket_name
47
+ self.name = s3_obj.key
48
+ self.size = s3_obj.content_length
49
+ # s3 mostly returns content-type as binary/octet-stream so we try to guess it
50
+ # self.content_type = FileUtils.mime_type(self.name) or s3_obj.content_type
51
+ # aws returns files + directory unlike gs which returns only files
52
+ # self.is_file = bool(self.content_type != 'application/x-directory')
53
+ # check all possible checksums, the structure here is different thant s3.ObjectSummary
54
+ # note: aws_blob returns inconsistent check_sum_algorithm keys, sometimes the keys are present
55
+ # if the value is null - so we add an extra protection to ensure hash can not be null
56
+ if hasattr(s3_obj, "checksum_crc32") and getattr(s3_obj, "checksum_crc32"):
57
+ self.hashes["crc32"] = getattr(s3_obj, "checksum_crc32")
58
+ if hasattr(s3_obj, "checksum_crc32_c") and getattr(s3_obj, "checksum_crc32_c"):
59
+ self.hashes["crc32_c"] = getattr(s3_obj, "checksum_crc32_c")
60
+ if hasattr(s3_obj, "checksum_sha1") and getattr(s3_obj, "checksum_sha1"):
61
+ self.hashes["sha1"] = getattr(s3_obj, "checksum_sha1")
62
+ if hasattr(s3_obj, "checksum_sha256") and getattr(s3_obj, "checksum_sha256"):
63
+ self.hashes["sha256"] = getattr(s3_obj, "checksum_sha256")
64
+
65
+ self.hashes["etag"] = s3_obj.e_tag
66
+ if '-' in s3_obj.e_tag and self._parse_etag(s3_obj.e_tag)[1] == '1':
67
+ # if number of parts is 1, then self.size is the multipart size
68
+ self._multipart_size = self.size
69
+
70
+ def _initialize_from_s3_summary_object(self, s3_summary_obj: Any):
71
+ """Initializes the AwsBlob object with data from an S3 object summary.
72
+
73
+ Parameters
74
+ ----------
75
+ s3_summary_obj : Any
76
+ The s3.ObjectSummary.
77
+ """
78
+ self.bucket = s3_summary_obj.bucket_name
79
+ self.name = s3_summary_obj.key
80
+ self.size = s3_summary_obj.size
81
+ # if hasattr(s3_summary_obj, "checksum_algorithm"):
82
+ # # todo: verify whats the checksum value when this attribute is present in the object.
83
+ # hash_type = getattr(s3_summary_obj, "checksum_algorithm") or "md5"
84
+ # e-tag has extra quotes, so we strip quotes
85
+ # etag has 2 parts separated by "-", first part is the md5,
86
+ # second part is the number of parts in which the file was uploaded
87
+ self.hashes["etag"] = s3_summary_obj.e_tag
88
+ if '-' in s3_summary_obj.e_tag and self._parse_etag(s3_summary_obj.e_tag)[1] == '1':
89
+ # if number of parts is 1, then self.size is the multipart size
90
+ self._multipart_size = self.size
91
+
92
+ def _initialize_from_dict(self, data: dict):
93
+ """Initialize the AwsBlob object from a dictionary.
94
+
95
+ Parameters
96
+ ----------
97
+ data : dict
98
+ The dictionary containing the blob data.
99
+ """
100
+ self.bucket = data.get("bucket")
101
+ self.hashes = data.get("hashes")
102
+ self.host = data.get("host")
103
+ self.name = data.get("name")
104
+ self.path_in_asset = data.get("path_in_asset")
105
+ self.size = data.get("size")
106
+ self.url = data.get("url")
107
+
108
+ @property
109
+ def content_type(self):
110
+ """Get the content type of the blob.
111
+
112
+ Refactored into a property to avoid extra network calls involved in getting the content-type of s3
113
+
114
+ Returns the content type of the blob. If the blob is an S3 object, it returns the content type of
115
+ the S3 object. If the blob is an S3 object summary, it returns the MIME type of the blob based on
116
+ its name or the content type of the S3 object summary.
117
+
118
+ Returns
119
+ -------
120
+ str
121
+ The content type of the blob.
122
+ """
123
+ if hasattr(self._aws_obj, 'content_type'): # s3.Object
124
+ return getattr(self._aws_obj, 'content_type')
125
+ else:
126
+ # note: this is refactored into a separate property for better list-blobs performance,
127
+ # self._parse_content_type(obj=s3_summary_obj)
128
+ # makes this slower by 0.6 seconds per call, so if you are checking 500 objects,
129
+ # it would take about 4 minutes which is painfully slow
130
+ return FileUtils.mime_type(self.name) or self._parse_content_type(obj=self._aws_obj)
131
+
132
+ @property
133
+ def is_file(self):
134
+ """Check if the blob is a file.
135
+
136
+ Refactored into property because of content_type refactoring above.
137
+
138
+ Returns
139
+ -------
140
+ bool
141
+ True if the blob is a file, False otherwise.
142
+
143
+ Notes
144
+ -----
145
+ aws returns files + directory unlike gs which returns only files.
146
+ """
147
+ # self.is_file = bool(self.content_type != 'application/x-directory')
148
+ return bool(self.content_type != 'application/x-directory')
149
+
150
+ @property
151
+ def multipart_size(self) -> int:
152
+ """Get the part size of the multipart upload.
153
+
154
+ Returns
155
+ -------
156
+ int
157
+ The part size of the multipart upload.
158
+
159
+ Raises
160
+ ------
161
+ AssetException
162
+ If the blob is a multipart upload but the multipart_size is not initialized.
163
+ """
164
+ if self.is_multipart and not self._multipart_size:
165
+ raise AssetException("Multipart size is not initialized. Please set the multipart size.")
166
+ return self._multipart_size
167
+
168
+ @multipart_size.setter
169
+ def multipart_size(self, size: int):
170
+ """Set the part size of the multipart upload.
171
+
172
+ Parameters
173
+ ----------
174
+ size : int
175
+ The part size of the multipart upload.
176
+ """
177
+ self._multipart_size = size
178
+
179
+ @property
180
+ def is_multipart(self) -> bool:
181
+ """Check if the blob is a multipart upload.
182
+
183
+ Returns True if the ETag of the blob indicates a multipart upload, False otherwise.
184
+
185
+ Returns
186
+ -------
187
+ bool
188
+ True if the blob is a multipart upload, False otherwise.
189
+ """
190
+ return bool('-' in self.hashes.get("etag", ""))
191
+
192
+ def _parse_etag(self, etag: str) -> list:
193
+ """Parse the ETag of the blob if it is multipart.
194
+
195
+ Parameters
196
+ ----------
197
+ etag : str
198
+ The ETag of the blob.
199
+
200
+ Returns
201
+ -------
202
+ list
203
+ A list containing the ETag and the number of parts in the upload.
204
+ """
205
+ # remove extra quotes
206
+ return etag[1:-1].split("-")
207
+
208
+ def _parse_content_type(self, obj: Any):
209
+ """Get the content-type of an S3 object summary.
210
+
211
+ Parameters
212
+ ----------
213
+ obj : Any
214
+ The S3 object summary.
215
+
216
+ Returns
217
+ -------
218
+ str
219
+ The content-type of the S3 object summary.
220
+ """
221
+ try:
222
+ return obj.get()['ContentType']
223
+ except Exception as e:
224
+ self.log.info(str(e))
225
+ return None
226
+
227
+ def compute_hash(self) -> tuple:
228
+ raise NotImplementedError
229
+
230
+ def get_hash_preferences(self) -> list:
231
+ return [*super().get_hash_preferences(), "etag"]
@@ -0,0 +1,222 @@
1
+ import os
2
+ from functools import cached_property
3
+ from typing import Union
4
+
5
+ import boto3
6
+ import botocore
7
+ from botocore.errorfactory import ClientError
8
+
9
+ from asset_pluggy.storage import StorageData, StorageURL, BlobStoreURL
10
+ from asset_pluggy.storage import storage_utils
11
+ from asset_plugin_s3.aws_auth import AwsAuth, get_aws_id_k_date
12
+ from asset_plugin_s3.aws_blob import AwsBlob
13
+ from asset_plugin_s3.transporter import AsyncAwsTransporter
14
+ from asset_utils.utils import utils
15
+
16
+
17
+ class AwsHttpHandler:
18
+ """AWS Http Handler class for handling AWS S3 operations using boto3 http requests."""
19
+
20
+ def __init__(self, credentials: dict = None):
21
+ self.cred = credentials
22
+
23
+ @cached_property
24
+ def credentials(self) -> dict:
25
+ """Fetches and returns AWS credentials.
26
+
27
+ This method first checks if the AWS credentials are provided by the user. If not, it fetches the
28
+ aws_access_key_id and k_date from the server. It also updates the auth type map to redirect to the
29
+ custom AwsAuth class.
30
+
31
+ Returns
32
+ -------
33
+ dict
34
+ A dictionary containing AWS credentials.
35
+ """
36
+ cred = self.cred
37
+ # cred = None # for testing purposes
38
+ # update the auth type map to redirect to the custom AwsAuth class
39
+ botocore.auth.AUTH_TYPE_MAPS.update({"s3v4": AwsAuth})
40
+ # if the aws_access_key_id and aws_secret_access_key are there it means the user has provided the credentials.
41
+ # is not then we fetch the aws_access_key_id and k_date from the server
42
+ if not cred or "aws_access_key_id" not in cred and "aws_secret_access_key" not in cred:
43
+ server_creds = get_aws_id_k_date() # TODO: fetch it from the server instead of using a test function
44
+ cred = {} if cred is None else cred
45
+ cred["aws_access_key_id"] = server_creds["aws_access_key_id"]
46
+ cred["aws_secret_access_key"] = ""
47
+ os.environ["ASSET_K_DATE"] = server_creds["k_date"] # set the k_date in the environment variable
48
+ return cred
49
+
50
+ def allows_object_add(self):
51
+ """Checks if object addition is allowed."""
52
+ return True
53
+
54
+ def allows_proxy(self):
55
+ """Checks if proxy is allowed."""
56
+ return True
57
+
58
+ def get_transporter(self) -> AsyncAwsTransporter:
59
+ """Returns an instance of AsyncAwsTransporter.
60
+
61
+ The AsyncAwsTransporter is initialized with the AWS credentials.
62
+
63
+ Returns
64
+ -------
65
+ Transporter
66
+ An instance of AsyncAwsTransporter.
67
+ """
68
+ return AsyncAwsTransporter.shared(credentials=self.credentials)
69
+
70
+ @property
71
+ def s3_client(self):
72
+ return boto3.client('s3', **self.credentials)
73
+
74
+ @property
75
+ def s3_resource(self):
76
+ return boto3.resource('s3', **self.credentials)
77
+
78
+ def get_storage_url(self, url_string: str, ignore: str = None) -> StorageURL:
79
+ return BlobStoreURL(url=url_string, ignore=ignore)
80
+
81
+ def get_blob(self, url_string: str) -> AwsBlob:
82
+ """Get the blob instance from the given URL."""
83
+ aws_url = BlobStoreURL(url=url_string)
84
+ blob_data = self.fetch_blob_data(url=aws_url)
85
+ return AwsBlob(data=blob_data, url_object=aws_url)
86
+
87
+ def fetch_blob_data(self, url: BlobStoreURL):
88
+ return self.fetch_data_from_bucket(bucket_name=url.bucket, blob_name=url.path)
89
+
90
+ def fetch_data_from_bucket(self, bucket_name, blob_name):
91
+ return self.s3_resource.Object(bucket_name, blob_name)
92
+
93
+ def url_is_file(self, url: Union[StorageURL, str]) -> bool:
94
+ """Checks if the URL is a file.
95
+
96
+ Blobs are files, so if a blob exists then it's a file
97
+ else either the url doesn't exist or it's a directory
98
+ """
99
+ if type(url) is str:
100
+ url = BlobStoreURL(url=url)
101
+ return self.check_if_blob_exists(url)
102
+
103
+ def blob_exists(self, url_string: str) -> bool:
104
+ """Checks if a blob exists at the given URL."""
105
+ return self.check_if_blob_exists(url=BlobStoreURL(url=url_string))
106
+
107
+ def check_if_blob_exists(self, url: BlobStoreURL) -> bool:
108
+ # no blob path means the url is a bucket
109
+ if not url.path:
110
+ return False
111
+ try:
112
+ exists = self.s3_client.head_object(Bucket=url.bucket, Key=url.path)
113
+ return bool(exists)
114
+ except ClientError:
115
+ return False
116
+
117
+ def list_blobs(self, url: Union[str, StorageURL], ignore: str = None) -> [StorageData]:
118
+ """Returns a list of AwsBlobs located at the url."""
119
+ if type(url) is str:
120
+ url = BlobStoreURL(url=url, ignore=ignore)
121
+ blobs_list = self.fetch_blobs_list(url=url)
122
+ return list(map(lambda x: AwsBlob(data=x, url_object=url), blobs_list))
123
+
124
+ def fetch_blobs_list(self, url: BlobStoreURL):
125
+ aws_bucket = self.s3_resource.Bucket(url.bucket)
126
+ # fetch blobs from the bucket filtered by prefix
127
+ blobs = list(aws_bucket.objects.filter(Prefix=url.path))
128
+ # filter blobs based on pattern and ignore
129
+ return storage_utils.filter_blobs(blobs=blobs,
130
+ name_key="key",
131
+ pattern=url.pattern,
132
+ ignore=url.ignore)
133
+
134
+ def delete_blobs(self, url_strings: [str]) -> None:
135
+ """Deletes blobs at the given URLs."""
136
+ self.delete_blob_urls(urls=list(map(lambda x: BlobStoreURL(url=x), url_strings)))
137
+
138
+ def delete_blob_urls(self, urls: [BlobStoreURL]):
139
+ # group by bucket
140
+ groups = {}
141
+ for url in urls:
142
+ keys_list = groups.get(url.bucket, [])
143
+ keys_list.append(url.path)
144
+ groups[url.bucket] = keys_list
145
+ # delete batch by batch
146
+ for bucket in groups:
147
+ self.batch_delete_s3(s3_client=self.s3_client, bucket=bucket, keys_list=groups.get(bucket))
148
+
149
+ def batch_delete_s3(self, s3_client, bucket: str, keys_list: list):
150
+ for batch in utils.batch(keys_list, batch_size=100):
151
+ batch_to_delete = list(map(lambda x: {"Key": x}, batch))
152
+ s3_client.delete_objects(Bucket=bucket, Delete={'Objects': batch_to_delete, 'Quiet': True})
153
+
154
+ def filter_duplicate_blobs(self, src_blobs: [StorageData], dst_blobs: [StorageData]) -> (list, list):
155
+ """Filters the source blobs to determine which blobs are new and which need to be replaced in the destination.
156
+
157
+ If a blob in `src_blobs` has the same path_in_asset as a blob in `dst_blobs`, it compares their hashes.
158
+ If the hashes are different, the blob is added to the replace_blobs list. If the path_in_asset is not
159
+ found in `dst_blobs`, the blob is considered new and is added to the new_blobs list. For upload and download
160
+ operations, the multipart sizes of the blobs are updated before hash comparison.
161
+
162
+ Parameters
163
+ ----------
164
+ src_blobs : list
165
+ A list of source blobs.
166
+ dst_blobs : list
167
+ A list of destination blobs.
168
+
169
+ Returns
170
+ -------
171
+ tuple
172
+ A tuple containing two lists: new_blobs and replace_blobs. new_blobs is a list of blobs that are new and
173
+ replace_blobs is a list of blobs that need to be replaced in the destination.
174
+ """
175
+ # TODO: improve the overall implementation
176
+ if not dst_blobs: # nothing to filter against
177
+ return src_blobs, []
178
+ if all(isinstance(blob, AwsBlob) for blob in src_blobs) and all(
179
+ isinstance(blob, AwsBlob) for blob in dst_blobs): # asset cp remote copy
180
+ new_blobs, replace_blobs = [], []
181
+ # compare the path_in_asset and hash of the blobs
182
+ dst_blob_map = {obj.path_in_asset: obj for obj in dst_blobs}
183
+ for src_blob in src_blobs:
184
+ if src_blob.path_in_asset in dst_blob_map:
185
+ # no need to update the multipart sizes before hash comparison
186
+ if not src_blob.compare_hash(dst_blob_map[src_blob.path_in_asset]):
187
+ replace_blobs.append(src_blob)
188
+ else:
189
+ # new path_in_asset new object
190
+ new_blobs.append(src_blob)
191
+ return new_blobs, replace_blobs
192
+
193
+ # src_blobs or dst_blobs must be PosixBlob objects
194
+ new_blobs, replace_blobs = [], []
195
+ # compare the path_in_asset and hash of the blobs
196
+ dst_blob_map = {obj.path_in_asset: obj for obj in dst_blobs}
197
+ need_hash_compare = []
198
+ for src_blob in src_blobs:
199
+ if src_blob.path_in_asset in dst_blob_map:
200
+ # need to compare hash
201
+ need_hash_compare.append(src_blob)
202
+ else:
203
+ # new path_in_asset new object
204
+ new_blobs.append(src_blob)
205
+
206
+ if all(isinstance(blob, AwsBlob) for blob in need_hash_compare): # asset cp download
207
+ # update the multipart sizes of the blobs that need hash comparison
208
+ self.get_transporter().update_multipart_blobs(blobs=need_hash_compare)
209
+ for src_blob in need_hash_compare:
210
+ posix_blob = dst_blob_map[src_blob.path_in_asset]
211
+ if not posix_blob.compare_hash(src_blob):
212
+ replace_blobs.append(src_blob)
213
+ else: # asset cp upload
214
+ # dst_blobs must be AwsBlob objects, update the multipart sizes
215
+ self.get_transporter().update_multipart_blobs(
216
+ blobs=[dst_blob_map[obj.path_in_asset] for obj in need_hash_compare])
217
+ for posix_blob in need_hash_compare:
218
+ dst_blob = dst_blob_map[posix_blob.path_in_asset]
219
+ if not posix_blob.compare_hash(dst_blob):
220
+ replace_blobs.append(posix_blob)
221
+
222
+ return new_blobs, replace_blobs