amapy-plugin-s3 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amapy-plugin-s3-1.0.0/PKG-INFO +16 -0
- amapy-plugin-s3-1.0.0/README.md +1 -0
- amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/PKG-INFO +16 -0
- amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/SOURCES.txt +42 -0
- amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/dependency_links.txt +1 -0
- amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/requires.txt +5 -0
- amapy-plugin-s3-1.0.0/amapy_plugin_s3.egg-info/top_level.txt +1 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/__init__.py +1 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_auth.py +79 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_blob.py +231 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_http_handler.py +222 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_mount_handler.py +96 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_storage.py +57 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/aws_storage_mixin.py +44 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/bucket_cors.py +67 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/__init__.py +0 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/async_mount_copy.py +48 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/async_mount_transporter.py +60 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/mounted_resource.py +20 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/mounted_url.py +25 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/tests/__init__.py +0 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/mounted_bucket/tests/test_mounted_url.py +73 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/s3_proxy.py +58 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/signed_url.py +41 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/tests/__init__.py +0 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/tests/test_aws_storage.py +185 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/__init__.py +2 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/__init__.py +0 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_aws_transporter.py +68 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_copy.py +272 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_download.py +70 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_update_blob.py +37 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/async_upload.py +53 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/tests/__init__.py +0 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/async_aws/tests/test_async_aws_transporter.py +184 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/aws_transport_resource.py +154 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/__init__.py +0 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/async_download.py +72 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/async_upload.py +59 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/legacy_aws_transporter.py +20 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/tests/__init__.py +0 -0
- amapy-plugin-s3-1.0.0/asset_plugin_s3/transporter/legacy_aws/tests/test_legacy_aws_transporter.py +131 -0
- amapy-plugin-s3-1.0.0/pyproject.toml +28 -0
- amapy-plugin-s3-1.0.0/setup.cfg +4 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: amapy-plugin-s3
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: The S3 plugin for the asset manager project. It provides functionalities to manage and integrate Amazon S3 storage within the asset management system.
|
|
5
|
+
Author-email: Swarup Mahanti <swarup.mahanti@roche.com>
|
|
6
|
+
Maintainer-email: Swarup Mahanti <swarup.mahanti@roche.com>
|
|
7
|
+
License: Copyright (c) 2024 Roche Diagnostics Computation Science & Informatics
|
|
8
|
+
Requires-Python: <3.11,>=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: aioboto3<=10.2.0
|
|
11
|
+
Requires-Dist: amapy-contents==1.*
|
|
12
|
+
Requires-Dist: amapy-pluggy==1.*
|
|
13
|
+
Requires-Dist: amapy-utils==1.*
|
|
14
|
+
Requires-Dist: backoff==2.2.*
|
|
15
|
+
|
|
16
|
+
asset-plugin-s3
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
asset-plugin-s3
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: amapy-plugin-s3
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: The S3 plugin for the asset manager project. It provides functionalities to manage and integrate Amazon S3 storage within the asset management system.
|
|
5
|
+
Author-email: Swarup Mahanti <swarup.mahanti@roche.com>
|
|
6
|
+
Maintainer-email: Swarup Mahanti <swarup.mahanti@roche.com>
|
|
7
|
+
License: Copyright (c) 2024 Roche Diagnostics Computation Science & Informatics
|
|
8
|
+
Requires-Python: <3.11,>=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: aioboto3<=10.2.0
|
|
11
|
+
Requires-Dist: amapy-contents==1.*
|
|
12
|
+
Requires-Dist: amapy-pluggy==1.*
|
|
13
|
+
Requires-Dist: amapy-utils==1.*
|
|
14
|
+
Requires-Dist: backoff==2.2.*
|
|
15
|
+
|
|
16
|
+
asset-plugin-s3
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
amapy_plugin_s3.egg-info/PKG-INFO
|
|
4
|
+
amapy_plugin_s3.egg-info/SOURCES.txt
|
|
5
|
+
amapy_plugin_s3.egg-info/dependency_links.txt
|
|
6
|
+
amapy_plugin_s3.egg-info/requires.txt
|
|
7
|
+
amapy_plugin_s3.egg-info/top_level.txt
|
|
8
|
+
asset_plugin_s3/__init__.py
|
|
9
|
+
asset_plugin_s3/aws_auth.py
|
|
10
|
+
asset_plugin_s3/aws_blob.py
|
|
11
|
+
asset_plugin_s3/aws_http_handler.py
|
|
12
|
+
asset_plugin_s3/aws_mount_handler.py
|
|
13
|
+
asset_plugin_s3/aws_storage.py
|
|
14
|
+
asset_plugin_s3/aws_storage_mixin.py
|
|
15
|
+
asset_plugin_s3/bucket_cors.py
|
|
16
|
+
asset_plugin_s3/s3_proxy.py
|
|
17
|
+
asset_plugin_s3/signed_url.py
|
|
18
|
+
asset_plugin_s3/mounted_bucket/__init__.py
|
|
19
|
+
asset_plugin_s3/mounted_bucket/async_mount_copy.py
|
|
20
|
+
asset_plugin_s3/mounted_bucket/async_mount_transporter.py
|
|
21
|
+
asset_plugin_s3/mounted_bucket/mounted_resource.py
|
|
22
|
+
asset_plugin_s3/mounted_bucket/mounted_url.py
|
|
23
|
+
asset_plugin_s3/mounted_bucket/tests/__init__.py
|
|
24
|
+
asset_plugin_s3/mounted_bucket/tests/test_mounted_url.py
|
|
25
|
+
asset_plugin_s3/tests/__init__.py
|
|
26
|
+
asset_plugin_s3/tests/test_aws_storage.py
|
|
27
|
+
asset_plugin_s3/transporter/__init__.py
|
|
28
|
+
asset_plugin_s3/transporter/aws_transport_resource.py
|
|
29
|
+
asset_plugin_s3/transporter/async_aws/__init__.py
|
|
30
|
+
asset_plugin_s3/transporter/async_aws/async_aws_transporter.py
|
|
31
|
+
asset_plugin_s3/transporter/async_aws/async_copy.py
|
|
32
|
+
asset_plugin_s3/transporter/async_aws/async_download.py
|
|
33
|
+
asset_plugin_s3/transporter/async_aws/async_update_blob.py
|
|
34
|
+
asset_plugin_s3/transporter/async_aws/async_upload.py
|
|
35
|
+
asset_plugin_s3/transporter/async_aws/tests/__init__.py
|
|
36
|
+
asset_plugin_s3/transporter/async_aws/tests/test_async_aws_transporter.py
|
|
37
|
+
asset_plugin_s3/transporter/legacy_aws/__init__.py
|
|
38
|
+
asset_plugin_s3/transporter/legacy_aws/async_download.py
|
|
39
|
+
asset_plugin_s3/transporter/legacy_aws/async_upload.py
|
|
40
|
+
asset_plugin_s3/transporter/legacy_aws/legacy_aws_transporter.py
|
|
41
|
+
asset_plugin_s3/transporter/legacy_aws/tests/__init__.py
|
|
42
|
+
asset_plugin_s3/transporter/legacy_aws/tests/test_legacy_aws_transporter.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
asset_plugin_s3
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .aws_storage import AwsStoragePlugin
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import hmac
|
|
2
|
+
import os
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from hashlib import sha256
|
|
5
|
+
|
|
6
|
+
from botocore.auth import S3SigV4Auth
|
|
7
|
+
|
|
8
|
+
from asset_pluggy.storage.storage_credentials import StorageCredentials
|
|
9
|
+
from asset_utils.common import exceptions
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_aws_id_k_date() -> dict:
|
|
13
|
+
"""
|
|
14
|
+
TODO: will move it into the server, added for testing purpose only
|
|
15
|
+
Returns
|
|
16
|
+
-------
|
|
17
|
+
str
|
|
18
|
+
the k_date hex string
|
|
19
|
+
"""
|
|
20
|
+
secret_key = StorageCredentials.shared().credentials.get("aws_secret_access_key")
|
|
21
|
+
today = datetime.utcnow().strftime('%Y%m%d') # always use the UTC time for k_date
|
|
22
|
+
hex_k_date = hmac.new(f"AWS4{secret_key}".encode("utf-8"), today.encode("utf-8"), sha256).hexdigest()
|
|
23
|
+
return {"aws_access_key_id": StorageCredentials.shared().credentials.get("aws_access_key_id"),
|
|
24
|
+
"k_date": hex_k_date}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AwsAuth(S3SigV4Auth):
|
|
28
|
+
"""
|
|
29
|
+
Custom AWS authentication class for S3 that uses K_DATE from the environment if the secret key is not provided
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def signature(self, string_to_sign, request):
|
|
33
|
+
"""
|
|
34
|
+
Returns the signature for the given string and request
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
string_to_sign : str
|
|
39
|
+
The string to sign
|
|
40
|
+
request : obj
|
|
41
|
+
The request object
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
str
|
|
46
|
+
The signature for the given string and request
|
|
47
|
+
"""
|
|
48
|
+
key = self.credentials.secret_key
|
|
49
|
+
if key:
|
|
50
|
+
return super().signature(string_to_sign, request)
|
|
51
|
+
else:
|
|
52
|
+
return self._sign_with_k_date(string_to_sign)
|
|
53
|
+
|
|
54
|
+
def _sign_with_k_date(self, string_to_sign):
|
|
55
|
+
"""Returns the signature for the given string using K_DATE from the environment
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
string_to_sign : str
|
|
60
|
+
The string to sign
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
str
|
|
65
|
+
The signature for the given string using K_DATE
|
|
66
|
+
|
|
67
|
+
Raises
|
|
68
|
+
------
|
|
69
|
+
AssetException
|
|
70
|
+
If K_DATE is not found in the environment
|
|
71
|
+
"""
|
|
72
|
+
k_date_str = os.environ.get("ASSET_K_DATE")
|
|
73
|
+
if not k_date_str:
|
|
74
|
+
raise exceptions.AssetException("ASSET_K_DATE not found in environment")
|
|
75
|
+
k_date = bytes.fromhex(k_date_str)
|
|
76
|
+
k_region = self._sign(k_date, self._region_name)
|
|
77
|
+
k_service = self._sign(k_region, self._service_name)
|
|
78
|
+
k_signing = self._sign(k_service, msg="aws4_request")
|
|
79
|
+
return self._sign(k_signing, string_to_sign, hex=True)
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from asset_pluggy.storage import BlobData
|
|
4
|
+
from asset_pluggy.storage import BlobStoreURL
|
|
5
|
+
from asset_utils.common.exceptions import AssetException
|
|
6
|
+
from asset_utils.utils.file_utils import FileUtils
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AwsBlob(BlobData):
|
|
10
|
+
"""Class to handle AWS S3 blob data."""
|
|
11
|
+
_aws_obj = None
|
|
12
|
+
_multipart_size = None
|
|
13
|
+
|
|
14
|
+
def initialize(self, data: Any, url_object: BlobStoreURL):
|
|
15
|
+
"""Initializes the AwsBlob object with data from an S3 object or S3 object summary.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
data : Any
|
|
20
|
+
A dict or s3.ObjectSummary or s3.object.
|
|
21
|
+
url_object : BlobStoreURL
|
|
22
|
+
The URL of the blob.
|
|
23
|
+
"""
|
|
24
|
+
self._aws_obj = data
|
|
25
|
+
try:
|
|
26
|
+
if type(self._aws_obj) is dict: # json data from S3Proxy
|
|
27
|
+
self._initialize_from_dict(data=self._aws_obj)
|
|
28
|
+
return
|
|
29
|
+
elif hasattr(self._aws_obj, 'content_type'): # s3.Object
|
|
30
|
+
self._initialize_from_s3_object(s3_obj=self._aws_obj)
|
|
31
|
+
else:
|
|
32
|
+
self._initialize_from_s3_summary_object(s3_summary_obj=self._aws_obj)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
raise AssetException(f"Error initializing blob from {url_object.url}") from e
|
|
35
|
+
self.host = url_object.host
|
|
36
|
+
self.url = url_object.url_for_blob(host=self.host, bucket=self.bucket, name=self.name)
|
|
37
|
+
|
|
38
|
+
def _initialize_from_s3_object(self, s3_obj: Any):
|
|
39
|
+
"""Initializes the AwsBlob object with data from an S3 object.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
s3_obj : Any
|
|
44
|
+
The S3.object.
|
|
45
|
+
"""
|
|
46
|
+
self.bucket = s3_obj.bucket_name
|
|
47
|
+
self.name = s3_obj.key
|
|
48
|
+
self.size = s3_obj.content_length
|
|
49
|
+
# s3 mostly returns content-type as binary/octet-stream so we try to guess it
|
|
50
|
+
# self.content_type = FileUtils.mime_type(self.name) or s3_obj.content_type
|
|
51
|
+
# aws returns files + directory unlike gs which returns only files
|
|
52
|
+
# self.is_file = bool(self.content_type != 'application/x-directory')
|
|
53
|
+
# check all possible checksums, the structure here is different thant s3.ObjectSummary
|
|
54
|
+
# note: aws_blob returns inconsistent check_sum_algorithm keys, sometimes the keys are present
|
|
55
|
+
# if the value is null - so we add an extra protection to ensure hash can not be null
|
|
56
|
+
if hasattr(s3_obj, "checksum_crc32") and getattr(s3_obj, "checksum_crc32"):
|
|
57
|
+
self.hashes["crc32"] = getattr(s3_obj, "checksum_crc32")
|
|
58
|
+
if hasattr(s3_obj, "checksum_crc32_c") and getattr(s3_obj, "checksum_crc32_c"):
|
|
59
|
+
self.hashes["crc32_c"] = getattr(s3_obj, "checksum_crc32_c")
|
|
60
|
+
if hasattr(s3_obj, "checksum_sha1") and getattr(s3_obj, "checksum_sha1"):
|
|
61
|
+
self.hashes["sha1"] = getattr(s3_obj, "checksum_sha1")
|
|
62
|
+
if hasattr(s3_obj, "checksum_sha256") and getattr(s3_obj, "checksum_sha256"):
|
|
63
|
+
self.hashes["sha256"] = getattr(s3_obj, "checksum_sha256")
|
|
64
|
+
|
|
65
|
+
self.hashes["etag"] = s3_obj.e_tag
|
|
66
|
+
if '-' in s3_obj.e_tag and self._parse_etag(s3_obj.e_tag)[1] == '1':
|
|
67
|
+
# if number of parts is 1, then self.size is the multipart size
|
|
68
|
+
self._multipart_size = self.size
|
|
69
|
+
|
|
70
|
+
def _initialize_from_s3_summary_object(self, s3_summary_obj: Any):
|
|
71
|
+
"""Initializes the AwsBlob object with data from an S3 object summary.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
s3_summary_obj : Any
|
|
76
|
+
The s3.ObjectSummary.
|
|
77
|
+
"""
|
|
78
|
+
self.bucket = s3_summary_obj.bucket_name
|
|
79
|
+
self.name = s3_summary_obj.key
|
|
80
|
+
self.size = s3_summary_obj.size
|
|
81
|
+
# if hasattr(s3_summary_obj, "checksum_algorithm"):
|
|
82
|
+
# # todo: verify whats the checksum value when this attribute is present in the object.
|
|
83
|
+
# hash_type = getattr(s3_summary_obj, "checksum_algorithm") or "md5"
|
|
84
|
+
# e-tag has extra quotes, so we strip quotes
|
|
85
|
+
# etag has 2 parts separated by "-", first part is the md5,
|
|
86
|
+
# second part is the number of parts in which the file was uploaded
|
|
87
|
+
self.hashes["etag"] = s3_summary_obj.e_tag
|
|
88
|
+
if '-' in s3_summary_obj.e_tag and self._parse_etag(s3_summary_obj.e_tag)[1] == '1':
|
|
89
|
+
# if number of parts is 1, then self.size is the multipart size
|
|
90
|
+
self._multipart_size = self.size
|
|
91
|
+
|
|
92
|
+
def _initialize_from_dict(self, data: dict):
|
|
93
|
+
"""Initialize the AwsBlob object from a dictionary.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
data : dict
|
|
98
|
+
The dictionary containing the blob data.
|
|
99
|
+
"""
|
|
100
|
+
self.bucket = data.get("bucket")
|
|
101
|
+
self.hashes = data.get("hashes")
|
|
102
|
+
self.host = data.get("host")
|
|
103
|
+
self.name = data.get("name")
|
|
104
|
+
self.path_in_asset = data.get("path_in_asset")
|
|
105
|
+
self.size = data.get("size")
|
|
106
|
+
self.url = data.get("url")
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def content_type(self):
|
|
110
|
+
"""Get the content type of the blob.
|
|
111
|
+
|
|
112
|
+
Refactored into a property to avoid extra network calls involved in getting the content-type of s3
|
|
113
|
+
|
|
114
|
+
Returns the content type of the blob. If the blob is an S3 object, it returns the content type of
|
|
115
|
+
the S3 object. If the blob is an S3 object summary, it returns the MIME type of the blob based on
|
|
116
|
+
its name or the content type of the S3 object summary.
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
str
|
|
121
|
+
The content type of the blob.
|
|
122
|
+
"""
|
|
123
|
+
if hasattr(self._aws_obj, 'content_type'): # s3.Object
|
|
124
|
+
return getattr(self._aws_obj, 'content_type')
|
|
125
|
+
else:
|
|
126
|
+
# note: this is refactored into a separate property for better list-blobs performance,
|
|
127
|
+
# self._parse_content_type(obj=s3_summary_obj)
|
|
128
|
+
# makes this slower by 0.6 seconds per call, so if you are checking 500 objects,
|
|
129
|
+
# it would take about 4 minutes which is painfully slow
|
|
130
|
+
return FileUtils.mime_type(self.name) or self._parse_content_type(obj=self._aws_obj)
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def is_file(self):
|
|
134
|
+
"""Check if the blob is a file.
|
|
135
|
+
|
|
136
|
+
Refactored into property because of content_type refactoring above.
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
bool
|
|
141
|
+
True if the blob is a file, False otherwise.
|
|
142
|
+
|
|
143
|
+
Notes
|
|
144
|
+
-----
|
|
145
|
+
aws returns files + directory unlike gs which returns only files.
|
|
146
|
+
"""
|
|
147
|
+
# self.is_file = bool(self.content_type != 'application/x-directory')
|
|
148
|
+
return bool(self.content_type != 'application/x-directory')
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def multipart_size(self) -> int:
|
|
152
|
+
"""Get the part size of the multipart upload.
|
|
153
|
+
|
|
154
|
+
Returns
|
|
155
|
+
-------
|
|
156
|
+
int
|
|
157
|
+
The part size of the multipart upload.
|
|
158
|
+
|
|
159
|
+
Raises
|
|
160
|
+
------
|
|
161
|
+
AssetException
|
|
162
|
+
If the blob is a multipart upload but the multipart_size is not initialized.
|
|
163
|
+
"""
|
|
164
|
+
if self.is_multipart and not self._multipart_size:
|
|
165
|
+
raise AssetException("Multipart size is not initialized. Please set the multipart size.")
|
|
166
|
+
return self._multipart_size
|
|
167
|
+
|
|
168
|
+
@multipart_size.setter
|
|
169
|
+
def multipart_size(self, size: int):
|
|
170
|
+
"""Set the part size of the multipart upload.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
size : int
|
|
175
|
+
The part size of the multipart upload.
|
|
176
|
+
"""
|
|
177
|
+
self._multipart_size = size
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def is_multipart(self) -> bool:
|
|
181
|
+
"""Check if the blob is a multipart upload.
|
|
182
|
+
|
|
183
|
+
Returns True if the ETag of the blob indicates a multipart upload, False otherwise.
|
|
184
|
+
|
|
185
|
+
Returns
|
|
186
|
+
-------
|
|
187
|
+
bool
|
|
188
|
+
True if the blob is a multipart upload, False otherwise.
|
|
189
|
+
"""
|
|
190
|
+
return bool('-' in self.hashes.get("etag", ""))
|
|
191
|
+
|
|
192
|
+
def _parse_etag(self, etag: str) -> list:
|
|
193
|
+
"""Parse the ETag of the blob if it is multipart.
|
|
194
|
+
|
|
195
|
+
Parameters
|
|
196
|
+
----------
|
|
197
|
+
etag : str
|
|
198
|
+
The ETag of the blob.
|
|
199
|
+
|
|
200
|
+
Returns
|
|
201
|
+
-------
|
|
202
|
+
list
|
|
203
|
+
A list containing the ETag and the number of parts in the upload.
|
|
204
|
+
"""
|
|
205
|
+
# remove extra quotes
|
|
206
|
+
return etag[1:-1].split("-")
|
|
207
|
+
|
|
208
|
+
def _parse_content_type(self, obj: Any):
|
|
209
|
+
"""Get the content-type of an S3 object summary.
|
|
210
|
+
|
|
211
|
+
Parameters
|
|
212
|
+
----------
|
|
213
|
+
obj : Any
|
|
214
|
+
The S3 object summary.
|
|
215
|
+
|
|
216
|
+
Returns
|
|
217
|
+
-------
|
|
218
|
+
str
|
|
219
|
+
The content-type of the S3 object summary.
|
|
220
|
+
"""
|
|
221
|
+
try:
|
|
222
|
+
return obj.get()['ContentType']
|
|
223
|
+
except Exception as e:
|
|
224
|
+
self.log.info(str(e))
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
def compute_hash(self) -> tuple:
|
|
228
|
+
raise NotImplementedError
|
|
229
|
+
|
|
230
|
+
def get_hash_preferences(self) -> list:
|
|
231
|
+
return [*super().get_hash_preferences(), "etag"]
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from functools import cached_property
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
import boto3
|
|
6
|
+
import botocore
|
|
7
|
+
from botocore.errorfactory import ClientError
|
|
8
|
+
|
|
9
|
+
from asset_pluggy.storage import StorageData, StorageURL, BlobStoreURL
|
|
10
|
+
from asset_pluggy.storage import storage_utils
|
|
11
|
+
from asset_plugin_s3.aws_auth import AwsAuth, get_aws_id_k_date
|
|
12
|
+
from asset_plugin_s3.aws_blob import AwsBlob
|
|
13
|
+
from asset_plugin_s3.transporter import AsyncAwsTransporter
|
|
14
|
+
from asset_utils.utils import utils
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AwsHttpHandler:
|
|
18
|
+
"""AWS Http Handler class for handling AWS S3 operations using boto3 http requests."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, credentials: dict = None):
|
|
21
|
+
self.cred = credentials
|
|
22
|
+
|
|
23
|
+
@cached_property
|
|
24
|
+
def credentials(self) -> dict:
|
|
25
|
+
"""Fetches and returns AWS credentials.
|
|
26
|
+
|
|
27
|
+
This method first checks if the AWS credentials are provided by the user. If not, it fetches the
|
|
28
|
+
aws_access_key_id and k_date from the server. It also updates the auth type map to redirect to the
|
|
29
|
+
custom AwsAuth class.
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
dict
|
|
34
|
+
A dictionary containing AWS credentials.
|
|
35
|
+
"""
|
|
36
|
+
cred = self.cred
|
|
37
|
+
# cred = None # for testing purposes
|
|
38
|
+
# update the auth type map to redirect to the custom AwsAuth class
|
|
39
|
+
botocore.auth.AUTH_TYPE_MAPS.update({"s3v4": AwsAuth})
|
|
40
|
+
# if the aws_access_key_id and aws_secret_access_key are there it means the user has provided the credentials.
|
|
41
|
+
# is not then we fetch the aws_access_key_id and k_date from the server
|
|
42
|
+
if not cred or "aws_access_key_id" not in cred and "aws_secret_access_key" not in cred:
|
|
43
|
+
server_creds = get_aws_id_k_date() # TODO: fetch it from the server instead of using a test function
|
|
44
|
+
cred = {} if cred is None else cred
|
|
45
|
+
cred["aws_access_key_id"] = server_creds["aws_access_key_id"]
|
|
46
|
+
cred["aws_secret_access_key"] = ""
|
|
47
|
+
os.environ["ASSET_K_DATE"] = server_creds["k_date"] # set the k_date in the environment variable
|
|
48
|
+
return cred
|
|
49
|
+
|
|
50
|
+
def allows_object_add(self):
|
|
51
|
+
"""Checks if object addition is allowed."""
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
def allows_proxy(self):
|
|
55
|
+
"""Checks if proxy is allowed."""
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
def get_transporter(self) -> AsyncAwsTransporter:
|
|
59
|
+
"""Returns an instance of AsyncAwsTransporter.
|
|
60
|
+
|
|
61
|
+
The AsyncAwsTransporter is initialized with the AWS credentials.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
Transporter
|
|
66
|
+
An instance of AsyncAwsTransporter.
|
|
67
|
+
"""
|
|
68
|
+
return AsyncAwsTransporter.shared(credentials=self.credentials)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def s3_client(self):
|
|
72
|
+
return boto3.client('s3', **self.credentials)
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def s3_resource(self):
|
|
76
|
+
return boto3.resource('s3', **self.credentials)
|
|
77
|
+
|
|
78
|
+
def get_storage_url(self, url_string: str, ignore: str = None) -> StorageURL:
|
|
79
|
+
return BlobStoreURL(url=url_string, ignore=ignore)
|
|
80
|
+
|
|
81
|
+
def get_blob(self, url_string: str) -> AwsBlob:
|
|
82
|
+
"""Get the blob instance from the given URL."""
|
|
83
|
+
aws_url = BlobStoreURL(url=url_string)
|
|
84
|
+
blob_data = self.fetch_blob_data(url=aws_url)
|
|
85
|
+
return AwsBlob(data=blob_data, url_object=aws_url)
|
|
86
|
+
|
|
87
|
+
def fetch_blob_data(self, url: BlobStoreURL):
|
|
88
|
+
return self.fetch_data_from_bucket(bucket_name=url.bucket, blob_name=url.path)
|
|
89
|
+
|
|
90
|
+
def fetch_data_from_bucket(self, bucket_name, blob_name):
|
|
91
|
+
return self.s3_resource.Object(bucket_name, blob_name)
|
|
92
|
+
|
|
93
|
+
def url_is_file(self, url: Union[StorageURL, str]) -> bool:
|
|
94
|
+
"""Checks if the URL is a file.
|
|
95
|
+
|
|
96
|
+
Blobs are files, so if a blob exists then it's a file
|
|
97
|
+
else either the url doesn't exist or it's a directory
|
|
98
|
+
"""
|
|
99
|
+
if type(url) is str:
|
|
100
|
+
url = BlobStoreURL(url=url)
|
|
101
|
+
return self.check_if_blob_exists(url)
|
|
102
|
+
|
|
103
|
+
def blob_exists(self, url_string: str) -> bool:
|
|
104
|
+
"""Checks if a blob exists at the given URL."""
|
|
105
|
+
return self.check_if_blob_exists(url=BlobStoreURL(url=url_string))
|
|
106
|
+
|
|
107
|
+
def check_if_blob_exists(self, url: BlobStoreURL) -> bool:
|
|
108
|
+
# no blob path means the url is a bucket
|
|
109
|
+
if not url.path:
|
|
110
|
+
return False
|
|
111
|
+
try:
|
|
112
|
+
exists = self.s3_client.head_object(Bucket=url.bucket, Key=url.path)
|
|
113
|
+
return bool(exists)
|
|
114
|
+
except ClientError:
|
|
115
|
+
return False
|
|
116
|
+
|
|
117
|
+
def list_blobs(self, url: Union[str, StorageURL], ignore: str = None) -> [StorageData]:
|
|
118
|
+
"""Returns a list of AwsBlobs located at the url."""
|
|
119
|
+
if type(url) is str:
|
|
120
|
+
url = BlobStoreURL(url=url, ignore=ignore)
|
|
121
|
+
blobs_list = self.fetch_blobs_list(url=url)
|
|
122
|
+
return list(map(lambda x: AwsBlob(data=x, url_object=url), blobs_list))
|
|
123
|
+
|
|
124
|
+
def fetch_blobs_list(self, url: BlobStoreURL):
|
|
125
|
+
aws_bucket = self.s3_resource.Bucket(url.bucket)
|
|
126
|
+
# fetch blobs from the bucket filtered by prefix
|
|
127
|
+
blobs = list(aws_bucket.objects.filter(Prefix=url.path))
|
|
128
|
+
# filter blobs based on pattern and ignore
|
|
129
|
+
return storage_utils.filter_blobs(blobs=blobs,
|
|
130
|
+
name_key="key",
|
|
131
|
+
pattern=url.pattern,
|
|
132
|
+
ignore=url.ignore)
|
|
133
|
+
|
|
134
|
+
def delete_blobs(self, url_strings: [str]) -> None:
|
|
135
|
+
"""Deletes blobs at the given URLs."""
|
|
136
|
+
self.delete_blob_urls(urls=list(map(lambda x: BlobStoreURL(url=x), url_strings)))
|
|
137
|
+
|
|
138
|
+
def delete_blob_urls(self, urls: [BlobStoreURL]):
|
|
139
|
+
# group by bucket
|
|
140
|
+
groups = {}
|
|
141
|
+
for url in urls:
|
|
142
|
+
keys_list = groups.get(url.bucket, [])
|
|
143
|
+
keys_list.append(url.path)
|
|
144
|
+
groups[url.bucket] = keys_list
|
|
145
|
+
# delete batch by batch
|
|
146
|
+
for bucket in groups:
|
|
147
|
+
self.batch_delete_s3(s3_client=self.s3_client, bucket=bucket, keys_list=groups.get(bucket))
|
|
148
|
+
|
|
149
|
+
def batch_delete_s3(self, s3_client, bucket: str, keys_list: list):
|
|
150
|
+
for batch in utils.batch(keys_list, batch_size=100):
|
|
151
|
+
batch_to_delete = list(map(lambda x: {"Key": x}, batch))
|
|
152
|
+
s3_client.delete_objects(Bucket=bucket, Delete={'Objects': batch_to_delete, 'Quiet': True})
|
|
153
|
+
|
|
154
|
+
def filter_duplicate_blobs(self, src_blobs: [StorageData], dst_blobs: [StorageData]) -> (list, list):
|
|
155
|
+
"""Filters the source blobs to determine which blobs are new and which need to be replaced in the destination.
|
|
156
|
+
|
|
157
|
+
If a blob in `src_blobs` has the same path_in_asset as a blob in `dst_blobs`, it compares their hashes.
|
|
158
|
+
If the hashes are different, the blob is added to the replace_blobs list. If the path_in_asset is not
|
|
159
|
+
found in `dst_blobs`, the blob is considered new and is added to the new_blobs list. For upload and download
|
|
160
|
+
operations, the multipart sizes of the blobs are updated before hash comparison.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
src_blobs : list
|
|
165
|
+
A list of source blobs.
|
|
166
|
+
dst_blobs : list
|
|
167
|
+
A list of destination blobs.
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
tuple
|
|
172
|
+
A tuple containing two lists: new_blobs and replace_blobs. new_blobs is a list of blobs that are new and
|
|
173
|
+
replace_blobs is a list of blobs that need to be replaced in the destination.
|
|
174
|
+
"""
|
|
175
|
+
# TODO: improve the overall implementation
|
|
176
|
+
if not dst_blobs: # nothing to filter against
|
|
177
|
+
return src_blobs, []
|
|
178
|
+
if all(isinstance(blob, AwsBlob) for blob in src_blobs) and all(
|
|
179
|
+
isinstance(blob, AwsBlob) for blob in dst_blobs): # asset cp remote copy
|
|
180
|
+
new_blobs, replace_blobs = [], []
|
|
181
|
+
# compare the path_in_asset and hash of the blobs
|
|
182
|
+
dst_blob_map = {obj.path_in_asset: obj for obj in dst_blobs}
|
|
183
|
+
for src_blob in src_blobs:
|
|
184
|
+
if src_blob.path_in_asset in dst_blob_map:
|
|
185
|
+
# no need to update the multipart sizes before hash comparison
|
|
186
|
+
if not src_blob.compare_hash(dst_blob_map[src_blob.path_in_asset]):
|
|
187
|
+
replace_blobs.append(src_blob)
|
|
188
|
+
else:
|
|
189
|
+
# new path_in_asset new object
|
|
190
|
+
new_blobs.append(src_blob)
|
|
191
|
+
return new_blobs, replace_blobs
|
|
192
|
+
|
|
193
|
+
# src_blobs or dst_blobs must be PosixBlob objects
|
|
194
|
+
new_blobs, replace_blobs = [], []
|
|
195
|
+
# compare the path_in_asset and hash of the blobs
|
|
196
|
+
dst_blob_map = {obj.path_in_asset: obj for obj in dst_blobs}
|
|
197
|
+
need_hash_compare = []
|
|
198
|
+
for src_blob in src_blobs:
|
|
199
|
+
if src_blob.path_in_asset in dst_blob_map:
|
|
200
|
+
# need to compare hash
|
|
201
|
+
need_hash_compare.append(src_blob)
|
|
202
|
+
else:
|
|
203
|
+
# new path_in_asset new object
|
|
204
|
+
new_blobs.append(src_blob)
|
|
205
|
+
|
|
206
|
+
if all(isinstance(blob, AwsBlob) for blob in need_hash_compare): # asset cp download
|
|
207
|
+
# update the multipart sizes of the blobs that need hash comparison
|
|
208
|
+
self.get_transporter().update_multipart_blobs(blobs=need_hash_compare)
|
|
209
|
+
for src_blob in need_hash_compare:
|
|
210
|
+
posix_blob = dst_blob_map[src_blob.path_in_asset]
|
|
211
|
+
if not posix_blob.compare_hash(src_blob):
|
|
212
|
+
replace_blobs.append(src_blob)
|
|
213
|
+
else: # asset cp upload
|
|
214
|
+
# dst_blobs must be AwsBlob objects, update the multipart sizes
|
|
215
|
+
self.get_transporter().update_multipart_blobs(
|
|
216
|
+
blobs=[dst_blob_map[obj.path_in_asset] for obj in need_hash_compare])
|
|
217
|
+
for posix_blob in need_hash_compare:
|
|
218
|
+
dst_blob = dst_blob_map[posix_blob.path_in_asset]
|
|
219
|
+
if not posix_blob.compare_hash(dst_blob):
|
|
220
|
+
replace_blobs.append(posix_blob)
|
|
221
|
+
|
|
222
|
+
return new_blobs, replace_blobs
|