pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
- datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +525 -61
- datamax/parser/docx_parser.py +512 -62
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -208
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- pydatamax-0.1.15.dist-info/METADATA +340 -0
- pydatamax-0.1.15.dist-info/RECORD +38 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.13.dist-info/METADATA +0 -280
- pydatamax-0.1.13.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
datamax/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
from .parser import DataMax
|
1
|
+
from .parser import DataMax
|
datamax/loader/core.py
CHANGED
@@ -1,119 +1,119 @@
|
|
1
|
-
import os
|
2
|
-
from typing import List
|
3
|
-
from datamax.loader.
|
4
|
-
from datamax.loader.
|
5
|
-
|
6
|
-
|
7
|
-
class DataLoader:
|
8
|
-
def __init__(self, endpoint: str = None, secret_key: str = None, access_key: str = None,
|
9
|
-
bucket_name: str = None, source: str = None):
|
10
|
-
if source and source == 'Oss':
|
11
|
-
self.oss = OssClient(
|
12
|
-
oss_endpoint=endpoint,
|
13
|
-
oss_access_key_secret=secret_key,
|
14
|
-
oss_access_key_id=access_key,
|
15
|
-
oss_bucket_name=bucket_name
|
16
|
-
)
|
17
|
-
elif source and source == 'MinIO':
|
18
|
-
self.mi = MinIOClient(
|
19
|
-
endpoint=endpoint,
|
20
|
-
secret_key=secret_key,
|
21
|
-
access_key=access_key,
|
22
|
-
bucket_name=bucket_name
|
23
|
-
)
|
24
|
-
self.download_path = str('./download_file')
|
25
|
-
self.source = source
|
26
|
-
self.bucket_name = bucket_name
|
27
|
-
|
28
|
-
@staticmethod
|
29
|
-
def load_from_file(local_file_path) -> List[str]:
|
30
|
-
if os.path.isfile(local_file_path):
|
31
|
-
if os.path.exists(local_file_path):
|
32
|
-
if os.access(local_file_path, os.R_OK):
|
33
|
-
return [local_file_path]
|
34
|
-
else:
|
35
|
-
return []
|
36
|
-
else:
|
37
|
-
return []
|
38
|
-
elif os.path.isdir(local_file_path):
|
39
|
-
access_path = []
|
40
|
-
for root, dirs, files in os.walk(local_file_path):
|
41
|
-
for file in files:
|
42
|
-
file_path = os.path.join(root, file)
|
43
|
-
if os.path.exists(file_path):
|
44
|
-
if os.access(file_path, os.R_OK):
|
45
|
-
access_path.append(file_path)
|
46
|
-
else:
|
47
|
-
continue
|
48
|
-
else:
|
49
|
-
continue
|
50
|
-
return access_path
|
51
|
-
else:
|
52
|
-
return []
|
53
|
-
|
54
|
-
def load_from_oss_source(self, oss_path: str) -> List[str]:
|
55
|
-
if not os.path.exists(self.download_path):
|
56
|
-
os.makedirs(self.download_path)
|
57
|
-
|
58
|
-
self.download(oss_path=oss_path)
|
59
|
-
|
60
|
-
file_list = []
|
61
|
-
for root, dirs, files in os.walk(self.download_path):
|
62
|
-
for file in files:
|
63
|
-
file_path = os.path.join(self.download_path, file)
|
64
|
-
file_list.append(file_path)
|
65
|
-
|
66
|
-
success_file_list = []
|
67
|
-
for file_path in file_list:
|
68
|
-
if self.load_from_file(file_path):
|
69
|
-
success_file_list.append(file_path)
|
70
|
-
|
71
|
-
return success_file_list
|
72
|
-
|
73
|
-
def download(self, oss_path: str):
|
74
|
-
if self.source == 'MinIO':
|
75
|
-
file_list = self.mi.list_objects(bucket_name=self.bucket_name, prefix=oss_path)
|
76
|
-
for path in file_list:
|
77
|
-
self.mi.download_file(bucket_name=self.bucket_name, object_name=path,
|
78
|
-
file_path=f'{self.download_path}/{path.split("/")[-1]}')
|
79
|
-
elif self.source == "Oss":
|
80
|
-
keys = self.oss.get_objects_in_folders(prefix=oss_path)
|
81
|
-
for path in keys:
|
82
|
-
self.oss.get_object_to_file(object_name=path,
|
83
|
-
file_path=f'{self.download_path}/{path.split("/")[-1]}')
|
84
|
-
|
85
|
-
def upload(self, local_file_path: str, save_prefix: str):
|
86
|
-
if self.source == 'MinIO':
|
87
|
-
if os.path.isdir(local_file_path):
|
88
|
-
for root, dirs, files in os.walk(local_file_path):
|
89
|
-
for file in files:
|
90
|
-
file_path = os.path.join(root, file)
|
91
|
-
self.mi.upload_file(bucket_name=self.bucket_name, object_name=save_prefix + f'{file}',
|
92
|
-
file_path=file_path)
|
93
|
-
elif os.path.isfile(local_file_path):
|
94
|
-
self.mi.upload_file(bucket_name=self.bucket_name,
|
95
|
-
object_name=save_prefix + os.path.basename(local_file_path),
|
96
|
-
file_path=local_file_path)
|
97
|
-
else:
|
98
|
-
pass
|
99
|
-
|
100
|
-
elif self.source == "Oss":
|
101
|
-
if os.path.isdir(local_file_path):
|
102
|
-
self.oss.put_object_from_folder(object_folder_name=save_prefix, local_folder_path=local_file_path)
|
103
|
-
elif os.path.isfile(local_file_path):
|
104
|
-
self.oss.put_object_from_file(object_name=save_prefix + os.path.basename(local_file_path),
|
105
|
-
file_path=local_file_path)
|
106
|
-
else:
|
107
|
-
pass
|
108
|
-
|
109
|
-
def share(self, oss_path: str,
|
110
|
-
expires: int = None,
|
111
|
-
aliyun_oss_url_prefix: str = None,
|
112
|
-
csnt_url_prefix: str = None):
|
113
|
-
if self.source == 'MinIO':
|
114
|
-
return self.mi.get_object_tmp_link(bucket_name=self.bucket_name, object_name=oss_path, expires=expires)
|
115
|
-
elif self.source == "Oss":
|
116
|
-
return self.oss.get_oss_url(object_name=oss_path,
|
117
|
-
url_expires_time=expires,
|
118
|
-
aliyun_oss_url_prefix=aliyun_oss_url_prefix,
|
1
|
+
import os
|
2
|
+
from typing import List
|
3
|
+
from datamax.loader.minio_handler import MinIOClient
|
4
|
+
from datamax.loader.oss_handler import OssClient
|
5
|
+
|
6
|
+
|
7
|
+
class DataLoader:
|
8
|
+
def __init__(self, endpoint: str = None, secret_key: str = None, access_key: str = None,
|
9
|
+
bucket_name: str = None, source: str = None):
|
10
|
+
if source and source == 'Oss':
|
11
|
+
self.oss = OssClient(
|
12
|
+
oss_endpoint=endpoint,
|
13
|
+
oss_access_key_secret=secret_key,
|
14
|
+
oss_access_key_id=access_key,
|
15
|
+
oss_bucket_name=bucket_name
|
16
|
+
)
|
17
|
+
elif source and source == 'MinIO':
|
18
|
+
self.mi = MinIOClient(
|
19
|
+
endpoint=endpoint,
|
20
|
+
secret_key=secret_key,
|
21
|
+
access_key=access_key,
|
22
|
+
bucket_name=bucket_name
|
23
|
+
)
|
24
|
+
self.download_path = str('./download_file')
|
25
|
+
self.source = source
|
26
|
+
self.bucket_name = bucket_name
|
27
|
+
|
28
|
+
@staticmethod
|
29
|
+
def load_from_file(local_file_path) -> List[str]:
|
30
|
+
if os.path.isfile(local_file_path):
|
31
|
+
if os.path.exists(local_file_path):
|
32
|
+
if os.access(local_file_path, os.R_OK):
|
33
|
+
return [local_file_path]
|
34
|
+
else:
|
35
|
+
return []
|
36
|
+
else:
|
37
|
+
return []
|
38
|
+
elif os.path.isdir(local_file_path):
|
39
|
+
access_path = []
|
40
|
+
for root, dirs, files in os.walk(local_file_path):
|
41
|
+
for file in files:
|
42
|
+
file_path = os.path.join(root, file)
|
43
|
+
if os.path.exists(file_path):
|
44
|
+
if os.access(file_path, os.R_OK):
|
45
|
+
access_path.append(file_path)
|
46
|
+
else:
|
47
|
+
continue
|
48
|
+
else:
|
49
|
+
continue
|
50
|
+
return access_path
|
51
|
+
else:
|
52
|
+
return []
|
53
|
+
|
54
|
+
def load_from_oss_source(self, oss_path: str) -> List[str]:
|
55
|
+
if not os.path.exists(self.download_path):
|
56
|
+
os.makedirs(self.download_path)
|
57
|
+
|
58
|
+
self.download(oss_path=oss_path)
|
59
|
+
|
60
|
+
file_list = []
|
61
|
+
for root, dirs, files in os.walk(self.download_path):
|
62
|
+
for file in files:
|
63
|
+
file_path = os.path.join(self.download_path, file)
|
64
|
+
file_list.append(file_path)
|
65
|
+
|
66
|
+
success_file_list = []
|
67
|
+
for file_path in file_list:
|
68
|
+
if self.load_from_file(file_path):
|
69
|
+
success_file_list.append(file_path)
|
70
|
+
|
71
|
+
return success_file_list
|
72
|
+
|
73
|
+
def download(self, oss_path: str):
|
74
|
+
if self.source == 'MinIO':
|
75
|
+
file_list = self.mi.list_objects(bucket_name=self.bucket_name, prefix=oss_path)
|
76
|
+
for path in file_list:
|
77
|
+
self.mi.download_file(bucket_name=self.bucket_name, object_name=path,
|
78
|
+
file_path=f'{self.download_path}/{path.split("/")[-1]}')
|
79
|
+
elif self.source == "Oss":
|
80
|
+
keys = self.oss.get_objects_in_folders(prefix=oss_path)
|
81
|
+
for path in keys:
|
82
|
+
self.oss.get_object_to_file(object_name=path,
|
83
|
+
file_path=f'{self.download_path}/{path.split("/")[-1]}')
|
84
|
+
|
85
|
+
def upload(self, local_file_path: str, save_prefix: str):
|
86
|
+
if self.source == 'MinIO':
|
87
|
+
if os.path.isdir(local_file_path):
|
88
|
+
for root, dirs, files in os.walk(local_file_path):
|
89
|
+
for file in files:
|
90
|
+
file_path = os.path.join(root, file)
|
91
|
+
self.mi.upload_file(bucket_name=self.bucket_name, object_name=save_prefix + f'{file}',
|
92
|
+
file_path=file_path)
|
93
|
+
elif os.path.isfile(local_file_path):
|
94
|
+
self.mi.upload_file(bucket_name=self.bucket_name,
|
95
|
+
object_name=save_prefix + os.path.basename(local_file_path),
|
96
|
+
file_path=local_file_path)
|
97
|
+
else:
|
98
|
+
pass
|
99
|
+
|
100
|
+
elif self.source == "Oss":
|
101
|
+
if os.path.isdir(local_file_path):
|
102
|
+
self.oss.put_object_from_folder(object_folder_name=save_prefix, local_folder_path=local_file_path)
|
103
|
+
elif os.path.isfile(local_file_path):
|
104
|
+
self.oss.put_object_from_file(object_name=save_prefix + os.path.basename(local_file_path),
|
105
|
+
file_path=local_file_path)
|
106
|
+
else:
|
107
|
+
pass
|
108
|
+
|
109
|
+
def share(self, oss_path: str,
|
110
|
+
expires: int = None,
|
111
|
+
aliyun_oss_url_prefix: str = None,
|
112
|
+
csnt_url_prefix: str = None):
|
113
|
+
if self.source == 'MinIO':
|
114
|
+
return self.mi.get_object_tmp_link(bucket_name=self.bucket_name, object_name=oss_path, expires=expires)
|
115
|
+
elif self.source == "Oss":
|
116
|
+
return self.oss.get_oss_url(object_name=oss_path,
|
117
|
+
url_expires_time=expires,
|
118
|
+
aliyun_oss_url_prefix=aliyun_oss_url_prefix,
|
119
119
|
csnt_url_prefix=csnt_url_prefix)
|
@@ -1,172 +1,172 @@
|
|
1
|
-
import os
|
2
|
-
from dotenv import load_dotenv
|
3
|
-
from datetime import timedelta
|
4
|
-
from minio import Minio
|
5
|
-
from minio.commonconfig import Tags
|
6
|
-
from minio.error import S3Error
|
7
|
-
from loguru import logger
|
8
|
-
import re
|
9
|
-
|
10
|
-
load_dotenv()
|
11
|
-
|
12
|
-
|
13
|
-
class MinIOClient:
|
14
|
-
def __init__(self, endpoint, access_key, secret_key, bucket_name, secure=False):
|
15
|
-
self.endpoint = os.getenv("ENDPOINT_MINIO", endpoint)
|
16
|
-
self.access_key = os.getenv("ACCESS_KEY_MINIO", access_key)
|
17
|
-
self.secret_key = os.getenv("SECRET_KEY_MINIO", secret_key)
|
18
|
-
self.bucket_name = os.getenv("BUCKET_NAME_MINIO", bucket_name)
|
19
|
-
self.secure = secure
|
20
|
-
self.client = self._initialize_client()
|
21
|
-
|
22
|
-
def _initialize_client(self):
|
23
|
-
try:
|
24
|
-
client = Minio(
|
25
|
-
self.endpoint,
|
26
|
-
access_key=self.access_key,
|
27
|
-
secret_key=self.secret_key,
|
28
|
-
secure=self.secure
|
29
|
-
)
|
30
|
-
return client
|
31
|
-
except S3Error as e:
|
32
|
-
raise
|
33
|
-
|
34
|
-
@staticmethod
|
35
|
-
def bytes_to_mb(bytes_value):
|
36
|
-
return bytes_value / (1024 * 1024)
|
37
|
-
|
38
|
-
def create_bucket(self, bucket_name):
|
39
|
-
if self.client:
|
40
|
-
try:
|
41
|
-
self.client.make_bucket(bucket_name)
|
42
|
-
logger.info(f"Bucket '{bucket_name}' created successfully.")
|
43
|
-
except S3Error as e:
|
44
|
-
raise
|
45
|
-
|
46
|
-
def remove_bucket(self, bucket_name):
|
47
|
-
if self.client:
|
48
|
-
try:
|
49
|
-
self.client.remove_bucket(bucket_name)
|
50
|
-
logger.info(f"Bucket '{bucket_name}' removed successfully.")
|
51
|
-
except S3Error as e:
|
52
|
-
raise
|
53
|
-
|
54
|
-
def upload_file(self, file_path, bucket_name, object_name):
|
55
|
-
if self.client:
|
56
|
-
try:
|
57
|
-
self.client.fput_object(bucket_name, object_name, file_path)
|
58
|
-
logger.info(f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'.")
|
59
|
-
except S3Error as e:
|
60
|
-
raise
|
61
|
-
|
62
|
-
def download_file(self, bucket_name, object_name, file_path):
|
63
|
-
if self.client:
|
64
|
-
try:
|
65
|
-
self.client.fget_object(bucket_name, object_name, file_path)
|
66
|
-
logger.info(f"Object '{object_name}' from bucket '{bucket_name}' downloaded to '{file_path}'.")
|
67
|
-
return file_path
|
68
|
-
except Exception as e:
|
69
|
-
try:
|
70
|
-
illegal_chars = r'[\/:*?"<>|]'
|
71
|
-
file_path = re.sub(illegal_chars, '_', file_path)
|
72
|
-
self.client.fget_object(bucket_name, object_name, file_path)
|
73
|
-
logger.info(
|
74
|
-
f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'.")
|
75
|
-
return file_path
|
76
|
-
except Exception as e:
|
77
|
-
raise
|
78
|
-
|
79
|
-
def list_objects(self, bucket_name, prefix=None):
|
80
|
-
if self.client:
|
81
|
-
try:
|
82
|
-
result_list = []
|
83
|
-
if prefix:
|
84
|
-
objects = self.client.list_objects(bucket_name, recursive=True, prefix=prefix)
|
85
|
-
else:
|
86
|
-
objects = self.client.list_objects(bucket_name, recursive=True)
|
87
|
-
logger.info(f"Objects in bucket '{bucket_name}':")
|
88
|
-
for obj in objects:
|
89
|
-
result_list.append(obj.object_name)
|
90
|
-
return result_list
|
91
|
-
except S3Error as e:
|
92
|
-
raise
|
93
|
-
|
94
|
-
def remove_object(self, bucket_name, object_name):
|
95
|
-
if self.client:
|
96
|
-
try:
|
97
|
-
self.client.remove_object(bucket_name, object_name)
|
98
|
-
except S3Error as e:
|
99
|
-
raise
|
100
|
-
|
101
|
-
def calculate_bucket_stats(self, bucket_name, prefix):
|
102
|
-
objects = self.client.list_objects(bucket_name,
|
103
|
-
prefix=prefix, recursive=True)
|
104
|
-
total_size = 0
|
105
|
-
object_count = 0
|
106
|
-
|
107
|
-
for obj in objects:
|
108
|
-
object_count += 1
|
109
|
-
total_size += obj.size
|
110
|
-
|
111
|
-
total_size = self.bytes_to_mb(total_size)
|
112
|
-
|
113
|
-
return object_count, total_size
|
114
|
-
|
115
|
-
def get_objects(self, bucket_name, object_name):
|
116
|
-
try:
|
117
|
-
response = self.client.get_object(bucket_name, object_name)
|
118
|
-
content = response.read().decode('utf-8')
|
119
|
-
return content
|
120
|
-
except Exception as e:
|
121
|
-
raise
|
122
|
-
|
123
|
-
def get_object_tag(self, bucket_name, object_name):
|
124
|
-
try:
|
125
|
-
tags = self.client.get_object_tags(bucket_name=bucket_name, object_name=object_name)
|
126
|
-
return tags
|
127
|
-
except Exception as e:
|
128
|
-
raise
|
129
|
-
|
130
|
-
def update_object_tag(self, bucket_name, object_name, tags):
|
131
|
-
try:
|
132
|
-
tags_obj = Tags.new_object_tags()
|
133
|
-
tag_info = self.get_object_tag(bucket_name=bucket_name, object_name=object_name)
|
134
|
-
if tag_info is None:
|
135
|
-
tag_info = {}
|
136
|
-
for tag_dict in tags:
|
137
|
-
for tag_key, tag_value in tag_dict.items():
|
138
|
-
if tag_key in tag_info:
|
139
|
-
tag_info[tag_key] = tag_value
|
140
|
-
else:
|
141
|
-
tag_info[tag_key] = tag_value
|
142
|
-
|
143
|
-
for k, v in tag_info.items():
|
144
|
-
tags_obj[k] = v
|
145
|
-
self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
|
146
|
-
else:
|
147
|
-
for tag_dict in tags:
|
148
|
-
for tag_key, tag_value in tag_dict.items():
|
149
|
-
if tag_key in tag_info:
|
150
|
-
tag_info[tag_key] = tag_value
|
151
|
-
else:
|
152
|
-
tag_info[tag_key] = tag_value
|
153
|
-
|
154
|
-
for k, v in tag_info.items():
|
155
|
-
tags_obj[k] = v
|
156
|
-
self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
|
157
|
-
return tag_info
|
158
|
-
except Exception as e:
|
159
|
-
raise
|
160
|
-
|
161
|
-
def reset_object_tag(self, bucket_name, object_name):
|
162
|
-
try:
|
163
|
-
self.client.delete_object_tags(bucket_name=bucket_name, object_name=object_name)
|
164
|
-
return True
|
165
|
-
except Exception as e:
|
166
|
-
raise
|
167
|
-
|
168
|
-
def get_object_tmp_link(self, bucket_name, object_name, expires):
|
169
|
-
try:
|
170
|
-
return self.client.presigned_get_object(bucket_name, object_name, expires=timedelta(days=expires))
|
171
|
-
except Exception as e:
|
1
|
+
import os
|
2
|
+
from dotenv import load_dotenv
|
3
|
+
from datetime import timedelta
|
4
|
+
from minio import Minio
|
5
|
+
from minio.commonconfig import Tags
|
6
|
+
from minio.error import S3Error
|
7
|
+
from loguru import logger
|
8
|
+
import re
|
9
|
+
|
10
|
+
load_dotenv()
|
11
|
+
|
12
|
+
|
13
|
+
class MinIOClient:
|
14
|
+
def __init__(self, endpoint, access_key, secret_key, bucket_name, secure=False):
|
15
|
+
self.endpoint = os.getenv("ENDPOINT_MINIO", endpoint)
|
16
|
+
self.access_key = os.getenv("ACCESS_KEY_MINIO", access_key)
|
17
|
+
self.secret_key = os.getenv("SECRET_KEY_MINIO", secret_key)
|
18
|
+
self.bucket_name = os.getenv("BUCKET_NAME_MINIO", bucket_name)
|
19
|
+
self.secure = secure
|
20
|
+
self.client = self._initialize_client()
|
21
|
+
|
22
|
+
def _initialize_client(self):
|
23
|
+
try:
|
24
|
+
client = Minio(
|
25
|
+
self.endpoint,
|
26
|
+
access_key=self.access_key,
|
27
|
+
secret_key=self.secret_key,
|
28
|
+
secure=self.secure
|
29
|
+
)
|
30
|
+
return client
|
31
|
+
except S3Error as e:
|
32
|
+
raise
|
33
|
+
|
34
|
+
@staticmethod
|
35
|
+
def bytes_to_mb(bytes_value):
|
36
|
+
return bytes_value / (1024 * 1024)
|
37
|
+
|
38
|
+
def create_bucket(self, bucket_name):
|
39
|
+
if self.client:
|
40
|
+
try:
|
41
|
+
self.client.make_bucket(bucket_name)
|
42
|
+
logger.info(f"Bucket '{bucket_name}' created successfully.")
|
43
|
+
except S3Error as e:
|
44
|
+
raise
|
45
|
+
|
46
|
+
def remove_bucket(self, bucket_name):
|
47
|
+
if self.client:
|
48
|
+
try:
|
49
|
+
self.client.remove_bucket(bucket_name)
|
50
|
+
logger.info(f"Bucket '{bucket_name}' removed successfully.")
|
51
|
+
except S3Error as e:
|
52
|
+
raise
|
53
|
+
|
54
|
+
def upload_file(self, file_path, bucket_name, object_name):
|
55
|
+
if self.client:
|
56
|
+
try:
|
57
|
+
self.client.fput_object(bucket_name, object_name, file_path)
|
58
|
+
logger.info(f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'.")
|
59
|
+
except S3Error as e:
|
60
|
+
raise
|
61
|
+
|
62
|
+
def download_file(self, bucket_name, object_name, file_path):
|
63
|
+
if self.client:
|
64
|
+
try:
|
65
|
+
self.client.fget_object(bucket_name, object_name, file_path)
|
66
|
+
logger.info(f"Object '{object_name}' from bucket '{bucket_name}' downloaded to '{file_path}'.")
|
67
|
+
return file_path
|
68
|
+
except Exception as e:
|
69
|
+
try:
|
70
|
+
illegal_chars = r'[\/:*?"<>|]'
|
71
|
+
file_path = re.sub(illegal_chars, '_', file_path)
|
72
|
+
self.client.fget_object(bucket_name, object_name, file_path)
|
73
|
+
logger.info(
|
74
|
+
f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'.")
|
75
|
+
return file_path
|
76
|
+
except Exception as e:
|
77
|
+
raise
|
78
|
+
|
79
|
+
def list_objects(self, bucket_name, prefix=None):
|
80
|
+
if self.client:
|
81
|
+
try:
|
82
|
+
result_list = []
|
83
|
+
if prefix:
|
84
|
+
objects = self.client.list_objects(bucket_name, recursive=True, prefix=prefix)
|
85
|
+
else:
|
86
|
+
objects = self.client.list_objects(bucket_name, recursive=True)
|
87
|
+
logger.info(f"Objects in bucket '{bucket_name}':")
|
88
|
+
for obj in objects:
|
89
|
+
result_list.append(obj.object_name)
|
90
|
+
return result_list
|
91
|
+
except S3Error as e:
|
92
|
+
raise
|
93
|
+
|
94
|
+
def remove_object(self, bucket_name, object_name):
|
95
|
+
if self.client:
|
96
|
+
try:
|
97
|
+
self.client.remove_object(bucket_name, object_name)
|
98
|
+
except S3Error as e:
|
99
|
+
raise
|
100
|
+
|
101
|
+
def calculate_bucket_stats(self, bucket_name, prefix):
|
102
|
+
objects = self.client.list_objects(bucket_name,
|
103
|
+
prefix=prefix, recursive=True)
|
104
|
+
total_size = 0
|
105
|
+
object_count = 0
|
106
|
+
|
107
|
+
for obj in objects:
|
108
|
+
object_count += 1
|
109
|
+
total_size += obj.size
|
110
|
+
|
111
|
+
total_size = self.bytes_to_mb(total_size)
|
112
|
+
|
113
|
+
return object_count, total_size
|
114
|
+
|
115
|
+
def get_objects(self, bucket_name, object_name):
|
116
|
+
try:
|
117
|
+
response = self.client.get_object(bucket_name, object_name)
|
118
|
+
content = response.read().decode('utf-8')
|
119
|
+
return content
|
120
|
+
except Exception as e:
|
121
|
+
raise
|
122
|
+
|
123
|
+
def get_object_tag(self, bucket_name, object_name):
|
124
|
+
try:
|
125
|
+
tags = self.client.get_object_tags(bucket_name=bucket_name, object_name=object_name)
|
126
|
+
return tags
|
127
|
+
except Exception as e:
|
128
|
+
raise
|
129
|
+
|
130
|
+
def update_object_tag(self, bucket_name, object_name, tags):
|
131
|
+
try:
|
132
|
+
tags_obj = Tags.new_object_tags()
|
133
|
+
tag_info = self.get_object_tag(bucket_name=bucket_name, object_name=object_name)
|
134
|
+
if tag_info is None:
|
135
|
+
tag_info = {}
|
136
|
+
for tag_dict in tags:
|
137
|
+
for tag_key, tag_value in tag_dict.items():
|
138
|
+
if tag_key in tag_info:
|
139
|
+
tag_info[tag_key] = tag_value
|
140
|
+
else:
|
141
|
+
tag_info[tag_key] = tag_value
|
142
|
+
|
143
|
+
for k, v in tag_info.items():
|
144
|
+
tags_obj[k] = v
|
145
|
+
self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
|
146
|
+
else:
|
147
|
+
for tag_dict in tags:
|
148
|
+
for tag_key, tag_value in tag_dict.items():
|
149
|
+
if tag_key in tag_info:
|
150
|
+
tag_info[tag_key] = tag_value
|
151
|
+
else:
|
152
|
+
tag_info[tag_key] = tag_value
|
153
|
+
|
154
|
+
for k, v in tag_info.items():
|
155
|
+
tags_obj[k] = v
|
156
|
+
self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
|
157
|
+
return tag_info
|
158
|
+
except Exception as e:
|
159
|
+
raise
|
160
|
+
|
161
|
+
def reset_object_tag(self, bucket_name, object_name):
|
162
|
+
try:
|
163
|
+
self.client.delete_object_tags(bucket_name=bucket_name, object_name=object_name)
|
164
|
+
return True
|
165
|
+
except Exception as e:
|
166
|
+
raise
|
167
|
+
|
168
|
+
def get_object_tmp_link(self, bucket_name, object_name, expires):
|
169
|
+
try:
|
170
|
+
return self.client.presigned_get_object(bucket_name, object_name, expires=timedelta(days=expires))
|
171
|
+
except Exception as e:
|
172
172
|
raise
|