pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +91 -68
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/loader/core.py
CHANGED
@@ -1,27 +1,34 @@
|
|
1
1
|
import os
|
2
2
|
from typing import List
|
3
|
+
|
3
4
|
from datamax.loader.minio_handler import MinIOClient
|
4
5
|
from datamax.loader.oss_handler import OssClient
|
5
6
|
|
6
7
|
|
7
8
|
class DataLoader:
|
8
|
-
def __init__(
|
9
|
-
|
10
|
-
|
9
|
+
def __init__(
|
10
|
+
self,
|
11
|
+
endpoint: str = None,
|
12
|
+
secret_key: str = None,
|
13
|
+
access_key: str = None,
|
14
|
+
bucket_name: str = None,
|
15
|
+
source: str = None,
|
16
|
+
):
|
17
|
+
if source and source == "Oss":
|
11
18
|
self.oss = OssClient(
|
12
19
|
oss_endpoint=endpoint,
|
13
20
|
oss_access_key_secret=secret_key,
|
14
21
|
oss_access_key_id=access_key,
|
15
|
-
oss_bucket_name=bucket_name
|
22
|
+
oss_bucket_name=bucket_name,
|
16
23
|
)
|
17
|
-
elif source and source ==
|
24
|
+
elif source and source == "MinIO":
|
18
25
|
self.mi = MinIOClient(
|
19
26
|
endpoint=endpoint,
|
20
27
|
secret_key=secret_key,
|
21
28
|
access_key=access_key,
|
22
|
-
bucket_name=bucket_name
|
29
|
+
bucket_name=bucket_name,
|
23
30
|
)
|
24
|
-
self.download_path = str(
|
31
|
+
self.download_path = str("./download_file")
|
25
32
|
self.source = source
|
26
33
|
self.bucket_name = bucket_name
|
27
34
|
|
@@ -37,16 +44,11 @@ class DataLoader:
|
|
37
44
|
return []
|
38
45
|
elif os.path.isdir(local_file_path):
|
39
46
|
access_path = []
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
access_path.append(file_path)
|
46
|
-
else:
|
47
|
-
continue
|
48
|
-
else:
|
49
|
-
continue
|
47
|
+
# Recursively process all files and subdirectories under the current directory.
|
48
|
+
for item in os.listdir(local_file_path):
|
49
|
+
item_path = os.path.join(local_file_path, item)
|
50
|
+
item_results = DataLoader.load_from_file(item_path)
|
51
|
+
access_path.extend(item_results)
|
50
52
|
return access_path
|
51
53
|
else:
|
52
54
|
return []
|
@@ -71,49 +73,72 @@ class DataLoader:
|
|
71
73
|
return success_file_list
|
72
74
|
|
73
75
|
def download(self, oss_path: str):
|
74
|
-
if self.source ==
|
75
|
-
file_list = self.mi.list_objects(
|
76
|
+
if self.source == "MinIO":
|
77
|
+
file_list = self.mi.list_objects(
|
78
|
+
bucket_name=self.bucket_name, prefix=oss_path
|
79
|
+
)
|
76
80
|
for path in file_list:
|
77
|
-
self.mi.download_file(
|
78
|
-
|
81
|
+
self.mi.download_file(
|
82
|
+
bucket_name=self.bucket_name,
|
83
|
+
object_name=path,
|
84
|
+
file_path=f'{self.download_path}/{path.split("/")[-1]}',
|
85
|
+
)
|
79
86
|
elif self.source == "Oss":
|
80
87
|
keys = self.oss.get_objects_in_folders(prefix=oss_path)
|
81
88
|
for path in keys:
|
82
|
-
self.oss.get_object_to_file(
|
83
|
-
|
89
|
+
self.oss.get_object_to_file(
|
90
|
+
object_name=path,
|
91
|
+
file_path=f'{self.download_path}/{path.split("/")[-1]}',
|
92
|
+
)
|
84
93
|
|
85
94
|
def upload(self, local_file_path: str, save_prefix: str):
|
86
|
-
if self.source ==
|
95
|
+
if self.source == "MinIO":
|
87
96
|
if os.path.isdir(local_file_path):
|
88
97
|
for root, dirs, files in os.walk(local_file_path):
|
89
98
|
for file in files:
|
90
99
|
file_path = os.path.join(root, file)
|
91
|
-
self.mi.upload_file(
|
92
|
-
|
100
|
+
self.mi.upload_file(
|
101
|
+
bucket_name=self.bucket_name,
|
102
|
+
object_name=save_prefix + f"{file}",
|
103
|
+
file_path=file_path,
|
104
|
+
)
|
93
105
|
elif os.path.isfile(local_file_path):
|
94
|
-
self.mi.upload_file(
|
95
|
-
|
96
|
-
|
106
|
+
self.mi.upload_file(
|
107
|
+
bucket_name=self.bucket_name,
|
108
|
+
object_name=save_prefix + os.path.basename(local_file_path),
|
109
|
+
file_path=local_file_path,
|
110
|
+
)
|
97
111
|
else:
|
98
112
|
pass
|
99
113
|
|
100
114
|
elif self.source == "Oss":
|
101
115
|
if os.path.isdir(local_file_path):
|
102
|
-
self.oss.put_object_from_folder(
|
116
|
+
self.oss.put_object_from_folder(
|
117
|
+
object_folder_name=save_prefix, local_folder_path=local_file_path
|
118
|
+
)
|
103
119
|
elif os.path.isfile(local_file_path):
|
104
|
-
self.oss.put_object_from_file(
|
105
|
-
|
120
|
+
self.oss.put_object_from_file(
|
121
|
+
object_name=save_prefix + os.path.basename(local_file_path),
|
122
|
+
file_path=local_file_path,
|
123
|
+
)
|
106
124
|
else:
|
107
125
|
pass
|
108
126
|
|
109
|
-
def share(
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
127
|
+
def share(
|
128
|
+
self,
|
129
|
+
oss_path: str,
|
130
|
+
expires: int = None,
|
131
|
+
aliyun_oss_url_prefix: str = None,
|
132
|
+
csnt_url_prefix: str = None,
|
133
|
+
):
|
134
|
+
if self.source == "MinIO":
|
135
|
+
return self.mi.get_object_tmp_link(
|
136
|
+
bucket_name=self.bucket_name, object_name=oss_path, expires=expires
|
137
|
+
)
|
115
138
|
elif self.source == "Oss":
|
116
|
-
return self.oss.get_oss_url(
|
117
|
-
|
118
|
-
|
119
|
-
|
139
|
+
return self.oss.get_oss_url(
|
140
|
+
object_name=oss_path,
|
141
|
+
url_expires_time=expires,
|
142
|
+
aliyun_oss_url_prefix=aliyun_oss_url_prefix,
|
143
|
+
csnt_url_prefix=csnt_url_prefix,
|
144
|
+
)
|
datamax/loader/minio_handler.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import os
|
2
|
-
|
2
|
+
import re
|
3
3
|
from datetime import timedelta
|
4
|
+
|
5
|
+
from dotenv import load_dotenv
|
6
|
+
from loguru import logger
|
4
7
|
from minio import Minio
|
5
8
|
from minio.commonconfig import Tags
|
6
9
|
from minio.error import S3Error
|
7
|
-
from loguru import logger
|
8
|
-
import re
|
9
10
|
|
10
11
|
load_dotenv()
|
11
12
|
|
@@ -25,7 +26,7 @@ class MinIOClient:
|
|
25
26
|
self.endpoint,
|
26
27
|
access_key=self.access_key,
|
27
28
|
secret_key=self.secret_key,
|
28
|
-
secure=self.secure
|
29
|
+
secure=self.secure,
|
29
30
|
)
|
30
31
|
return client
|
31
32
|
except S3Error as e:
|
@@ -55,7 +56,9 @@ class MinIOClient:
|
|
55
56
|
if self.client:
|
56
57
|
try:
|
57
58
|
self.client.fput_object(bucket_name, object_name, file_path)
|
58
|
-
logger.info(
|
59
|
+
logger.info(
|
60
|
+
f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'."
|
61
|
+
)
|
59
62
|
except S3Error as e:
|
60
63
|
raise
|
61
64
|
|
@@ -63,15 +66,18 @@ class MinIOClient:
|
|
63
66
|
if self.client:
|
64
67
|
try:
|
65
68
|
self.client.fget_object(bucket_name, object_name, file_path)
|
66
|
-
logger.info(
|
69
|
+
logger.info(
|
70
|
+
f"Object '{object_name}' from bucket '{bucket_name}' downloaded to '{file_path}'."
|
71
|
+
)
|
67
72
|
return file_path
|
68
73
|
except Exception as e:
|
69
74
|
try:
|
70
75
|
illegal_chars = r'[\/:*?"<>|]'
|
71
|
-
file_path = re.sub(illegal_chars,
|
76
|
+
file_path = re.sub(illegal_chars, "_", file_path)
|
72
77
|
self.client.fget_object(bucket_name, object_name, file_path)
|
73
78
|
logger.info(
|
74
|
-
f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'."
|
79
|
+
f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'."
|
80
|
+
)
|
75
81
|
return file_path
|
76
82
|
except Exception as e:
|
77
83
|
raise
|
@@ -81,7 +87,9 @@ class MinIOClient:
|
|
81
87
|
try:
|
82
88
|
result_list = []
|
83
89
|
if prefix:
|
84
|
-
objects = self.client.list_objects(
|
90
|
+
objects = self.client.list_objects(
|
91
|
+
bucket_name, recursive=True, prefix=prefix
|
92
|
+
)
|
85
93
|
else:
|
86
94
|
objects = self.client.list_objects(bucket_name, recursive=True)
|
87
95
|
logger.info(f"Objects in bucket '{bucket_name}':")
|
@@ -99,8 +107,7 @@ class MinIOClient:
|
|
99
107
|
raise
|
100
108
|
|
101
109
|
def calculate_bucket_stats(self, bucket_name, prefix):
|
102
|
-
objects = self.client.list_objects(bucket_name,
|
103
|
-
prefix=prefix, recursive=True)
|
110
|
+
objects = self.client.list_objects(bucket_name, prefix=prefix, recursive=True)
|
104
111
|
total_size = 0
|
105
112
|
object_count = 0
|
106
113
|
|
@@ -115,14 +122,16 @@ class MinIOClient:
|
|
115
122
|
def get_objects(self, bucket_name, object_name):
|
116
123
|
try:
|
117
124
|
response = self.client.get_object(bucket_name, object_name)
|
118
|
-
content = response.read().decode(
|
125
|
+
content = response.read().decode("utf-8")
|
119
126
|
return content
|
120
127
|
except Exception as e:
|
121
128
|
raise
|
122
129
|
|
123
130
|
def get_object_tag(self, bucket_name, object_name):
|
124
131
|
try:
|
125
|
-
tags = self.client.get_object_tags(
|
132
|
+
tags = self.client.get_object_tags(
|
133
|
+
bucket_name=bucket_name, object_name=object_name
|
134
|
+
)
|
126
135
|
return tags
|
127
136
|
except Exception as e:
|
128
137
|
raise
|
@@ -130,7 +139,9 @@ class MinIOClient:
|
|
130
139
|
def update_object_tag(self, bucket_name, object_name, tags):
|
131
140
|
try:
|
132
141
|
tags_obj = Tags.new_object_tags()
|
133
|
-
tag_info = self.get_object_tag(
|
142
|
+
tag_info = self.get_object_tag(
|
143
|
+
bucket_name=bucket_name, object_name=object_name
|
144
|
+
)
|
134
145
|
if tag_info is None:
|
135
146
|
tag_info = {}
|
136
147
|
for tag_dict in tags:
|
@@ -142,7 +153,9 @@ class MinIOClient:
|
|
142
153
|
|
143
154
|
for k, v in tag_info.items():
|
144
155
|
tags_obj[k] = v
|
145
|
-
self.client.set_object_tags(
|
156
|
+
self.client.set_object_tags(
|
157
|
+
bucket_name=bucket_name, object_name=object_name, tags=tags_obj
|
158
|
+
)
|
146
159
|
else:
|
147
160
|
for tag_dict in tags:
|
148
161
|
for tag_key, tag_value in tag_dict.items():
|
@@ -153,20 +166,26 @@ class MinIOClient:
|
|
153
166
|
|
154
167
|
for k, v in tag_info.items():
|
155
168
|
tags_obj[k] = v
|
156
|
-
self.client.set_object_tags(
|
169
|
+
self.client.set_object_tags(
|
170
|
+
bucket_name=bucket_name, object_name=object_name, tags=tags_obj
|
171
|
+
)
|
157
172
|
return tag_info
|
158
173
|
except Exception as e:
|
159
174
|
raise
|
160
175
|
|
161
176
|
def reset_object_tag(self, bucket_name, object_name):
|
162
177
|
try:
|
163
|
-
self.client.delete_object_tags(
|
178
|
+
self.client.delete_object_tags(
|
179
|
+
bucket_name=bucket_name, object_name=object_name
|
180
|
+
)
|
164
181
|
return True
|
165
182
|
except Exception as e:
|
166
183
|
raise
|
167
184
|
|
168
185
|
def get_object_tmp_link(self, bucket_name, object_name, expires):
|
169
186
|
try:
|
170
|
-
return self.client.presigned_get_object(
|
187
|
+
return self.client.presigned_get_object(
|
188
|
+
bucket_name, object_name, expires=timedelta(days=expires)
|
189
|
+
)
|
171
190
|
except Exception as e:
|
172
|
-
raise
|
191
|
+
raise
|
datamax/parser/__init__.py
CHANGED
datamax/parser/base.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import os
|
2
2
|
from datetime import datetime
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import List,
|
4
|
+
from typing import Dict, List, Union
|
5
|
+
|
6
|
+
from datamax.utils.lifecycle_types import LifeType
|
5
7
|
from datamax.utils.tokenizer import DashScopeClient
|
6
8
|
|
7
9
|
|
@@ -10,7 +12,9 @@ class LifeCycle:
|
|
10
12
|
Life cycle class
|
11
13
|
"""
|
12
14
|
|
13
|
-
def __init__(
|
15
|
+
def __init__(
|
16
|
+
self, update_time: str, life_type: list, life_metadata: Dict[str, str]
|
17
|
+
):
|
14
18
|
self.update_time = update_time # Update time
|
15
19
|
self.life_type = life_type # Life cycle type
|
16
20
|
self.life_metadata = life_metadata # Life cycle metadata
|
@@ -21,14 +25,14 @@ class LifeCycle:
|
|
21
25
|
self.life_metadata.update(life_metadata)
|
22
26
|
|
23
27
|
def __str__(self):
|
24
|
-
metadata_str =
|
25
|
-
return f
|
28
|
+
metadata_str = ", ".join(f"{k}: {v}" for k, v in self.life_metadata.items())
|
29
|
+
return f"update_time: {self.update_time}, life_type: {self.life_type}, life_metadata: {{{metadata_str}}}"
|
26
30
|
|
27
31
|
def to_dict(self):
|
28
32
|
return {
|
29
|
-
|
30
|
-
|
31
|
-
|
33
|
+
"update_time": self.update_time,
|
34
|
+
"life_type": self.life_type,
|
35
|
+
"life_metadata": self.life_metadata,
|
32
36
|
}
|
33
37
|
|
34
38
|
|
@@ -37,8 +41,8 @@ class MarkdownOutputVo:
|
|
37
41
|
Markdown output conversion
|
38
42
|
"""
|
39
43
|
|
40
|
-
def __init__(self,
|
41
|
-
self.
|
44
|
+
def __init__(self, extension: str, content: str):
|
45
|
+
self.extension: str = extension # File type
|
42
46
|
self.content: str = content # Markdown content
|
43
47
|
self.lifecycle: List[LifeCycle] = [] # Life cycle data
|
44
48
|
|
@@ -47,9 +51,9 @@ class MarkdownOutputVo:
|
|
47
51
|
|
48
52
|
def to_dict(self):
|
49
53
|
data_dict = {
|
50
|
-
|
51
|
-
|
52
|
-
|
54
|
+
"extension": self.extension,
|
55
|
+
"content": self.content,
|
56
|
+
"lifecycle": [lc.to_dict() for lc in self.lifecycle],
|
53
57
|
}
|
54
58
|
return data_dict
|
55
59
|
|
@@ -58,20 +62,40 @@ class BaseLife:
|
|
58
62
|
tk_client = DashScopeClient()
|
59
63
|
|
60
64
|
@staticmethod
|
61
|
-
def generate_lifecycle(
|
65
|
+
def generate_lifecycle(
|
66
|
+
source_file: str,
|
67
|
+
domain: str,
|
68
|
+
life_type: Union[LifeType, str, List[Union[LifeType, str]]],
|
69
|
+
usage_purpose: str,
|
70
|
+
) -> LifeCycle:
|
71
|
+
"""
|
72
|
+
构造一个 LifeCycle 记录,可以传入单个枚举/字符串或列表混合
|
73
|
+
"""
|
74
|
+
# 1) 先统一成 list
|
75
|
+
if isinstance(life_type, (list, tuple)):
|
76
|
+
raw = list(life_type)
|
77
|
+
else:
|
78
|
+
raw = [life_type]
|
79
|
+
|
80
|
+
# 2) 如果是枚举,就取它的 value
|
81
|
+
life_list: List[str] = [
|
82
|
+
lt.value if isinstance(lt, LifeType) else lt for lt in raw
|
83
|
+
]
|
84
|
+
|
62
85
|
update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
63
|
-
|
64
|
-
|
86
|
+
try:
|
87
|
+
storage = os.path.getsize(source_file)
|
88
|
+
except Exception:
|
89
|
+
storage = 0
|
65
90
|
life_metadata = {
|
66
|
-
|
67
|
-
"
|
68
|
-
"
|
69
|
-
"
|
70
|
-
"usage_purpose": usage_purpose # Usage purpose
|
91
|
+
"storage_size": storage,
|
92
|
+
"source_file": source_file,
|
93
|
+
"domain": domain,
|
94
|
+
"usage_purpose": usage_purpose,
|
71
95
|
}
|
72
|
-
return LifeCycle(update_time,
|
96
|
+
return LifeCycle(update_time, life_list, life_metadata)
|
73
97
|
|
74
98
|
@staticmethod
|
75
99
|
def get_file_extension(file_path):
|
76
100
|
file_path = Path(file_path)
|
77
|
-
return file_path.suffix[1:].lower()
|
101
|
+
return file_path.suffix[1:].lower()
|