pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +91 -68
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/parser/pptx_parser.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
-
import os
|
2
1
|
from typing import Union
|
2
|
+
|
3
|
+
from loguru import logger
|
3
4
|
from pptx import Presentation
|
4
|
-
|
5
|
-
from datamax.parser.base import MarkdownOutputVo
|
5
|
+
|
6
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
7
|
+
from datamax.utils.lifecycle_types import LifeType
|
6
8
|
|
7
9
|
|
8
|
-
class
|
10
|
+
class PptxParser(BaseLife):
|
9
11
|
def __init__(self, file_path: Union[str, list]):
|
10
12
|
super().__init__()
|
11
13
|
self.file_path = file_path
|
@@ -13,33 +15,59 @@ class PPtxParser(BaseLife):
|
|
13
15
|
@staticmethod
|
14
16
|
def read_ppt_file(file_path: str):
|
15
17
|
try:
|
16
|
-
content =
|
18
|
+
content = ""
|
17
19
|
prs = Presentation(file_path)
|
18
20
|
for slide in prs.slides:
|
19
21
|
for shape in slide.shapes:
|
20
22
|
if shape.has_text_frame:
|
21
|
-
content += shape.text +
|
22
|
-
# if shape.shape_type == 13:
|
23
|
-
# if not os.path.exists("extracted_images"):
|
24
|
-
# os.makedirs("extracted_images")
|
25
|
-
# image = shape.image
|
26
|
-
# image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
|
27
|
-
# with open(image_filename, 'wb') as img_file:
|
28
|
-
# img_file.write(image.blob)
|
29
|
-
# content += ('[' + image_filename + ']')
|
23
|
+
content += shape.text + "\n"
|
30
24
|
return content
|
31
25
|
except Exception:
|
32
26
|
raise
|
33
27
|
|
34
28
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
29
|
+
# —— 生命周期:开始处理 PPTX —— #
|
30
|
+
lc_start = self.generate_lifecycle(
|
31
|
+
source_file=file_path,
|
32
|
+
domain="Technology",
|
33
|
+
usage_purpose="Documentation",
|
34
|
+
life_type=LifeType.DATA_PROCESSING,
|
35
|
+
)
|
36
|
+
logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
|
37
|
+
|
35
38
|
try:
|
36
|
-
|
39
|
+
extension = self.get_file_extension(file_path)
|
37
40
|
content = self.read_ppt_file(file_path=file_path)
|
38
41
|
mk_content = content
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
42
|
+
|
43
|
+
# —— 生命周期:处理完成 —— #
|
44
|
+
lc_end = self.generate_lifecycle(
|
45
|
+
source_file=file_path,
|
46
|
+
domain="Technology",
|
47
|
+
usage_purpose="Documentation",
|
48
|
+
life_type=LifeType.DATA_PROCESSED,
|
49
|
+
)
|
50
|
+
logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
|
51
|
+
|
52
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
53
|
+
output_vo.add_lifecycle(lc_start)
|
54
|
+
output_vo.add_lifecycle(lc_end)
|
43
55
|
return output_vo.to_dict()
|
44
|
-
|
45
|
-
|
56
|
+
|
57
|
+
except Exception as e:
|
58
|
+
# —— 生命周期:处理失败 —— #
|
59
|
+
lc_fail = self.generate_lifecycle(
|
60
|
+
source_file=file_path,
|
61
|
+
domain="Technology",
|
62
|
+
usage_purpose="Documentation",
|
63
|
+
life_type=LifeType.DATA_PROCESS_FAILED,
|
64
|
+
)
|
65
|
+
logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
|
66
|
+
|
67
|
+
raise Exception(
|
68
|
+
{
|
69
|
+
"error": str(e),
|
70
|
+
"file_path": file_path,
|
71
|
+
"lifecycle": [lc_fail.to_dict()],
|
72
|
+
}
|
73
|
+
)
|
datamax/parser/txt_parser.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
-
import chardet
|
2
1
|
from typing import Union
|
3
|
-
|
4
|
-
|
5
|
-
|
2
|
+
|
3
|
+
import chardet
|
4
|
+
|
5
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
6
|
+
from datamax.utils.lifecycle_types import LifeType
|
7
|
+
|
6
8
|
|
7
9
|
class TxtParser(BaseLife):
|
8
10
|
def __init__(self, file_path: Union[str, list]):
|
@@ -12,9 +14,9 @@ class TxtParser(BaseLife):
|
|
12
14
|
@staticmethod
|
13
15
|
def detect_encoding(file_path: str):
|
14
16
|
try:
|
15
|
-
with open(file_path,
|
17
|
+
with open(file_path, "rb") as f:
|
16
18
|
result = chardet.detect(f.read())
|
17
|
-
return result[
|
19
|
+
return result["encoding"]
|
18
20
|
except Exception as e:
|
19
21
|
raise e
|
20
22
|
|
@@ -27,20 +29,49 @@ class TxtParser(BaseLife):
|
|
27
29
|
"""
|
28
30
|
try:
|
29
31
|
encoding = TxtParser.detect_encoding(file_path)
|
30
|
-
with open(file_path,
|
32
|
+
with open(file_path, "r", encoding=encoding) as file:
|
31
33
|
return file.read()
|
32
34
|
except Exception as e:
|
33
35
|
raise e
|
34
36
|
|
35
37
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
36
38
|
try:
|
37
|
-
|
38
|
-
|
39
|
+
extension = self.get_file_extension(file_path)
|
40
|
+
|
41
|
+
# 1) 开始处理
|
42
|
+
lc_start = self.generate_lifecycle(
|
43
|
+
source_file=file_path,
|
44
|
+
domain="Technology",
|
45
|
+
usage_purpose="Documentation",
|
46
|
+
life_type=LifeType.DATA_PROCESSING,
|
47
|
+
)
|
48
|
+
|
49
|
+
# 2) 读取文件内容
|
50
|
+
content = self.read_txt_file(file_path=file_path)
|
39
51
|
mk_content = content
|
40
|
-
|
41
|
-
|
42
|
-
output_vo = MarkdownOutputVo(
|
43
|
-
output_vo.add_lifecycle(
|
52
|
+
|
53
|
+
# 3) 构造输出对象并加上开始生命周期
|
54
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
55
|
+
output_vo.add_lifecycle(lc_start)
|
56
|
+
|
57
|
+
# 4) 处理完成
|
58
|
+
lc_end = self.generate_lifecycle(
|
59
|
+
source_file=file_path,
|
60
|
+
domain="Technology",
|
61
|
+
usage_purpose="Documentation",
|
62
|
+
life_type=LifeType.DATA_PROCESSED,
|
63
|
+
)
|
64
|
+
output_vo.add_lifecycle(lc_end)
|
65
|
+
|
44
66
|
return output_vo.to_dict()
|
67
|
+
|
45
68
|
except Exception as e:
|
46
|
-
|
69
|
+
# 5) 处理失败
|
70
|
+
lc_fail = self.generate_lifecycle(
|
71
|
+
source_file=file_path,
|
72
|
+
domain="Technology",
|
73
|
+
usage_purpose="Documentation",
|
74
|
+
life_type=LifeType.DATA_PROCESS_FAILED,
|
75
|
+
)
|
76
|
+
# (可选)如果希望在失败时也返回 VO,可在这里构造空 content 的 VO 并加入 lc_fail
|
77
|
+
raise
|
datamax/parser/xls_parser.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
-
from datamax.parser.base import MarkdownOutputVo
|
2
|
-
from datamax.parser.base import BaseLife
|
3
|
-
import pandas as pd
|
4
1
|
import warnings
|
5
2
|
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
6
|
+
from datamax.utils.lifecycle_types import LifeType
|
7
|
+
|
6
8
|
warnings.filterwarnings("ignore")
|
7
9
|
|
8
10
|
|
@@ -15,12 +17,38 @@ class XlsParser(BaseLife):
|
|
15
17
|
|
16
18
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
17
19
|
try:
|
20
|
+
# 🏷️ 解析开始
|
21
|
+
lc_start = self.generate_lifecycle(
|
22
|
+
source_file=file_path,
|
23
|
+
domain="Technology",
|
24
|
+
usage_purpose="Documentation",
|
25
|
+
life_type=LifeType.DATA_PROCESSING,
|
26
|
+
)
|
27
|
+
|
28
|
+
# 📊 读取Excel并生成Markdown
|
18
29
|
df = pd.read_excel(file_path)
|
19
30
|
mk_content = df.to_markdown(index=False)
|
20
|
-
|
21
|
-
|
31
|
+
|
32
|
+
# 🏷️ 解析完成
|
33
|
+
lc_end = self.generate_lifecycle(
|
34
|
+
source_file=file_path,
|
35
|
+
domain="Technology",
|
36
|
+
usage_purpose="Documentation",
|
37
|
+
life_type=LifeType.DATA_PROCESSED,
|
38
|
+
)
|
39
|
+
|
22
40
|
output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
|
23
|
-
output_vo.add_lifecycle(
|
41
|
+
output_vo.add_lifecycle(lc_start)
|
42
|
+
output_vo.add_lifecycle(lc_end)
|
24
43
|
return output_vo.to_dict()
|
44
|
+
|
25
45
|
except Exception as e:
|
46
|
+
# ❌ 解析失败
|
47
|
+
lc_fail = self.generate_lifecycle(
|
48
|
+
source_file=file_path,
|
49
|
+
domain="Technology",
|
50
|
+
usage_purpose="Documentation",
|
51
|
+
life_type=LifeType.DATA_PROCESS_FAILED,
|
52
|
+
)
|
53
|
+
# 此处不返回空VO,直接抛出,框架可捕获并上报
|
26
54
|
raise e
|
datamax/parser/xlsx_parser.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
from loguru import logger
|
2
1
|
import multiprocessing
|
3
2
|
import os
|
4
3
|
import time
|
@@ -6,8 +5,10 @@ import warnings
|
|
6
5
|
from multiprocessing import Queue
|
7
6
|
|
8
7
|
import pandas as pd
|
8
|
+
from loguru import logger
|
9
9
|
|
10
10
|
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
11
|
+
from datamax.utils.lifecycle_types import LifeType
|
11
12
|
|
12
13
|
warnings.filterwarnings("ignore")
|
13
14
|
|
@@ -15,11 +16,10 @@ warnings.filterwarnings("ignore")
|
|
15
16
|
class XlsxParser(BaseLife):
|
16
17
|
"""XLSX解析器 - 使用pandas读取并转换为markdown,支持多进程处理"""
|
17
18
|
|
18
|
-
def __init__(self, file_path
|
19
|
+
def __init__(self, file_path):
|
19
20
|
super().__init__()
|
20
21
|
self.file_path = file_path
|
21
|
-
|
22
|
-
logger.info(f"🚀 XlsxParser初始化完成 - 文件路径: {file_path}, 超时: {timeout}s")
|
22
|
+
logger.info(f"🚀 XlsxParser初始化完成 - 文件路径: {file_path}")
|
23
23
|
|
24
24
|
def _parse_with_pandas(self, file_path: str) -> str:
|
25
25
|
"""使用pandas读取Excel并转换为markdown"""
|
@@ -85,7 +85,9 @@ class XlsxParser(BaseLife):
|
|
85
85
|
markdown_content = "*工作表为空*"
|
86
86
|
logger.warning("⚠️ 工作表为空")
|
87
87
|
|
88
|
-
logger.info(
|
88
|
+
logger.info(
|
89
|
+
f"🎊 pandas转换完成,markdown内容长度: {len(markdown_content)} 字符"
|
90
|
+
)
|
89
91
|
logger.debug(f"👀 前200字符预览: {markdown_content[:200]}...")
|
90
92
|
|
91
93
|
return markdown_content
|
@@ -107,6 +109,15 @@ class XlsxParser(BaseLife):
|
|
107
109
|
"""解析Excel文件的核心方法"""
|
108
110
|
logger.info(f"🎬 开始解析Excel文件: {file_path}")
|
109
111
|
|
112
|
+
# —— 生命周期:开始处理 —— #
|
113
|
+
lc_start = self.generate_lifecycle(
|
114
|
+
source_file=file_path,
|
115
|
+
domain="Technology",
|
116
|
+
usage_purpose="Documentation",
|
117
|
+
life_type=LifeType.DATA_PROCESSING,
|
118
|
+
)
|
119
|
+
logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
|
120
|
+
|
110
121
|
try:
|
111
122
|
# 使用pandas解析Excel
|
112
123
|
logger.info("🐼 使用pandas模式解析Excel")
|
@@ -119,19 +130,20 @@ class XlsxParser(BaseLife):
|
|
119
130
|
|
120
131
|
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
121
132
|
|
122
|
-
#
|
123
|
-
|
133
|
+
# —— 生命周期:处理完成 —— #
|
134
|
+
lc_end = self.generate_lifecycle(
|
124
135
|
source_file=file_path,
|
125
136
|
domain="Technology",
|
126
137
|
usage_purpose="Documentation",
|
127
|
-
life_type=
|
138
|
+
life_type=LifeType.DATA_PROCESSED,
|
128
139
|
)
|
129
|
-
logger.debug("⚙️
|
140
|
+
logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
|
130
141
|
|
131
|
-
#
|
132
|
-
|
133
|
-
output_vo = MarkdownOutputVo(
|
134
|
-
output_vo.add_lifecycle(
|
142
|
+
# 创建输出对象并添加两个生命周期
|
143
|
+
extension = self.get_file_extension(file_path)
|
144
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
145
|
+
output_vo.add_lifecycle(lc_start)
|
146
|
+
output_vo.add_lifecycle(lc_end)
|
135
147
|
|
136
148
|
result = output_vo.to_dict()
|
137
149
|
result_queue.put(result)
|
@@ -142,15 +154,46 @@ class XlsxParser(BaseLife):
|
|
142
154
|
return result
|
143
155
|
|
144
156
|
except Exception as e:
|
157
|
+
# —— 生命周期:处理失败 —— #
|
158
|
+
try:
|
159
|
+
lc_fail = self.generate_lifecycle(
|
160
|
+
source_file=file_path,
|
161
|
+
domain="Technology",
|
162
|
+
usage_purpose="Documentation",
|
163
|
+
life_type=LifeType.DATA_PROCESS_FAILED,
|
164
|
+
)
|
165
|
+
logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
|
166
|
+
# 如果需要,也可以把它加到 error_result 里:
|
167
|
+
# error_result = {"error": str(e), "file_path": file_path, "lifecycle":[lc_fail.to_dict()]}
|
168
|
+
except Exception:
|
169
|
+
pass
|
170
|
+
|
171
|
+
# —— 生命周期:处理失败 —— #
|
172
|
+
try:
|
173
|
+
lc_fail = self.generate_lifecycle(
|
174
|
+
source_file=file_path,
|
175
|
+
domain="Technology",
|
176
|
+
usage_purpose="Documentation",
|
177
|
+
life_type=LifeType.DATA_PROCESS_FAILED,
|
178
|
+
)
|
179
|
+
logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
|
180
|
+
except Exception:
|
181
|
+
pass
|
182
|
+
|
145
183
|
logger.error(f"💀 解析Excel文件失败: {file_path}, 错误: {str(e)}")
|
146
184
|
# 将错误也放入队列
|
147
|
-
error_result = {
|
185
|
+
error_result = {
|
186
|
+
"error": str(e),
|
187
|
+
"file_path": file_path,
|
188
|
+
# 额外把失败的 lifecycle 也一起返回,测试中可选校验
|
189
|
+
"lifecycle": [lc_fail.to_dict()] if "lc_fail" in locals() else [],
|
190
|
+
}
|
148
191
|
result_queue.put(error_result)
|
149
192
|
raise
|
150
193
|
|
151
194
|
def parse(self, file_path: str) -> dict:
|
152
195
|
"""解析Excel文件 - 支持多进程和超时控制"""
|
153
|
-
logger.info(f"🚀 启动Excel解析进程 - 文件: {file_path}
|
196
|
+
logger.info(f"🚀 启动Excel解析进程 - 文件: {file_path}")
|
154
197
|
|
155
198
|
try:
|
156
199
|
# 验证文件存在
|
@@ -169,42 +212,6 @@ class XlsxParser(BaseLife):
|
|
169
212
|
process.start()
|
170
213
|
logger.debug(f"⚡ 启动子进程,PID: {process.pid}")
|
171
214
|
|
172
|
-
start_time = time.time()
|
173
|
-
|
174
|
-
# 等待解析完成或超时
|
175
|
-
while time.time() - start_time < self.timeout:
|
176
|
-
elapsed_time = int(time.time() - start_time)
|
177
|
-
logger.debug(f"⏱️ 等待解析完成... {elapsed_time}s")
|
178
|
-
|
179
|
-
if not process.is_alive():
|
180
|
-
logger.debug("✅ 子进程已完成")
|
181
|
-
break
|
182
|
-
|
183
|
-
if not result_queue.empty():
|
184
|
-
result = result_queue.get()
|
185
|
-
process.join() # 等待进程正常结束
|
186
|
-
|
187
|
-
# 检查是否是错误结果
|
188
|
-
if "error" in result:
|
189
|
-
logger.error(f"💥 子进程返回错误: {result['error']}")
|
190
|
-
raise Exception(result["error"])
|
191
|
-
|
192
|
-
logger.info(f"🎉 Excel解析成功完成,耗时: {elapsed_time}s")
|
193
|
-
return result
|
194
|
-
|
195
|
-
time.sleep(1)
|
196
|
-
else:
|
197
|
-
# 超时处理
|
198
|
-
logger.error(f"⏰ 解析超时 ({self.timeout}s),终止进程")
|
199
|
-
process.terminate()
|
200
|
-
process.join(timeout=5) # 给进程5秒时间优雅退出
|
201
|
-
|
202
|
-
if process.is_alive():
|
203
|
-
logger.error("💀 强制杀死进程")
|
204
|
-
process.kill()
|
205
|
-
|
206
|
-
raise TimeoutError(f"Excel解析超时: {file_path}")
|
207
|
-
|
208
215
|
except Exception as e:
|
209
216
|
logger.error(
|
210
217
|
f"💀 Excel解析失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
|
datamax/utils/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from datamax.utils.data_cleaner import (
|
|
5
5
|
)
|
6
6
|
from datamax.utils.env_setup import setup_environment
|
7
7
|
|
8
|
-
#
|
8
|
+
# Conditionally import the UNO processor
|
9
9
|
try:
|
10
10
|
from datamax.utils.uno_handler import (
|
11
11
|
HAS_UNO,
|
@@ -23,6 +23,7 @@ except ImportError:
|
|
23
23
|
cleanup_uno_manager = None
|
24
24
|
uno_manager_context = None
|
25
25
|
|
26
|
+
|
26
27
|
def clean_original_text(text):
|
27
28
|
"""
|
28
29
|
Clean the original text.
|
datamax/utils/data_cleaner.py
CHANGED
@@ -31,15 +31,15 @@ class AbnormalCleaner:
|
|
31
31
|
"""
|
32
32
|
Extract reference entries and assign to self.parsed_data
|
33
33
|
(Original text will be replaced with extracted references, each item on a separate line)
|
34
|
-
|
34
|
+
|
35
35
|
Returns:
|
36
36
|
str: Extracted reference text (same as self.parsed_data)
|
37
37
|
"""
|
38
38
|
patterns = [
|
39
|
-
r
|
40
|
-
r
|
41
|
-
r
|
42
|
-
r
|
39
|
+
r"([A-Z][a-z]+(?:, [A-Z](?:\.[a-z]*)?)+(?: et al\.)? $\d{4}$[^\n]+)", # APA format
|
40
|
+
r"($$\d+$$[^\n]+)", # Numbered references like [1]
|
41
|
+
r"(DOI:\s?\S+|https?://\S+)", # DOI/URL
|
42
|
+
r"([A-Z][a-z]+, [A-Z]\.?,? & [A-Z][a-z]+, [A-Z]\. \d{4}[^\n]+)", # Multi-author APA
|
43
43
|
]
|
44
44
|
references = []
|
45
45
|
for pattern in patterns:
|
@@ -47,9 +47,11 @@ class AbnormalCleaner:
|
|
47
47
|
references.extend(re.findall(pattern, self.parsed_data))
|
48
48
|
except re.error as e:
|
49
49
|
print(f"Regex error {pattern}: {e}")
|
50
|
-
|
50
|
+
|
51
51
|
# Assign extraction results to parsed_data (each item on a separate line)
|
52
|
-
self.parsed_data = "\n".join(
|
52
|
+
self.parsed_data = "\n".join(
|
53
|
+
list(set(references))
|
54
|
+
) # Deduplicate and merge into string
|
53
55
|
return self.parsed_data
|
54
56
|
|
55
57
|
# Exception cleaning class
|
@@ -164,19 +166,19 @@ class TextFilter:
|
|
164
166
|
"""Filter by word repetition rate"""
|
165
167
|
if not isinstance(self.parsed_data, str):
|
166
168
|
return False
|
167
|
-
|
169
|
+
|
168
170
|
text = str(self.parsed_data)
|
169
|
-
bi_grams = [text[i:i+2] for i in range(0, len(text)-1, 2)]
|
171
|
+
bi_grams = [text[i : i + 2] for i in range(0, len(text) - 1, 2)]
|
170
172
|
word_count = len(bi_grams)
|
171
173
|
if word_count == 0:
|
172
174
|
print("No words found.")
|
173
175
|
return False
|
174
|
-
|
176
|
+
|
175
177
|
word_freq = Counter(bi_grams)
|
176
178
|
most_common_word, most_common_count = word_freq.most_common(1)[0]
|
177
179
|
repetition_rate = most_common_count / word_count
|
178
180
|
print(f"Word repetition rate: {repetition_rate}")
|
179
|
-
|
181
|
+
|
180
182
|
return repetition_rate <= threshold
|
181
183
|
|
182
184
|
def filter_by_char_count(self, min_chars=30, max_chars=500000):
|
@@ -227,22 +229,34 @@ class PrivacyDesensitization:
|
|
227
229
|
# Customer service hotlines are not easy to match and are not considered private data
|
228
230
|
self.parsed_data = re.sub(r"\d+-\d+-\d+", token, self.parsed_data)
|
229
231
|
return self.parsed_data
|
230
|
-
|
232
|
+
|
231
233
|
def replace_bank_id(self, token="COSCO_NUMBER"):
|
232
234
|
# Match bank card numbers and replace
|
233
|
-
|
234
|
-
|
235
|
-
)
|
235
|
+
BANK_ID_PATTERN = r"\b(?:(?:\d{4}[ -]?){4}\d{3}|(?:\d{4}[ -]?){3}\d{4}|(?:4\d{3}|5[1-5]\d{2}|6[045]\d{2})(?:[ -]?\d{4}){3}|3[47]\d{2}[ -]?\d{6}[ -]?\d{5})\b"
|
236
|
+
|
237
|
+
def luhn_check(card_number):
|
238
|
+
digits = [int(d) for d in card_number if d.isdigit()]
|
239
|
+
if len(digits) not in (13, 15, 16, 19):
|
240
|
+
return False
|
241
|
+
checksum = sum(digits[-1::-2])
|
242
|
+
checksum += sum(sum(divmod(d * 2, 10)) for d in digits[-2::-2])
|
243
|
+
return checksum % 10 == 0
|
244
|
+
|
245
|
+
bank_card_numbers = re.findall(BANK_ID_PATTERN, self.parsed_data)
|
246
|
+
|
247
|
+
for card_number in bank_card_numbers:
|
248
|
+
if luhn_check(card_number):
|
249
|
+
self.parsed_data = re.sub(card_number, token, self.parsed_data)
|
236
250
|
return self.parsed_data
|
237
|
-
|
251
|
+
|
238
252
|
def replace_phone_number(self, token="COSCO_NUMBER"):
|
239
253
|
# Match phone numbers and replace
|
240
254
|
self.parsed_data = jio.replace_phone_number(self.parsed_data, token)
|
241
255
|
return self.parsed_data
|
242
|
-
|
256
|
+
|
243
257
|
def replace_qq(self, token="COSCO_NUMBER"):
|
244
258
|
# Match QQ numbers and replace
|
245
|
-
self.parsed_data = jio.replace_qq(self.parsed_data,token)
|
259
|
+
self.parsed_data = jio.replace_qq(self.parsed_data, token)
|
246
260
|
return self.parsed_data
|
247
261
|
|
248
262
|
def replace_id_card(self, token="COSCO_NUMBER"):
|
@@ -252,6 +266,10 @@ class PrivacyDesensitization:
|
|
252
266
|
|
253
267
|
def replace_number(self):
|
254
268
|
# Replace all types of numeric private data
|
269
|
+
# Bank card
|
270
|
+
self.parsed_data = self.replace_bank_id(
|
271
|
+
token="BANK_ID"
|
272
|
+
) # nosec B106 - 这是数据脱敏标记,不是密码
|
255
273
|
|
256
274
|
# Landline + mobile phone
|
257
275
|
self.parsed_data = jio.replace_phone_number(self.parsed_data, "COSCO_NUMBER")
|
@@ -259,10 +277,6 @@ class PrivacyDesensitization:
|
|
259
277
|
self.parsed_data = jio.replace_qq(self.parsed_data, "COSCO_NUMBER")
|
260
278
|
# ID card
|
261
279
|
self.parsed_data = jio.replace_id_card(self.parsed_data, "COSCO_NUMBER")
|
262
|
-
# Bank card
|
263
|
-
self.parsed_data = self.replace_bank_id(
|
264
|
-
self.parsed_data, token="COSCO_NUMBER"
|
265
|
-
) # nosec B106 - 这是数据脱敏标记,不是密码
|
266
280
|
|
267
281
|
return self.parsed_data
|
268
282
|
|
datamax/utils/env_setup.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
+
import importlib.metadata
|
2
|
+
import os
|
1
3
|
import subprocess
|
2
4
|
import sys
|
3
|
-
|
4
|
-
import importlib.metadata
|
5
|
+
|
5
6
|
|
6
7
|
class EnvironmentSetup:
|
7
|
-
"""
|
8
|
-
|
8
|
+
"""Responsible for setting up the correct environment,
|
9
|
+
including checking GPU support and installing the necessary packages
|
9
10
|
"""
|
10
11
|
|
11
12
|
def __init__(self, use_gpu: bool = False):
|
@@ -18,36 +19,40 @@ class EnvironmentSetup:
|
|
18
19
|
if self._gpu_available is None:
|
19
20
|
try:
|
20
21
|
# Check whether CUDA is available
|
21
|
-
subprocess.check_output([
|
22
|
+
subprocess.check_output(["nvcc", "--version"], stderr=subprocess.STDOUT)
|
22
23
|
self._gpu_available = True
|
23
24
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
24
25
|
self._gpu_available = False
|
25
26
|
return self._gpu_available
|
26
27
|
|
27
28
|
def is_conda(self):
|
28
|
-
"""
|
29
|
-
return os.path.exists(os.path.join(sys.prefix,
|
29
|
+
"""Check whether the current environment is a Conda environment"""
|
30
|
+
return os.path.exists(os.path.join(sys.prefix, "conda-meta"))
|
30
31
|
|
31
32
|
def install_package(self, package_name):
|
32
|
-
"""
|
33
|
-
installer =
|
34
|
-
if installer ==
|
33
|
+
"""Select pip or conda or other installation specified package according to the environment"""
|
34
|
+
installer = "conda" if self.is_conda() else "pip"
|
35
|
+
if installer == "conda":
|
35
36
|
print(f"Detected Conda environment. Installing {package_name} with conda.")
|
36
37
|
try:
|
37
|
-
subprocess.check_call([
|
38
|
+
subprocess.check_call(["pip", "install", package_name])
|
38
39
|
print(f"Successfully installed {package_name} with conda.")
|
39
40
|
except subprocess.CalledProcessError as e:
|
40
41
|
print(f"Failed to install {package_name} with conda: {e}")
|
41
|
-
elif installer ==
|
42
|
+
elif installer == "pip":
|
42
43
|
print(f"Using pip to install {package_name}.")
|
43
44
|
try:
|
44
45
|
# Invoke the pip installation package using the Python interpreter
|
45
|
-
subprocess.check_call(
|
46
|
+
subprocess.check_call(
|
47
|
+
[sys.executable, "-m", "pip", "install", package_name]
|
48
|
+
)
|
46
49
|
print(f"Successfully installed {package_name} with pip.")
|
47
50
|
except subprocess.CalledProcessError as e:
|
48
51
|
print(f"Failed to install {package_name} with pip: {e}")
|
49
52
|
else:
|
50
|
-
print(
|
53
|
+
print(
|
54
|
+
"Unable to determine the package manager. Please install the package manually."
|
55
|
+
)
|
51
56
|
|
52
57
|
def check_and_install(self):
|
53
58
|
"""Check and install appropriate packages based on user's choice and GPU availability"""
|
@@ -56,12 +61,14 @@ class EnvironmentSetup:
|
|
56
61
|
|
57
62
|
# Override GPU detection with the use_gpu parameter
|
58
63
|
if self.use_gpu:
|
59
|
-
pkg_name =
|
64
|
+
pkg_name = "paddlepaddle-gpu" if self.is_gpu_available() else "paddlepaddle"
|
60
65
|
else:
|
61
|
-
pkg_name =
|
66
|
+
pkg_name = "paddlepaddle"
|
62
67
|
|
63
68
|
try:
|
64
|
-
_ = importlib.metadata.version(
|
69
|
+
_ = importlib.metadata.version(
|
70
|
+
pkg_name.split()[0]
|
71
|
+
) # Check if paddlepaddle is installed
|
65
72
|
# print(f"{pkg_name} version {1} is already installed.")
|
66
73
|
except importlib.metadata.PackageNotFoundError:
|
67
74
|
print(f"{pkg_name} is not installed. Installing now...")
|
@@ -77,4 +84,4 @@ env_setup = EnvironmentSetup() # Set this flag as needed
|
|
77
84
|
def setup_environment(use_gpu: bool = False):
|
78
85
|
"""Used to set the environment when the program starts"""
|
79
86
|
env_setup.use_gpu = use_gpu
|
80
|
-
env_setup.check_and_install()
|
87
|
+
env_setup.check_and_install()
|