pydatamax 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +2 -2
- datamax/parser/doc_parser.py +60 -52
- datamax/parser/docx_parser.py +70 -58
- datamax/parser/xlsx_parser.py +53 -46
- pydatamax-0.1.14.dist-info/METADATA +228 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.14.dist-info}/RECORD +11 -11
- pydatamax-0.1.13.dist-info/METADATA +0 -280
- /datamax/loader/{MinioHandler.py → minio_handler.py} +0 -0
- /datamax/loader/{OssHandler.py → oss_handler.py} +0 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.14.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.14.dist-info}/top_level.txt +0 -0
datamax/loader/core.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import os
|
2
2
|
from typing import List
|
3
|
-
from datamax.loader.
|
4
|
-
from datamax.loader.
|
3
|
+
from datamax.loader.minio_handler import MinIOClient
|
4
|
+
from datamax.loader.oss_handler import OssClient
|
5
5
|
|
6
6
|
|
7
7
|
class DataLoader:
|
datamax/parser/doc_parser.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
+
import logging
|
1
2
|
import os
|
2
3
|
import shutil
|
3
4
|
import subprocess
|
4
5
|
import tempfile
|
5
|
-
import chardet
|
6
|
-
import logging
|
7
6
|
from pathlib import Path
|
8
7
|
from typing import Union
|
9
|
-
from datamax.parser.base import BaseLife
|
10
|
-
from datamax.parser.base import MarkdownOutputVo
|
11
8
|
|
9
|
+
import chardet
|
10
|
+
|
11
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
12
12
|
|
13
13
|
# 配置日志
|
14
14
|
logger = logging.getLogger(__name__)
|
@@ -24,37 +24,41 @@ class DocParser(BaseLife):
|
|
24
24
|
def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
|
25
25
|
"""将.doc文件转换为.txt文件"""
|
26
26
|
logger.info(f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}")
|
27
|
-
|
27
|
+
|
28
28
|
try:
|
29
29
|
cmd = f'soffice --headless --convert-to txt "{doc_path}" --outdir "{dir_path}"'
|
30
30
|
logger.debug(f"⚡ 执行转换命令: {cmd}")
|
31
|
-
|
32
|
-
process = subprocess.Popen(
|
31
|
+
|
32
|
+
process = subprocess.Popen(
|
33
|
+
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
34
|
+
)
|
33
35
|
stdout, stderr = process.communicate()
|
34
36
|
exit_code = process.returncode
|
35
|
-
|
37
|
+
|
36
38
|
if exit_code == 0:
|
37
39
|
logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
|
38
40
|
if stdout:
|
39
41
|
logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
|
40
42
|
else:
|
41
|
-
encoding = chardet.detect(stderr)[
|
43
|
+
encoding = chardet.detect(stderr)["encoding"]
|
42
44
|
if encoding is None:
|
43
|
-
encoding =
|
44
|
-
error_msg = stderr.decode(encoding, errors=
|
45
|
+
encoding = "utf-8"
|
46
|
+
error_msg = stderr.decode(encoding, errors="replace")
|
45
47
|
logger.error(f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
|
46
|
-
raise Exception(
|
47
|
-
|
48
|
+
raise Exception(
|
49
|
+
f"Error Output (detected encoding: {encoding}): {error_msg}"
|
50
|
+
)
|
51
|
+
|
48
52
|
fname = str(Path(doc_path).stem)
|
49
|
-
txt_path = os.path.join(dir_path, f
|
50
|
-
|
53
|
+
txt_path = os.path.join(dir_path, f"{fname}.txt")
|
54
|
+
|
51
55
|
if not os.path.exists(txt_path):
|
52
56
|
logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
|
53
57
|
raise Exception(f"文件转换失败 {doc_path} ==> {txt_path}")
|
54
58
|
else:
|
55
59
|
logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
|
56
60
|
return txt_path
|
57
|
-
|
61
|
+
|
58
62
|
except subprocess.SubprocessError as e:
|
59
63
|
logger.error(f"💥 subprocess执行失败: {str(e)}")
|
60
64
|
raise Exception(f"执行转换命令时发生错误: {str(e)}")
|
@@ -65,25 +69,25 @@ class DocParser(BaseLife):
|
|
65
69
|
def read_txt_file(self, txt_path: str) -> str:
|
66
70
|
"""读取txt文件内容"""
|
67
71
|
logger.info(f"📖 开始读取TXT文件: {txt_path}")
|
68
|
-
|
72
|
+
|
69
73
|
try:
|
70
74
|
# 检测文件编码
|
71
|
-
with open(txt_path,
|
75
|
+
with open(txt_path, "rb") as f:
|
72
76
|
raw_data = f.read()
|
73
|
-
encoding = chardet.detect(raw_data)[
|
77
|
+
encoding = chardet.detect(raw_data)["encoding"]
|
74
78
|
if encoding is None:
|
75
|
-
encoding =
|
79
|
+
encoding = "utf-8"
|
76
80
|
logger.debug(f"🔍 检测到文件编码: {encoding}")
|
77
|
-
|
81
|
+
|
78
82
|
# 读取文件内容
|
79
|
-
with open(txt_path,
|
83
|
+
with open(txt_path, "r", encoding=encoding, errors="replace") as f:
|
80
84
|
content = f.read()
|
81
|
-
|
85
|
+
|
82
86
|
logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
|
83
87
|
logger.debug(f"👀 前100字符预览: {content[:100]}...")
|
84
|
-
|
88
|
+
|
85
89
|
return content
|
86
|
-
|
90
|
+
|
87
91
|
except FileNotFoundError as e:
|
88
92
|
logger.error(f"🚫 TXT文件未找到: {str(e)}")
|
89
93
|
raise Exception(f"文件未找到: {txt_path}")
|
@@ -94,27 +98,27 @@ class DocParser(BaseLife):
|
|
94
98
|
def read_doc_file(self, doc_path: str) -> str:
|
95
99
|
"""读取doc文件并转换为文本"""
|
96
100
|
logger.info(f"📖 开始读取DOC文件 - 文件: {doc_path}")
|
97
|
-
|
101
|
+
|
98
102
|
try:
|
99
103
|
with tempfile.TemporaryDirectory() as temp_path:
|
100
104
|
logger.debug(f"📁 创建临时目录: {temp_path}")
|
101
|
-
|
105
|
+
|
102
106
|
temp_dir = Path(temp_path)
|
103
|
-
|
107
|
+
|
104
108
|
file_path = temp_dir / "tmp.doc"
|
105
109
|
shutil.copy(doc_path, file_path)
|
106
110
|
logger.debug(f"📋 复制文件到临时目录: {doc_path} -> {file_path}")
|
107
|
-
|
111
|
+
|
108
112
|
# 转换DOC为TXT
|
109
113
|
txt_file_path = self.doc_to_txt(str(file_path), str(temp_path))
|
110
114
|
logger.info(f"🎯 DOC转TXT完成: {txt_file_path}")
|
111
|
-
|
115
|
+
|
112
116
|
# 读取TXT文件内容
|
113
117
|
content = self.read_txt_file(txt_file_path)
|
114
118
|
logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
|
115
|
-
|
119
|
+
|
116
120
|
return content
|
117
|
-
|
121
|
+
|
118
122
|
except FileNotFoundError as e:
|
119
123
|
logger.error(f"🚫 文件未找到: {str(e)}")
|
120
124
|
raise Exception(f"文件未找到: {doc_path}")
|
@@ -128,24 +132,24 @@ class DocParser(BaseLife):
|
|
128
132
|
def parse(self, file_path: str):
|
129
133
|
"""解析DOC文件"""
|
130
134
|
logger.info(f"🎬 开始解析DOC文件: {file_path}")
|
131
|
-
|
135
|
+
|
132
136
|
try:
|
133
137
|
# 验证文件存在
|
134
138
|
if not os.path.exists(file_path):
|
135
139
|
logger.error(f"🚫 文件不存在: {file_path}")
|
136
140
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
137
|
-
|
141
|
+
|
138
142
|
# 验证文件大小
|
139
143
|
file_size = os.path.getsize(file_path)
|
140
144
|
logger.info(f"📏 文件大小: {file_size} 字节")
|
141
|
-
|
145
|
+
|
142
146
|
title = self.get_file_extension(file_path)
|
143
147
|
logger.debug(f"🏷️ 提取文件标题: {title}")
|
144
|
-
|
148
|
+
|
145
149
|
# 使用soffice转换为txt后读取内容
|
146
150
|
logger.info("📝 使用soffice转换DOC为TXT并读取内容")
|
147
151
|
content = self.read_doc_file(doc_path=file_path)
|
148
|
-
|
152
|
+
|
149
153
|
# 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
|
150
154
|
if self.to_markdown:
|
151
155
|
# 简单的文本到markdown转换(保持段落结构)
|
@@ -154,22 +158,26 @@ class DocParser(BaseLife):
|
|
154
158
|
else:
|
155
159
|
mk_content = content
|
156
160
|
logger.info("📝 保持原始文本格式")
|
157
|
-
|
161
|
+
|
158
162
|
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
159
|
-
|
160
|
-
lifecycle = self.generate_lifecycle(
|
161
|
-
|
163
|
+
|
164
|
+
lifecycle = self.generate_lifecycle(
|
165
|
+
source_file=file_path,
|
166
|
+
domain="Technology",
|
167
|
+
usage_purpose="Documentation",
|
168
|
+
life_type="LLM_ORIGIN",
|
169
|
+
)
|
162
170
|
logger.debug("⚙️ 生成lifecycle信息完成")
|
163
|
-
|
171
|
+
|
164
172
|
output_vo = MarkdownOutputVo(title, mk_content)
|
165
173
|
output_vo.add_lifecycle(lifecycle)
|
166
|
-
|
174
|
+
|
167
175
|
result = output_vo.to_dict()
|
168
176
|
logger.info(f"🏆 DOC文件解析完成: {file_path}")
|
169
177
|
logger.debug(f"🔑 返回结果键: {list(result.keys())}")
|
170
|
-
|
178
|
+
|
171
179
|
return result
|
172
|
-
|
180
|
+
|
173
181
|
except Exception as e:
|
174
182
|
logger.error(f"💀 解析DOC文件失败: {file_path}, 错误: {str(e)}")
|
175
183
|
raise
|
@@ -178,18 +186,18 @@ class DocParser(BaseLife):
|
|
178
186
|
"""将纯文本格式化为简单的markdown格式"""
|
179
187
|
if not content.strip():
|
180
188
|
return content
|
181
|
-
|
182
|
-
lines = content.split(
|
189
|
+
|
190
|
+
lines = content.split("\n")
|
183
191
|
formatted_lines = []
|
184
|
-
|
192
|
+
|
185
193
|
for line in lines:
|
186
194
|
line = line.strip()
|
187
195
|
if not line:
|
188
|
-
formatted_lines.append(
|
196
|
+
formatted_lines.append("")
|
189
197
|
continue
|
190
|
-
|
198
|
+
|
191
199
|
# 简单的markdown格式化规则
|
192
200
|
# 可以根据需要扩展更多规则
|
193
201
|
formatted_lines.append(line)
|
194
|
-
|
195
|
-
return
|
202
|
+
|
203
|
+
return "\n".join(formatted_lines)
|
datamax/parser/docx_parser.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
+
import logging
|
1
2
|
import os
|
2
3
|
import shutil
|
3
4
|
import subprocess
|
4
5
|
import tempfile
|
5
|
-
import chardet
|
6
|
-
import logging
|
7
6
|
from pathlib import Path
|
8
7
|
from typing import Union
|
9
|
-
from datamax.parser.base import BaseLife
|
10
|
-
from datamax.parser.base import MarkdownOutputVo
|
11
8
|
|
9
|
+
import chardet
|
10
|
+
|
11
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
12
12
|
|
13
13
|
# 配置日志
|
14
14
|
logger = logging.getLogger(__name__)
|
@@ -19,42 +19,48 @@ class DocxParser(BaseLife):
|
|
19
19
|
super().__init__()
|
20
20
|
self.file_path = file_path
|
21
21
|
self.to_markdown = to_markdown
|
22
|
-
logger.info(
|
22
|
+
logger.info(
|
23
|
+
f"🚀 DocxParser初始化完成 - 文件路径: {file_path}, 转换为markdown: {to_markdown}"
|
24
|
+
)
|
23
25
|
|
24
26
|
def docx_to_txt(self, docx_path: str, dir_path: str) -> str:
|
25
27
|
"""将.docx文件转换为.txt文件"""
|
26
28
|
logger.info(f"🔄 开始转换DOCX文件为TXT - 源文件: {docx_path}, 输出目录: {dir_path}")
|
27
|
-
|
29
|
+
|
28
30
|
try:
|
29
31
|
cmd = f'soffice --headless --convert-to txt "{docx_path}" --outdir "{dir_path}"'
|
30
32
|
logger.debug(f"⚡ 执行转换命令: {cmd}")
|
31
|
-
|
32
|
-
process = subprocess.Popen(
|
33
|
+
|
34
|
+
process = subprocess.Popen(
|
35
|
+
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
36
|
+
)
|
33
37
|
stdout, stderr = process.communicate()
|
34
38
|
exit_code = process.returncode
|
35
|
-
|
39
|
+
|
36
40
|
if exit_code == 0:
|
37
41
|
logger.info(f"✅ DOCX到TXT转换成功 - 退出码: {exit_code}")
|
38
42
|
if stdout:
|
39
43
|
logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
|
40
44
|
else:
|
41
|
-
encoding = chardet.detect(stderr)[
|
45
|
+
encoding = chardet.detect(stderr)["encoding"]
|
42
46
|
if encoding is None:
|
43
|
-
encoding =
|
44
|
-
error_msg = stderr.decode(encoding, errors=
|
47
|
+
encoding = "utf-8"
|
48
|
+
error_msg = stderr.decode(encoding, errors="replace")
|
45
49
|
logger.error(f"❌ DOCX到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
|
46
|
-
raise Exception(
|
47
|
-
|
50
|
+
raise Exception(
|
51
|
+
f"Error Output (detected encoding: {encoding}): {error_msg}"
|
52
|
+
)
|
53
|
+
|
48
54
|
fname = str(Path(docx_path).stem)
|
49
|
-
txt_path = os.path.join(dir_path, f
|
50
|
-
|
55
|
+
txt_path = os.path.join(dir_path, f"{fname}.txt")
|
56
|
+
|
51
57
|
if not os.path.exists(txt_path):
|
52
58
|
logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
|
53
59
|
raise Exception(f"文件转换失败 {docx_path} ==> {txt_path}")
|
54
60
|
else:
|
55
61
|
logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
|
56
62
|
return txt_path
|
57
|
-
|
63
|
+
|
58
64
|
except subprocess.SubprocessError as e:
|
59
65
|
logger.error(f"💥 subprocess执行失败: {str(e)}")
|
60
66
|
raise Exception(f"执行转换命令时发生错误: {str(e)}")
|
@@ -65,25 +71,25 @@ class DocxParser(BaseLife):
|
|
65
71
|
def read_txt_file(self, txt_path: str) -> str:
|
66
72
|
"""读取txt文件内容"""
|
67
73
|
logger.info(f"📖 开始读取TXT文件: {txt_path}")
|
68
|
-
|
74
|
+
|
69
75
|
try:
|
70
76
|
# 检测文件编码
|
71
|
-
with open(txt_path,
|
77
|
+
with open(txt_path, "rb") as f:
|
72
78
|
raw_data = f.read()
|
73
|
-
encoding = chardet.detect(raw_data)[
|
79
|
+
encoding = chardet.detect(raw_data)["encoding"]
|
74
80
|
if encoding is None:
|
75
|
-
encoding =
|
81
|
+
encoding = "utf-8"
|
76
82
|
logger.debug(f"🔍 检测到文件编码: {encoding}")
|
77
|
-
|
83
|
+
|
78
84
|
# 读取文件内容
|
79
|
-
with open(txt_path,
|
85
|
+
with open(txt_path, "r", encoding=encoding, errors="replace") as f:
|
80
86
|
content = f.read()
|
81
|
-
|
87
|
+
|
82
88
|
logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
|
83
89
|
logger.debug(f"👀 前100字符预览: {content[:100]}...")
|
84
|
-
|
90
|
+
|
85
91
|
return content
|
86
|
-
|
92
|
+
|
87
93
|
except FileNotFoundError as e:
|
88
94
|
logger.error(f"🚫 TXT文件未找到: {str(e)}")
|
89
95
|
raise Exception(f"文件未找到: {txt_path}")
|
@@ -94,27 +100,27 @@ class DocxParser(BaseLife):
|
|
94
100
|
def read_docx_file(self, docx_path: str) -> str:
|
95
101
|
"""读取docx文件并转换为文本"""
|
96
102
|
logger.info(f"📖 开始读取DOCX文件 - 文件: {docx_path}")
|
97
|
-
|
103
|
+
|
98
104
|
try:
|
99
105
|
with tempfile.TemporaryDirectory() as temp_path:
|
100
106
|
logger.debug(f"📁 创建临时目录: {temp_path}")
|
101
|
-
|
107
|
+
|
102
108
|
temp_dir = Path(temp_path)
|
103
|
-
|
109
|
+
|
104
110
|
file_path = temp_dir / "tmp.docx"
|
105
111
|
shutil.copy(docx_path, file_path)
|
106
112
|
logger.debug(f"📋 复制文件到临时目录: {docx_path} -> {file_path}")
|
107
|
-
|
113
|
+
|
108
114
|
# 转换DOCX为TXT
|
109
115
|
txt_file_path = self.docx_to_txt(str(file_path), str(temp_path))
|
110
116
|
logger.info(f"🎯 DOCX转TXT完成: {txt_file_path}")
|
111
|
-
|
117
|
+
|
112
118
|
# 读取TXT文件内容
|
113
119
|
content = self.read_txt_file(txt_file_path)
|
114
120
|
logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
|
115
|
-
|
121
|
+
|
116
122
|
return content
|
117
|
-
|
123
|
+
|
118
124
|
except FileNotFoundError as e:
|
119
125
|
logger.error(f"🚫 文件未找到: {str(e)}")
|
120
126
|
raise Exception(f"文件未找到: {docx_path}")
|
@@ -128,31 +134,31 @@ class DocxParser(BaseLife):
|
|
128
134
|
def parse(self, file_path: str):
|
129
135
|
"""解析DOCX文件"""
|
130
136
|
logger.info(f"🎬 开始解析DOCX文件: {file_path}")
|
131
|
-
|
137
|
+
|
132
138
|
try:
|
133
139
|
# 验证文件存在
|
134
140
|
if not os.path.exists(file_path):
|
135
141
|
logger.error(f"🚫 文件不存在: {file_path}")
|
136
142
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
137
|
-
|
143
|
+
|
138
144
|
# 验证文件扩展名
|
139
|
-
if not file_path.lower().endswith(
|
145
|
+
if not file_path.lower().endswith(".docx"):
|
140
146
|
logger.warning(f"⚠️ 文件扩展名不是.docx: {file_path}")
|
141
|
-
|
147
|
+
|
142
148
|
# 验证文件大小
|
143
149
|
file_size = os.path.getsize(file_path)
|
144
150
|
logger.info(f"📏 文件大小: {file_size} 字节")
|
145
|
-
|
151
|
+
|
146
152
|
if file_size == 0:
|
147
153
|
logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
|
148
|
-
|
154
|
+
|
149
155
|
title = self.get_file_extension(file_path)
|
150
156
|
logger.debug(f"🏷️ 提取文件标题: {title}")
|
151
|
-
|
157
|
+
|
152
158
|
# 使用soffice转换为txt后读取内容
|
153
159
|
logger.info("📝 使用soffice转换DOCX为TXT并读取内容")
|
154
160
|
content = self.read_docx_file(docx_path=file_path)
|
155
|
-
|
161
|
+
|
156
162
|
# 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
|
157
163
|
if self.to_markdown:
|
158
164
|
# 简单的文本到markdown转换(保持段落结构)
|
@@ -161,26 +167,30 @@ class DocxParser(BaseLife):
|
|
161
167
|
else:
|
162
168
|
mk_content = content
|
163
169
|
logger.info("📝 保持原始文本格式")
|
164
|
-
|
170
|
+
|
165
171
|
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
166
|
-
|
172
|
+
|
167
173
|
# 检查内容是否为空
|
168
174
|
if not mk_content.strip():
|
169
175
|
logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
|
170
|
-
|
171
|
-
lifecycle = self.generate_lifecycle(
|
172
|
-
|
176
|
+
|
177
|
+
lifecycle = self.generate_lifecycle(
|
178
|
+
source_file=file_path,
|
179
|
+
domain="Technology",
|
180
|
+
usage_purpose="Documentation",
|
181
|
+
life_type="LLM_ORIGIN",
|
182
|
+
)
|
173
183
|
logger.debug("⚙️ 生成lifecycle信息完成")
|
174
|
-
|
184
|
+
|
175
185
|
output_vo = MarkdownOutputVo(title, mk_content)
|
176
186
|
output_vo.add_lifecycle(lifecycle)
|
177
|
-
|
187
|
+
|
178
188
|
result = output_vo.to_dict()
|
179
189
|
logger.info(f"🏆 DOCX文件解析完成: {file_path}")
|
180
190
|
logger.debug(f"🔑 返回结果键: {list(result.keys())}")
|
181
|
-
|
191
|
+
|
182
192
|
return result
|
183
|
-
|
193
|
+
|
184
194
|
except FileNotFoundError as e:
|
185
195
|
logger.error(f"🚫 文件不存在错误: {str(e)}")
|
186
196
|
raise
|
@@ -188,25 +198,27 @@ class DocxParser(BaseLife):
|
|
188
198
|
logger.error(f"🔒 文件权限错误: {str(e)}")
|
189
199
|
raise Exception(f"无权限访问文件: {file_path}")
|
190
200
|
except Exception as e:
|
191
|
-
logger.error(
|
201
|
+
logger.error(
|
202
|
+
f"💀 解析DOCX文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
|
203
|
+
)
|
192
204
|
raise
|
193
205
|
|
194
206
|
def format_as_markdown(self, content: str) -> str:
|
195
207
|
"""将纯文本格式化为简单的markdown格式"""
|
196
208
|
if not content.strip():
|
197
209
|
return content
|
198
|
-
|
199
|
-
lines = content.split(
|
210
|
+
|
211
|
+
lines = content.split("\n")
|
200
212
|
formatted_lines = []
|
201
|
-
|
213
|
+
|
202
214
|
for line in lines:
|
203
215
|
line = line.strip()
|
204
216
|
if not line:
|
205
|
-
formatted_lines.append(
|
217
|
+
formatted_lines.append("")
|
206
218
|
continue
|
207
|
-
|
219
|
+
|
208
220
|
# 简单的markdown格式化规则
|
209
221
|
# 可以根据需要扩展更多规则
|
210
222
|
formatted_lines.append(line)
|
211
|
-
|
212
|
-
return
|
223
|
+
|
224
|
+
return "\n".join(formatted_lines)
|
datamax/parser/xlsx_parser.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
-
import multiprocessing
|
2
|
-
import time
|
3
1
|
import logging
|
2
|
+
import multiprocessing
|
4
3
|
import os
|
4
|
+
import time
|
5
|
+
import warnings
|
5
6
|
from multiprocessing import Queue
|
6
|
-
|
7
|
-
from datamax.parser.base import BaseLife
|
7
|
+
|
8
8
|
import pandas as pd
|
9
|
-
|
9
|
+
|
10
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
10
11
|
|
11
12
|
warnings.filterwarnings("ignore")
|
12
13
|
|
@@ -26,42 +27,44 @@ class XlsxParser(BaseLife):
|
|
26
27
|
def _parse_with_pandas(self, file_path: str) -> str:
|
27
28
|
"""使用pandas读取Excel并转换为markdown"""
|
28
29
|
logger.info(f"🐼 开始使用pandas读取Excel文件: {file_path}")
|
29
|
-
|
30
|
+
|
30
31
|
try:
|
31
32
|
# 验证文件存在
|
32
33
|
if not os.path.exists(file_path):
|
33
34
|
logger.error(f"🚫 Excel文件不存在: {file_path}")
|
34
35
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
35
|
-
|
36
|
+
|
36
37
|
# 验证文件大小
|
37
38
|
file_size = os.path.getsize(file_path)
|
38
39
|
logger.info(f"📏 文件大小: {file_size} 字节")
|
39
|
-
|
40
|
+
|
40
41
|
if file_size == 0:
|
41
42
|
logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
|
42
43
|
return "*文件为空*"
|
43
|
-
|
44
|
+
|
44
45
|
# 使用pandas读取Excel文件
|
45
46
|
logger.debug("📊 正在读取Excel数据...")
|
46
47
|
df = pd.read_excel(file_path, sheet_name=None) # 读取所有工作表
|
47
|
-
|
48
|
+
|
48
49
|
markdown_content = ""
|
49
|
-
|
50
|
+
|
50
51
|
if isinstance(df, dict):
|
51
52
|
# 多个工作表
|
52
53
|
logger.info(f"📑 检测到多个工作表,共 {len(df)} 个")
|
53
54
|
for sheet_name, sheet_df in df.items():
|
54
55
|
logger.debug(f"📋 处理工作表: {sheet_name}, 形状: {sheet_df.shape}")
|
55
56
|
markdown_content += f"## 工作表: {sheet_name}\n\n"
|
56
|
-
|
57
|
+
|
57
58
|
if not sheet_df.empty:
|
58
59
|
# 清理数据:移除完全为空的行和列
|
59
|
-
sheet_df = sheet_df.dropna(how=
|
60
|
-
|
60
|
+
sheet_df = sheet_df.dropna(how="all").dropna(axis=1, how="all")
|
61
|
+
|
61
62
|
if not sheet_df.empty:
|
62
63
|
sheet_markdown = sheet_df.to_markdown(index=False)
|
63
64
|
markdown_content += sheet_markdown + "\n\n"
|
64
|
-
logger.debug(
|
65
|
+
logger.debug(
|
66
|
+
f"✅ 工作表 {sheet_name} 转换完成,有效数据形状: {sheet_df.shape}"
|
67
|
+
)
|
65
68
|
else:
|
66
69
|
markdown_content += "*该工作表无有效数据*\n\n"
|
67
70
|
logger.warning(f"⚠️ 工作表 {sheet_name} 清理后无有效数据")
|
@@ -73,8 +76,8 @@ class XlsxParser(BaseLife):
|
|
73
76
|
logger.info(f"📄 单个工作表,形状: {df.shape}")
|
74
77
|
if not df.empty:
|
75
78
|
# 清理数据:移除完全为空的行和列
|
76
|
-
df = df.dropna(how=
|
77
|
-
|
79
|
+
df = df.dropna(how="all").dropna(axis=1, how="all")
|
80
|
+
|
78
81
|
if not df.empty:
|
79
82
|
markdown_content = df.to_markdown(index=False)
|
80
83
|
logger.info(f"✅ 工作表转换完成,有效数据形状: {df.shape}")
|
@@ -84,12 +87,12 @@ class XlsxParser(BaseLife):
|
|
84
87
|
else:
|
85
88
|
markdown_content = "*工作表为空*"
|
86
89
|
logger.warning("⚠️ 工作表为空")
|
87
|
-
|
90
|
+
|
88
91
|
logger.info(f"🎊 pandas转换完成,markdown内容长度: {len(markdown_content)} 字符")
|
89
92
|
logger.debug(f"👀 前200字符预览: {markdown_content[:200]}...")
|
90
|
-
|
93
|
+
|
91
94
|
return markdown_content
|
92
|
-
|
95
|
+
|
93
96
|
except FileNotFoundError as e:
|
94
97
|
logger.error(f"🚫 文件未找到: {str(e)}")
|
95
98
|
raise
|
@@ -106,41 +109,41 @@ class XlsxParser(BaseLife):
|
|
106
109
|
def _parse(self, file_path: str, result_queue: Queue) -> dict:
|
107
110
|
"""解析Excel文件的核心方法"""
|
108
111
|
logger.info(f"🎬 开始解析Excel文件: {file_path}")
|
109
|
-
|
112
|
+
|
110
113
|
try:
|
111
114
|
# 使用pandas解析Excel
|
112
115
|
logger.info("🐼 使用pandas模式解析Excel")
|
113
116
|
mk_content = self._parse_with_pandas(file_path)
|
114
|
-
|
117
|
+
|
115
118
|
# 检查内容是否为空
|
116
119
|
if not mk_content.strip():
|
117
120
|
logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
|
118
121
|
mk_content = "*无法解析文件内容*"
|
119
|
-
|
122
|
+
|
120
123
|
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
121
|
-
|
124
|
+
|
122
125
|
# 生成lifecycle信息
|
123
126
|
lifecycle = self.generate_lifecycle(
|
124
127
|
source_file=file_path,
|
125
128
|
domain="Technology",
|
126
129
|
usage_purpose="Documentation",
|
127
|
-
life_type="LLM_ORIGIN"
|
130
|
+
life_type="LLM_ORIGIN",
|
128
131
|
)
|
129
132
|
logger.debug("⚙️ 生成lifecycle信息完成")
|
130
|
-
|
133
|
+
|
131
134
|
# 创建输出对象
|
132
135
|
title = self.get_file_extension(file_path)
|
133
136
|
output_vo = MarkdownOutputVo(title, mk_content)
|
134
137
|
output_vo.add_lifecycle(lifecycle)
|
135
|
-
|
138
|
+
|
136
139
|
result = output_vo.to_dict()
|
137
140
|
result_queue.put(result)
|
138
141
|
logger.info(f"🏆 Excel文件解析完成: {file_path}")
|
139
142
|
logger.debug(f"🔑 返回结果键: {list(result.keys())}")
|
140
|
-
|
143
|
+
|
141
144
|
time.sleep(0.5) # 给队列一点时间
|
142
145
|
return result
|
143
|
-
|
146
|
+
|
144
147
|
except Exception as e:
|
145
148
|
logger.error(f"💀 解析Excel文件失败: {file_path}, 错误: {str(e)}")
|
146
149
|
# 将错误也放入队列
|
@@ -151,58 +154,62 @@ class XlsxParser(BaseLife):
|
|
151
154
|
def parse(self, file_path: str) -> dict:
|
152
155
|
"""解析Excel文件 - 支持多进程和超时控制"""
|
153
156
|
logger.info(f"🚀 启动Excel解析进程 - 文件: {file_path}, 超时: {self.timeout}s")
|
154
|
-
|
157
|
+
|
155
158
|
try:
|
156
159
|
# 验证文件存在
|
157
160
|
if not os.path.exists(file_path):
|
158
161
|
logger.error(f"🚫 文件不存在: {file_path}")
|
159
162
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
160
|
-
|
163
|
+
|
161
164
|
# 验证文件扩展名
|
162
|
-
if not file_path.lower().endswith((
|
165
|
+
if not file_path.lower().endswith((".xlsx", ".xls")):
|
163
166
|
logger.warning(f"⚠️ 文件扩展名不是Excel格式: {file_path}")
|
164
|
-
|
167
|
+
|
165
168
|
result_queue = Queue()
|
166
|
-
process = multiprocessing.Process(
|
169
|
+
process = multiprocessing.Process(
|
170
|
+
target=self._parse, args=(file_path, result_queue)
|
171
|
+
)
|
167
172
|
process.start()
|
168
173
|
logger.debug(f"⚡ 启动子进程,PID: {process.pid}")
|
169
|
-
|
174
|
+
|
170
175
|
start_time = time.time()
|
171
|
-
|
176
|
+
|
172
177
|
# 等待解析完成或超时
|
173
178
|
while time.time() - start_time < self.timeout:
|
174
179
|
elapsed_time = int(time.time() - start_time)
|
175
180
|
logger.debug(f"⏱️ 等待解析完成... {elapsed_time}s")
|
176
|
-
|
181
|
+
|
177
182
|
if not process.is_alive():
|
178
183
|
logger.debug("✅ 子进程已完成")
|
179
184
|
break
|
180
|
-
|
185
|
+
|
181
186
|
if not result_queue.empty():
|
182
187
|
result = result_queue.get()
|
183
188
|
process.join() # 等待进程正常结束
|
184
|
-
|
189
|
+
|
185
190
|
# 检查是否是错误结果
|
186
191
|
if "error" in result:
|
187
192
|
logger.error(f"💥 子进程返回错误: {result['error']}")
|
188
193
|
raise Exception(result["error"])
|
189
|
-
|
194
|
+
|
190
195
|
logger.info(f"🎉 Excel解析成功完成,耗时: {elapsed_time}s")
|
191
196
|
return result
|
192
|
-
|
197
|
+
|
193
198
|
time.sleep(1)
|
194
199
|
else:
|
195
200
|
# 超时处理
|
196
201
|
logger.error(f"⏰ 解析超时 ({self.timeout}s),终止进程")
|
197
202
|
process.terminate()
|
198
203
|
process.join(timeout=5) # 给进程5秒时间优雅退出
|
199
|
-
|
204
|
+
|
200
205
|
if process.is_alive():
|
201
206
|
logger.error("💀 强制杀死进程")
|
202
207
|
process.kill()
|
203
|
-
|
208
|
+
|
204
209
|
raise TimeoutError(f"Excel解析超时: {file_path}")
|
205
|
-
|
210
|
+
|
206
211
|
except Exception as e:
|
207
|
-
logger.error(
|
208
|
-
|
212
|
+
logger.error(
|
213
|
+
f"💀 Excel解析失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
|
214
|
+
)
|
215
|
+
raise
|
@@ -0,0 +1,228 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: pydatamax
|
3
|
+
Version: 0.1.14
|
4
|
+
Summary: A library for parsing and converting various file formats.
|
5
|
+
Home-page: https://github.com/Hi-Dolphin/datamax
|
6
|
+
Author: ccy
|
7
|
+
Author-email: cy.kron@foxmail.com
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.10
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Requires-Dist: oss2<3.0.0,>=2.19.1
|
15
|
+
Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
|
16
|
+
Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
|
17
|
+
Requires-Dist: crcmod<2.0.0,>=1.7
|
18
|
+
Requires-Dist: langdetect<2.0.0,>=1.0.9
|
19
|
+
Requires-Dist: loguru<1.0.0,>=0.7.3
|
20
|
+
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
21
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.1.0
|
22
|
+
Requires-Dist: pymupdf<2.0.0,>=1.26.0
|
23
|
+
Requires-Dist: pypdf<6.0.0,>=5.5.0
|
24
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
25
|
+
Requires-Dist: pandas<3.0.0,>=2.2.3
|
26
|
+
Requires-Dist: numpy<3.0.0,>=2.2.6
|
27
|
+
Requires-Dist: requests<3.0.0,>=2.32.3
|
28
|
+
Requires-Dist: tqdm<5.0.0,>=4.67.1
|
29
|
+
Requires-Dist: pydantic<3.0.0,>=2.11.5
|
30
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
|
31
|
+
Requires-Dist: python-magic<1.0.0,>=0.4.27
|
32
|
+
Requires-Dist: PyYAML<7.0.0,>=6.0.2
|
33
|
+
Requires-Dist: Pillow<12.0.0,>=11.2.1
|
34
|
+
Requires-Dist: packaging<25.0,>=24.2
|
35
|
+
Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
|
36
|
+
Requires-Dist: minio<8.0.0,>=7.2.15
|
37
|
+
Requires-Dist: openai<2.0.0,>=1.82.0
|
38
|
+
Requires-Dist: jionlp<2.0.0,>=1.5.23
|
39
|
+
Requires-Dist: chardet<6.0.0,>=5.2.0
|
40
|
+
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
41
|
+
Requires-Dist: tiktoken<1.0.0,>=0.9.0
|
42
|
+
Requires-Dist: markitdown<1.0.0,>=0.1.1
|
43
|
+
Requires-Dist: xlrd<3.0.0,>=2.0.1
|
44
|
+
Requires-Dist: tabulate<1.0.0,>=0.9.0
|
45
|
+
Requires-Dist: unstructured<1.0.0,>=0.17.2
|
46
|
+
Requires-Dist: markdown<4.0.0,>=3.8
|
47
|
+
Requires-Dist: langchain<1.0.0,>=0.3.0
|
48
|
+
Requires-Dist: langchain-community<1.0.0,>=0.3.0
|
49
|
+
Requires-Dist: ebooklib==0.19
|
50
|
+
Requires-Dist: setuptools
|
51
|
+
Dynamic: author
|
52
|
+
Dynamic: author-email
|
53
|
+
Dynamic: classifier
|
54
|
+
Dynamic: description
|
55
|
+
Dynamic: description-content-type
|
56
|
+
Dynamic: home-page
|
57
|
+
Dynamic: license-file
|
58
|
+
Dynamic: requires-dist
|
59
|
+
Dynamic: requires-python
|
60
|
+
Dynamic: summary
|
61
|
+
|
62
|
+
# DataMax
|
63
|
+
|
64
|
+
<div align="center">
|
65
|
+
|
66
|
+
[中文](README_zh.md) | **English**
|
67
|
+
|
68
|
+
[](https://badge.fury.io/py/pydatamax) [](https://www.python.org/downloads/) [](https://opensource.org/licenses/MIT)
|
69
|
+
|
70
|
+
</div>
|
71
|
+
|
72
|
+
A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
|
73
|
+
|
74
|
+
## ✨ Core Features
|
75
|
+
|
76
|
+
- 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
|
77
|
+
- 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
|
78
|
+
- 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
|
79
|
+
- ⚡ **Batch Processing**: Efficient multi-file parallel processing
|
80
|
+
- 🎯 **Easy Integration**: Clean API design, ready to use out of the box
|
81
|
+
|
82
|
+
## 🚀 Quick Start
|
83
|
+
|
84
|
+
### Installation
|
85
|
+
|
86
|
+
```bash
|
87
|
+
pip install pydatamax
|
88
|
+
```
|
89
|
+
|
90
|
+
### Basic Usage
|
91
|
+
|
92
|
+
```python
|
93
|
+
from datamax import DataMax
|
94
|
+
|
95
|
+
# Parse a single file
|
96
|
+
dm = DataMax(file_path="document.pdf")
|
97
|
+
data = dm.get_data()
|
98
|
+
|
99
|
+
# Batch processing
|
100
|
+
dm = DataMax(file_path=["file1.docx", "file2.pdf"])
|
101
|
+
data = dm.get_data()
|
102
|
+
|
103
|
+
# Data cleaning
|
104
|
+
cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
|
105
|
+
|
106
|
+
# AI annotation
|
107
|
+
qa_data = dm.get_pre_label(
|
108
|
+
api_key="your-api-key",
|
109
|
+
base_url="https://api.openai.com/v1",
|
110
|
+
model_name="gpt-3.5-turbo"
|
111
|
+
)
|
112
|
+
```
|
113
|
+
|
114
|
+
## 📖 Detailed Documentation
|
115
|
+
|
116
|
+
### File Parsing
|
117
|
+
|
118
|
+
#### Supported Formats
|
119
|
+
|
120
|
+
| Format | Extensions | Special Features |
|
121
|
+
|--------|------------|------------------|
|
122
|
+
| Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
|
123
|
+
| Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
|
124
|
+
| Presentations | `.pptx`, `.ppt` | Slide content extraction |
|
125
|
+
| Web | `.html`, `.epub` | Tag parsing |
|
126
|
+
| Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
|
127
|
+
| Text | `.txt` | Automatic encoding detection |
|
128
|
+
|
129
|
+
#### Advanced Features
|
130
|
+
|
131
|
+
```python
|
132
|
+
# Advanced PDF parsing (requires MinerU)
|
133
|
+
dm = DataMax(file_path="complex.pdf", use_mineru=True)
|
134
|
+
|
135
|
+
# Word to Markdown conversion
|
136
|
+
dm = DataMax(file_path="document.docx", to_markdown=True)
|
137
|
+
|
138
|
+
# Image OCR
|
139
|
+
dm = DataMax(file_path="image.jpg", use_ocr=True)
|
140
|
+
```
|
141
|
+
|
142
|
+
### Data Cleaning
|
143
|
+
|
144
|
+
```python
|
145
|
+
# Three cleaning modes
|
146
|
+
dm.clean_data(method_list=[
|
147
|
+
"abnormal", # Anomaly data processing
|
148
|
+
"private", # Privacy information masking
|
149
|
+
"filter" # Text filtering and normalization
|
150
|
+
])
|
151
|
+
```
|
152
|
+
|
153
|
+
### AI Annotation
|
154
|
+
|
155
|
+
```python
|
156
|
+
# Custom annotation tasks
|
157
|
+
qa_data = dm.get_pre_label(
|
158
|
+
api_key="sk-xxx",
|
159
|
+
base_url="https://api.provider.com/v1",
|
160
|
+
model_name="model-name",
|
161
|
+
chunk_size=500, # Text chunk size
|
162
|
+
chunk_overlap=100, # Overlap length
|
163
|
+
question_number=5, # Questions per chunk
|
164
|
+
max_workers=5 # Concurrency
|
165
|
+
)
|
166
|
+
```
|
167
|
+
|
168
|
+
## ⚙️ Environment Setup
|
169
|
+
|
170
|
+
### Optional Dependencies
|
171
|
+
|
172
|
+
#### LibreOffice (DOC file support)
|
173
|
+
|
174
|
+
**Ubuntu/Debian:**
|
175
|
+
```bash
|
176
|
+
sudo apt-get install libreoffice
|
177
|
+
```
|
178
|
+
|
179
|
+
**Windows:**
|
180
|
+
1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
|
181
|
+
2. Add to environment variables: `C:\Program Files\LibreOffice\program`
|
182
|
+
|
183
|
+
#### MinerU (Advanced PDF parsing)
|
184
|
+
|
185
|
+
```bash
|
186
|
+
# Create virtual environment
|
187
|
+
conda create -n mineru python=3.10
|
188
|
+
conda activate mineru
|
189
|
+
|
190
|
+
# Install MinerU
|
191
|
+
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
|
192
|
+
```
|
193
|
+
|
194
|
+
For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
|
195
|
+
|
196
|
+
## 🛠️ Development
|
197
|
+
|
198
|
+
### Local Installation
|
199
|
+
|
200
|
+
```bash
|
201
|
+
git clone https://github.com/Hi-Dolphin/datamax.git
|
202
|
+
cd datamax
|
203
|
+
pip install -r requirements.txt
|
204
|
+
python setup.py install
|
205
|
+
```
|
206
|
+
|
207
|
+
## 📋 System Requirements
|
208
|
+
|
209
|
+
- Python >= 3.10
|
210
|
+
- Supports Windows, macOS, Linux
|
211
|
+
|
212
|
+
## 🤝 Contributing
|
213
|
+
|
214
|
+
Issues and Pull Requests are welcome!
|
215
|
+
|
216
|
+
## 📄 License
|
217
|
+
|
218
|
+
This project is licensed under the [MIT License](LICENSE).
|
219
|
+
|
220
|
+
## 📞 Contact Us
|
221
|
+
|
222
|
+
- 📧 Email: cy.kron@foxmail.com
|
223
|
+
- 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
|
224
|
+
- 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
|
225
|
+
|
226
|
+
---
|
227
|
+
|
228
|
+
⭐ If this project helps you, please give us a star!
|
@@ -1,14 +1,14 @@
|
|
1
1
|
datamax/__init__.py,sha256=Kbs8ITE6suPy0VL8WzKH8A_iAGqukC0jIHcFGLgoBw8,28
|
2
|
-
datamax/loader/MinioHandler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
|
3
|
-
datamax/loader/OssHandler.py,sha256=YFbUfH2vkKRVg8OhkYpOrlYHrIX4zofbHFVe8nPpsQ4,7457
|
4
2
|
datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
datamax/loader/core.py,sha256=
|
3
|
+
datamax/loader/core.py,sha256=C5VhDXkv0HTRcF6WWtztatUZHURZBD-KoZpKyqXfD7U,5100
|
4
|
+
datamax/loader/minio_handler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
|
5
|
+
datamax/loader/oss_handler.py,sha256=YFbUfH2vkKRVg8OhkYpOrlYHrIX4zofbHFVe8nPpsQ4,7457
|
6
6
|
datamax/parser/__init__.py,sha256=Jilq2PLBNonmoXKATzsIHWWvFuBdlcV2dbSP1cOZ6zg,111
|
7
7
|
datamax/parser/base.py,sha256=riGcMn4m295_qf9O0-NbHU2BcHGBXvoF4T3fWj9vgUQ,2514
|
8
8
|
datamax/parser/core.py,sha256=9rzIjsVTRacPTUTAVa5gm5fx0h95LxYnw0lEGqjIIB4,11437
|
9
9
|
datamax/parser/csv_parser.py,sha256=IcyVq8mGE6auIcUInXGYWDnh0H0XJ_3SyQrLVRrS7i0,190
|
10
|
-
datamax/parser/doc_parser.py,sha256=
|
11
|
-
datamax/parser/docx_parser.py,sha256=
|
10
|
+
datamax/parser/doc_parser.py,sha256=wVNYn7dkPI12pW_YImDgoceLFWS3RvFpzbFQwVlrnNo,7936
|
11
|
+
datamax/parser/docx_parser.py,sha256=o9_1VBDc8nBCmsEMv0sKsVcPMxiAxY5pl0mUvOcoOJc,8796
|
12
12
|
datamax/parser/epub_parser.py,sha256=ljCGxLBPwE5gXVKARJec93VpP4dE9R2GspzuSZBkqPQ,1557
|
13
13
|
datamax/parser/html_parser.py,sha256=xQaaK8674QbQwE-Up9X0DJIH0Gg0mR2KoI7fJ6iw2m0,1393
|
14
14
|
datamax/parser/image_parser.py,sha256=qGCndc_21PwsfuxFG03wHSsV0uc-XMBaW3VDbsJQd90,1233
|
@@ -19,7 +19,7 @@ datamax/parser/ppt_parser.py,sha256=Niu3Ina6I6m6lAMS1Z-A7rUbR_iFGmNTaASBoNH_vZ0,
|
|
19
19
|
datamax/parser/pptx_parser.py,sha256=sFWyOa3QNIs4BgtpmSzFQgsgPmunfGqCqi6fulbLFW0,1811
|
20
20
|
datamax/parser/txt_parser.py,sha256=4DIP1LVOw21NDdtqG2RTD_hMcHufkvC8kr048AkuLFs,1682
|
21
21
|
datamax/parser/xls_parser.py,sha256=pRlqgg96f76H8UqXQfheQT9O0ThdP7958hKUCEyQfPM,954
|
22
|
-
datamax/parser/xlsx_parser.py,sha256=
|
22
|
+
datamax/parser/xlsx_parser.py,sha256=Vw6XfoQyu6aQUSIueR-krByMA_WOb5fasf4VmKxjVio,8905
|
23
23
|
datamax/utils/__init__.py,sha256=d69SJvqOXzItyg9rEcLc4z67Lw9vACispOe3x7NvZLA,1051
|
24
24
|
datamax/utils/constants.py,sha256=A0S56mkIfeT6oQmOd-VGTChzLOSBUqsG4skMmLt6uNk,4507
|
25
25
|
datamax/utils/data_cleaner.py,sha256=zlk2dXmhU-_9KVfqmqMGr967v-nc7Iv8ZKRdMkIJsGM,7784
|
@@ -30,10 +30,10 @@ datamax/utils/paddleocr_pdf_operator.py,sha256=Tnb-5SzUd6OXM-XeaL8vdPnsOhgG_GKz-
|
|
30
30
|
datamax/utils/ppt_extract.py,sha256=nd6KSqEzxANrPhNPUZY4ogAyxHzKCbdsI5ZfDQCz0Cw,6164
|
31
31
|
datamax/utils/qa_generator.py,sha256=d75an9JEyT6sxlSjdmWYveQshfyTb0v4aGSuTpTJa0A,12561
|
32
32
|
datamax/utils/tokenizer.py,sha256=Y8XB06XQVsNuG8IPl_4iBZj2yu1xzXldVbmZtXFMQM4,859
|
33
|
-
pydatamax-0.1.
|
33
|
+
pydatamax-0.1.14.dist-info/licenses/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
|
34
34
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
35
|
tests/test_basic.py,sha256=4AByx25-MIt6_zmzxpFRoSCBqLtIjyfTwFLb1UCJz6k,303
|
36
|
-
pydatamax-0.1.
|
37
|
-
pydatamax-0.1.
|
38
|
-
pydatamax-0.1.
|
39
|
-
pydatamax-0.1.
|
36
|
+
pydatamax-0.1.14.dist-info/METADATA,sha256=P-wz8Log3gcUMftKTd2qrcmNuzpp-HOn8gVVe8cTceM,6314
|
37
|
+
pydatamax-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
38
|
+
pydatamax-0.1.14.dist-info/top_level.txt,sha256=DvdVIUrxJDgRUYiGA5vznYZIP-K8ZnDkTZfrqYLNZMQ,14
|
39
|
+
pydatamax-0.1.14.dist-info/RECORD,,
|
@@ -1,280 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: pydatamax
|
3
|
-
Version: 0.1.13
|
4
|
-
Summary: A library for parsing and converting various file formats.
|
5
|
-
Home-page: https://github.com/Hi-Dolphin/datamax
|
6
|
-
Author: ccy
|
7
|
-
Author-email: cy.kron@foxmail.com
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Requires-Python: >=3.10
|
12
|
-
Description-Content-Type: text/markdown
|
13
|
-
License-File: LICENSE
|
14
|
-
Requires-Dist: oss2<3.0.0,>=2.19.1
|
15
|
-
Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
|
16
|
-
Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
|
17
|
-
Requires-Dist: crcmod<2.0.0,>=1.7
|
18
|
-
Requires-Dist: langdetect<2.0.0,>=1.0.9
|
19
|
-
Requires-Dist: loguru<1.0.0,>=0.7.3
|
20
|
-
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
21
|
-
Requires-Dist: python-dotenv<2.0.0,>=1.1.0
|
22
|
-
Requires-Dist: pymupdf<2.0.0,>=1.26.0
|
23
|
-
Requires-Dist: pypdf<6.0.0,>=5.5.0
|
24
|
-
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
25
|
-
Requires-Dist: pandas<3.0.0,>=2.2.3
|
26
|
-
Requires-Dist: numpy<3.0.0,>=2.2.6
|
27
|
-
Requires-Dist: requests<3.0.0,>=2.32.3
|
28
|
-
Requires-Dist: tqdm<5.0.0,>=4.67.1
|
29
|
-
Requires-Dist: pydantic<3.0.0,>=2.11.5
|
30
|
-
Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
|
31
|
-
Requires-Dist: python-magic<1.0.0,>=0.4.27
|
32
|
-
Requires-Dist: PyYAML<7.0.0,>=6.0.2
|
33
|
-
Requires-Dist: Pillow<12.0.0,>=11.2.1
|
34
|
-
Requires-Dist: packaging<25.0,>=24.2
|
35
|
-
Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
|
36
|
-
Requires-Dist: minio<8.0.0,>=7.2.15
|
37
|
-
Requires-Dist: openai<2.0.0,>=1.82.0
|
38
|
-
Requires-Dist: jionlp<2.0.0,>=1.5.23
|
39
|
-
Requires-Dist: chardet<6.0.0,>=5.2.0
|
40
|
-
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
41
|
-
Requires-Dist: tiktoken<1.0.0,>=0.9.0
|
42
|
-
Requires-Dist: markitdown<1.0.0,>=0.1.1
|
43
|
-
Requires-Dist: xlrd<3.0.0,>=2.0.1
|
44
|
-
Requires-Dist: tabulate<1.0.0,>=0.9.0
|
45
|
-
Requires-Dist: unstructured<1.0.0,>=0.17.2
|
46
|
-
Requires-Dist: markdown<4.0.0,>=3.8
|
47
|
-
Requires-Dist: langchain<1.0.0,>=0.3.0
|
48
|
-
Requires-Dist: langchain-community<1.0.0,>=0.3.0
|
49
|
-
Dynamic: author
|
50
|
-
Dynamic: author-email
|
51
|
-
Dynamic: classifier
|
52
|
-
Dynamic: description
|
53
|
-
Dynamic: description-content-type
|
54
|
-
Dynamic: home-page
|
55
|
-
Dynamic: license-file
|
56
|
-
Dynamic: requires-dist
|
57
|
-
Dynamic: requires-python
|
58
|
-
Dynamic: summary
|
59
|
-
|
60
|
-
# DataMax
|
61
|
-
|
62
|
-
## Overview
|
63
|
-
DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
|
64
|
-
|
65
|
-
## Key Features
|
66
|
-
|
67
|
-
### File Processing Capabilities
|
68
|
-
Currently supports reading, conversion, and extraction from:
|
69
|
-
- PDF, HTML
|
70
|
-
- DOCX/DOC, PPT/PPTX
|
71
|
-
- EPUB
|
72
|
-
- Images
|
73
|
-
- XLS/XLSX spreadsheets
|
74
|
-
- Plain text (TXT)
|
75
|
-
|
76
|
-
### Data Cleaning Pipeline
|
77
|
-
Three-tiered cleaning process:
|
78
|
-
1. Anomaly detection and handling
|
79
|
-
2. Privacy protection processing
|
80
|
-
3. Text filtering and normalization
|
81
|
-
|
82
|
-
### AI-Powered Data Annotation
|
83
|
-
Implements an LLM+Prompt to:
|
84
|
-
- Continuously generate pre-labeled datasets
|
85
|
-
- Provide optimized training data for model fine-tuning
|
86
|
-
|
87
|
-
|
88
|
-
## Installation Guide (Key Dependencies)
|
89
|
-
Dependencies include libreoffice, datamax, and MinerU.
|
90
|
-
|
91
|
-
### 1. Installing libreoffice Dependency
|
92
|
-
**Note:** Without datamax, .doc files will not be supported.
|
93
|
-
|
94
|
-
#### Linux (Debian/Ubuntu)
|
95
|
-
```bash
|
96
|
-
sudo apt-get update
|
97
|
-
sudo apt-get install libreoffice
|
98
|
-
```
|
99
|
-
### Windows
|
100
|
-
```text
|
101
|
-
Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
|
102
|
-
Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
|
103
|
-
```
|
104
|
-
### Checking LibreOffice Installation
|
105
|
-
```bash
|
106
|
-
soffice --version
|
107
|
-
```
|
108
|
-
|
109
|
-
## 2. Installing MinerU Dependency
|
110
|
-
Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
|
111
|
-
### Create a Virtual Environment and Install Basic Dependencies
|
112
|
-
```bash
|
113
|
-
conda create -n mineru python=3.10
|
114
|
-
conda activate mineru
|
115
|
-
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
|
116
|
-
```
|
117
|
-
### Installing Model Weight Files
|
118
|
-
https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
|
119
|
-
```bash
|
120
|
-
pip install modelscope
|
121
|
-
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
|
122
|
-
python download_models.py
|
123
|
-
```
|
124
|
-
|
125
|
-
### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
|
126
|
-
```json
|
127
|
-
{
|
128
|
-
"models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
|
129
|
-
"layoutreader-model-dir": "path\\to\\folder\\layoutreader",
|
130
|
-
"device-mode": "cpu",
|
131
|
-
...
|
132
|
-
}
|
133
|
-
```
|
134
|
-
|
135
|
-
## 3. Installing Basic Dependencies for datamax
|
136
|
-
1. Clone the repository to your local machine:
|
137
|
-
```bash
|
138
|
-
git clone <repository-url>
|
139
|
-
```
|
140
|
-
2. Install dependencies into conda:
|
141
|
-
```bash
|
142
|
-
cd datamax
|
143
|
-
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
144
|
-
```
|
145
|
-
|
146
|
-
|
147
|
-
## Features
|
148
|
-
- **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
|
149
|
-
- **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
|
150
|
-
- **Data Conversion**: Supports converting processed data into markdown format for further analysis.
|
151
|
-
- **Batch Processing**: Can handle multiple files at once, improving work efficiency.
|
152
|
-
- **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
|
153
|
-
- **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
|
154
|
-
|
155
|
-
|
156
|
-
## Technology Stack
|
157
|
-
|
158
|
-
- **Programming Language**: Python >= 3.10
|
159
|
-
- **Dependency Libraries**:
|
160
|
-
- PyMuPDF: For PDF file parsing.
|
161
|
-
- BeautifulSoup: For HTML file parsing.
|
162
|
-
- python-docx: For DOCX file parsing.
|
163
|
-
- pandas: For data processing and conversion.
|
164
|
-
- paddleocr: For parsing scanned PDFs, tables, and images.
|
165
|
-
- **Development Environment**: Visual Studio Code or PyCharm
|
166
|
-
- **Version Control**: Git
|
167
|
-
|
168
|
-
## Usage Instructions
|
169
|
-
### Installing the SDK
|
170
|
-
- **Installation Commands**:
|
171
|
-
```bash
|
172
|
-
## Local Installation
|
173
|
-
python setup.py sdist bdist_wheel
|
174
|
-
pip install dist/datamax-0.1.3-py3-none-any.whl
|
175
|
-
|
176
|
-
## Pip Installation
|
177
|
-
pip install pydatamax
|
178
|
-
```
|
179
|
-
|
180
|
-
|
181
|
-
- **Importing the Code**:
|
182
|
-
```python
|
183
|
-
# File Parsing
|
184
|
-
from datamax import DataMax
|
185
|
-
|
186
|
-
## Handling a Single File in Two Ways
|
187
|
-
# 1. Using a List of Length 1
|
188
|
-
data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
|
189
|
-
data = data.get_data()
|
190
|
-
|
191
|
-
# 2. Using a String
|
192
|
-
data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
|
193
|
-
data = data.get_data()
|
194
|
-
|
195
|
-
## Handling Multiple Files
|
196
|
-
# 1. Using a List of Length n
|
197
|
-
data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
|
198
|
-
data = data.get_data()
|
199
|
-
|
200
|
-
# 2. Passing a Folder Path as a String
|
201
|
-
data = DataMax(file_path=r"docx_files_example/")
|
202
|
-
data = data.get_data()
|
203
|
-
|
204
|
-
# Data Cleaning
|
205
|
-
"""
|
206
|
-
Cleaning rules can be found in datamax/utils/data_cleaner.py
|
207
|
-
abnormal: Abnormal cleaning
|
208
|
-
private: Privacy processing
|
209
|
-
filter: Text filtering
|
210
|
-
"""
|
211
|
-
# Direct Use: Clean the text parameter directly and return a string
|
212
|
-
dm = DataMax()
|
213
|
-
data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
|
214
|
-
|
215
|
-
# Process Use: Use after get_data() to return the complete data structure
|
216
|
-
dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
|
217
|
-
data2 = dm.get_data()
|
218
|
-
cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
|
219
|
-
|
220
|
-
# Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
|
221
|
-
data = DataMax(file_path=r"path\to\xxx.docx")
|
222
|
-
parsed_data = data.get_data()
|
223
|
-
# If no custom messages are passed, the default messages in the SDK will be used
|
224
|
-
messages = [
|
225
|
-
{'role': 'system', 'content': 'You are a helpful assistant.'},
|
226
|
-
{'role': 'user', 'content': 'Who are you?'}
|
227
|
-
]
|
228
|
-
qa_datas = data.get_pre_label(
|
229
|
-
api_key="sk-xxx",
|
230
|
-
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
|
231
|
-
model_name="qwen-max",
|
232
|
-
chunk_size=500,
|
233
|
-
chunk_overlap=100,
|
234
|
-
question_number=5,
|
235
|
-
max_workers=5,
|
236
|
-
# message=[]
|
237
|
-
)
|
238
|
-
print(f'Annotated result:{qa_datas}')
|
239
|
-
```
|
240
|
-
|
241
|
-
|
242
|
-
## Examples
|
243
|
-
```python
|
244
|
-
## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
|
245
|
-
from datamax import DataMax
|
246
|
-
data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
|
247
|
-
"""
|
248
|
-
Parameters:
|
249
|
-
file_path: Relative file path / Absolute file path
|
250
|
-
to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
|
251
|
-
"""
|
252
|
-
|
253
|
-
## jpg | jpeg | png | ...(image types)
|
254
|
-
data = DataMax(file_path=r"image.jpg", use_mineru=True)
|
255
|
-
"""
|
256
|
-
Parameters:
|
257
|
-
file_path: Relative file path / Absolute file path
|
258
|
-
use_mineru: Whether to use MinerU enhancement
|
259
|
-
"""
|
260
|
-
|
261
|
-
## pdf
|
262
|
-
from datamax import DataMax
|
263
|
-
data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
|
264
|
-
"""
|
265
|
-
Parameters:
|
266
|
-
file_path: Relative file path / Absolute file path
|
267
|
-
use_mineru: Whether to use MinerU enhancement
|
268
|
-
"""
|
269
|
-
```
|
270
|
-
|
271
|
-
## Contribution Guide
|
272
|
-
We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
|
273
|
-
## License
|
274
|
-
This project is licensed under the MIT License. For more details, see the LICENSE file.
|
275
|
-
|
276
|
-
## Contact Information
|
277
|
-
If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
|
278
|
-
- Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
|
279
|
-
- Project Homepage: GitHub Project Link
|
280
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|