pydatamax 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/parser/doc_parser.py +159 -41
- datamax/parser/docx_parser.py +186 -17
- datamax/parser/xlsx_parser.py +185 -48
- {pydatamax-0.1.11.dist-info → pydatamax-0.1.13.dist-info}/METADATA +39 -30
- {pydatamax-0.1.11.dist-info → pydatamax-0.1.13.dist-info}/RECORD +8 -8
- {pydatamax-0.1.11.dist-info → pydatamax-0.1.13.dist-info}/WHEEL +1 -1
- {pydatamax-0.1.11.dist-info → pydatamax-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.11.dist-info → pydatamax-0.1.13.dist-info}/top_level.txt +0 -0
datamax/parser/doc_parser.py
CHANGED
@@ -3,75 +3,193 @@ import shutil
|
|
3
3
|
import subprocess
|
4
4
|
import tempfile
|
5
5
|
import chardet
|
6
|
-
import
|
6
|
+
import logging
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import Union
|
9
|
-
from docx import Document
|
10
9
|
from datamax.parser.base import BaseLife
|
11
10
|
from datamax.parser.base import MarkdownOutputVo
|
12
11
|
|
13
12
|
|
13
|
+
# 配置日志
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
14
17
|
class DocParser(BaseLife):
|
15
18
|
def __init__(self, file_path: Union[str, list], to_markdown: bool = False):
|
16
19
|
super().__init__()
|
17
20
|
self.file_path = file_path
|
18
21
|
self.to_markdown = to_markdown
|
22
|
+
logger.info(f"🚀 DocParser初始化完成 - 文件路径: {file_path}, 转换为markdown: {to_markdown}")
|
23
|
+
|
24
|
+
def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
|
25
|
+
"""将.doc文件转换为.txt文件"""
|
26
|
+
logger.info(f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}")
|
27
|
+
|
28
|
+
try:
|
29
|
+
cmd = f'soffice --headless --convert-to txt "{doc_path}" --outdir "{dir_path}"'
|
30
|
+
logger.debug(f"⚡ 执行转换命令: {cmd}")
|
31
|
+
|
32
|
+
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
33
|
+
stdout, stderr = process.communicate()
|
34
|
+
exit_code = process.returncode
|
35
|
+
|
36
|
+
if exit_code == 0:
|
37
|
+
logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
|
38
|
+
if stdout:
|
39
|
+
logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
|
40
|
+
else:
|
41
|
+
encoding = chardet.detect(stderr)['encoding']
|
42
|
+
if encoding is None:
|
43
|
+
encoding = 'utf-8'
|
44
|
+
error_msg = stderr.decode(encoding, errors='replace')
|
45
|
+
logger.error(f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
|
46
|
+
raise Exception(f"Error Output (detected encoding: {encoding}): {error_msg}")
|
47
|
+
|
48
|
+
fname = str(Path(doc_path).stem)
|
49
|
+
txt_path = os.path.join(dir_path, f'{fname}.txt')
|
50
|
+
|
51
|
+
if not os.path.exists(txt_path):
|
52
|
+
logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
|
53
|
+
raise Exception(f"文件转换失败 {doc_path} ==> {txt_path}")
|
54
|
+
else:
|
55
|
+
logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
|
56
|
+
return txt_path
|
57
|
+
|
58
|
+
except subprocess.SubprocessError as e:
|
59
|
+
logger.error(f"💥 subprocess执行失败: {str(e)}")
|
60
|
+
raise Exception(f"执行转换命令时发生错误: {str(e)}")
|
61
|
+
except Exception as e:
|
62
|
+
logger.error(f"💥 DOC到TXT转换过程中发生未知错误: {str(e)}")
|
63
|
+
raise
|
19
64
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
65
|
+
def read_txt_file(self, txt_path: str) -> str:
|
66
|
+
"""读取txt文件内容"""
|
67
|
+
logger.info(f"📖 开始读取TXT文件: {txt_path}")
|
68
|
+
|
69
|
+
try:
|
70
|
+
# 检测文件编码
|
71
|
+
with open(txt_path, 'rb') as f:
|
72
|
+
raw_data = f.read()
|
73
|
+
encoding = chardet.detect(raw_data)['encoding']
|
74
|
+
if encoding is None:
|
75
|
+
encoding = 'utf-8'
|
76
|
+
logger.debug(f"🔍 检测到文件编码: {encoding}")
|
77
|
+
|
78
|
+
# 读取文件内容
|
79
|
+
with open(txt_path, 'r', encoding=encoding, errors='replace') as f:
|
80
|
+
content = f.read()
|
81
|
+
|
82
|
+
logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
|
83
|
+
logger.debug(f"👀 前100字符预览: {content[:100]}...")
|
84
|
+
|
85
|
+
return content
|
86
|
+
|
87
|
+
except FileNotFoundError as e:
|
88
|
+
logger.error(f"🚫 TXT文件未找到: {str(e)}")
|
89
|
+
raise Exception(f"文件未找到: {txt_path}")
|
90
|
+
except Exception as e:
|
91
|
+
logger.error(f"💥 读取TXT文件时发生错误: {str(e)}")
|
92
|
+
raise
|
38
93
|
|
39
|
-
def
|
94
|
+
def read_doc_file(self, doc_path: str) -> str:
|
95
|
+
"""读取doc文件并转换为文本"""
|
96
|
+
logger.info(f"📖 开始读取DOC文件 - 文件: {doc_path}")
|
97
|
+
|
40
98
|
try:
|
41
99
|
with tempfile.TemporaryDirectory() as temp_path:
|
100
|
+
logger.debug(f"📁 创建临时目录: {temp_path}")
|
101
|
+
|
42
102
|
temp_dir = Path(temp_path)
|
43
|
-
|
44
|
-
media_dir.mkdir()
|
103
|
+
|
45
104
|
file_path = temp_dir / "tmp.doc"
|
46
105
|
shutil.copy(doc_path, file_path)
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
106
|
+
logger.debug(f"📋 复制文件到临时目录: {doc_path} -> {file_path}")
|
107
|
+
|
108
|
+
# 转换DOC为TXT
|
109
|
+
txt_file_path = self.doc_to_txt(str(file_path), str(temp_path))
|
110
|
+
logger.info(f"🎯 DOC转TXT完成: {txt_file_path}")
|
111
|
+
|
112
|
+
# 读取TXT文件内容
|
113
|
+
content = self.read_txt_file(txt_file_path)
|
114
|
+
logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
|
115
|
+
|
116
|
+
return content
|
117
|
+
|
118
|
+
except FileNotFoundError as e:
|
119
|
+
logger.error(f"🚫 文件未找到: {str(e)}")
|
120
|
+
raise Exception(f"文件未找到: {doc_path}")
|
121
|
+
except PermissionError as e:
|
122
|
+
logger.error(f"🔒 文件权限错误: {str(e)}")
|
123
|
+
raise Exception(f"无权限访问文件: {doc_path}")
|
60
124
|
except Exception as e:
|
61
|
-
|
125
|
+
logger.error(f"💥 读取DOC文件时发生错误: {str(e)}")
|
126
|
+
raise
|
62
127
|
|
63
128
|
def parse(self, file_path: str):
|
129
|
+
"""解析DOC文件"""
|
130
|
+
logger.info(f"🎬 开始解析DOC文件: {file_path}")
|
131
|
+
|
64
132
|
try:
|
133
|
+
# 验证文件存在
|
134
|
+
if not os.path.exists(file_path):
|
135
|
+
logger.error(f"🚫 文件不存在: {file_path}")
|
136
|
+
raise FileNotFoundError(f"文件不存在: {file_path}")
|
137
|
+
|
138
|
+
# 验证文件大小
|
139
|
+
file_size = os.path.getsize(file_path)
|
140
|
+
logger.info(f"📏 文件大小: {file_size} 字节")
|
141
|
+
|
65
142
|
title = self.get_file_extension(file_path)
|
143
|
+
logger.debug(f"🏷️ 提取文件标题: {title}")
|
144
|
+
|
145
|
+
# 使用soffice转换为txt后读取内容
|
146
|
+
logger.info("📝 使用soffice转换DOC为TXT并读取内容")
|
147
|
+
content = self.read_doc_file(doc_path=file_path)
|
148
|
+
|
149
|
+
# 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
|
66
150
|
if self.to_markdown:
|
67
|
-
|
151
|
+
# 简单的文本到markdown转换(保持段落结构)
|
152
|
+
mk_content = self.format_as_markdown(content)
|
153
|
+
logger.info("🎨 内容已格式化为markdown格式")
|
68
154
|
else:
|
69
|
-
content = self.read_docx_file(doc_path=file_path, to_mk=False)
|
70
155
|
mk_content = content
|
156
|
+
logger.info("📝 保持原始文本格式")
|
157
|
+
|
158
|
+
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
159
|
+
|
71
160
|
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
72
161
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
162
|
+
logger.debug("⚙️ 生成lifecycle信息完成")
|
163
|
+
|
73
164
|
output_vo = MarkdownOutputVo(title, mk_content)
|
74
165
|
output_vo.add_lifecycle(lifecycle)
|
75
|
-
|
166
|
+
|
167
|
+
result = output_vo.to_dict()
|
168
|
+
logger.info(f"🏆 DOC文件解析完成: {file_path}")
|
169
|
+
logger.debug(f"🔑 返回结果键: {list(result.keys())}")
|
170
|
+
|
171
|
+
return result
|
172
|
+
|
76
173
|
except Exception as e:
|
77
|
-
|
174
|
+
logger.error(f"💀 解析DOC文件失败: {file_path}, 错误: {str(e)}")
|
175
|
+
raise
|
176
|
+
|
177
|
+
def format_as_markdown(self, content: str) -> str:
|
178
|
+
"""将纯文本格式化为简单的markdown格式"""
|
179
|
+
if not content.strip():
|
180
|
+
return content
|
181
|
+
|
182
|
+
lines = content.split('\n')
|
183
|
+
formatted_lines = []
|
184
|
+
|
185
|
+
for line in lines:
|
186
|
+
line = line.strip()
|
187
|
+
if not line:
|
188
|
+
formatted_lines.append('')
|
189
|
+
continue
|
190
|
+
|
191
|
+
# 简单的markdown格式化规则
|
192
|
+
# 可以根据需要扩展更多规则
|
193
|
+
formatted_lines.append(line)
|
194
|
+
|
195
|
+
return '\n'.join(formatted_lines)
|
datamax/parser/docx_parser.py
CHANGED
@@ -1,43 +1,212 @@
|
|
1
1
|
import os
|
2
|
-
import
|
3
|
-
|
2
|
+
import shutil
|
3
|
+
import subprocess
|
4
|
+
import tempfile
|
5
|
+
import chardet
|
6
|
+
import logging
|
7
|
+
from pathlib import Path
|
4
8
|
from typing import Union
|
5
9
|
from datamax.parser.base import BaseLife
|
6
10
|
from datamax.parser.base import MarkdownOutputVo
|
7
11
|
|
8
12
|
|
13
|
+
# 配置日志
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
9
17
|
class DocxParser(BaseLife):
|
10
18
|
def __init__(self, file_path: Union[str, list], to_markdown: bool = False):
|
11
19
|
super().__init__()
|
12
20
|
self.file_path = file_path
|
13
21
|
self.to_markdown = to_markdown
|
22
|
+
logger.info(f"🚀 DocxParser初始化完成 - 文件路径: {file_path}, 转换为markdown: {to_markdown}")
|
14
23
|
|
15
|
-
|
16
|
-
|
24
|
+
def docx_to_txt(self, docx_path: str, dir_path: str) -> str:
|
25
|
+
"""将.docx文件转换为.txt文件"""
|
26
|
+
logger.info(f"🔄 开始转换DOCX文件为TXT - 源文件: {docx_path}, 输出目录: {dir_path}")
|
27
|
+
|
17
28
|
try:
|
18
|
-
|
19
|
-
|
20
|
-
|
29
|
+
cmd = f'soffice --headless --convert-to txt "{docx_path}" --outdir "{dir_path}"'
|
30
|
+
logger.debug(f"⚡ 执行转换命令: {cmd}")
|
31
|
+
|
32
|
+
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
33
|
+
stdout, stderr = process.communicate()
|
34
|
+
exit_code = process.returncode
|
35
|
+
|
36
|
+
if exit_code == 0:
|
37
|
+
logger.info(f"✅ DOCX到TXT转换成功 - 退出码: {exit_code}")
|
38
|
+
if stdout:
|
39
|
+
logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
|
40
|
+
else:
|
41
|
+
encoding = chardet.detect(stderr)['encoding']
|
42
|
+
if encoding is None:
|
43
|
+
encoding = 'utf-8'
|
44
|
+
error_msg = stderr.decode(encoding, errors='replace')
|
45
|
+
logger.error(f"❌ DOCX到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
|
46
|
+
raise Exception(f"Error Output (detected encoding: {encoding}): {error_msg}")
|
47
|
+
|
48
|
+
fname = str(Path(docx_path).stem)
|
49
|
+
txt_path = os.path.join(dir_path, f'{fname}.txt')
|
50
|
+
|
51
|
+
if not os.path.exists(txt_path):
|
52
|
+
logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
|
53
|
+
raise Exception(f"文件转换失败 {docx_path} ==> {txt_path}")
|
54
|
+
else:
|
55
|
+
logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
|
56
|
+
return txt_path
|
57
|
+
|
58
|
+
except subprocess.SubprocessError as e:
|
59
|
+
logger.error(f"💥 subprocess执行失败: {str(e)}")
|
60
|
+
raise Exception(f"执行转换命令时发生错误: {str(e)}")
|
21
61
|
except Exception as e:
|
22
|
-
|
62
|
+
logger.error(f"💥 DOCX到TXT转换过程中发生未知错误: {str(e)}")
|
63
|
+
raise
|
64
|
+
|
65
|
+
def read_txt_file(self, txt_path: str) -> str:
|
66
|
+
"""读取txt文件内容"""
|
67
|
+
logger.info(f"📖 开始读取TXT文件: {txt_path}")
|
68
|
+
|
69
|
+
try:
|
70
|
+
# 检测文件编码
|
71
|
+
with open(txt_path, 'rb') as f:
|
72
|
+
raw_data = f.read()
|
73
|
+
encoding = chardet.detect(raw_data)['encoding']
|
74
|
+
if encoding is None:
|
75
|
+
encoding = 'utf-8'
|
76
|
+
logger.debug(f"🔍 检测到文件编码: {encoding}")
|
77
|
+
|
78
|
+
# 读取文件内容
|
79
|
+
with open(txt_path, 'r', encoding=encoding, errors='replace') as f:
|
80
|
+
content = f.read()
|
81
|
+
|
82
|
+
logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
|
83
|
+
logger.debug(f"👀 前100字符预览: {content[:100]}...")
|
84
|
+
|
85
|
+
return content
|
86
|
+
|
87
|
+
except FileNotFoundError as e:
|
88
|
+
logger.error(f"🚫 TXT文件未找到: {str(e)}")
|
89
|
+
raise Exception(f"文件未找到: {txt_path}")
|
90
|
+
except Exception as e:
|
91
|
+
logger.error(f"💥 读取TXT文件时发生错误: {str(e)}")
|
92
|
+
raise
|
93
|
+
|
94
|
+
def read_docx_file(self, docx_path: str) -> str:
|
95
|
+
"""读取docx文件并转换为文本"""
|
96
|
+
logger.info(f"📖 开始读取DOCX文件 - 文件: {docx_path}")
|
97
|
+
|
98
|
+
try:
|
99
|
+
with tempfile.TemporaryDirectory() as temp_path:
|
100
|
+
logger.debug(f"📁 创建临时目录: {temp_path}")
|
101
|
+
|
102
|
+
temp_dir = Path(temp_path)
|
103
|
+
|
104
|
+
file_path = temp_dir / "tmp.docx"
|
105
|
+
shutil.copy(docx_path, file_path)
|
106
|
+
logger.debug(f"📋 复制文件到临时目录: {docx_path} -> {file_path}")
|
107
|
+
|
108
|
+
# 转换DOCX为TXT
|
109
|
+
txt_file_path = self.docx_to_txt(str(file_path), str(temp_path))
|
110
|
+
logger.info(f"🎯 DOCX转TXT完成: {txt_file_path}")
|
111
|
+
|
112
|
+
# 读取TXT文件内容
|
113
|
+
content = self.read_txt_file(txt_file_path)
|
114
|
+
logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
|
115
|
+
|
116
|
+
return content
|
117
|
+
|
118
|
+
except FileNotFoundError as e:
|
119
|
+
logger.error(f"🚫 文件未找到: {str(e)}")
|
120
|
+
raise Exception(f"文件未找到: {docx_path}")
|
121
|
+
except PermissionError as e:
|
122
|
+
logger.error(f"🔒 文件权限错误: {str(e)}")
|
123
|
+
raise Exception(f"无权限访问文件: {docx_path}")
|
124
|
+
except Exception as e:
|
125
|
+
logger.error(f"💥 读取DOCX文件时发生错误: {str(e)}")
|
126
|
+
raise
|
23
127
|
|
24
128
|
def parse(self, file_path: str):
|
129
|
+
"""解析DOCX文件"""
|
130
|
+
logger.info(f"🎬 开始解析DOCX文件: {file_path}")
|
131
|
+
|
25
132
|
try:
|
133
|
+
# 验证文件存在
|
134
|
+
if not os.path.exists(file_path):
|
135
|
+
logger.error(f"🚫 文件不存在: {file_path}")
|
136
|
+
raise FileNotFoundError(f"文件不存在: {file_path}")
|
137
|
+
|
138
|
+
# 验证文件扩展名
|
139
|
+
if not file_path.lower().endswith('.docx'):
|
140
|
+
logger.warning(f"⚠️ 文件扩展名不是.docx: {file_path}")
|
141
|
+
|
142
|
+
# 验证文件大小
|
143
|
+
file_size = os.path.getsize(file_path)
|
144
|
+
logger.info(f"📏 文件大小: {file_size} 字节")
|
145
|
+
|
146
|
+
if file_size == 0:
|
147
|
+
logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
|
148
|
+
|
26
149
|
title = self.get_file_extension(file_path)
|
150
|
+
logger.debug(f"🏷️ 提取文件标题: {title}")
|
151
|
+
|
152
|
+
# 使用soffice转换为txt后读取内容
|
153
|
+
logger.info("📝 使用soffice转换DOCX为TXT并读取内容")
|
154
|
+
content = self.read_docx_file(docx_path=file_path)
|
155
|
+
|
156
|
+
# 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
|
27
157
|
if self.to_markdown:
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
os.makedirs(output_dir)
|
32
|
-
docx2markdown.docx_to_markdown(file_path, output_md_dir)
|
33
|
-
mk_content = open(output_md_dir, 'r', encoding='utf-8').read()
|
158
|
+
# 简单的文本到markdown转换(保持段落结构)
|
159
|
+
mk_content = self.format_as_markdown(content)
|
160
|
+
logger.info("🎨 内容已格式化为markdown格式")
|
34
161
|
else:
|
35
|
-
content = self.read_docx_file(file_path=file_path)
|
36
162
|
mk_content = content
|
163
|
+
logger.info("📝 保持原始文本格式")
|
164
|
+
|
165
|
+
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
166
|
+
|
167
|
+
# 检查内容是否为空
|
168
|
+
if not mk_content.strip():
|
169
|
+
logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
|
170
|
+
|
37
171
|
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
38
172
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
173
|
+
logger.debug("⚙️ 生成lifecycle信息完成")
|
174
|
+
|
39
175
|
output_vo = MarkdownOutputVo(title, mk_content)
|
40
176
|
output_vo.add_lifecycle(lifecycle)
|
41
|
-
|
177
|
+
|
178
|
+
result = output_vo.to_dict()
|
179
|
+
logger.info(f"🏆 DOCX文件解析完成: {file_path}")
|
180
|
+
logger.debug(f"🔑 返回结果键: {list(result.keys())}")
|
181
|
+
|
182
|
+
return result
|
183
|
+
|
184
|
+
except FileNotFoundError as e:
|
185
|
+
logger.error(f"🚫 文件不存在错误: {str(e)}")
|
186
|
+
raise
|
187
|
+
except PermissionError as e:
|
188
|
+
logger.error(f"🔒 文件权限错误: {str(e)}")
|
189
|
+
raise Exception(f"无权限访问文件: {file_path}")
|
42
190
|
except Exception as e:
|
43
|
-
|
191
|
+
logger.error(f"💀 解析DOCX文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}")
|
192
|
+
raise
|
193
|
+
|
194
|
+
def format_as_markdown(self, content: str) -> str:
|
195
|
+
"""将纯文本格式化为简单的markdown格式"""
|
196
|
+
if not content.strip():
|
197
|
+
return content
|
198
|
+
|
199
|
+
lines = content.split('\n')
|
200
|
+
formatted_lines = []
|
201
|
+
|
202
|
+
for line in lines:
|
203
|
+
line = line.strip()
|
204
|
+
if not line:
|
205
|
+
formatted_lines.append('')
|
206
|
+
continue
|
207
|
+
|
208
|
+
# 简单的markdown格式化规则
|
209
|
+
# 可以根据需要扩展更多规则
|
210
|
+
formatted_lines.append(line)
|
211
|
+
|
212
|
+
return '\n'.join(formatted_lines)
|
datamax/parser/xlsx_parser.py
CHANGED
@@ -1,71 +1,208 @@
|
|
1
1
|
import multiprocessing
|
2
2
|
import time
|
3
|
+
import logging
|
4
|
+
import os
|
3
5
|
from multiprocessing import Queue
|
4
6
|
from datamax.parser.base import MarkdownOutputVo
|
5
7
|
from datamax.parser.base import BaseLife
|
6
|
-
|
8
|
+
import pandas as pd
|
7
9
|
import warnings
|
8
|
-
from markitdown import MarkItDown
|
9
10
|
|
10
11
|
warnings.filterwarnings("ignore")
|
11
12
|
|
12
|
-
|
13
|
-
|
13
|
+
# 配置日志
|
14
|
+
logger = logging.getLogger(__name__)
|
14
15
|
|
15
|
-
_markitdown_instance = None
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
if cls._markitdown_instance is None:
|
20
|
-
cls._markitdown_instance = MarkItDown()
|
21
|
-
return cls._markitdown_instance
|
17
|
+
class XlsxParser(BaseLife):
|
18
|
+
"""XLSX解析器 - 使用pandas读取并转换为markdown,支持多进程处理"""
|
22
19
|
|
23
20
|
def __init__(self, file_path, timeout):
|
24
21
|
super().__init__()
|
25
22
|
self.file_path = file_path
|
26
23
|
self.timeout = timeout
|
27
|
-
|
24
|
+
logger.info(f"🚀 XlsxParser初始化完成 - 文件路径: {file_path}, 超时: {timeout}s")
|
25
|
+
|
26
|
+
def _parse_with_pandas(self, file_path: str) -> str:
|
27
|
+
"""使用pandas读取Excel并转换为markdown"""
|
28
|
+
logger.info(f"🐼 开始使用pandas读取Excel文件: {file_path}")
|
29
|
+
|
30
|
+
try:
|
31
|
+
# 验证文件存在
|
32
|
+
if not os.path.exists(file_path):
|
33
|
+
logger.error(f"🚫 Excel文件不存在: {file_path}")
|
34
|
+
raise FileNotFoundError(f"文件不存在: {file_path}")
|
35
|
+
|
36
|
+
# 验证文件大小
|
37
|
+
file_size = os.path.getsize(file_path)
|
38
|
+
logger.info(f"📏 文件大小: {file_size} 字节")
|
39
|
+
|
40
|
+
if file_size == 0:
|
41
|
+
logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
|
42
|
+
return "*文件为空*"
|
43
|
+
|
44
|
+
# 使用pandas读取Excel文件
|
45
|
+
logger.debug("📊 正在读取Excel数据...")
|
46
|
+
df = pd.read_excel(file_path, sheet_name=None) # 读取所有工作表
|
47
|
+
|
48
|
+
markdown_content = ""
|
49
|
+
|
50
|
+
if isinstance(df, dict):
|
51
|
+
# 多个工作表
|
52
|
+
logger.info(f"📑 检测到多个工作表,共 {len(df)} 个")
|
53
|
+
for sheet_name, sheet_df in df.items():
|
54
|
+
logger.debug(f"📋 处理工作表: {sheet_name}, 形状: {sheet_df.shape}")
|
55
|
+
markdown_content += f"## 工作表: {sheet_name}\n\n"
|
56
|
+
|
57
|
+
if not sheet_df.empty:
|
58
|
+
# 清理数据:移除完全为空的行和列
|
59
|
+
sheet_df = sheet_df.dropna(how='all').dropna(axis=1, how='all')
|
60
|
+
|
61
|
+
if not sheet_df.empty:
|
62
|
+
sheet_markdown = sheet_df.to_markdown(index=False)
|
63
|
+
markdown_content += sheet_markdown + "\n\n"
|
64
|
+
logger.debug(f"✅ 工作表 {sheet_name} 转换完成,有效数据形状: {sheet_df.shape}")
|
65
|
+
else:
|
66
|
+
markdown_content += "*该工作表无有效数据*\n\n"
|
67
|
+
logger.warning(f"⚠️ 工作表 {sheet_name} 清理后无有效数据")
|
68
|
+
else:
|
69
|
+
markdown_content += "*该工作表为空*\n\n"
|
70
|
+
logger.warning(f"⚠️ 工作表 {sheet_name} 为空")
|
71
|
+
else:
|
72
|
+
# 单个工作表
|
73
|
+
logger.info(f"📄 单个工作表,形状: {df.shape}")
|
74
|
+
if not df.empty:
|
75
|
+
# 清理数据:移除完全为空的行和列
|
76
|
+
df = df.dropna(how='all').dropna(axis=1, how='all')
|
77
|
+
|
78
|
+
if not df.empty:
|
79
|
+
markdown_content = df.to_markdown(index=False)
|
80
|
+
logger.info(f"✅ 工作表转换完成,有效数据形状: {df.shape}")
|
81
|
+
else:
|
82
|
+
markdown_content = "*工作表无有效数据*"
|
83
|
+
logger.warning("⚠️ 工作表清理后无有效数据")
|
84
|
+
else:
|
85
|
+
markdown_content = "*工作表为空*"
|
86
|
+
logger.warning("⚠️ 工作表为空")
|
87
|
+
|
88
|
+
logger.info(f"🎊 pandas转换完成,markdown内容长度: {len(markdown_content)} 字符")
|
89
|
+
logger.debug(f"👀 前200字符预览: {markdown_content[:200]}...")
|
90
|
+
|
91
|
+
return markdown_content
|
92
|
+
|
93
|
+
except FileNotFoundError as e:
|
94
|
+
logger.error(f"🚫 文件未找到: {str(e)}")
|
95
|
+
raise
|
96
|
+
except PermissionError as e:
|
97
|
+
logger.error(f"🔒 文件权限错误: {str(e)}")
|
98
|
+
raise Exception(f"无权限访问文件: {file_path}")
|
99
|
+
except pd.errors.EmptyDataError as e:
|
100
|
+
logger.error(f"📭 Excel文件为空: {str(e)}")
|
101
|
+
raise Exception(f"Excel文件为空或无法读取: {file_path}")
|
102
|
+
except Exception as e:
|
103
|
+
logger.error(f"💥 pandas读取Excel失败: {str(e)}")
|
104
|
+
raise
|
28
105
|
|
29
106
|
def _parse(self, file_path: str, result_queue: Queue) -> dict:
|
107
|
+
"""解析Excel文件的核心方法"""
|
108
|
+
logger.info(f"🎬 开始解析Excel文件: {file_path}")
|
109
|
+
|
30
110
|
try:
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
111
|
+
# 使用pandas解析Excel
|
112
|
+
logger.info("🐼 使用pandas模式解析Excel")
|
113
|
+
mk_content = self._parse_with_pandas(file_path)
|
114
|
+
|
115
|
+
# 检查内容是否为空
|
116
|
+
if not mk_content.strip():
|
117
|
+
logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
|
118
|
+
mk_content = "*无法解析文件内容*"
|
119
|
+
|
120
|
+
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
121
|
+
|
122
|
+
# 生成lifecycle信息
|
123
|
+
lifecycle = self.generate_lifecycle(
|
124
|
+
source_file=file_path,
|
125
|
+
domain="Technology",
|
126
|
+
usage_purpose="Documentation",
|
127
|
+
life_type="LLM_ORIGIN"
|
35
128
|
)
|
36
|
-
|
129
|
+
logger.debug("⚙️ 生成lifecycle信息完成")
|
130
|
+
|
131
|
+
# 创建输出对象
|
132
|
+
title = self.get_file_extension(file_path)
|
133
|
+
output_vo = MarkdownOutputVo(title, mk_content)
|
134
|
+
output_vo.add_lifecycle(lifecycle)
|
135
|
+
|
136
|
+
result = output_vo.to_dict()
|
137
|
+
result_queue.put(result)
|
138
|
+
logger.info(f"🏆 Excel文件解析完成: {file_path}")
|
139
|
+
logger.debug(f"🔑 返回结果键: {list(result.keys())}")
|
140
|
+
|
141
|
+
time.sleep(0.5) # 给队列一点时间
|
142
|
+
return result
|
143
|
+
|
37
144
|
except Exception as e:
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
domain="Technology",
|
44
|
-
usage_purpose="Documentation",
|
45
|
-
life_type="LLM_ORIGIN"
|
46
|
-
)
|
47
|
-
output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
|
48
|
-
output_vo.add_lifecycle(lifecycle)
|
49
|
-
result_queue.put(output_vo.to_dict())
|
50
|
-
time.sleep(0.5)
|
51
|
-
return output_vo.to_dict()
|
145
|
+
logger.error(f"💀 解析Excel文件失败: {file_path}, 错误: {str(e)}")
|
146
|
+
# 将错误也放入队列
|
147
|
+
error_result = {"error": str(e), "file_path": file_path}
|
148
|
+
result_queue.put(error_result)
|
149
|
+
raise
|
52
150
|
|
53
151
|
def parse(self, file_path: str) -> dict:
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
process.
|
71
|
-
|
152
|
+
"""解析Excel文件 - 支持多进程和超时控制"""
|
153
|
+
logger.info(f"🚀 启动Excel解析进程 - 文件: {file_path}, 超时: {self.timeout}s")
|
154
|
+
|
155
|
+
try:
|
156
|
+
# 验证文件存在
|
157
|
+
if not os.path.exists(file_path):
|
158
|
+
logger.error(f"🚫 文件不存在: {file_path}")
|
159
|
+
raise FileNotFoundError(f"文件不存在: {file_path}")
|
160
|
+
|
161
|
+
# 验证文件扩展名
|
162
|
+
if not file_path.lower().endswith(('.xlsx', '.xls')):
|
163
|
+
logger.warning(f"⚠️ 文件扩展名不是Excel格式: {file_path}")
|
164
|
+
|
165
|
+
result_queue = Queue()
|
166
|
+
process = multiprocessing.Process(target=self._parse, args=(file_path, result_queue))
|
167
|
+
process.start()
|
168
|
+
logger.debug(f"⚡ 启动子进程,PID: {process.pid}")
|
169
|
+
|
170
|
+
start_time = time.time()
|
171
|
+
|
172
|
+
# 等待解析完成或超时
|
173
|
+
while time.time() - start_time < self.timeout:
|
174
|
+
elapsed_time = int(time.time() - start_time)
|
175
|
+
logger.debug(f"⏱️ 等待解析完成... {elapsed_time}s")
|
176
|
+
|
177
|
+
if not process.is_alive():
|
178
|
+
logger.debug("✅ 子进程已完成")
|
179
|
+
break
|
180
|
+
|
181
|
+
if not result_queue.empty():
|
182
|
+
result = result_queue.get()
|
183
|
+
process.join() # 等待进程正常结束
|
184
|
+
|
185
|
+
# 检查是否是错误结果
|
186
|
+
if "error" in result:
|
187
|
+
logger.error(f"💥 子进程返回错误: {result['error']}")
|
188
|
+
raise Exception(result["error"])
|
189
|
+
|
190
|
+
logger.info(f"🎉 Excel解析成功完成,耗时: {elapsed_time}s")
|
191
|
+
return result
|
192
|
+
|
193
|
+
time.sleep(1)
|
194
|
+
else:
|
195
|
+
# 超时处理
|
196
|
+
logger.error(f"⏰ 解析超时 ({self.timeout}s),终止进程")
|
197
|
+
process.terminate()
|
198
|
+
process.join(timeout=5) # 给进程5秒时间优雅退出
|
199
|
+
|
200
|
+
if process.is_alive():
|
201
|
+
logger.error("💀 强制杀死进程")
|
202
|
+
process.kill()
|
203
|
+
|
204
|
+
raise TimeoutError(f"Excel解析超时: {file_path}")
|
205
|
+
|
206
|
+
except Exception as e:
|
207
|
+
logger.error(f"💀 Excel解析失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}")
|
208
|
+
raise
|
@@ -1,42 +1,51 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pydatamax
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.13
|
4
4
|
Summary: A library for parsing and converting various file formats.
|
5
|
-
Home-page: https://github.com/
|
6
|
-
Author:
|
7
|
-
Author-email:
|
5
|
+
Home-page: https://github.com/Hi-Dolphin/datamax
|
6
|
+
Author: ccy
|
7
|
+
Author-email: cy.kron@foxmail.com
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
11
|
Requires-Python: >=3.10
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
|
-
Requires-Dist:
|
15
|
-
Requires-Dist: python-
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist:
|
18
|
-
Requires-Dist:
|
19
|
-
Requires-Dist: loguru
|
20
|
-
Requires-Dist:
|
21
|
-
Requires-Dist:
|
22
|
-
Requires-Dist:
|
23
|
-
Requires-Dist:
|
24
|
-
Requires-Dist:
|
25
|
-
Requires-Dist:
|
26
|
-
Requires-Dist:
|
27
|
-
Requires-Dist:
|
28
|
-
Requires-Dist:
|
29
|
-
Requires-Dist:
|
30
|
-
Requires-Dist:
|
31
|
-
Requires-Dist:
|
32
|
-
Requires-Dist:
|
33
|
-
Requires-Dist:
|
34
|
-
Requires-Dist:
|
35
|
-
Requires-Dist:
|
36
|
-
Requires-Dist:
|
37
|
-
Requires-Dist:
|
38
|
-
Requires-Dist:
|
39
|
-
Requires-Dist:
|
14
|
+
Requires-Dist: oss2<3.0.0,>=2.19.1
|
15
|
+
Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
|
16
|
+
Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
|
17
|
+
Requires-Dist: crcmod<2.0.0,>=1.7
|
18
|
+
Requires-Dist: langdetect<2.0.0,>=1.0.9
|
19
|
+
Requires-Dist: loguru<1.0.0,>=0.7.3
|
20
|
+
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
21
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.1.0
|
22
|
+
Requires-Dist: pymupdf<2.0.0,>=1.26.0
|
23
|
+
Requires-Dist: pypdf<6.0.0,>=5.5.0
|
24
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
25
|
+
Requires-Dist: pandas<3.0.0,>=2.2.3
|
26
|
+
Requires-Dist: numpy<3.0.0,>=2.2.6
|
27
|
+
Requires-Dist: requests<3.0.0,>=2.32.3
|
28
|
+
Requires-Dist: tqdm<5.0.0,>=4.67.1
|
29
|
+
Requires-Dist: pydantic<3.0.0,>=2.11.5
|
30
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
|
31
|
+
Requires-Dist: python-magic<1.0.0,>=0.4.27
|
32
|
+
Requires-Dist: PyYAML<7.0.0,>=6.0.2
|
33
|
+
Requires-Dist: Pillow<12.0.0,>=11.2.1
|
34
|
+
Requires-Dist: packaging<25.0,>=24.2
|
35
|
+
Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
|
36
|
+
Requires-Dist: minio<8.0.0,>=7.2.15
|
37
|
+
Requires-Dist: openai<2.0.0,>=1.82.0
|
38
|
+
Requires-Dist: jionlp<2.0.0,>=1.5.23
|
39
|
+
Requires-Dist: chardet<6.0.0,>=5.2.0
|
40
|
+
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
41
|
+
Requires-Dist: tiktoken<1.0.0,>=0.9.0
|
42
|
+
Requires-Dist: markitdown<1.0.0,>=0.1.1
|
43
|
+
Requires-Dist: xlrd<3.0.0,>=2.0.1
|
44
|
+
Requires-Dist: tabulate<1.0.0,>=0.9.0
|
45
|
+
Requires-Dist: unstructured<1.0.0,>=0.17.2
|
46
|
+
Requires-Dist: markdown<4.0.0,>=3.8
|
47
|
+
Requires-Dist: langchain<1.0.0,>=0.3.0
|
48
|
+
Requires-Dist: langchain-community<1.0.0,>=0.3.0
|
40
49
|
Dynamic: author
|
41
50
|
Dynamic: author-email
|
42
51
|
Dynamic: classifier
|
@@ -7,8 +7,8 @@ datamax/parser/__init__.py,sha256=Jilq2PLBNonmoXKATzsIHWWvFuBdlcV2dbSP1cOZ6zg,11
|
|
7
7
|
datamax/parser/base.py,sha256=riGcMn4m295_qf9O0-NbHU2BcHGBXvoF4T3fWj9vgUQ,2514
|
8
8
|
datamax/parser/core.py,sha256=9rzIjsVTRacPTUTAVa5gm5fx0h95LxYnw0lEGqjIIB4,11437
|
9
9
|
datamax/parser/csv_parser.py,sha256=IcyVq8mGE6auIcUInXGYWDnh0H0XJ_3SyQrLVRrS7i0,190
|
10
|
-
datamax/parser/doc_parser.py,sha256=
|
11
|
-
datamax/parser/docx_parser.py,sha256=
|
10
|
+
datamax/parser/doc_parser.py,sha256=WIWZqvWT4bbquMn1t5Y4P3rEFG6YZ6z3b-f-5yCEtwU,8266
|
11
|
+
datamax/parser/docx_parser.py,sha256=Ipk9ea281N8Edj74tnqUpc_MGZgD4qn780MX_QA9SiU,9111
|
12
12
|
datamax/parser/epub_parser.py,sha256=ljCGxLBPwE5gXVKARJec93VpP4dE9R2GspzuSZBkqPQ,1557
|
13
13
|
datamax/parser/html_parser.py,sha256=xQaaK8674QbQwE-Up9X0DJIH0Gg0mR2KoI7fJ6iw2m0,1393
|
14
14
|
datamax/parser/image_parser.py,sha256=qGCndc_21PwsfuxFG03wHSsV0uc-XMBaW3VDbsJQd90,1233
|
@@ -19,7 +19,7 @@ datamax/parser/ppt_parser.py,sha256=Niu3Ina6I6m6lAMS1Z-A7rUbR_iFGmNTaASBoNH_vZ0,
|
|
19
19
|
datamax/parser/pptx_parser.py,sha256=sFWyOa3QNIs4BgtpmSzFQgsgPmunfGqCqi6fulbLFW0,1811
|
20
20
|
datamax/parser/txt_parser.py,sha256=4DIP1LVOw21NDdtqG2RTD_hMcHufkvC8kr048AkuLFs,1682
|
21
21
|
datamax/parser/xls_parser.py,sha256=pRlqgg96f76H8UqXQfheQT9O0ThdP7958hKUCEyQfPM,954
|
22
|
-
datamax/parser/xlsx_parser.py,sha256=
|
22
|
+
datamax/parser/xlsx_parser.py,sha256=tyLU6wa3F31p7JaoCpML6TJyzYd2Lpeuhzs4036en2U,9274
|
23
23
|
datamax/utils/__init__.py,sha256=d69SJvqOXzItyg9rEcLc4z67Lw9vACispOe3x7NvZLA,1051
|
24
24
|
datamax/utils/constants.py,sha256=A0S56mkIfeT6oQmOd-VGTChzLOSBUqsG4skMmLt6uNk,4507
|
25
25
|
datamax/utils/data_cleaner.py,sha256=zlk2dXmhU-_9KVfqmqMGr967v-nc7Iv8ZKRdMkIJsGM,7784
|
@@ -30,10 +30,10 @@ datamax/utils/paddleocr_pdf_operator.py,sha256=Tnb-5SzUd6OXM-XeaL8vdPnsOhgG_GKz-
|
|
30
30
|
datamax/utils/ppt_extract.py,sha256=nd6KSqEzxANrPhNPUZY4ogAyxHzKCbdsI5ZfDQCz0Cw,6164
|
31
31
|
datamax/utils/qa_generator.py,sha256=d75an9JEyT6sxlSjdmWYveQshfyTb0v4aGSuTpTJa0A,12561
|
32
32
|
datamax/utils/tokenizer.py,sha256=Y8XB06XQVsNuG8IPl_4iBZj2yu1xzXldVbmZtXFMQM4,859
|
33
|
-
pydatamax-0.1.
|
33
|
+
pydatamax-0.1.13.dist-info/licenses/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
|
34
34
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
35
|
tests/test_basic.py,sha256=4AByx25-MIt6_zmzxpFRoSCBqLtIjyfTwFLb1UCJz6k,303
|
36
|
-
pydatamax-0.1.
|
37
|
-
pydatamax-0.1.
|
38
|
-
pydatamax-0.1.
|
39
|
-
pydatamax-0.1.
|
36
|
+
pydatamax-0.1.13.dist-info/METADATA,sha256=knte2YZ9jdSGxmO0fzBVtMFAcq1exCKyEdfBde4aCjA,9731
|
37
|
+
pydatamax-0.1.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
38
|
+
pydatamax-0.1.13.dist-info/top_level.txt,sha256=DvdVIUrxJDgRUYiGA5vznYZIP-K8ZnDkTZfrqYLNZMQ,14
|
39
|
+
pydatamax-0.1.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|