pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
- datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +525 -61
- datamax/parser/docx_parser.py +512 -62
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -208
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- pydatamax-0.1.15.dist-info/METADATA +340 -0
- pydatamax-0.1.15.dist-info/RECORD +38 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.13.dist-info/METADATA +0 -280
- pydatamax-0.1.13.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
datamax/parser/docx_parser.py
CHANGED
@@ -1,60 +1,112 @@
|
|
1
|
+
from loguru import logger
|
1
2
|
import os
|
2
3
|
import shutil
|
3
4
|
import subprocess
|
4
5
|
import tempfile
|
5
|
-
import chardet
|
6
|
-
import logging
|
7
6
|
from pathlib import Path
|
8
|
-
from typing import Union
|
9
|
-
|
10
|
-
|
7
|
+
from typing import Union, Optional
|
8
|
+
|
9
|
+
import chardet
|
10
|
+
from loguru import logger
|
11
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
12
|
+
import zipfile
|
13
|
+
import re
|
14
|
+
import html
|
11
15
|
|
16
|
+
# 尝试导入UNO处理器
|
17
|
+
try:
|
18
|
+
from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
|
19
|
+
except ImportError:
|
20
|
+
HAS_UNO = False
|
21
|
+
logger.warning("⚠️ UNO不可用,回退到传统命令行方式")
|
12
22
|
|
13
|
-
# 配置日志
|
14
|
-
logger = logging.getLogger(__name__)
|
15
23
|
|
16
24
|
|
17
25
|
class DocxParser(BaseLife):
|
18
|
-
def __init__(
|
26
|
+
def __init__(
|
27
|
+
self,
|
28
|
+
file_path: Union[str, list],
|
29
|
+
to_markdown: bool = False,
|
30
|
+
use_uno: bool = True,
|
31
|
+
):
|
19
32
|
super().__init__()
|
20
33
|
self.file_path = file_path
|
21
34
|
self.to_markdown = to_markdown
|
22
|
-
|
35
|
+
|
36
|
+
# 优先使用UNO(除非明确禁用)
|
37
|
+
if use_uno and HAS_UNO:
|
38
|
+
self.use_uno = True
|
39
|
+
logger.info(f"🚀 DocxParser初始化完成 - 使用UNO API进行单线程高效处理")
|
40
|
+
else:
|
41
|
+
self.use_uno = False
|
42
|
+
if use_uno and not HAS_UNO:
|
43
|
+
logger.warning(f"⚠️ UNO不可用,回退到传统命令行方式")
|
44
|
+
else:
|
45
|
+
logger.info(f"🚀 DocxParser初始化完成 - 使用传统命令行方式")
|
46
|
+
|
47
|
+
logger.info(f"📄 文件路径: {file_path}, 转换为markdown: {to_markdown}")
|
23
48
|
|
24
49
|
def docx_to_txt(self, docx_path: str, dir_path: str) -> str:
|
25
50
|
"""将.docx文件转换为.txt文件"""
|
26
51
|
logger.info(f"🔄 开始转换DOCX文件为TXT - 源文件: {docx_path}, 输出目录: {dir_path}")
|
27
|
-
|
52
|
+
|
53
|
+
if self.use_uno:
|
54
|
+
# 使用UNO API进行转换
|
55
|
+
try:
|
56
|
+
logger.info("🎯 使用UNO API进行文档转换...")
|
57
|
+
txt_path = convert_with_uno(docx_path, "txt", dir_path)
|
58
|
+
|
59
|
+
if not os.path.exists(txt_path):
|
60
|
+
logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
|
61
|
+
raise Exception(f"文件转换失败 {docx_path} ==> {txt_path}")
|
62
|
+
else:
|
63
|
+
logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
|
64
|
+
return txt_path
|
65
|
+
|
66
|
+
except Exception as e:
|
67
|
+
logger.error(f"💥 UNO转换失败: {str(e)}")
|
68
|
+
logger.warning("⚠️ 自动回退到传统命令行方式...")
|
69
|
+
return self._docx_to_txt_subprocess(docx_path, dir_path)
|
70
|
+
else:
|
71
|
+
# 使用传统的subprocess方式
|
72
|
+
return self._docx_to_txt_subprocess(docx_path, dir_path)
|
73
|
+
|
74
|
+
def _docx_to_txt_subprocess(self, docx_path: str, dir_path: str) -> str:
|
75
|
+
"""使用subprocess将.docx文件转换为.txt文件(传统方式)"""
|
28
76
|
try:
|
29
77
|
cmd = f'soffice --headless --convert-to txt "{docx_path}" --outdir "{dir_path}"'
|
30
78
|
logger.debug(f"⚡ 执行转换命令: {cmd}")
|
31
|
-
|
32
|
-
process = subprocess.Popen(
|
79
|
+
|
80
|
+
process = subprocess.Popen(
|
81
|
+
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
82
|
+
)
|
33
83
|
stdout, stderr = process.communicate()
|
34
84
|
exit_code = process.returncode
|
35
|
-
|
85
|
+
|
36
86
|
if exit_code == 0:
|
37
87
|
logger.info(f"✅ DOCX到TXT转换成功 - 退出码: {exit_code}")
|
38
88
|
if stdout:
|
39
89
|
logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
|
40
90
|
else:
|
41
|
-
encoding = chardet.detect(stderr)[
|
91
|
+
encoding = chardet.detect(stderr)["encoding"]
|
42
92
|
if encoding is None:
|
43
|
-
encoding =
|
44
|
-
error_msg = stderr.decode(encoding, errors=
|
93
|
+
encoding = "utf-8"
|
94
|
+
error_msg = stderr.decode(encoding, errors="replace")
|
45
95
|
logger.error(f"❌ DOCX到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
|
46
|
-
raise Exception(
|
47
|
-
|
96
|
+
raise Exception(
|
97
|
+
f"Error Output (detected encoding: {encoding}): {error_msg}"
|
98
|
+
)
|
99
|
+
|
48
100
|
fname = str(Path(docx_path).stem)
|
49
|
-
txt_path = os.path.join(dir_path, f
|
50
|
-
|
101
|
+
txt_path = os.path.join(dir_path, f"{fname}.txt")
|
102
|
+
|
51
103
|
if not os.path.exists(txt_path):
|
52
104
|
logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
|
53
105
|
raise Exception(f"文件转换失败 {docx_path} ==> {txt_path}")
|
54
106
|
else:
|
55
107
|
logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
|
56
108
|
return txt_path
|
57
|
-
|
109
|
+
|
58
110
|
except subprocess.SubprocessError as e:
|
59
111
|
logger.error(f"💥 subprocess执行失败: {str(e)}")
|
60
112
|
raise Exception(f"执行转换命令时发生错误: {str(e)}")
|
@@ -65,25 +117,25 @@ class DocxParser(BaseLife):
|
|
65
117
|
def read_txt_file(self, txt_path: str) -> str:
|
66
118
|
"""读取txt文件内容"""
|
67
119
|
logger.info(f"📖 开始读取TXT文件: {txt_path}")
|
68
|
-
|
120
|
+
|
69
121
|
try:
|
70
122
|
# 检测文件编码
|
71
|
-
with open(txt_path,
|
123
|
+
with open(txt_path, "rb") as f:
|
72
124
|
raw_data = f.read()
|
73
|
-
encoding = chardet.detect(raw_data)[
|
125
|
+
encoding = chardet.detect(raw_data)["encoding"]
|
74
126
|
if encoding is None:
|
75
|
-
encoding =
|
127
|
+
encoding = "utf-8"
|
76
128
|
logger.debug(f"🔍 检测到文件编码: {encoding}")
|
77
|
-
|
129
|
+
|
78
130
|
# 读取文件内容
|
79
|
-
with open(txt_path,
|
131
|
+
with open(txt_path, "r", encoding=encoding, errors="replace") as f:
|
80
132
|
content = f.read()
|
81
|
-
|
133
|
+
|
82
134
|
logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
|
83
135
|
logger.debug(f"👀 前100字符预览: {content[:100]}...")
|
84
|
-
|
136
|
+
|
85
137
|
return content
|
86
|
-
|
138
|
+
|
87
139
|
except FileNotFoundError as e:
|
88
140
|
logger.error(f"🚫 TXT文件未找到: {str(e)}")
|
89
141
|
raise Exception(f"文件未找到: {txt_path}")
|
@@ -91,20 +143,412 @@ class DocxParser(BaseLife):
|
|
91
143
|
logger.error(f"💥 读取TXT文件时发生错误: {str(e)}")
|
92
144
|
raise
|
93
145
|
|
146
|
+
def extract_all_content(self, docx_path: str) -> str:
|
147
|
+
"""
|
148
|
+
综合提取DOCX文件的所有内容
|
149
|
+
支持多种DOCX内部格式和存储方式
|
150
|
+
"""
|
151
|
+
logger.info(f"🔍 开始综合内容提取: {docx_path}")
|
152
|
+
|
153
|
+
all_content = []
|
154
|
+
|
155
|
+
try:
|
156
|
+
with zipfile.ZipFile(docx_path, 'r') as docx:
|
157
|
+
# 1. 检查并提取altChunk内容 (HTML/MHT嵌入)
|
158
|
+
altchunk_content = self._extract_altchunk_content_internal(docx)
|
159
|
+
if altchunk_content:
|
160
|
+
all_content.append(("altChunk", altchunk_content))
|
161
|
+
|
162
|
+
# 2. 提取标准document.xml内容
|
163
|
+
standard_content = self._extract_standard_document_content(docx)
|
164
|
+
if standard_content:
|
165
|
+
all_content.append(("standard", standard_content))
|
166
|
+
|
167
|
+
# 3. 提取嵌入对象内容 (embeddings)
|
168
|
+
embedded_content = self._extract_embedded_objects(docx)
|
169
|
+
if embedded_content:
|
170
|
+
all_content.append(("embedded", embedded_content))
|
171
|
+
|
172
|
+
# 4. 提取头部和脚部内容
|
173
|
+
header_footer_content = self._extract_headers_footers(docx)
|
174
|
+
if header_footer_content:
|
175
|
+
all_content.append(("header_footer", header_footer_content))
|
176
|
+
|
177
|
+
# 5. 提取注释和批注
|
178
|
+
comments_content = self._extract_comments(docx)
|
179
|
+
if comments_content:
|
180
|
+
all_content.append(("comments", comments_content))
|
181
|
+
|
182
|
+
# 6. 提取文本框和图形对象中的文本
|
183
|
+
textbox_content = self._extract_textbox_content(docx)
|
184
|
+
if textbox_content:
|
185
|
+
all_content.append(("textboxes", textbox_content))
|
186
|
+
|
187
|
+
except Exception as e:
|
188
|
+
logger.error(f"💥 综合内容提取失败: {str(e)}")
|
189
|
+
return ""
|
190
|
+
|
191
|
+
# 合并所有内容
|
192
|
+
if all_content:
|
193
|
+
combined_content = self._combine_extracted_content(all_content)
|
194
|
+
logger.info(f"✅ 综合提取完成,总内容长度: {len(combined_content)} 字符")
|
195
|
+
logger.debug(f"📊 提取到的内容类型: {[item[0] for item in all_content]}")
|
196
|
+
return combined_content
|
197
|
+
|
198
|
+
return ""
|
199
|
+
|
200
|
+
def _extract_altchunk_content_internal(self, docx_zip: zipfile.ZipFile) -> str:
|
201
|
+
"""内部方法:提取altChunk内容,优先使用MHT方式"""
|
202
|
+
try:
|
203
|
+
# 检查document.xml中的altChunk引用
|
204
|
+
if 'word/document.xml' in docx_zip.namelist():
|
205
|
+
doc_xml = docx_zip.read('word/document.xml').decode('utf-8', errors='replace')
|
206
|
+
if 'altChunk' in doc_xml:
|
207
|
+
logger.info("🔍 检测到altChunk格式")
|
208
|
+
|
209
|
+
# 优先查找MHT文件(更简洁的处理方式)
|
210
|
+
mht_files = [f for f in docx_zip.namelist() if f.endswith('.mht') and 'word/' in f]
|
211
|
+
html_files = [f for f in docx_zip.namelist() if f.endswith('.html') and 'word/' in f]
|
212
|
+
|
213
|
+
# 优先处理MHT文件
|
214
|
+
for filename in mht_files:
|
215
|
+
logger.info(f"📄 优先处理MHT文件: {filename}")
|
216
|
+
content = docx_zip.read(filename).decode('utf-8', errors='replace')
|
217
|
+
return self._extract_html_from_mht(content)
|
218
|
+
|
219
|
+
# 如果没有MHT文件,再处理HTML文件
|
220
|
+
for filename in html_files:
|
221
|
+
logger.info(f"📄 处理HTML文件: {filename}")
|
222
|
+
content = docx_zip.read(filename).decode('utf-8', errors='replace')
|
223
|
+
return self._html_to_clean_text(content)
|
224
|
+
|
225
|
+
return ""
|
226
|
+
except Exception as e:
|
227
|
+
logger.error(f"💥 提取altChunk内容失败: {str(e)}")
|
228
|
+
return ""
|
229
|
+
|
230
|
+
def _extract_standard_document_content(self, docx_zip: zipfile.ZipFile) -> str:
|
231
|
+
"""提取标准document.xml内容"""
|
232
|
+
try:
|
233
|
+
if 'word/document.xml' in docx_zip.namelist():
|
234
|
+
doc_xml = docx_zip.read('word/document.xml').decode('utf-8', errors='replace')
|
235
|
+
|
236
|
+
# 使用正则表达式提取文本内容
|
237
|
+
import xml.etree.ElementTree as ET
|
238
|
+
|
239
|
+
# 移除命名空间前缀以简化处理
|
240
|
+
doc_xml_clean = re.sub(r'xmlns[^=]*="[^"]*"', '', doc_xml)
|
241
|
+
doc_xml_clean = re.sub(r'w:', '', doc_xml_clean)
|
242
|
+
doc_xml_clean = re.sub(r'[a-zA-Z0-9]+:', '', doc_xml_clean)
|
243
|
+
|
244
|
+
# 提取所有<t>标签中的文本
|
245
|
+
text_matches = re.findall(r'<t[^>]*>(.*?)</t>', doc_xml_clean, re.DOTALL)
|
246
|
+
if text_matches:
|
247
|
+
content = ' '.join(text_matches)
|
248
|
+
content = html.unescape(content)
|
249
|
+
logger.info(f"📝 从document.xml提取文本: {len(content)} 字符")
|
250
|
+
return content.strip()
|
251
|
+
return ""
|
252
|
+
except Exception as e:
|
253
|
+
logger.error(f"💥 提取标准文档内容失败: {str(e)}")
|
254
|
+
return ""
|
255
|
+
|
256
|
+
def _extract_embedded_objects(self, docx_zip: zipfile.ZipFile) -> str:
|
257
|
+
"""提取嵌入对象内容"""
|
258
|
+
try:
|
259
|
+
embedded_content = []
|
260
|
+
|
261
|
+
# 查找嵌入的文档对象
|
262
|
+
for filename in docx_zip.namelist():
|
263
|
+
if 'word/embeddings/' in filename:
|
264
|
+
logger.info(f"📎 找到嵌入对象: {filename}")
|
265
|
+
# 这里可以根据文件类型进一步处理
|
266
|
+
# 例如:.docx, .xlsx, .txt等
|
267
|
+
|
268
|
+
return ' '.join(embedded_content) if embedded_content else ""
|
269
|
+
except Exception as e:
|
270
|
+
logger.error(f"💥 提取嵌入对象失败: {str(e)}")
|
271
|
+
return ""
|
272
|
+
|
273
|
+
def _extract_headers_footers(self, docx_zip: zipfile.ZipFile) -> str:
|
274
|
+
"""提取页眉页脚内容"""
|
275
|
+
try:
|
276
|
+
header_footer_content = []
|
277
|
+
|
278
|
+
for filename in docx_zip.namelist():
|
279
|
+
if ('word/header' in filename or 'word/footer' in filename) and filename.endswith('.xml'):
|
280
|
+
logger.debug(f"📄 处理页眉页脚: {filename}")
|
281
|
+
content = docx_zip.read(filename).decode('utf-8', errors='replace')
|
282
|
+
|
283
|
+
# 提取文本内容
|
284
|
+
text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', content, re.DOTALL)
|
285
|
+
if text_matches:
|
286
|
+
header_footer_text = ' '.join(text_matches)
|
287
|
+
header_footer_text = html.unescape(header_footer_text)
|
288
|
+
if header_footer_text.strip():
|
289
|
+
header_footer_content.append(header_footer_text.strip())
|
290
|
+
|
291
|
+
if header_footer_content:
|
292
|
+
logger.info(f"📑 提取页眉页脚内容: {len(header_footer_content)} 个")
|
293
|
+
|
294
|
+
return ' '.join(header_footer_content) if header_footer_content else ""
|
295
|
+
except Exception as e:
|
296
|
+
logger.error(f"💥 提取页眉页脚失败: {str(e)}")
|
297
|
+
return ""
|
298
|
+
|
299
|
+
def _extract_comments(self, docx_zip: zipfile.ZipFile) -> str:
|
300
|
+
"""提取注释和批注内容"""
|
301
|
+
try:
|
302
|
+
if 'word/comments.xml' in docx_zip.namelist():
|
303
|
+
comments_xml = docx_zip.read('word/comments.xml').decode('utf-8', errors='replace')
|
304
|
+
|
305
|
+
# 提取注释文本
|
306
|
+
text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', comments_xml, re.DOTALL)
|
307
|
+
if text_matches:
|
308
|
+
comments_text = ' '.join(text_matches)
|
309
|
+
comments_text = html.unescape(comments_text)
|
310
|
+
logger.info(f"💬 提取注释内容: {len(comments_text)} 字符")
|
311
|
+
return comments_text.strip()
|
312
|
+
|
313
|
+
return ""
|
314
|
+
except Exception as e:
|
315
|
+
logger.error(f"💥 提取注释失败: {str(e)}")
|
316
|
+
return ""
|
317
|
+
|
318
|
+
def _extract_textbox_content(self, docx_zip: zipfile.ZipFile) -> str:
|
319
|
+
"""提取文本框和图形对象中的文本"""
|
320
|
+
try:
|
321
|
+
textbox_content = []
|
322
|
+
|
323
|
+
# 查找可能包含文本框的文件
|
324
|
+
for filename in docx_zip.namelist():
|
325
|
+
if 'word/' in filename and filename.endswith('.xml'):
|
326
|
+
content = docx_zip.read(filename).decode('utf-8', errors='replace')
|
327
|
+
|
328
|
+
# 查找文本框内容 (w:txbxContent)
|
329
|
+
textbox_matches = re.findall(r'<w:txbxContent[^>]*>(.*?)</w:txbxContent>', content, re.DOTALL)
|
330
|
+
for match in textbox_matches:
|
331
|
+
text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', match, re.DOTALL)
|
332
|
+
if text_matches:
|
333
|
+
textbox_text = ' '.join(text_matches)
|
334
|
+
textbox_text = html.unescape(textbox_text)
|
335
|
+
if textbox_text.strip():
|
336
|
+
textbox_content.append(textbox_text.strip())
|
337
|
+
|
338
|
+
if textbox_content:
|
339
|
+
logger.info(f"📦 提取文本框内容: {len(textbox_content)} 个")
|
340
|
+
|
341
|
+
return ' '.join(textbox_content) if textbox_content else ""
|
342
|
+
except Exception as e:
|
343
|
+
logger.error(f"💥 提取文本框内容失败: {str(e)}")
|
344
|
+
return ""
|
345
|
+
|
346
|
+
def _combine_extracted_content(self, content_list: list) -> str:
|
347
|
+
"""合并提取到的各种内容"""
|
348
|
+
combined = []
|
349
|
+
|
350
|
+
# 按重要性排序内容
|
351
|
+
priority_order = ["altChunk", "standard", "header_footer", "textboxes", "comments", "embedded"]
|
352
|
+
|
353
|
+
for content_type in priority_order:
|
354
|
+
for item_type, content in content_list:
|
355
|
+
if item_type == content_type and content.strip():
|
356
|
+
combined.append(content.strip())
|
357
|
+
|
358
|
+
# 添加其他未分类的内容
|
359
|
+
for item_type, content in content_list:
|
360
|
+
if item_type not in priority_order and content.strip():
|
361
|
+
combined.append(content.strip())
|
362
|
+
|
363
|
+
return '\n\n'.join(combined) if combined else ""
|
364
|
+
|
365
|
+
def _extract_html_from_mht(self, mht_content: str) -> str:
|
366
|
+
"""从MHT内容中提取HTML部分并转换为简洁文本"""
|
367
|
+
try:
|
368
|
+
# MHT文件使用MIME格式,寻找HTML部分
|
369
|
+
lines = mht_content.split('\n')
|
370
|
+
in_html_section = False
|
371
|
+
html_lines = []
|
372
|
+
skip_headers = True
|
373
|
+
|
374
|
+
for line in lines:
|
375
|
+
# 检测HTML部分开始
|
376
|
+
if 'Content-Type: text/html' in line:
|
377
|
+
in_html_section = True
|
378
|
+
skip_headers = True
|
379
|
+
continue
|
380
|
+
|
381
|
+
# 在HTML部分中
|
382
|
+
if in_html_section:
|
383
|
+
# 跳过Content-*头部
|
384
|
+
if skip_headers and line.strip() and not line.startswith('Content-'):
|
385
|
+
skip_headers = False
|
386
|
+
|
387
|
+
# 空行表示头部结束,内容开始
|
388
|
+
if skip_headers and not line.strip():
|
389
|
+
skip_headers = False
|
390
|
+
continue
|
391
|
+
|
392
|
+
# 检查是否到达下一个MIME部分
|
393
|
+
if line.startswith('------=') and len(html_lines) > 0:
|
394
|
+
# HTML部分结束
|
395
|
+
break
|
396
|
+
|
397
|
+
# 收集HTML内容
|
398
|
+
if not skip_headers:
|
399
|
+
html_lines.append(line)
|
400
|
+
|
401
|
+
# 合并所有HTML行
|
402
|
+
html_content = '\n'.join(html_lines)
|
403
|
+
|
404
|
+
# 解码quoted-printable编码
|
405
|
+
if '=3D' in html_content or '=\n' in html_content:
|
406
|
+
try:
|
407
|
+
import quopri
|
408
|
+
html_content = quopri.decodestring(html_content.encode()).decode('utf-8', errors='replace')
|
409
|
+
logger.info("📧 解码quoted-printable编码")
|
410
|
+
except Exception as e:
|
411
|
+
logger.warning(f"⚠️ quoted-printable解码失败: {str(e)}")
|
412
|
+
|
413
|
+
logger.debug(f"📄 提取的HTML内容长度: {len(html_content)} 字符")
|
414
|
+
|
415
|
+
# 转换为简洁文本
|
416
|
+
return self._html_to_clean_text(html_content)
|
417
|
+
|
418
|
+
except Exception as e:
|
419
|
+
logger.error(f"💥 从MHT提取HTML失败: {str(e)}")
|
420
|
+
return ""
|
421
|
+
|
422
|
+
def _html_to_clean_text(self, html_content: str) -> str:
|
423
|
+
"""将HTML内容转换为简洁的纯文本,专门优化MHT内容"""
|
424
|
+
try:
|
425
|
+
# 首先解码HTML实体
|
426
|
+
text = html.unescape(html_content)
|
427
|
+
|
428
|
+
# 先尝试提取<body>标签内的所有内容
|
429
|
+
body_match = re.search(r'<body[^>]*>(.*?)</body>', text, re.DOTALL | re.IGNORECASE)
|
430
|
+
if body_match:
|
431
|
+
main_content = body_match.group(1)
|
432
|
+
logger.info("📄 提取<body>标签内容")
|
433
|
+
else:
|
434
|
+
main_content = text
|
435
|
+
logger.info("📄 使用全部内容(未找到body标签)")
|
436
|
+
|
437
|
+
# 特殊处理<pre><code>标签,保持其内部的格式
|
438
|
+
pre_code_blocks = []
|
439
|
+
def preserve_pre_code(match):
|
440
|
+
idx = len(pre_code_blocks)
|
441
|
+
pre_code_blocks.append(match.group(1))
|
442
|
+
return f"__PRE_CODE_{idx}__"
|
443
|
+
|
444
|
+
main_content = re.sub(r'<pre[^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>',
|
445
|
+
preserve_pre_code, main_content, flags=re.DOTALL | re.IGNORECASE)
|
446
|
+
|
447
|
+
# 处理其他HTML结构
|
448
|
+
# 1. 先转换需要保留换行的标签
|
449
|
+
main_content = re.sub(r'<br\s*/?>', '\n', main_content, flags=re.IGNORECASE)
|
450
|
+
main_content = re.sub(r'</p>', '\n', main_content, flags=re.IGNORECASE)
|
451
|
+
main_content = re.sub(r'<p[^>]*>', '', main_content, flags=re.IGNORECASE)
|
452
|
+
main_content = re.sub(r'</div>', '\n', main_content, flags=re.IGNORECASE)
|
453
|
+
main_content = re.sub(r'<div[^>]*>', '', main_content, flags=re.IGNORECASE)
|
454
|
+
main_content = re.sub(r'</h[1-6]>', '\n\n', main_content, flags=re.IGNORECASE)
|
455
|
+
main_content = re.sub(r'<h[1-6][^>]*>', '', main_content, flags=re.IGNORECASE)
|
456
|
+
main_content = re.sub(r'</li>', '\n', main_content, flags=re.IGNORECASE)
|
457
|
+
main_content = re.sub(r'<li[^>]*>', '• ', main_content, flags=re.IGNORECASE)
|
458
|
+
main_content = re.sub(r'</tr>', '\n', main_content, flags=re.IGNORECASE)
|
459
|
+
main_content = re.sub(r'</td>', ' | ', main_content, flags=re.IGNORECASE)
|
460
|
+
main_content = re.sub(r'</th>', ' | ', main_content, flags=re.IGNORECASE)
|
461
|
+
|
462
|
+
# 2. 移除style和script标签及其内容
|
463
|
+
main_content = re.sub(r'<style[^>]*>.*?</style>', '', main_content, flags=re.DOTALL | re.IGNORECASE)
|
464
|
+
main_content = re.sub(r'<script[^>]*>.*?</script>', '', main_content, flags=re.DOTALL | re.IGNORECASE)
|
465
|
+
|
466
|
+
# 3. 移除所有剩余的HTML标签
|
467
|
+
main_content = re.sub(r'<[^>]+>', '', main_content)
|
468
|
+
|
469
|
+
# 4. 解码HTML实体(第二次,确保完全解码)
|
470
|
+
main_content = html.unescape(main_content)
|
471
|
+
|
472
|
+
# 5. 恢复<pre><code>块的内容
|
473
|
+
for idx, pre_code_content in enumerate(pre_code_blocks):
|
474
|
+
# 清理pre_code内容
|
475
|
+
cleaned_pre_code = html.unescape(pre_code_content)
|
476
|
+
main_content = main_content.replace(f"__PRE_CODE_{idx}__", cleaned_pre_code)
|
477
|
+
|
478
|
+
# 6. 清理多余的空白字符,但保持段落结构
|
479
|
+
lines = main_content.split('\n')
|
480
|
+
cleaned_lines = []
|
481
|
+
|
482
|
+
for line in lines:
|
483
|
+
# 清理每行的首尾空格
|
484
|
+
line = line.strip()
|
485
|
+
# 保留非空行
|
486
|
+
if line:
|
487
|
+
# 清理行内多余空格
|
488
|
+
line = re.sub(r'[ \t]+', ' ', line)
|
489
|
+
# 清理表格分隔符多余的空格
|
490
|
+
line = re.sub(r'\s*\|\s*', ' | ', line)
|
491
|
+
cleaned_lines.append(line)
|
492
|
+
else:
|
493
|
+
# 保留空行作为段落分隔
|
494
|
+
if cleaned_lines and cleaned_lines[-1] != '':
|
495
|
+
cleaned_lines.append('')
|
496
|
+
|
497
|
+
# 7. 合并清理后的行
|
498
|
+
main_content = '\n'.join(cleaned_lines)
|
499
|
+
|
500
|
+
# 8. 最终清理:移除多余的空行
|
501
|
+
main_content = re.sub(r'\n{3,}', '\n\n', main_content)
|
502
|
+
main_content = main_content.strip()
|
503
|
+
|
504
|
+
logger.info(f"📝 HTML内容转换为简洁文本: {len(main_content)} 字符")
|
505
|
+
|
506
|
+
return main_content
|
507
|
+
|
508
|
+
except Exception as e:
|
509
|
+
logger.error(f"💥 HTML转简洁文本失败: {str(e)}")
|
510
|
+
# 如果转换失败,返回原始文本的基础清理版本
|
511
|
+
return re.sub(r'<[^>]+>', '', html_content)
|
512
|
+
|
513
|
+
def _html_to_text(self, html_content: str) -> str:
|
514
|
+
"""将HTML内容转换为纯文本(保留此方法用于其他HTML内容)"""
|
515
|
+
# 对于非MHT的HTML内容,使用这个更通用的方法
|
516
|
+
return self._html_to_clean_text(html_content)
|
517
|
+
|
518
|
+
def extract_altchunk_content(self, docx_path: str) -> Optional[str]:
|
519
|
+
"""
|
520
|
+
提取包含altChunk的DOCX文件内容 (保持向后兼容)
|
521
|
+
"""
|
522
|
+
try:
|
523
|
+
with zipfile.ZipFile(docx_path, 'r') as docx:
|
524
|
+
return self._extract_altchunk_content_internal(docx)
|
525
|
+
except Exception as e:
|
526
|
+
logger.error(f"💥 提取altChunk内容失败: {str(e)}")
|
527
|
+
return None
|
528
|
+
|
94
529
|
def read_docx_file(self, docx_path: str) -> str:
|
95
530
|
"""读取docx文件并转换为文本"""
|
96
531
|
logger.info(f"📖 开始读取DOCX文件 - 文件: {docx_path}")
|
97
|
-
|
532
|
+
|
98
533
|
try:
|
534
|
+
# 首先尝试综合提取所有内容
|
535
|
+
comprehensive_content = self.extract_all_content(docx_path)
|
536
|
+
if comprehensive_content and comprehensive_content.strip():
|
537
|
+
logger.info(f"✨ 使用综合提取方式成功,内容长度: {len(comprehensive_content)} 字符")
|
538
|
+
return comprehensive_content
|
539
|
+
|
540
|
+
# 如果综合提取失败,使用传统转换方式
|
541
|
+
logger.info("🔄 综合提取失败或内容为空,使用传统转换方式")
|
542
|
+
|
99
543
|
with tempfile.TemporaryDirectory() as temp_path:
|
100
544
|
logger.debug(f"📁 创建临时目录: {temp_path}")
|
101
|
-
|
545
|
+
|
102
546
|
temp_dir = Path(temp_path)
|
103
|
-
|
547
|
+
|
104
548
|
file_path = temp_dir / "tmp.docx"
|
105
549
|
shutil.copy(docx_path, file_path)
|
106
550
|
logger.debug(f"📋 复制文件到临时目录: {docx_path} -> {file_path}")
|
107
|
-
|
551
|
+
|
108
552
|
# 转换DOCX为TXT
|
109
553
|
txt_file_path = self.docx_to_txt(str(file_path), str(temp_path))
|
110
554
|
logger.info(f"🎯 DOCX转TXT完成: {txt_file_path}")
|
@@ -112,9 +556,9 @@ class DocxParser(BaseLife):
|
|
112
556
|
# 读取TXT文件内容
|
113
557
|
content = self.read_txt_file(txt_file_path)
|
114
558
|
logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
|
115
|
-
|
559
|
+
|
116
560
|
return content
|
117
|
-
|
561
|
+
|
118
562
|
except FileNotFoundError as e:
|
119
563
|
logger.error(f"🚫 文件未找到: {str(e)}")
|
120
564
|
raise Exception(f"文件未找到: {docx_path}")
|
@@ -128,31 +572,31 @@ class DocxParser(BaseLife):
|
|
128
572
|
def parse(self, file_path: str):
|
129
573
|
"""解析DOCX文件"""
|
130
574
|
logger.info(f"🎬 开始解析DOCX文件: {file_path}")
|
131
|
-
|
575
|
+
|
132
576
|
try:
|
133
577
|
# 验证文件存在
|
134
578
|
if not os.path.exists(file_path):
|
135
579
|
logger.error(f"🚫 文件不存在: {file_path}")
|
136
580
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
137
|
-
|
581
|
+
|
138
582
|
# 验证文件扩展名
|
139
|
-
if not file_path.lower().endswith(
|
583
|
+
if not file_path.lower().endswith(".docx"):
|
140
584
|
logger.warning(f"⚠️ 文件扩展名不是.docx: {file_path}")
|
141
|
-
|
585
|
+
|
142
586
|
# 验证文件大小
|
143
587
|
file_size = os.path.getsize(file_path)
|
144
588
|
logger.info(f"📏 文件大小: {file_size} 字节")
|
145
|
-
|
589
|
+
|
146
590
|
if file_size == 0:
|
147
591
|
logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
|
148
|
-
|
149
|
-
title =
|
592
|
+
|
593
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
150
594
|
logger.debug(f"🏷️ 提取文件标题: {title}")
|
151
|
-
|
595
|
+
|
152
596
|
# 使用soffice转换为txt后读取内容
|
153
597
|
logger.info("📝 使用soffice转换DOCX为TXT并读取内容")
|
154
598
|
content = self.read_docx_file(docx_path=file_path)
|
155
|
-
|
599
|
+
|
156
600
|
# 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
|
157
601
|
if self.to_markdown:
|
158
602
|
# 简单的文本到markdown转换(保持段落结构)
|
@@ -161,26 +605,30 @@ class DocxParser(BaseLife):
|
|
161
605
|
else:
|
162
606
|
mk_content = content
|
163
607
|
logger.info("📝 保持原始文本格式")
|
164
|
-
|
608
|
+
|
165
609
|
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
166
|
-
|
610
|
+
|
167
611
|
# 检查内容是否为空
|
168
612
|
if not mk_content.strip():
|
169
613
|
logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
|
170
|
-
|
171
|
-
lifecycle = self.generate_lifecycle(
|
172
|
-
|
614
|
+
|
615
|
+
lifecycle = self.generate_lifecycle(
|
616
|
+
source_file=file_path,
|
617
|
+
domain="Technology",
|
618
|
+
usage_purpose="Documentation",
|
619
|
+
life_type="LLM_ORIGIN",
|
620
|
+
)
|
173
621
|
logger.debug("⚙️ 生成lifecycle信息完成")
|
174
|
-
|
622
|
+
|
175
623
|
output_vo = MarkdownOutputVo(title, mk_content)
|
176
624
|
output_vo.add_lifecycle(lifecycle)
|
177
|
-
|
625
|
+
|
178
626
|
result = output_vo.to_dict()
|
179
627
|
logger.info(f"🏆 DOCX文件解析完成: {file_path}")
|
180
628
|
logger.debug(f"🔑 返回结果键: {list(result.keys())}")
|
181
|
-
|
629
|
+
|
182
630
|
return result
|
183
|
-
|
631
|
+
|
184
632
|
except FileNotFoundError as e:
|
185
633
|
logger.error(f"🚫 文件不存在错误: {str(e)}")
|
186
634
|
raise
|
@@ -188,25 +636,27 @@ class DocxParser(BaseLife):
|
|
188
636
|
logger.error(f"🔒 文件权限错误: {str(e)}")
|
189
637
|
raise Exception(f"无权限访问文件: {file_path}")
|
190
638
|
except Exception as e:
|
191
|
-
logger.error(
|
639
|
+
logger.error(
|
640
|
+
f"💀 解析DOCX文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
|
641
|
+
)
|
192
642
|
raise
|
193
643
|
|
194
644
|
def format_as_markdown(self, content: str) -> str:
|
195
645
|
"""将纯文本格式化为简单的markdown格式"""
|
196
646
|
if not content.strip():
|
197
647
|
return content
|
198
|
-
|
199
|
-
lines = content.split(
|
648
|
+
|
649
|
+
lines = content.split("\n")
|
200
650
|
formatted_lines = []
|
201
|
-
|
651
|
+
|
202
652
|
for line in lines:
|
203
653
|
line = line.strip()
|
204
654
|
if not line:
|
205
|
-
formatted_lines.append(
|
655
|
+
formatted_lines.append("")
|
206
656
|
continue
|
207
|
-
|
657
|
+
|
208
658
|
# 简单的markdown格式化规则
|
209
659
|
# 可以根据需要扩展更多规则
|
210
660
|
formatted_lines.append(line)
|
211
|
-
|
212
|
-
return
|
661
|
+
|
662
|
+
return "\n".join(formatted_lines)
|