pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
- datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +525 -61
- datamax/parser/docx_parser.py +512 -62
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -208
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- pydatamax-0.1.15.dist-info/METADATA +340 -0
- pydatamax-0.1.15.dist-info/RECORD +38 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.13.dist-info/METADATA +0 -280
- pydatamax-0.1.13.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
datamax/parser/doc_parser.py
CHANGED
@@ -1,60 +1,403 @@
|
|
1
|
+
from loguru import logger
|
1
2
|
import os
|
2
3
|
import shutil
|
3
4
|
import subprocess
|
4
5
|
import tempfile
|
5
|
-
import chardet
|
6
|
-
import logging
|
7
6
|
from pathlib import Path
|
8
|
-
from typing import Union
|
9
|
-
|
10
|
-
|
7
|
+
from typing import Union, Optional
|
8
|
+
import struct
|
9
|
+
import re
|
10
|
+
import html
|
11
|
+
|
12
|
+
import chardet
|
11
13
|
|
14
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
12
15
|
|
13
|
-
#
|
14
|
-
|
16
|
+
# 尝试导入OLE相关库(用于读取DOC内部结构)
|
17
|
+
try:
|
18
|
+
import olefile
|
19
|
+
HAS_OLEFILE = True
|
20
|
+
except ImportError:
|
21
|
+
HAS_OLEFILE = False
|
22
|
+
logger.warning("⚠️ olefile库未安装,无法进行高级DOC解析")
|
23
|
+
|
24
|
+
# 尝试导入UNO处理器
|
25
|
+
try:
|
26
|
+
from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
|
27
|
+
except ImportError:
|
28
|
+
HAS_UNO = False
|
15
29
|
|
16
30
|
|
17
31
|
class DocParser(BaseLife):
|
18
|
-
def __init__(
|
32
|
+
def __init__(
|
33
|
+
self,
|
34
|
+
file_path: Union[str, list],
|
35
|
+
to_markdown: bool = False,
|
36
|
+
use_uno: bool = True,
|
37
|
+
):
|
19
38
|
super().__init__()
|
20
39
|
self.file_path = file_path
|
21
40
|
self.to_markdown = to_markdown
|
22
|
-
|
41
|
+
|
42
|
+
# 优先使用UNO(除非明确禁用)
|
43
|
+
if use_uno and HAS_UNO:
|
44
|
+
self.use_uno = True
|
45
|
+
logger.info(f"🚀 DocParser初始化完成 - 使用UNO API进行单线程高效处理")
|
46
|
+
else:
|
47
|
+
self.use_uno = False
|
48
|
+
if use_uno and not HAS_UNO:
|
49
|
+
logger.warning(f"⚠️ UNO不可用,回退到传统命令行方式")
|
50
|
+
else:
|
51
|
+
logger.info(f"🚀 DocParser初始化完成 - 使用传统命令行方式")
|
52
|
+
|
53
|
+
logger.info(f"📄 文件路径: {file_path}, 转换为markdown: {to_markdown}")
|
54
|
+
|
55
|
+
def extract_all_content(self, doc_path: str) -> str:
|
56
|
+
"""
|
57
|
+
综合提取DOC文件的所有内容
|
58
|
+
支持多种DOC内部格式和存储方式
|
59
|
+
"""
|
60
|
+
logger.info(f"🔍 开始综合内容提取: {doc_path}")
|
61
|
+
|
62
|
+
all_content = []
|
63
|
+
|
64
|
+
try:
|
65
|
+
# 1. 尝试使用OLE解析提取内容(如果可用)
|
66
|
+
if HAS_OLEFILE:
|
67
|
+
ole_content = self._extract_ole_content(doc_path)
|
68
|
+
if ole_content:
|
69
|
+
all_content.append(("ole", ole_content))
|
70
|
+
|
71
|
+
# 2. 尝试提取嵌入对象
|
72
|
+
embedded_content = self._extract_embedded_objects(doc_path)
|
73
|
+
if embedded_content:
|
74
|
+
all_content.append(("embedded", embedded_content))
|
75
|
+
|
76
|
+
# 3. 如果上述方法都没有提取到内容,使用传统转换
|
77
|
+
if not all_content:
|
78
|
+
logger.info("🔄 使用传统转换方式提取内容")
|
79
|
+
return "" # 返回空,让调用者使用传统方式
|
80
|
+
|
81
|
+
# 检查内容质量,特别是对于WPS文件
|
82
|
+
for content_type, content in all_content:
|
83
|
+
if content and self._check_content_quality(content):
|
84
|
+
logger.info(f"✅ 使用 {content_type} 内容提取成功")
|
85
|
+
return content
|
86
|
+
|
87
|
+
# 如果所有内容质量都不佳,返回空
|
88
|
+
logger.warning("⚠️ 所有提取方式的内容质量都不佳")
|
89
|
+
return ""
|
90
|
+
|
91
|
+
except Exception as e:
|
92
|
+
logger.error(f"💥 综合内容提取失败: {str(e)}")
|
93
|
+
return ""
|
94
|
+
|
95
|
+
def _extract_ole_content(self, doc_path: str) -> str:
|
96
|
+
"""使用OLE解析提取DOC内容"""
|
97
|
+
try:
|
98
|
+
ole = olefile.OleFileIO(doc_path)
|
99
|
+
logger.info(f"📂 成功打开OLE文件: {doc_path}")
|
100
|
+
|
101
|
+
# 列出所有流
|
102
|
+
streams = ole.listdir()
|
103
|
+
logger.debug(f"📋 可用的OLE流: {streams}")
|
104
|
+
|
105
|
+
# 检查是否是WPS生成的文件
|
106
|
+
is_wps = any('WpsCustomData' in str(stream) for stream in streams)
|
107
|
+
if is_wps:
|
108
|
+
logger.info("📝 检测到WPS DOC文件,建议使用传统转换方式")
|
109
|
+
# 对于WPS文件,OLE解析可能不可靠,返回空让其使用传统方式
|
110
|
+
ole.close()
|
111
|
+
return ""
|
112
|
+
|
113
|
+
all_texts = []
|
114
|
+
|
115
|
+
# 尝试提取WordDocument流
|
116
|
+
if ole.exists('WordDocument'):
|
117
|
+
try:
|
118
|
+
word_stream = ole.openstream('WordDocument').read()
|
119
|
+
logger.info(f"📄 WordDocument流大小: {len(word_stream)} 字节")
|
120
|
+
text = self._parse_word_stream(word_stream)
|
121
|
+
if text:
|
122
|
+
all_texts.append(text)
|
123
|
+
except Exception as e:
|
124
|
+
logger.error(f"💥 解析WordDocument流失败: {str(e)}")
|
125
|
+
|
126
|
+
# 尝试读取其他可能包含文本的流
|
127
|
+
text_content = []
|
128
|
+
for entry in ole.listdir():
|
129
|
+
if any(name in str(entry) for name in ['Text', 'Content', 'Body']):
|
130
|
+
try:
|
131
|
+
stream = ole.openstream(entry)
|
132
|
+
data = stream.read()
|
133
|
+
# 尝试解码
|
134
|
+
decoded = self._try_decode_bytes(data)
|
135
|
+
if decoded and len(decoded.strip()) > 10:
|
136
|
+
text_content.append(decoded)
|
137
|
+
except:
|
138
|
+
continue
|
139
|
+
|
140
|
+
if text_content:
|
141
|
+
combined = '\n'.join(text_content)
|
142
|
+
logger.info(f"📄 从OLE流中提取文本: {len(combined)} 字符")
|
143
|
+
return self._clean_extracted_text(combined)
|
144
|
+
|
145
|
+
ole.close()
|
146
|
+
|
147
|
+
return ""
|
148
|
+
|
149
|
+
except Exception as e:
|
150
|
+
logger.warning(f"⚠️ OLE解析失败: {str(e)}")
|
151
|
+
|
152
|
+
return ""
|
153
|
+
|
154
|
+
def _parse_word_stream(self, data: bytes) -> str:
|
155
|
+
"""解析WordDocument流中的文本"""
|
156
|
+
try:
|
157
|
+
# DOC文件格式复杂,这里提供基础的文本提取
|
158
|
+
# 查找文本片段
|
159
|
+
text_parts = []
|
160
|
+
|
161
|
+
# 尝试多种编码,特别注意中文编码
|
162
|
+
for encoding in ['utf-16-le', 'utf-8', 'gbk', 'gb18030', 'gb2312', 'big5', 'cp936', 'cp1252']:
|
163
|
+
try:
|
164
|
+
decoded = data.decode(encoding, errors='ignore')
|
165
|
+
# 检查是否包含合理的中文字符
|
166
|
+
chinese_chars = len([c for c in decoded if '\u4e00' <= c <= '\u9fff'])
|
167
|
+
if chinese_chars > 10 or (decoded and len(decoded.strip()) > 50):
|
168
|
+
# 过滤出可打印字符,但保留中文
|
169
|
+
cleaned = self._filter_printable_text(decoded)
|
170
|
+
if cleaned and len(cleaned.strip()) > 20:
|
171
|
+
text_parts.append(cleaned)
|
172
|
+
logger.debug(f"📝 使用编码 {encoding} 成功解码,包含 {chinese_chars} 个中文字符")
|
173
|
+
break
|
174
|
+
except:
|
175
|
+
continue
|
176
|
+
|
177
|
+
return '\n'.join(text_parts) if text_parts else ""
|
178
|
+
|
179
|
+
except Exception as e:
|
180
|
+
logger.error(f"💥 解析Word流失败: {str(e)}")
|
181
|
+
return ""
|
182
|
+
|
183
|
+
def _filter_printable_text(self, text: str) -> str:
|
184
|
+
"""过滤文本,保留可打印字符和中文"""
|
185
|
+
result = []
|
186
|
+
for char in text:
|
187
|
+
# 保留中文字符
|
188
|
+
if '\u4e00' <= char <= '\u9fff':
|
189
|
+
result.append(char)
|
190
|
+
# 保留日文字符
|
191
|
+
elif '\u3040' <= char <= '\u30ff':
|
192
|
+
result.append(char)
|
193
|
+
# 保留韩文字符
|
194
|
+
elif '\uac00' <= char <= '\ud7af':
|
195
|
+
result.append(char)
|
196
|
+
# 保留ASCII可打印字符和空白字符
|
197
|
+
elif char.isprintable() or char.isspace():
|
198
|
+
result.append(char)
|
199
|
+
# 保留常用标点符号
|
200
|
+
elif char in ',。!?;:""''()【】《》、·…—':
|
201
|
+
result.append(char)
|
202
|
+
|
203
|
+
return ''.join(result)
|
204
|
+
|
205
|
+
def _try_decode_bytes(self, data: bytes) -> str:
|
206
|
+
"""尝试使用多种编码解码字节数据"""
|
207
|
+
# 优先尝试中文编码
|
208
|
+
encodings = ['utf-8', 'gbk', 'gb18030', 'gb2312', 'big5', 'utf-16-le', 'utf-16-be', 'cp936', 'cp1252', 'latin-1']
|
209
|
+
|
210
|
+
# 首先尝试使用chardet检测编码
|
211
|
+
try:
|
212
|
+
import chardet
|
213
|
+
detected = chardet.detect(data)
|
214
|
+
if detected['encoding'] and detected['confidence'] > 0.7:
|
215
|
+
encodings.insert(0, detected['encoding'])
|
216
|
+
logger.debug(f"🔍 检测到编码: {detected['encoding']} (置信度: {detected['confidence']})")
|
217
|
+
except:
|
218
|
+
pass
|
219
|
+
|
220
|
+
for encoding in encodings:
|
221
|
+
try:
|
222
|
+
decoded = data.decode(encoding, errors='ignore')
|
223
|
+
# 检查是否包含有意义的文本(包括中文)
|
224
|
+
if decoded and (any(c.isalnum() for c in decoded) or any('\u4e00' <= c <= '\u9fff' for c in decoded)):
|
225
|
+
# 进一步清理文本
|
226
|
+
cleaned = self._filter_printable_text(decoded)
|
227
|
+
if cleaned and len(cleaned.strip()) > 10:
|
228
|
+
return cleaned
|
229
|
+
except:
|
230
|
+
continue
|
231
|
+
|
232
|
+
return ""
|
233
|
+
|
234
|
+
def _extract_embedded_objects(self, doc_path: str) -> str:
|
235
|
+
"""提取DOC文件中的嵌入对象"""
|
236
|
+
try:
|
237
|
+
if not HAS_OLEFILE:
|
238
|
+
return ""
|
239
|
+
|
240
|
+
embedded_content = []
|
241
|
+
|
242
|
+
with olefile.OleFileIO(doc_path) as ole:
|
243
|
+
# 查找嵌入的对象
|
244
|
+
for entry in ole.listdir():
|
245
|
+
entry_name = '/'.join(entry)
|
246
|
+
|
247
|
+
# 检查是否是嵌入对象
|
248
|
+
if any(pattern in entry_name.lower() for pattern in ['object', 'embed', 'package']):
|
249
|
+
logger.info(f"📎 找到嵌入对象: {entry_name}")
|
250
|
+
try:
|
251
|
+
stream = ole.openstream(entry)
|
252
|
+
data = stream.read()
|
253
|
+
|
254
|
+
# 尝试提取文本内容
|
255
|
+
text = self._try_decode_bytes(data)
|
256
|
+
if text and len(text.strip()) > 20:
|
257
|
+
embedded_content.append(text.strip())
|
258
|
+
except:
|
259
|
+
continue
|
260
|
+
|
261
|
+
return '\n\n'.join(embedded_content) if embedded_content else ""
|
262
|
+
|
263
|
+
except Exception as e:
|
264
|
+
logger.warning(f"⚠️ 提取嵌入对象失败: {str(e)}")
|
265
|
+
return ""
|
266
|
+
|
267
|
+
def _clean_extracted_text(self, text: str) -> str:
|
268
|
+
"""清理提取的文本,移除控制字符和格式化,但保留中文"""
|
269
|
+
try:
|
270
|
+
# 移除NULL字符和其他控制字符(但不移除换行等)
|
271
|
+
text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)
|
272
|
+
|
273
|
+
# 移除连续的特殊字符(但保留中文和常用标点)
|
274
|
+
# 修改正则表达式,确保不会误删中文
|
275
|
+
text = re.sub(r'[^\w\s\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af,。!?;:""''()【】《》、·…—.,!?;:()-]+', ' ', text)
|
276
|
+
|
277
|
+
# 移除过长的无意义字符序列(通常是乱码)
|
278
|
+
text = re.sub(r'[\x80-\xff]{10,}', ' ', text)
|
279
|
+
|
280
|
+
# 移除重复的空白
|
281
|
+
text = re.sub(r'\s+', ' ', text)
|
282
|
+
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
|
283
|
+
|
284
|
+
# 确保段落分隔
|
285
|
+
lines = text.split('\n')
|
286
|
+
cleaned_lines = []
|
287
|
+
|
288
|
+
for line in lines:
|
289
|
+
line = line.strip()
|
290
|
+
if line:
|
291
|
+
# 检查行是否主要是乱码
|
292
|
+
printable_chars = sum(1 for c in line if c.isprintable() or '\u4e00' <= c <= '\u9fff')
|
293
|
+
total_chars = len(line)
|
294
|
+
|
295
|
+
# 如果可打印字符(包括中文)占比超过60%,则保留该行
|
296
|
+
if total_chars > 0 and printable_chars / total_chars > 0.6:
|
297
|
+
cleaned_lines.append(line)
|
298
|
+
elif cleaned_lines and cleaned_lines[-1]:
|
299
|
+
cleaned_lines.append('') # 保留段落分隔
|
300
|
+
|
301
|
+
result = '\n'.join(cleaned_lines).strip()
|
302
|
+
|
303
|
+
# 最后检查:如果结果太短或包含太多乱码,返回空
|
304
|
+
if len(result) < 20:
|
305
|
+
return ""
|
306
|
+
|
307
|
+
# 检查乱码比例
|
308
|
+
weird_chars = sum(1 for c in result if ord(c) > 127 and not ('\u4e00' <= c <= '\u9fff' or c in ',。!?;:""''()【】《》、·…—'))
|
309
|
+
if len(result) > 0 and weird_chars / len(result) > 0.3:
|
310
|
+
logger.warning(f"⚠️ 文本包含过多乱码字符 ({weird_chars}/{len(result)})")
|
311
|
+
# 尝试只保留ASCII和中文部分
|
312
|
+
result = re.sub(r'[^\x00-\x7f\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af,。!?;:""''()【】《》、·…—\s]+', ' ', result)
|
313
|
+
result = re.sub(r'\s+', ' ', result).strip()
|
314
|
+
|
315
|
+
return result
|
316
|
+
|
317
|
+
except Exception as e:
|
318
|
+
logger.error(f"💥 清理文本失败: {str(e)}")
|
319
|
+
return text
|
320
|
+
|
321
|
+
def _combine_extracted_content(self, content_list: list) -> str:
|
322
|
+
"""合并提取到的各种内容"""
|
323
|
+
combined = []
|
324
|
+
|
325
|
+
# 按优先级排序内容
|
326
|
+
priority_order = ["ole", "embedded", "converted", "fallback"]
|
327
|
+
|
328
|
+
for content_type in priority_order:
|
329
|
+
for item_type, content in content_list:
|
330
|
+
if item_type == content_type and content.strip():
|
331
|
+
combined.append(content.strip())
|
332
|
+
|
333
|
+
# 添加其他未分类的内容
|
334
|
+
for item_type, content in content_list:
|
335
|
+
if item_type not in priority_order and content.strip():
|
336
|
+
combined.append(content.strip())
|
337
|
+
|
338
|
+
return '\n\n'.join(combined) if combined else ""
|
23
339
|
|
24
340
|
def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
|
25
341
|
"""将.doc文件转换为.txt文件"""
|
26
342
|
logger.info(f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}")
|
27
|
-
|
343
|
+
|
344
|
+
if self.use_uno:
|
345
|
+
# 使用UNO API进行转换
|
346
|
+
try:
|
347
|
+
logger.info("🎯 使用UNO API进行文档转换...")
|
348
|
+
txt_path = convert_with_uno(doc_path, "txt", dir_path)
|
349
|
+
|
350
|
+
if not os.path.exists(txt_path):
|
351
|
+
logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
|
352
|
+
raise Exception(f"文件转换失败 {doc_path} ==> {txt_path}")
|
353
|
+
else:
|
354
|
+
logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
|
355
|
+
return txt_path
|
356
|
+
|
357
|
+
except Exception as e:
|
358
|
+
logger.error(f"💥 UNO转换失败: {str(e)}")
|
359
|
+
logger.warning("⚠️ 自动回退到传统命令行方式...")
|
360
|
+
return self._doc_to_txt_subprocess(doc_path, dir_path)
|
361
|
+
else:
|
362
|
+
# 使用传统的subprocess方式
|
363
|
+
return self._doc_to_txt_subprocess(doc_path, dir_path)
|
364
|
+
|
365
|
+
def _doc_to_txt_subprocess(self, doc_path: str, dir_path: str) -> str:
|
366
|
+
"""使用subprocess将.doc文件转换为.txt文件(传统方式)"""
|
28
367
|
try:
|
29
368
|
cmd = f'soffice --headless --convert-to txt "{doc_path}" --outdir "{dir_path}"'
|
30
369
|
logger.debug(f"⚡ 执行转换命令: {cmd}")
|
31
|
-
|
32
|
-
process = subprocess.Popen(
|
370
|
+
|
371
|
+
process = subprocess.Popen(
|
372
|
+
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
373
|
+
)
|
33
374
|
stdout, stderr = process.communicate()
|
34
375
|
exit_code = process.returncode
|
35
|
-
|
376
|
+
|
36
377
|
if exit_code == 0:
|
37
378
|
logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
|
38
379
|
if stdout:
|
39
380
|
logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
|
40
381
|
else:
|
41
|
-
encoding = chardet.detect(stderr)[
|
382
|
+
encoding = chardet.detect(stderr)["encoding"]
|
42
383
|
if encoding is None:
|
43
|
-
encoding =
|
44
|
-
error_msg = stderr.decode(encoding, errors=
|
384
|
+
encoding = "utf-8"
|
385
|
+
error_msg = stderr.decode(encoding, errors="replace")
|
45
386
|
logger.error(f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
|
46
|
-
raise Exception(
|
47
|
-
|
387
|
+
raise Exception(
|
388
|
+
f"Error Output (detected encoding: {encoding}): {error_msg}"
|
389
|
+
)
|
390
|
+
|
48
391
|
fname = str(Path(doc_path).stem)
|
49
|
-
txt_path = os.path.join(dir_path, f
|
50
|
-
|
392
|
+
txt_path = os.path.join(dir_path, f"{fname}.txt")
|
393
|
+
|
51
394
|
if not os.path.exists(txt_path):
|
52
395
|
logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
|
53
396
|
raise Exception(f"文件转换失败 {doc_path} ==> {txt_path}")
|
54
397
|
else:
|
55
398
|
logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
|
56
399
|
return txt_path
|
57
|
-
|
400
|
+
|
58
401
|
except subprocess.SubprocessError as e:
|
59
402
|
logger.error(f"💥 subprocess执行失败: {str(e)}")
|
60
403
|
raise Exception(f"执行转换命令时发生错误: {str(e)}")
|
@@ -65,25 +408,25 @@ class DocParser(BaseLife):
|
|
65
408
|
def read_txt_file(self, txt_path: str) -> str:
|
66
409
|
"""读取txt文件内容"""
|
67
410
|
logger.info(f"📖 开始读取TXT文件: {txt_path}")
|
68
|
-
|
411
|
+
|
69
412
|
try:
|
70
413
|
# 检测文件编码
|
71
|
-
with open(txt_path,
|
414
|
+
with open(txt_path, "rb") as f:
|
72
415
|
raw_data = f.read()
|
73
|
-
encoding = chardet.detect(raw_data)[
|
416
|
+
encoding = chardet.detect(raw_data)["encoding"]
|
74
417
|
if encoding is None:
|
75
|
-
encoding =
|
418
|
+
encoding = "utf-8"
|
76
419
|
logger.debug(f"🔍 检测到文件编码: {encoding}")
|
77
|
-
|
420
|
+
|
78
421
|
# 读取文件内容
|
79
|
-
with open(txt_path,
|
422
|
+
with open(txt_path, "r", encoding=encoding, errors="replace") as f:
|
80
423
|
content = f.read()
|
81
|
-
|
424
|
+
|
82
425
|
logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
|
83
426
|
logger.debug(f"👀 前100字符预览: {content[:100]}...")
|
84
|
-
|
427
|
+
|
85
428
|
return content
|
86
|
-
|
429
|
+
|
87
430
|
except FileNotFoundError as e:
|
88
431
|
logger.error(f"🚫 TXT文件未找到: {str(e)}")
|
89
432
|
raise Exception(f"文件未找到: {txt_path}")
|
@@ -94,27 +437,41 @@ class DocParser(BaseLife):
|
|
94
437
|
def read_doc_file(self, doc_path: str) -> str:
|
95
438
|
"""读取doc文件并转换为文本"""
|
96
439
|
logger.info(f"📖 开始读取DOC文件 - 文件: {doc_path}")
|
97
|
-
|
440
|
+
|
98
441
|
try:
|
442
|
+
# 首先尝试综合提取(如果有高级解析功能)
|
443
|
+
if HAS_OLEFILE:
|
444
|
+
comprehensive_content = self.extract_all_content(doc_path)
|
445
|
+
if comprehensive_content and comprehensive_content.strip():
|
446
|
+
# 检查内容质量
|
447
|
+
if self._check_content_quality(comprehensive_content):
|
448
|
+
logger.info(f"✨ 使用综合提取方式成功,内容长度: {len(comprehensive_content)} 字符")
|
449
|
+
return comprehensive_content
|
450
|
+
else:
|
451
|
+
logger.warning("⚠️ 综合提取的内容质量不佳,尝试其他方式")
|
452
|
+
|
453
|
+
# 降级到传统转换方式
|
454
|
+
logger.info("🔄 使用传统转换方式")
|
455
|
+
|
99
456
|
with tempfile.TemporaryDirectory() as temp_path:
|
100
457
|
logger.debug(f"📁 创建临时目录: {temp_path}")
|
101
|
-
|
458
|
+
|
102
459
|
temp_dir = Path(temp_path)
|
103
|
-
|
460
|
+
|
104
461
|
file_path = temp_dir / "tmp.doc"
|
105
462
|
shutil.copy(doc_path, file_path)
|
106
463
|
logger.debug(f"📋 复制文件到临时目录: {doc_path} -> {file_path}")
|
107
|
-
|
464
|
+
|
108
465
|
# 转换DOC为TXT
|
109
466
|
txt_file_path = self.doc_to_txt(str(file_path), str(temp_path))
|
110
467
|
logger.info(f"🎯 DOC转TXT完成: {txt_file_path}")
|
111
|
-
|
468
|
+
|
112
469
|
# 读取TXT文件内容
|
113
470
|
content = self.read_txt_file(txt_file_path)
|
114
471
|
logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
|
115
|
-
|
472
|
+
|
116
473
|
return content
|
117
|
-
|
474
|
+
|
118
475
|
except FileNotFoundError as e:
|
119
476
|
logger.error(f"🚫 文件未找到: {str(e)}")
|
120
477
|
raise Exception(f"文件未找到: {doc_path}")
|
@@ -125,27 +482,57 @@ class DocParser(BaseLife):
|
|
125
482
|
logger.error(f"💥 读取DOC文件时发生错误: {str(e)}")
|
126
483
|
raise
|
127
484
|
|
485
|
+
def _check_content_quality(self, content: str) -> bool:
|
486
|
+
"""检查提取内容的质量"""
|
487
|
+
if not content or len(content) < 50:
|
488
|
+
return False
|
489
|
+
|
490
|
+
# 计算乱码字符比例
|
491
|
+
total_chars = len(content)
|
492
|
+
# 可识别字符:ASCII、中文、日文、韩文、常用标点
|
493
|
+
recognizable = sum(1 for c in content if (
|
494
|
+
c.isascii() or
|
495
|
+
'\u4e00' <= c <= '\u9fff' or # 中文
|
496
|
+
'\u3040' <= c <= '\u30ff' or # 日文
|
497
|
+
'\uac00' <= c <= '\ud7af' or # 韩文
|
498
|
+
c in ',。!?;:""''()【】《》、·…—\n\r\t '
|
499
|
+
))
|
500
|
+
|
501
|
+
# 如果可识别字符占比低于70%,认为质量不佳
|
502
|
+
if recognizable / total_chars < 0.7:
|
503
|
+
logger.warning(f"⚠️ 内容质量检查失败:可识别字符比例 {recognizable}/{total_chars} = {recognizable/total_chars:.2%}")
|
504
|
+
return False
|
505
|
+
|
506
|
+
return True
|
507
|
+
|
128
508
|
def parse(self, file_path: str):
|
129
509
|
"""解析DOC文件"""
|
130
510
|
logger.info(f"🎬 开始解析DOC文件: {file_path}")
|
131
|
-
|
511
|
+
|
132
512
|
try:
|
133
513
|
# 验证文件存在
|
134
514
|
if not os.path.exists(file_path):
|
135
515
|
logger.error(f"🚫 文件不存在: {file_path}")
|
136
516
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
137
|
-
|
517
|
+
|
518
|
+
# 验证文件扩展名
|
519
|
+
if not file_path.lower().endswith(".doc"):
|
520
|
+
logger.warning(f"⚠️ 文件扩展名不是.doc: {file_path}")
|
521
|
+
|
138
522
|
# 验证文件大小
|
139
523
|
file_size = os.path.getsize(file_path)
|
140
524
|
logger.info(f"📏 文件大小: {file_size} 字节")
|
141
|
-
|
142
|
-
|
525
|
+
|
526
|
+
if file_size == 0:
|
527
|
+
logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
|
528
|
+
|
529
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
143
530
|
logger.debug(f"🏷️ 提取文件标题: {title}")
|
144
|
-
|
145
|
-
#
|
146
|
-
logger.info("📝
|
531
|
+
|
532
|
+
# 读取文件内容
|
533
|
+
logger.info("📝 读取DOC文件内容")
|
147
534
|
content = self.read_doc_file(doc_path=file_path)
|
148
|
-
|
535
|
+
|
149
536
|
# 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
|
150
537
|
if self.to_markdown:
|
151
538
|
# 简单的文本到markdown转换(保持段落结构)
|
@@ -154,42 +541,119 @@ class DocParser(BaseLife):
|
|
154
541
|
else:
|
155
542
|
mk_content = content
|
156
543
|
logger.info("📝 保持原始文本格式")
|
157
|
-
|
544
|
+
|
158
545
|
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
159
|
-
|
160
|
-
|
161
|
-
|
546
|
+
|
547
|
+
# 检查内容是否为空
|
548
|
+
if not mk_content.strip():
|
549
|
+
logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
|
550
|
+
|
551
|
+
lifecycle = self.generate_lifecycle(
|
552
|
+
source_file=file_path,
|
553
|
+
domain="Technology",
|
554
|
+
usage_purpose="Documentation",
|
555
|
+
life_type="LLM_ORIGIN",
|
556
|
+
)
|
162
557
|
logger.debug("⚙️ 生成lifecycle信息完成")
|
163
|
-
|
558
|
+
|
164
559
|
output_vo = MarkdownOutputVo(title, mk_content)
|
165
560
|
output_vo.add_lifecycle(lifecycle)
|
166
|
-
|
561
|
+
|
167
562
|
result = output_vo.to_dict()
|
168
563
|
logger.info(f"🏆 DOC文件解析完成: {file_path}")
|
169
564
|
logger.debug(f"🔑 返回结果键: {list(result.keys())}")
|
170
|
-
|
565
|
+
|
171
566
|
return result
|
172
|
-
|
567
|
+
|
568
|
+
except FileNotFoundError as e:
|
569
|
+
logger.error(f"🚫 文件不存在错误: {str(e)}")
|
570
|
+
raise
|
571
|
+
except PermissionError as e:
|
572
|
+
logger.error(f"🔒 文件权限错误: {str(e)}")
|
573
|
+
raise Exception(f"无权限访问文件: {file_path}")
|
173
574
|
except Exception as e:
|
174
|
-
logger.error(f"💀 解析DOC文件失败: {file_path},
|
575
|
+
logger.error(f"💀 解析DOC文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}")
|
175
576
|
raise
|
176
577
|
|
177
578
|
def format_as_markdown(self, content: str) -> str:
|
178
579
|
"""将纯文本格式化为简单的markdown格式"""
|
179
580
|
if not content.strip():
|
180
581
|
return content
|
181
|
-
|
182
|
-
lines = content.split(
|
582
|
+
|
583
|
+
lines = content.split("\n")
|
183
584
|
formatted_lines = []
|
184
|
-
|
585
|
+
|
185
586
|
for line in lines:
|
186
587
|
line = line.strip()
|
187
588
|
if not line:
|
188
|
-
formatted_lines.append(
|
589
|
+
formatted_lines.append("")
|
189
590
|
continue
|
190
|
-
|
591
|
+
|
191
592
|
# 简单的markdown格式化规则
|
192
593
|
# 可以根据需要扩展更多规则
|
193
594
|
formatted_lines.append(line)
|
194
|
-
|
195
|
-
return
|
595
|
+
|
596
|
+
return "\n".join(formatted_lines)
|
597
|
+
|
598
|
+
def _extract_text_from_wps_stream(self, data: bytes) -> str:
|
599
|
+
"""从WPS的WordDocument流中提取文本(使用更宽松的策略)"""
|
600
|
+
try:
|
601
|
+
text_parts = []
|
602
|
+
|
603
|
+
# WPS文件可能使用不同的编码和结构
|
604
|
+
# 尝试多种策略提取文本
|
605
|
+
|
606
|
+
# 策略1:尝试找到连续的文本块
|
607
|
+
# 查找看起来像文本的字节序列
|
608
|
+
i = 0
|
609
|
+
while i < len(data):
|
610
|
+
# 查找可能的文本开始位置
|
611
|
+
if i + 2 < len(data):
|
612
|
+
# 检查是否是Unicode文本(小端序)
|
613
|
+
if data[i+1] == 0 and 32 <= data[i] <= 126:
|
614
|
+
# 可能是ASCII字符的Unicode编码
|
615
|
+
text_block = bytearray()
|
616
|
+
j = i
|
617
|
+
while j + 1 < len(data) and data[j+1] == 0 and 32 <= data[j] <= 126:
|
618
|
+
text_block.append(data[j])
|
619
|
+
j += 2
|
620
|
+
if len(text_block) > 10:
|
621
|
+
text_parts.append(text_block.decode('ascii', errors='ignore'))
|
622
|
+
i = j
|
623
|
+
# 检查是否是UTF-8或GBK中文
|
624
|
+
elif 0xe0 <= data[i] <= 0xef or 0x81 <= data[i] <= 0xfe:
|
625
|
+
# 可能是多字节字符
|
626
|
+
text_block = bytearray()
|
627
|
+
j = i
|
628
|
+
while j < len(data):
|
629
|
+
if data[j] < 32 and data[j] not in [9, 10, 13]:
|
630
|
+
break
|
631
|
+
text_block.append(data[j])
|
632
|
+
j += 1
|
633
|
+
if len(text_block) > 20:
|
634
|
+
# 尝试解码
|
635
|
+
for encoding in ['utf-8', 'gbk', 'gb18030', 'gb2312']:
|
636
|
+
try:
|
637
|
+
decoded = text_block.decode(encoding, errors='ignore')
|
638
|
+
if decoded and len(decoded.strip()) > 10:
|
639
|
+
text_parts.append(decoded)
|
640
|
+
break
|
641
|
+
except:
|
642
|
+
continue
|
643
|
+
i = j
|
644
|
+
else:
|
645
|
+
i += 1
|
646
|
+
else:
|
647
|
+
i += 1
|
648
|
+
|
649
|
+
# 合并文本部分
|
650
|
+
if text_parts:
|
651
|
+
combined = '\n'.join(text_parts)
|
652
|
+
return self._clean_extracted_text(combined)
|
653
|
+
|
654
|
+
# 如果上述方法失败,回退到原始方法
|
655
|
+
return self._parse_word_stream(data)
|
656
|
+
|
657
|
+
except Exception as e:
|
658
|
+
logger.error(f"💥 解析WPS流失败: {str(e)}")
|
659
|
+
return ""
|