pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +91 -68
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/parser/doc_parser.py
CHANGED
@@ -1,21 +1,22 @@
|
|
1
|
-
|
1
|
+
import html
|
2
2
|
import os
|
3
|
+
import re
|
3
4
|
import shutil
|
4
5
|
import subprocess
|
5
6
|
import tempfile
|
6
7
|
from pathlib import Path
|
7
|
-
from typing import Union
|
8
|
-
import struct
|
9
|
-
import re
|
10
|
-
import html
|
8
|
+
from typing import Union
|
11
9
|
|
12
10
|
import chardet
|
11
|
+
from loguru import logger
|
13
12
|
|
14
13
|
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
14
|
+
from datamax.utils.lifecycle_types import LifeType
|
15
15
|
|
16
16
|
# 尝试导入OLE相关库(用于读取DOC内部结构)
|
17
17
|
try:
|
18
18
|
import olefile
|
19
|
+
|
19
20
|
HAS_OLEFILE = True
|
20
21
|
except ImportError:
|
21
22
|
HAS_OLEFILE = False
|
@@ -77,36 +78,36 @@ class DocParser(BaseLife):
|
|
77
78
|
支持多种DOC内部格式和存储方式
|
78
79
|
"""
|
79
80
|
logger.info(f"🔍 开始综合内容提取: {doc_path}")
|
80
|
-
|
81
|
+
|
81
82
|
all_content = []
|
82
|
-
|
83
|
+
|
83
84
|
try:
|
84
85
|
# 1. 尝试使用OLE解析提取内容(如果可用)
|
85
86
|
if HAS_OLEFILE:
|
86
87
|
ole_content = self._extract_ole_content(doc_path)
|
87
88
|
if ole_content:
|
88
89
|
all_content.append(("ole", ole_content))
|
89
|
-
|
90
|
+
|
90
91
|
# 2. 尝试提取嵌入对象
|
91
92
|
embedded_content = self._extract_embedded_objects(doc_path)
|
92
93
|
if embedded_content:
|
93
94
|
all_content.append(("embedded", embedded_content))
|
94
|
-
|
95
|
+
|
95
96
|
# 3. 如果上述方法都没有提取到内容,使用传统转换
|
96
97
|
if not all_content:
|
97
98
|
logger.info("🔄 使用传统转换方式提取内容")
|
98
99
|
return "" # 返回空,让调用者使用传统方式
|
99
|
-
|
100
|
+
|
100
101
|
# 检查内容质量,特别是对于WPS文件
|
101
102
|
for content_type, content in all_content:
|
102
103
|
if content and self._check_content_quality(content):
|
103
104
|
logger.info(f"✅ 使用 {content_type} 内容提取成功")
|
104
105
|
return content
|
105
|
-
|
106
|
+
|
106
107
|
# 如果所有内容质量都不佳,返回空
|
107
108
|
logger.warning("⚠️ 所有提取方式的内容质量都不佳")
|
108
109
|
return ""
|
109
|
-
|
110
|
+
|
110
111
|
except Exception as e:
|
111
112
|
logger.error(f"💥 综合内容提取失败: {str(e)}")
|
112
113
|
return ""
|
@@ -116,36 +117,36 @@ class DocParser(BaseLife):
|
|
116
117
|
try:
|
117
118
|
ole = olefile.OleFileIO(doc_path)
|
118
119
|
logger.info(f"📂 成功打开OLE文件: {doc_path}")
|
119
|
-
|
120
|
+
|
120
121
|
# 列出所有流
|
121
122
|
streams = ole.listdir()
|
122
123
|
logger.debug(f"📋 可用的OLE流: {streams}")
|
123
|
-
|
124
|
+
|
124
125
|
# 检查是否是WPS生成的文件
|
125
|
-
is_wps = any(
|
126
|
+
is_wps = any("WpsCustomData" in str(stream) for stream in streams)
|
126
127
|
if is_wps:
|
127
128
|
logger.info("📝 检测到WPS DOC文件,建议使用传统转换方式")
|
128
129
|
# 对于WPS文件,OLE解析可能不可靠,返回空让其使用传统方式
|
129
130
|
ole.close()
|
130
131
|
return ""
|
131
|
-
|
132
|
+
|
132
133
|
all_texts = []
|
133
|
-
|
134
|
+
|
134
135
|
# 尝试提取WordDocument流
|
135
|
-
if ole.exists(
|
136
|
+
if ole.exists("WordDocument"):
|
136
137
|
try:
|
137
|
-
word_stream = ole.openstream(
|
138
|
+
word_stream = ole.openstream("WordDocument").read()
|
138
139
|
logger.info(f"📄 WordDocument流大小: {len(word_stream)} 字节")
|
139
140
|
text = self._parse_word_stream(word_stream)
|
140
141
|
if text:
|
141
142
|
all_texts.append(text)
|
142
143
|
except Exception as e:
|
143
144
|
logger.error(f"💥 解析WordDocument流失败: {str(e)}")
|
144
|
-
|
145
|
+
|
145
146
|
# 尝试读取其他可能包含文本的流
|
146
147
|
text_content = []
|
147
148
|
for entry in ole.listdir():
|
148
|
-
if any(name in str(entry) for name in [
|
149
|
+
if any(name in str(entry) for name in ["Text", "Content", "Body"]):
|
149
150
|
try:
|
150
151
|
stream = ole.openstream(entry)
|
151
152
|
data = stream.read()
|
@@ -155,19 +156,19 @@ class DocParser(BaseLife):
|
|
155
156
|
text_content.append(decoded)
|
156
157
|
except:
|
157
158
|
continue
|
158
|
-
|
159
|
+
|
159
160
|
if text_content:
|
160
|
-
combined =
|
161
|
+
combined = "\n".join(text_content)
|
161
162
|
logger.info(f"📄 从OLE流中提取文本: {len(combined)} 字符")
|
162
163
|
return self._clean_extracted_text(combined)
|
163
|
-
|
164
|
+
|
164
165
|
ole.close()
|
165
|
-
|
166
|
+
|
166
167
|
return ""
|
167
|
-
|
168
|
+
|
168
169
|
except Exception as e:
|
169
170
|
logger.warning(f"⚠️ OLE解析失败: {str(e)}")
|
170
|
-
|
171
|
+
|
171
172
|
return ""
|
172
173
|
|
173
174
|
def _parse_word_stream(self, data: bytes) -> str:
|
@@ -176,25 +177,38 @@ class DocParser(BaseLife):
|
|
176
177
|
# DOC文件格式复杂,这里提供基础的文本提取
|
177
178
|
# 查找文本片段
|
178
179
|
text_parts = []
|
179
|
-
|
180
|
+
|
180
181
|
# 尝试多种编码,特别注意中文编码
|
181
|
-
for encoding in [
|
182
|
+
for encoding in [
|
183
|
+
"utf-16-le",
|
184
|
+
"utf-8",
|
185
|
+
"gbk",
|
186
|
+
"gb18030",
|
187
|
+
"gb2312",
|
188
|
+
"big5",
|
189
|
+
"cp936",
|
190
|
+
"cp1252",
|
191
|
+
]:
|
182
192
|
try:
|
183
|
-
decoded = data.decode(encoding, errors=
|
193
|
+
decoded = data.decode(encoding, errors="ignore")
|
184
194
|
# 检查是否包含合理的中文字符
|
185
|
-
chinese_chars = len(
|
195
|
+
chinese_chars = len(
|
196
|
+
[c for c in decoded if "\u4e00" <= c <= "\u9fff"]
|
197
|
+
)
|
186
198
|
if chinese_chars > 10 or (decoded and len(decoded.strip()) > 50):
|
187
199
|
# 过滤出可打印字符,但保留中文
|
188
200
|
cleaned = self._filter_printable_text(decoded)
|
189
201
|
if cleaned and len(cleaned.strip()) > 20:
|
190
202
|
text_parts.append(cleaned)
|
191
|
-
logger.debug(
|
203
|
+
logger.debug(
|
204
|
+
f"📝 使用编码 {encoding} 成功解码,包含 {chinese_chars} 个中文字符"
|
205
|
+
)
|
192
206
|
break
|
193
207
|
except:
|
194
208
|
continue
|
195
|
-
|
196
|
-
return
|
197
|
-
|
209
|
+
|
210
|
+
return "\n".join(text_parts) if text_parts else ""
|
211
|
+
|
198
212
|
except Exception as e:
|
199
213
|
logger.error(f"💥 解析Word流失败: {str(e)}")
|
200
214
|
return ""
|
@@ -204,50 +218,67 @@ class DocParser(BaseLife):
|
|
204
218
|
result = []
|
205
219
|
for char in text:
|
206
220
|
# 保留中文字符
|
207
|
-
if
|
221
|
+
if "\u4e00" <= char <= "\u9fff":
|
208
222
|
result.append(char)
|
209
223
|
# 保留日文字符
|
210
|
-
elif
|
224
|
+
elif "\u3040" <= char <= "\u30ff":
|
211
225
|
result.append(char)
|
212
226
|
# 保留韩文字符
|
213
|
-
elif
|
227
|
+
elif "\uac00" <= char <= "\ud7af":
|
214
228
|
result.append(char)
|
215
229
|
# 保留ASCII可打印字符和空白字符
|
216
230
|
elif char.isprintable() or char.isspace():
|
217
231
|
result.append(char)
|
218
232
|
# 保留常用标点符号
|
219
|
-
elif char in ',。!?;:""'
|
233
|
+
elif char in ',。!?;:""' "()【】《》、·…—":
|
220
234
|
result.append(char)
|
221
|
-
|
222
|
-
return
|
235
|
+
|
236
|
+
return "".join(result)
|
223
237
|
|
224
238
|
def _try_decode_bytes(self, data: bytes) -> str:
|
225
239
|
"""尝试使用多种编码解码字节数据"""
|
226
240
|
# 优先尝试中文编码
|
227
|
-
encodings = [
|
228
|
-
|
241
|
+
encodings = [
|
242
|
+
"utf-8",
|
243
|
+
"gbk",
|
244
|
+
"gb18030",
|
245
|
+
"gb2312",
|
246
|
+
"big5",
|
247
|
+
"utf-16-le",
|
248
|
+
"utf-16-be",
|
249
|
+
"cp936",
|
250
|
+
"cp1252",
|
251
|
+
"latin-1",
|
252
|
+
]
|
253
|
+
|
229
254
|
# 首先尝试使用chardet检测编码
|
230
255
|
try:
|
231
256
|
import chardet
|
257
|
+
|
232
258
|
detected = chardet.detect(data)
|
233
|
-
if detected[
|
234
|
-
encodings.insert(0, detected[
|
235
|
-
logger.debug(
|
259
|
+
if detected["encoding"] and detected["confidence"] > 0.7:
|
260
|
+
encodings.insert(0, detected["encoding"])
|
261
|
+
logger.debug(
|
262
|
+
f"🔍 检测到编码: {detected['encoding']} (置信度: {detected['confidence']})"
|
263
|
+
)
|
236
264
|
except:
|
237
265
|
pass
|
238
|
-
|
266
|
+
|
239
267
|
for encoding in encodings:
|
240
268
|
try:
|
241
|
-
decoded = data.decode(encoding, errors=
|
269
|
+
decoded = data.decode(encoding, errors="ignore")
|
242
270
|
# 检查是否包含有意义的文本(包括中文)
|
243
|
-
if decoded and (
|
271
|
+
if decoded and (
|
272
|
+
any(c.isalnum() for c in decoded)
|
273
|
+
or any("\u4e00" <= c <= "\u9fff" for c in decoded)
|
274
|
+
):
|
244
275
|
# 进一步清理文本
|
245
276
|
cleaned = self._filter_printable_text(decoded)
|
246
277
|
if cleaned and len(cleaned.strip()) > 10:
|
247
278
|
return cleaned
|
248
279
|
except:
|
249
280
|
continue
|
250
|
-
|
281
|
+
|
251
282
|
return ""
|
252
283
|
|
253
284
|
def _extract_embedded_objects(self, doc_path: str) -> str:
|
@@ -255,30 +286,33 @@ class DocParser(BaseLife):
|
|
255
286
|
try:
|
256
287
|
if not HAS_OLEFILE:
|
257
288
|
return ""
|
258
|
-
|
289
|
+
|
259
290
|
embedded_content = []
|
260
|
-
|
291
|
+
|
261
292
|
with olefile.OleFileIO(doc_path) as ole:
|
262
293
|
# 查找嵌入的对象
|
263
294
|
for entry in ole.listdir():
|
264
|
-
entry_name =
|
265
|
-
|
295
|
+
entry_name = "/".join(entry)
|
296
|
+
|
266
297
|
# 检查是否是嵌入对象
|
267
|
-
if any(
|
298
|
+
if any(
|
299
|
+
pattern in entry_name.lower()
|
300
|
+
for pattern in ["object", "embed", "package"]
|
301
|
+
):
|
268
302
|
logger.info(f"📎 找到嵌入对象: {entry_name}")
|
269
303
|
try:
|
270
304
|
stream = ole.openstream(entry)
|
271
305
|
data = stream.read()
|
272
|
-
|
306
|
+
|
273
307
|
# 尝试提取文本内容
|
274
308
|
text = self._try_decode_bytes(data)
|
275
309
|
if text and len(text.strip()) > 20:
|
276
310
|
embedded_content.append(text.strip())
|
277
311
|
except:
|
278
312
|
continue
|
279
|
-
|
280
|
-
return
|
281
|
-
|
313
|
+
|
314
|
+
return "\n\n".join(embedded_content) if embedded_content else ""
|
315
|
+
|
282
316
|
except Exception as e:
|
283
317
|
logger.warning(f"⚠️ 提取嵌入对象失败: {str(e)}")
|
284
318
|
return ""
|
@@ -288,77 +322,87 @@ class DocParser(BaseLife):
|
|
288
322
|
try:
|
289
323
|
# 1. 解码HTML/XML实体
|
290
324
|
text = html.unescape(text)
|
291
|
-
|
325
|
+
|
292
326
|
# 2. 移除所有XML/HTML标签
|
293
|
-
text = re.sub(r
|
294
|
-
|
327
|
+
text = re.sub(r"<[^>]+>", "", text)
|
328
|
+
|
295
329
|
# 3. 移除XML命名空间前缀
|
296
|
-
text = re.sub(r
|
297
|
-
|
330
|
+
text = re.sub(r"\b\w+:", "", text)
|
331
|
+
|
298
332
|
# 4. 移除NULL字符和其他控制字符
|
299
|
-
text = re.sub(r
|
300
|
-
|
333
|
+
text = re.sub(r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]", "", text)
|
334
|
+
|
301
335
|
# 5. 移除特殊的XML字符序列
|
302
|
-
text = re.sub(r
|
303
|
-
text = re.sub(r
|
304
|
-
text = re.sub(r
|
305
|
-
|
336
|
+
text = re.sub(r"&[a-zA-Z]+;", "", text)
|
337
|
+
text = re.sub(r"&#\d+;", "", text)
|
338
|
+
text = re.sub(r"&#x[0-9a-fA-F]+;", "", text)
|
339
|
+
|
306
340
|
# 6. 保留有意义的字符,移除其他特殊字符
|
307
341
|
# 保留:中文、日文、韩文、英文、数字、常用标点和空白
|
308
342
|
allowed_chars = (
|
309
|
-
r
|
310
|
-
r
|
311
|
-
r
|
312
|
-
r
|
313
|
-
r',。!?;:""'
|
343
|
+
r"\w\s" # 字母数字和空白
|
344
|
+
r"\u4e00-\u9fff" # 中文
|
345
|
+
r"\u3040-\u30ff" # 日文
|
346
|
+
r"\uac00-\ud7af" # 韩文
|
347
|
+
r',。!?;:""'
|
348
|
+
"()【】《》、·…—" # 中文标点
|
314
349
|
r'.,!?;:()[\]{}"\'`~@#$%^&*+=\-_/\\' # 英文标点和常用符号
|
315
350
|
)
|
316
|
-
|
351
|
+
|
317
352
|
# 使用更严格的过滤,但保留所有有意义的字符
|
318
|
-
cleaned_text =
|
319
|
-
|
353
|
+
cleaned_text = "".join(
|
354
|
+
char for char in text if re.match(f"[{allowed_chars}]", char)
|
355
|
+
)
|
356
|
+
|
320
357
|
# 7. 移除过长的无意义字符序列(通常是二进制垃圾)
|
321
|
-
cleaned_text = re.sub(r
|
322
|
-
|
358
|
+
cleaned_text = re.sub(r"([^\s\u4e00-\u9fff])\1{5,}", r"\1", cleaned_text)
|
359
|
+
|
323
360
|
# 8. 清理多余的空白,但保留段落结构
|
324
|
-
cleaned_text = re.sub(
|
325
|
-
|
326
|
-
|
327
|
-
|
361
|
+
cleaned_text = re.sub(
|
362
|
+
r"[ \t]+", " ", cleaned_text
|
363
|
+
) # 多个空格/制表符变为单个空格
|
364
|
+
cleaned_text = re.sub(
|
365
|
+
r"\n\s*\n\s*\n+", "\n\n", cleaned_text
|
366
|
+
) # 多个空行变为双空行
|
367
|
+
cleaned_text = re.sub(
|
368
|
+
r"^\s+|\s+$", "", cleaned_text, flags=re.MULTILINE
|
369
|
+
) # 移除行首行尾空白
|
370
|
+
|
328
371
|
# 9. 进一步清理:移除独立的标点符号行
|
329
|
-
lines = cleaned_text.split(
|
372
|
+
lines = cleaned_text.split("\n")
|
330
373
|
cleaned_lines = []
|
331
|
-
|
374
|
+
|
332
375
|
for line in lines:
|
333
376
|
line = line.strip()
|
334
377
|
if line:
|
335
378
|
# 检查行是否主要是有意义的内容
|
336
379
|
# 计算中文、英文字母和数字的比例
|
337
|
-
meaningful_chars = sum(
|
338
|
-
c.isalnum() or
|
339
|
-
)
|
340
|
-
|
380
|
+
meaningful_chars = sum(
|
381
|
+
1 for c in line if (c.isalnum() or "\u4e00" <= c <= "\u9fff")
|
382
|
+
)
|
383
|
+
|
341
384
|
# 如果有意义字符占比超过30%,或者行长度小于5(可能是标题),则保留
|
342
|
-
if
|
343
|
-
|
385
|
+
if len(line) < 5 or (
|
386
|
+
meaningful_chars > 0 and meaningful_chars / len(line) > 0.3
|
387
|
+
):
|
344
388
|
cleaned_lines.append(line)
|
345
389
|
elif cleaned_lines and cleaned_lines[-1]: # 保留段落分隔
|
346
|
-
cleaned_lines.append(
|
347
|
-
|
348
|
-
result =
|
349
|
-
|
390
|
+
cleaned_lines.append("")
|
391
|
+
|
392
|
+
result = "\n".join(cleaned_lines).strip()
|
393
|
+
|
350
394
|
# 10. 最终检查
|
351
395
|
if len(result) < 10:
|
352
396
|
logger.warning("⚠️ 清理后的文本过短,可能存在问题")
|
353
397
|
return ""
|
354
|
-
|
398
|
+
|
355
399
|
# 检查是否还包含XML标签
|
356
|
-
if re.search(r
|
400
|
+
if re.search(r"<[^>]+>", result):
|
357
401
|
logger.warning("⚠️ 清理后仍包含XML标签,进行二次清理")
|
358
|
-
result = re.sub(r
|
359
|
-
|
402
|
+
result = re.sub(r"<[^>]+>", "", result)
|
403
|
+
|
360
404
|
return result
|
361
|
-
|
405
|
+
|
362
406
|
except Exception as e:
|
363
407
|
logger.error(f"💥 清理文本失败: {str(e)}")
|
364
408
|
return text
|
@@ -366,25 +410,27 @@ class DocParser(BaseLife):
|
|
366
410
|
def _combine_extracted_content(self, content_list: list) -> str:
|
367
411
|
"""合并提取到的各种内容"""
|
368
412
|
combined = []
|
369
|
-
|
413
|
+
|
370
414
|
# 按优先级排序内容
|
371
415
|
priority_order = ["ole", "embedded", "converted", "fallback"]
|
372
|
-
|
416
|
+
|
373
417
|
for content_type in priority_order:
|
374
418
|
for item_type, content in content_list:
|
375
419
|
if item_type == content_type and content.strip():
|
376
420
|
combined.append(content.strip())
|
377
|
-
|
421
|
+
|
378
422
|
# 添加其他未分类的内容
|
379
423
|
for item_type, content in content_list:
|
380
424
|
if item_type not in priority_order and content.strip():
|
381
425
|
combined.append(content.strip())
|
382
|
-
|
383
|
-
return
|
426
|
+
|
427
|
+
return "\n\n".join(combined) if combined else ""
|
384
428
|
|
385
429
|
def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
|
386
430
|
"""将.doc文件转换为.txt文件"""
|
387
|
-
logger.info(
|
431
|
+
logger.info(
|
432
|
+
f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}"
|
433
|
+
)
|
388
434
|
|
389
435
|
if self.use_uno:
|
390
436
|
# 使用UNO API进行转换
|
@@ -412,7 +458,7 @@ class DocParser(BaseLife):
|
|
412
458
|
f" 1. 确保LibreOffice正确安装\n"
|
413
459
|
f" 2. 关闭所有LibreOffice进程\n"
|
414
460
|
f" 3. 检查文件权限和路径\n"
|
415
|
-
f
|
461
|
+
f' 4. 尝试手动运行: soffice --headless --convert-to txt "{doc_path}"'
|
416
462
|
)
|
417
463
|
logger.warning("⚠️ 自动回退到传统命令行方式...")
|
418
464
|
return self._doc_to_txt_subprocess(doc_path, dir_path)
|
@@ -435,13 +481,17 @@ class DocParser(BaseLife):
|
|
435
481
|
if exit_code == 0:
|
436
482
|
logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
|
437
483
|
if stdout:
|
438
|
-
logger.debug(
|
484
|
+
logger.debug(
|
485
|
+
f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}"
|
486
|
+
)
|
439
487
|
else:
|
440
488
|
encoding = chardet.detect(stderr)["encoding"]
|
441
489
|
if encoding is None:
|
442
490
|
encoding = "utf-8"
|
443
491
|
error_msg = stderr.decode(encoding, errors="replace")
|
444
|
-
logger.error(
|
492
|
+
logger.error(
|
493
|
+
f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}"
|
494
|
+
)
|
445
495
|
raise Exception(
|
446
496
|
f"Error Output (detected encoding: {encoding}): {error_msg}"
|
447
497
|
)
|
@@ -503,14 +553,16 @@ class DocParser(BaseLife):
|
|
503
553
|
if comprehensive_content and comprehensive_content.strip():
|
504
554
|
# 检查内容质量
|
505
555
|
if self._check_content_quality(comprehensive_content):
|
506
|
-
logger.info(
|
556
|
+
logger.info(
|
557
|
+
f"✨ 使用综合提取方式成功,内容长度: {len(comprehensive_content)} 字符"
|
558
|
+
)
|
507
559
|
return comprehensive_content
|
508
560
|
else:
|
509
561
|
logger.warning("⚠️ 综合提取的内容质量不佳,尝试其他方式")
|
510
|
-
|
562
|
+
|
511
563
|
# 降级到传统转换方式
|
512
564
|
logger.info("🔄 使用传统转换方式")
|
513
|
-
|
565
|
+
|
514
566
|
with tempfile.TemporaryDirectory() as temp_path:
|
515
567
|
logger.debug(f"📁 创建临时目录: {temp_path}")
|
516
568
|
|
@@ -544,23 +596,29 @@ class DocParser(BaseLife):
|
|
544
596
|
"""检查提取内容的质量"""
|
545
597
|
if not content or len(content) < 50:
|
546
598
|
return False
|
547
|
-
|
599
|
+
|
548
600
|
# 计算乱码字符比例
|
549
601
|
total_chars = len(content)
|
550
602
|
# 可识别字符:ASCII、中文、日文、韩文、常用标点
|
551
|
-
recognizable = sum(
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
603
|
+
recognizable = sum(
|
604
|
+
1
|
605
|
+
for c in content
|
606
|
+
if (
|
607
|
+
c.isascii()
|
608
|
+
or "\u4e00" <= c <= "\u9fff" # 中文
|
609
|
+
or "\u3040" <= c <= "\u30ff" # 日文
|
610
|
+
or "\uac00" <= c <= "\ud7af" # 韩文
|
611
|
+
or c in ',。!?;:""' "()【】《》、·…—\n\r\t "
|
612
|
+
)
|
613
|
+
)
|
614
|
+
|
559
615
|
# 如果可识别字符占比低于70%,认为质量不佳
|
560
616
|
if recognizable / total_chars < 0.7:
|
561
|
-
logger.warning(
|
617
|
+
logger.warning(
|
618
|
+
f"⚠️ 内容质量检查失败:可识别字符比例 {recognizable}/{total_chars} = {recognizable/total_chars:.2%}"
|
619
|
+
)
|
562
620
|
return False
|
563
|
-
|
621
|
+
|
564
622
|
return True
|
565
623
|
|
566
624
|
def parse(self, file_path: str):
|
@@ -583,9 +641,17 @@ class DocParser(BaseLife):
|
|
583
641
|
|
584
642
|
if file_size == 0:
|
585
643
|
logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
|
644
|
+
# 生命周期:Data Processing 开始
|
645
|
+
lc_start = self.generate_lifecycle(
|
646
|
+
source_file=file_path,
|
647
|
+
domain="Technology",
|
648
|
+
life_type=LifeType.DATA_PROCESSING,
|
649
|
+
usage_purpose="Documentation",
|
650
|
+
)
|
586
651
|
|
587
|
-
|
588
|
-
|
652
|
+
# 🏷️ 提取文件扩展名
|
653
|
+
extension = self.get_file_extension(file_path)
|
654
|
+
logger.debug(f"🏷️ 提取文件扩展名: {extension}")
|
589
655
|
|
590
656
|
# 读取文件内容
|
591
657
|
logger.info("📝 读取DOC文件内容")
|
@@ -599,6 +665,17 @@ class DocParser(BaseLife):
|
|
599
665
|
else:
|
600
666
|
mk_content = content
|
601
667
|
logger.info("📝 保持原始文本格式")
|
668
|
+
# 3) 生命周期:Data Processed or Failed
|
669
|
+
lc_end = self.generate_lifecycle(
|
670
|
+
source_file=file_path,
|
671
|
+
domain="Technology",
|
672
|
+
life_type=(
|
673
|
+
LifeType.DATA_PROCESSED
|
674
|
+
if mk_content.strip()
|
675
|
+
else LifeType.DATA_PROCESS_FAILED
|
676
|
+
),
|
677
|
+
usage_purpose="Documentation",
|
678
|
+
)
|
602
679
|
|
603
680
|
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
604
681
|
|
@@ -614,8 +691,10 @@ class DocParser(BaseLife):
|
|
614
691
|
)
|
615
692
|
logger.debug("⚙️ 生成lifecycle信息完成")
|
616
693
|
|
617
|
-
output_vo = MarkdownOutputVo(
|
618
|
-
output_vo.add_lifecycle(
|
694
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
695
|
+
output_vo.add_lifecycle(lc_start)
|
696
|
+
output_vo.add_lifecycle(lc_end)
|
697
|
+
# output_vo.add_lifecycle(lc_origin)
|
619
698
|
|
620
699
|
result = output_vo.to_dict()
|
621
700
|
logger.info(f"🏆 DOC文件解析完成: {file_path}")
|
@@ -630,7 +709,9 @@ class DocParser(BaseLife):
|
|
630
709
|
logger.error(f"🔒 文件权限错误: {str(e)}")
|
631
710
|
raise Exception(f"无权限访问文件: {file_path}")
|
632
711
|
except Exception as e:
|
633
|
-
logger.error(
|
712
|
+
logger.error(
|
713
|
+
f"💀 解析DOC文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
|
714
|
+
)
|
634
715
|
raise
|
635
716
|
|
636
717
|
def format_as_markdown(self, content: str) -> str:
|
@@ -657,10 +738,10 @@ class DocParser(BaseLife):
|
|
657
738
|
"""从WPS的WordDocument流中提取文本(使用更宽松的策略)"""
|
658
739
|
try:
|
659
740
|
text_parts = []
|
660
|
-
|
741
|
+
|
661
742
|
# WPS文件可能使用不同的编码和结构
|
662
743
|
# 尝试多种策略提取文本
|
663
|
-
|
744
|
+
|
664
745
|
# 策略1:尝试找到连续的文本块
|
665
746
|
# 查找看起来像文本的字节序列
|
666
747
|
i = 0
|
@@ -668,18 +749,24 @@ class DocParser(BaseLife):
|
|
668
749
|
# 查找可能的文本开始位置
|
669
750
|
if i + 2 < len(data):
|
670
751
|
# 检查是否是Unicode文本(小端序)
|
671
|
-
if data[i+1] == 0 and 32 <= data[i] <= 126:
|
752
|
+
if data[i + 1] == 0 and 32 <= data[i] <= 126:
|
672
753
|
# 可能是ASCII字符的Unicode编码
|
673
754
|
text_block = bytearray()
|
674
755
|
j = i
|
675
|
-
while
|
756
|
+
while (
|
757
|
+
j + 1 < len(data)
|
758
|
+
and data[j + 1] == 0
|
759
|
+
and 32 <= data[j] <= 126
|
760
|
+
):
|
676
761
|
text_block.append(data[j])
|
677
762
|
j += 2
|
678
763
|
if len(text_block) > 10:
|
679
|
-
text_parts.append(
|
764
|
+
text_parts.append(
|
765
|
+
text_block.decode("ascii", errors="ignore")
|
766
|
+
)
|
680
767
|
i = j
|
681
768
|
# 检查是否是UTF-8或GBK中文
|
682
|
-
elif
|
769
|
+
elif 0xE0 <= data[i] <= 0xEF or 0x81 <= data[i] <= 0xFE:
|
683
770
|
# 可能是多字节字符
|
684
771
|
text_block = bytearray()
|
685
772
|
j = i
|
@@ -690,9 +777,11 @@ class DocParser(BaseLife):
|
|
690
777
|
j += 1
|
691
778
|
if len(text_block) > 20:
|
692
779
|
# 尝试解码
|
693
|
-
for encoding in [
|
780
|
+
for encoding in ["utf-8", "gbk", "gb18030", "gb2312"]:
|
694
781
|
try:
|
695
|
-
decoded = text_block.decode(
|
782
|
+
decoded = text_block.decode(
|
783
|
+
encoding, errors="ignore"
|
784
|
+
)
|
696
785
|
if decoded and len(decoded.strip()) > 10:
|
697
786
|
text_parts.append(decoded)
|
698
787
|
break
|
@@ -703,15 +792,15 @@ class DocParser(BaseLife):
|
|
703
792
|
i += 1
|
704
793
|
else:
|
705
794
|
i += 1
|
706
|
-
|
795
|
+
|
707
796
|
# 合并文本部分
|
708
797
|
if text_parts:
|
709
|
-
combined =
|
798
|
+
combined = "\n".join(text_parts)
|
710
799
|
return self._clean_extracted_text(combined)
|
711
|
-
|
800
|
+
|
712
801
|
# 如果上述方法失败,回退到原始方法
|
713
802
|
return self._parse_word_stream(data)
|
714
|
-
|
803
|
+
|
715
804
|
except Exception as e:
|
716
805
|
logger.error(f"💥 解析WPS流失败: {str(e)}")
|
717
806
|
return ""
|