pydatamax 0.1.15.post2__tar.gz → 0.1.16.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/PKG-INFO +1 -1
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/core.py +1 -1
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/doc_parser.py +88 -30
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/docx_parser.py +178 -45
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/uno_handler.py +17 -6
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/pydatamax.egg-info/PKG-INFO +1 -1
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/setup.py +1 -1
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/LICENSE +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/README.md +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/__init__.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/loader/__init__.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/loader/core.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/loader/minio_handler.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/loader/oss_handler.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/__init__.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/base.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/csv_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/epub_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/html_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/image_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/json_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/md_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/pdf_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/ppt_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/pptx_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/txt_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/xls_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/xlsx_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/__init__.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/constants.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/data_cleaner.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/env_setup.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/gotocr_pdf.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/mineru_operator.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/paddleocr_pdf_operator.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/ppt_extract.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/qa_generator.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/tokenizer.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/pydatamax.egg-info/SOURCES.txt +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/pydatamax.egg-info/dependency_links.txt +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/pydatamax.egg-info/requires.txt +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/pydatamax.egg-info/top_level.txt +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/setup.cfg +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/tests/test_doc_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/tests/test_docx_format_analysis.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/tests/test_docx_parser.py +0 -0
- {pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/tests/test_wps_doc.py +0 -0
@@ -87,7 +87,7 @@ class ParserFactory:
|
|
87
87
|
)
|
88
88
|
elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
|
89
89
|
return parser_class(
|
90
|
-
file_path=file_path, to_markdown=to_markdown
|
90
|
+
file_path=file_path, to_markdown=to_markdown, use_uno=True
|
91
91
|
)
|
92
92
|
elif parser_class_name == 'XlsxParser':
|
93
93
|
return parser_class(
|
@@ -26,6 +26,21 @@ try:
|
|
26
26
|
from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
|
27
27
|
except ImportError:
|
28
28
|
HAS_UNO = False
|
29
|
+
logger.error(
|
30
|
+
"❌ UNO处理器导入失败!\n"
|
31
|
+
"🔧 解决方案:\n"
|
32
|
+
"1. 安装LibreOffice和python-uno:\n"
|
33
|
+
" - Ubuntu/Debian: sudo apt-get install libreoffice python3-uno\n"
|
34
|
+
" - CentOS/RHEL: sudo yum install libreoffice python3-uno\n"
|
35
|
+
" - macOS: brew install libreoffice\n"
|
36
|
+
" - Windows: 下载并安装LibreOffice\n"
|
37
|
+
"2. 确保Python可以访问uno模块:\n"
|
38
|
+
" - Linux: export PYTHONPATH=/usr/lib/libreoffice/program:$PYTHONPATH\n"
|
39
|
+
" - Windows: 添加LibreOffice\\program到系统PATH\n"
|
40
|
+
"3. 验证安装:python -c 'import uno'\n"
|
41
|
+
"4. 如果仍有问题,请查看完整文档:\n"
|
42
|
+
" https://wiki.documentfoundation.org/Documentation/DevGuide/Installing_the_SDK"
|
43
|
+
)
|
29
44
|
|
30
45
|
|
31
46
|
class DocParser(BaseLife):
|
@@ -46,7 +61,11 @@ class DocParser(BaseLife):
|
|
46
61
|
else:
|
47
62
|
self.use_uno = False
|
48
63
|
if use_uno and not HAS_UNO:
|
49
|
-
logger.warning(
|
64
|
+
logger.warning(
|
65
|
+
f"⚠️ UNO不可用,回退到传统命令行方式\n"
|
66
|
+
f"💡 提示:UNO转换更快更稳定,强烈建议安装和配置UNO\n"
|
67
|
+
f"📖 请参考上述错误信息中的安装指南"
|
68
|
+
)
|
50
69
|
else:
|
51
70
|
logger.info(f"🚀 DocParser初始化完成 - 使用传统命令行方式")
|
52
71
|
|
@@ -265,52 +284,78 @@ class DocParser(BaseLife):
|
|
265
284
|
return ""
|
266
285
|
|
267
286
|
def _clean_extracted_text(self, text: str) -> str:
|
268
|
-
"""
|
287
|
+
"""清理提取的文本,彻底移除所有XML标签和控制字符,只保留纯文本"""
|
269
288
|
try:
|
270
|
-
#
|
289
|
+
# 1. 解码HTML/XML实体
|
290
|
+
text = html.unescape(text)
|
291
|
+
|
292
|
+
# 2. 移除所有XML/HTML标签
|
293
|
+
text = re.sub(r'<[^>]+>', '', text)
|
294
|
+
|
295
|
+
# 3. 移除XML命名空间前缀
|
296
|
+
text = re.sub(r'\b\w+:', '', text)
|
297
|
+
|
298
|
+
# 4. 移除NULL字符和其他控制字符
|
271
299
|
text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)
|
272
300
|
|
273
|
-
#
|
274
|
-
|
275
|
-
text = re.sub(r'
|
301
|
+
# 5. 移除特殊的XML字符序列
|
302
|
+
text = re.sub(r'&[a-zA-Z]+;', '', text)
|
303
|
+
text = re.sub(r'&#\d+;', '', text)
|
304
|
+
text = re.sub(r'&#x[0-9a-fA-F]+;', '', text)
|
305
|
+
|
306
|
+
# 6. 保留有意义的字符,移除其他特殊字符
|
307
|
+
# 保留:中文、日文、韩文、英文、数字、常用标点和空白
|
308
|
+
allowed_chars = (
|
309
|
+
r'\w\s' # 字母数字和空白
|
310
|
+
r'\u4e00-\u9fff' # 中文
|
311
|
+
r'\u3040-\u30ff' # 日文
|
312
|
+
r'\uac00-\ud7af' # 韩文
|
313
|
+
r',。!?;:""''()【】《》、·…—' # 中文标点
|
314
|
+
r'.,!?;:()[\]{}"\'`~@#$%^&*+=\-_/\\' # 英文标点和常用符号
|
315
|
+
)
|
316
|
+
|
317
|
+
# 使用更严格的过滤,但保留所有有意义的字符
|
318
|
+
cleaned_text = ''.join(char for char in text if re.match(f'[{allowed_chars}]', char))
|
276
319
|
|
277
|
-
#
|
278
|
-
|
320
|
+
# 7. 移除过长的无意义字符序列(通常是二进制垃圾)
|
321
|
+
cleaned_text = re.sub(r'([^\s\u4e00-\u9fff])\1{5,}', r'\1', cleaned_text)
|
279
322
|
|
280
|
-
#
|
281
|
-
|
282
|
-
|
323
|
+
# 8. 清理多余的空白,但保留段落结构
|
324
|
+
cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text) # 多个空格/制表符变为单个空格
|
325
|
+
cleaned_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', cleaned_text) # 多个空行变为双空行
|
326
|
+
cleaned_text = re.sub(r'^\s+|\s+$', '', cleaned_text, flags=re.MULTILINE) # 移除行首行尾空白
|
283
327
|
|
284
|
-
#
|
285
|
-
lines =
|
328
|
+
# 9. 进一步清理:移除独立的标点符号行
|
329
|
+
lines = cleaned_text.split('\n')
|
286
330
|
cleaned_lines = []
|
287
331
|
|
288
332
|
for line in lines:
|
289
333
|
line = line.strip()
|
290
334
|
if line:
|
291
|
-
#
|
292
|
-
|
293
|
-
|
335
|
+
# 检查行是否主要是有意义的内容
|
336
|
+
# 计算中文、英文字母和数字的比例
|
337
|
+
meaningful_chars = sum(1 for c in line if (
|
338
|
+
c.isalnum() or '\u4e00' <= c <= '\u9fff'
|
339
|
+
))
|
294
340
|
|
295
|
-
#
|
296
|
-
if
|
341
|
+
# 如果有意义字符占比超过30%,或者行长度小于5(可能是标题),则保留
|
342
|
+
if (len(line) < 5 or
|
343
|
+
(meaningful_chars > 0 and meaningful_chars / len(line) > 0.3)):
|
297
344
|
cleaned_lines.append(line)
|
298
|
-
elif cleaned_lines and cleaned_lines[-1]:
|
299
|
-
cleaned_lines.append('')
|
345
|
+
elif cleaned_lines and cleaned_lines[-1]: # 保留段落分隔
|
346
|
+
cleaned_lines.append('')
|
300
347
|
|
301
348
|
result = '\n'.join(cleaned_lines).strip()
|
302
349
|
|
303
|
-
#
|
304
|
-
if len(result) <
|
350
|
+
# 10. 最终检查
|
351
|
+
if len(result) < 10:
|
352
|
+
logger.warning("⚠️ 清理后的文本过短,可能存在问题")
|
305
353
|
return ""
|
306
354
|
|
307
|
-
#
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
# 尝试只保留ASCII和中文部分
|
312
|
-
result = re.sub(r'[^\x00-\x7f\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af,。!?;:""''()【】《》、·…—\s]+', ' ', result)
|
313
|
-
result = re.sub(r'\s+', ' ', result).strip()
|
355
|
+
# 检查是否还包含XML标签
|
356
|
+
if re.search(r'<[^>]+>', result):
|
357
|
+
logger.warning("⚠️ 清理后仍包含XML标签,进行二次清理")
|
358
|
+
result = re.sub(r'<[^>]+>', '', result)
|
314
359
|
|
315
360
|
return result
|
316
361
|
|
@@ -355,7 +400,20 @@ class DocParser(BaseLife):
|
|
355
400
|
return txt_path
|
356
401
|
|
357
402
|
except Exception as e:
|
358
|
-
logger.error(
|
403
|
+
logger.error(
|
404
|
+
f"💥 UNO转换失败: {str(e)}\n"
|
405
|
+
f"🔍 诊断信息:\n"
|
406
|
+
f" - 错误类型: {type(e).__name__}\n"
|
407
|
+
f" - LibreOffice是否已安装?尝试运行: soffice --version\n"
|
408
|
+
f" - Python UNO模块是否可用?尝试: python -c 'import uno'\n"
|
409
|
+
f" - 是否有其他LibreOffice实例在运行?\n"
|
410
|
+
f" - 文件权限是否正确?\n"
|
411
|
+
f"🔧 可能的解决方案:\n"
|
412
|
+
f" 1. 确保LibreOffice正确安装\n"
|
413
|
+
f" 2. 关闭所有LibreOffice进程\n"
|
414
|
+
f" 3. 检查文件权限和路径\n"
|
415
|
+
f" 4. 尝试手动运行: soffice --headless --convert-to txt \"{doc_path}\""
|
416
|
+
)
|
359
417
|
logger.warning("⚠️ 自动回退到传统命令行方式...")
|
360
418
|
return self._doc_to_txt_subprocess(doc_path, dir_path)
|
361
419
|
else:
|
@@ -18,7 +18,21 @@ try:
|
|
18
18
|
from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
|
19
19
|
except ImportError:
|
20
20
|
HAS_UNO = False
|
21
|
-
logger.
|
21
|
+
logger.error(
|
22
|
+
"❌ UNO处理器导入失败!\n"
|
23
|
+
"🔧 解决方案:\n"
|
24
|
+
"1. 安装LibreOffice和python-uno:\n"
|
25
|
+
" - Ubuntu/Debian: sudo apt-get install libreoffice python3-uno\n"
|
26
|
+
" - CentOS/RHEL: sudo yum install libreoffice python3-uno\n"
|
27
|
+
" - macOS: brew install libreoffice\n"
|
28
|
+
" - Windows: 下载并安装LibreOffice\n"
|
29
|
+
"2. 确保Python可以访问uno模块:\n"
|
30
|
+
" - Linux: export PYTHONPATH=/usr/lib/libreoffice/program:$PYTHONPATH\n"
|
31
|
+
" - Windows: 添加LibreOffice\\program到系统PATH\n"
|
32
|
+
"3. 验证安装:python -c 'import uno'\n"
|
33
|
+
"4. 如果仍有问题,请查看完整文档:\n"
|
34
|
+
" https://wiki.documentfoundation.org/Documentation/DevGuide/Installing_the_SDK"
|
35
|
+
)
|
22
36
|
|
23
37
|
|
24
38
|
|
@@ -40,7 +54,11 @@ class DocxParser(BaseLife):
|
|
40
54
|
else:
|
41
55
|
self.use_uno = False
|
42
56
|
if use_uno and not HAS_UNO:
|
43
|
-
logger.warning(
|
57
|
+
logger.warning(
|
58
|
+
f"⚠️ UNO不可用,回退到传统命令行方式\n"
|
59
|
+
f"💡 提示:UNO转换更快更稳定,强烈建议安装和配置UNO\n"
|
60
|
+
f"📖 请参考上述错误信息中的安装指南"
|
61
|
+
)
|
44
62
|
else:
|
45
63
|
logger.info(f"🚀 DocxParser初始化完成 - 使用传统命令行方式")
|
46
64
|
|
@@ -64,7 +82,20 @@ class DocxParser(BaseLife):
|
|
64
82
|
return txt_path
|
65
83
|
|
66
84
|
except Exception as e:
|
67
|
-
logger.error(
|
85
|
+
logger.error(
|
86
|
+
f"💥 UNO转换失败: {str(e)}\n"
|
87
|
+
f"🔍 诊断信息:\n"
|
88
|
+
f" - 错误类型: {type(e).__name__}\n"
|
89
|
+
f" - LibreOffice是否已安装?尝试运行: soffice --version\n"
|
90
|
+
f" - Python UNO模块是否可用?尝试: python -c 'import uno'\n"
|
91
|
+
f" - 是否有其他LibreOffice实例在运行?\n"
|
92
|
+
f" - 文件权限是否正确?\n"
|
93
|
+
f"🔧 可能的解决方案:\n"
|
94
|
+
f" 1. 确保LibreOffice正确安装\n"
|
95
|
+
f" 2. 关闭所有LibreOffice进程\n"
|
96
|
+
f" 3. 检查文件权限和路径\n"
|
97
|
+
f" 4. 尝试手动运行: soffice --headless --convert-to txt \"{docx_path}\""
|
98
|
+
)
|
68
99
|
logger.warning("⚠️ 自动回退到传统命令行方式...")
|
69
100
|
return self._docx_to_txt_subprocess(docx_path, dir_path)
|
70
101
|
else:
|
@@ -228,26 +259,54 @@ class DocxParser(BaseLife):
|
|
228
259
|
return ""
|
229
260
|
|
230
261
|
def _extract_standard_document_content(self, docx_zip: zipfile.ZipFile) -> str:
|
231
|
-
"""提取标准document.xml内容"""
|
262
|
+
"""提取标准document.xml内容 - 只提取纯文本"""
|
232
263
|
try:
|
233
264
|
if 'word/document.xml' in docx_zip.namelist():
|
234
265
|
doc_xml = docx_zip.read('word/document.xml').decode('utf-8', errors='replace')
|
235
266
|
|
236
|
-
#
|
237
|
-
|
267
|
+
# 解码XML实体
|
268
|
+
doc_xml = html.unescape(doc_xml)
|
269
|
+
|
270
|
+
# 提取所有<w:t>标签中的文本(包括各种命名空间前缀)
|
271
|
+
# 使用更宽松的正则表达式来匹配任何命名空间前缀
|
272
|
+
text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
|
273
|
+
text_matches = re.findall(text_pattern, doc_xml)
|
238
274
|
|
239
|
-
#
|
240
|
-
|
241
|
-
doc_xml_clean = re.sub(r'w:', '', doc_xml_clean)
|
242
|
-
doc_xml_clean = re.sub(r'[a-zA-Z0-9]+:', '', doc_xml_clean)
|
275
|
+
# 额外提取可能存在的无命名空间的<t>标签
|
276
|
+
text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', doc_xml))
|
243
277
|
|
244
|
-
# 提取所有<t>标签中的文本
|
245
|
-
text_matches = re.findall(r'<t[^>]*>(.*?)</t>', doc_xml_clean, re.DOTALL)
|
246
278
|
if text_matches:
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
279
|
+
# 清理和组合文本
|
280
|
+
cleaned_texts = []
|
281
|
+
for text in text_matches:
|
282
|
+
# 解码XML实体
|
283
|
+
text = html.unescape(text)
|
284
|
+
# 移除多余的空白字符,但保留单个空格
|
285
|
+
text = re.sub(r'\s+', ' ', text.strip())
|
286
|
+
if text:
|
287
|
+
cleaned_texts.append(text)
|
288
|
+
|
289
|
+
# 智能连接文本片段
|
290
|
+
content = ''
|
291
|
+
for i, text in enumerate(cleaned_texts):
|
292
|
+
if i == 0:
|
293
|
+
content = text
|
294
|
+
else:
|
295
|
+
# 如果前一个文本片段不是以标点结束,且当前文本不是以大写开头,则不加空格
|
296
|
+
prev_char = content[-1] if content else ''
|
297
|
+
curr_char = text[0] if text else ''
|
298
|
+
|
299
|
+
if prev_char in '.!?。!?\n' or curr_char.isupper() or curr_char in ',。!?;:':
|
300
|
+
content += ' ' + text
|
301
|
+
else:
|
302
|
+
content += text
|
303
|
+
|
304
|
+
# 最终清理
|
305
|
+
content = re.sub(r'\s+', ' ', content)
|
306
|
+
content = content.strip()
|
307
|
+
|
308
|
+
logger.info(f"📝 从document.xml提取纯文本: {len(content)} 字符")
|
309
|
+
return content
|
251
310
|
return ""
|
252
311
|
except Exception as e:
|
253
312
|
logger.error(f"💥 提取标准文档内容失败: {str(e)}")
|
@@ -271,7 +330,7 @@ class DocxParser(BaseLife):
|
|
271
330
|
return ""
|
272
331
|
|
273
332
|
def _extract_headers_footers(self, docx_zip: zipfile.ZipFile) -> str:
|
274
|
-
"""提取页眉页脚内容"""
|
333
|
+
"""提取页眉页脚内容 - 只提取纯文本"""
|
275
334
|
try:
|
276
335
|
header_footer_content = []
|
277
336
|
|
@@ -280,35 +339,66 @@ class DocxParser(BaseLife):
|
|
280
339
|
logger.debug(f"📄 处理页眉页脚: {filename}")
|
281
340
|
content = docx_zip.read(filename).decode('utf-8', errors='replace')
|
282
341
|
|
283
|
-
#
|
284
|
-
|
342
|
+
# 解码XML实体
|
343
|
+
content = html.unescape(content)
|
344
|
+
|
345
|
+
# 提取文本内容 - 使用更宽松的模式
|
346
|
+
text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
|
347
|
+
text_matches = re.findall(text_pattern, content)
|
348
|
+
text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', content))
|
349
|
+
|
285
350
|
if text_matches:
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
351
|
+
# 清理和组合文本
|
352
|
+
cleaned_texts = []
|
353
|
+
for text in text_matches:
|
354
|
+
text = html.unescape(text)
|
355
|
+
text = re.sub(r'\s+', ' ', text.strip())
|
356
|
+
if text:
|
357
|
+
cleaned_texts.append(text)
|
358
|
+
|
359
|
+
if cleaned_texts:
|
360
|
+
# 合并文本片段
|
361
|
+
header_footer_text = ' '.join(cleaned_texts)
|
362
|
+
header_footer_text = re.sub(r'\s+', ' ', header_footer_text.strip())
|
363
|
+
if header_footer_text:
|
364
|
+
header_footer_content.append(header_footer_text)
|
290
365
|
|
291
366
|
if header_footer_content:
|
292
|
-
logger.info(f"📑
|
367
|
+
logger.info(f"📑 提取页眉页脚纯文本: {len(header_footer_content)} 个")
|
293
368
|
|
294
|
-
return '
|
369
|
+
return '\n'.join(header_footer_content) if header_footer_content else ""
|
295
370
|
except Exception as e:
|
296
371
|
logger.error(f"💥 提取页眉页脚失败: {str(e)}")
|
297
372
|
return ""
|
298
373
|
|
299
374
|
def _extract_comments(self, docx_zip: zipfile.ZipFile) -> str:
|
300
|
-
"""提取注释和批注内容"""
|
375
|
+
"""提取注释和批注内容 - 只提取纯文本"""
|
301
376
|
try:
|
302
377
|
if 'word/comments.xml' in docx_zip.namelist():
|
303
378
|
comments_xml = docx_zip.read('word/comments.xml').decode('utf-8', errors='replace')
|
304
379
|
|
305
|
-
#
|
306
|
-
|
380
|
+
# 解码XML实体
|
381
|
+
comments_xml = html.unescape(comments_xml)
|
382
|
+
|
383
|
+
# 提取注释文本 - 使用更宽松的模式
|
384
|
+
text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
|
385
|
+
text_matches = re.findall(text_pattern, comments_xml)
|
386
|
+
text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', comments_xml))
|
387
|
+
|
307
388
|
if text_matches:
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
389
|
+
# 清理和组合文本
|
390
|
+
cleaned_texts = []
|
391
|
+
for text in text_matches:
|
392
|
+
text = html.unescape(text)
|
393
|
+
text = re.sub(r'\s+', ' ', text.strip())
|
394
|
+
if text:
|
395
|
+
cleaned_texts.append(text)
|
396
|
+
|
397
|
+
if cleaned_texts:
|
398
|
+
comments_text = ' '.join(cleaned_texts)
|
399
|
+
comments_text = re.sub(r'\s+', ' ', comments_text.strip())
|
400
|
+
logger.info(f"💬 提取注释纯文本: {len(comments_text)} 字符")
|
401
|
+
return comments_text
|
312
402
|
|
313
403
|
return ""
|
314
404
|
except Exception as e:
|
@@ -316,7 +406,7 @@ class DocxParser(BaseLife):
|
|
316
406
|
return ""
|
317
407
|
|
318
408
|
def _extract_textbox_content(self, docx_zip: zipfile.ZipFile) -> str:
|
319
|
-
"""提取文本框和图形对象中的文本"""
|
409
|
+
"""提取文本框和图形对象中的文本 - 只提取纯文本"""
|
320
410
|
try:
|
321
411
|
textbox_content = []
|
322
412
|
|
@@ -325,26 +415,43 @@ class DocxParser(BaseLife):
|
|
325
415
|
if 'word/' in filename and filename.endswith('.xml'):
|
326
416
|
content = docx_zip.read(filename).decode('utf-8', errors='replace')
|
327
417
|
|
418
|
+
# 解码XML实体
|
419
|
+
content = html.unescape(content)
|
420
|
+
|
328
421
|
# 查找文本框内容 (w:txbxContent)
|
329
|
-
textbox_matches = re.findall(r'<
|
422
|
+
textbox_matches = re.findall(r'<[^:>]*:txbxContent[^>]*>(.*?)</[^:>]*:txbxContent>', content, re.DOTALL)
|
423
|
+
|
330
424
|
for match in textbox_matches:
|
331
|
-
|
425
|
+
# 从文本框内容中提取文本
|
426
|
+
text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
|
427
|
+
text_matches = re.findall(text_pattern, match)
|
428
|
+
text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', match))
|
429
|
+
|
332
430
|
if text_matches:
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
431
|
+
# 清理和组合文本
|
432
|
+
cleaned_texts = []
|
433
|
+
for text in text_matches:
|
434
|
+
text = html.unescape(text)
|
435
|
+
text = re.sub(r'\s+', ' ', text.strip())
|
436
|
+
if text:
|
437
|
+
cleaned_texts.append(text)
|
438
|
+
|
439
|
+
if cleaned_texts:
|
440
|
+
textbox_text = ' '.join(cleaned_texts)
|
441
|
+
textbox_text = re.sub(r'\s+', ' ', textbox_text.strip())
|
442
|
+
if textbox_text:
|
443
|
+
textbox_content.append(textbox_text)
|
337
444
|
|
338
445
|
if textbox_content:
|
339
|
-
logger.info(f"📦
|
446
|
+
logger.info(f"📦 提取文本框纯文本: {len(textbox_content)} 个")
|
340
447
|
|
341
|
-
return '
|
448
|
+
return '\n'.join(textbox_content) if textbox_content else ""
|
342
449
|
except Exception as e:
|
343
450
|
logger.error(f"💥 提取文本框内容失败: {str(e)}")
|
344
451
|
return ""
|
345
452
|
|
346
453
|
def _combine_extracted_content(self, content_list: list) -> str:
|
347
|
-
"""合并提取到的各种内容"""
|
454
|
+
"""合并提取到的各种内容 - 输出清晰的纯文本"""
|
348
455
|
combined = []
|
349
456
|
|
350
457
|
# 按重要性排序内容
|
@@ -353,14 +460,40 @@ class DocxParser(BaseLife):
|
|
353
460
|
for content_type in priority_order:
|
354
461
|
for item_type, content in content_list:
|
355
462
|
if item_type == content_type and content.strip():
|
356
|
-
|
463
|
+
# 清理内容中的多余空白
|
464
|
+
cleaned_content = re.sub(r'\s+', ' ', content.strip())
|
465
|
+
cleaned_content = re.sub(r'\n\s*\n', '\n\n', cleaned_content)
|
466
|
+
|
467
|
+
if cleaned_content:
|
468
|
+
# 根据内容类型添加简单的标记(仅在有多种内容类型时)
|
469
|
+
if len([1 for t, c in content_list if c.strip()]) > 1:
|
470
|
+
if item_type == "header_footer":
|
471
|
+
combined.append(f"[页眉页脚]\n{cleaned_content}")
|
472
|
+
elif item_type == "comments":
|
473
|
+
combined.append(f"[批注]\n{cleaned_content}")
|
474
|
+
elif item_type == "textboxes":
|
475
|
+
combined.append(f"[文本框]\n{cleaned_content}")
|
476
|
+
else:
|
477
|
+
combined.append(cleaned_content)
|
478
|
+
else:
|
479
|
+
combined.append(cleaned_content)
|
357
480
|
|
358
481
|
# 添加其他未分类的内容
|
359
482
|
for item_type, content in content_list:
|
360
483
|
if item_type not in priority_order and content.strip():
|
361
|
-
|
484
|
+
cleaned_content = re.sub(r'\s+', ' ', content.strip())
|
485
|
+
cleaned_content = re.sub(r'\n\s*\n', '\n\n', cleaned_content)
|
486
|
+
if cleaned_content:
|
487
|
+
combined.append(cleaned_content)
|
488
|
+
|
489
|
+
# 合并所有内容,使用双换行分隔不同部分
|
490
|
+
final_content = '\n\n'.join(combined) if combined else ""
|
491
|
+
|
492
|
+
# 最终清理:确保没有过多的空行
|
493
|
+
final_content = re.sub(r'\n{3,}', '\n\n', final_content)
|
494
|
+
final_content = final_content.strip()
|
362
495
|
|
363
|
-
return
|
496
|
+
return final_content
|
364
497
|
|
365
498
|
def _extract_html_from_mht(self, mht_content: str) -> str:
|
366
499
|
"""从MHT内容中提取HTML部分并转换为简洁文本"""
|
@@ -125,12 +125,23 @@ class UnoManager:
|
|
125
125
|
cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
126
126
|
)
|
127
127
|
logger.info(f"⏳ 等待LibreOffice服务启动...")
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
128
|
+
|
129
|
+
# 智能等待:轮询检查服务状态,给不同性能机器弹性时间
|
130
|
+
start_time = time.time()
|
131
|
+
check_interval = 1 # 每1秒检查一次
|
132
|
+
max_wait_time = 30 # 最大等待30秒
|
133
|
+
|
134
|
+
while time.time() - start_time < max_wait_time:
|
135
|
+
if self._check_soffice_running():
|
136
|
+
elapsed = time.time() - start_time
|
137
|
+
logger.info(f"✅ LibreOffice服务启动成功 (耗时 {elapsed:.1f}秒)")
|
138
|
+
return
|
139
|
+
|
140
|
+
logger.debug(f"🔄 服务未就绪,继续等待... (已等待 {time.time() - start_time:.1f}秒)")
|
141
|
+
time.sleep(check_interval)
|
142
|
+
|
143
|
+
# 超时仍未启动
|
144
|
+
raise Exception(f"LibreOffice服务启动超时 (等待了{max_wait_time}秒)")
|
134
145
|
|
135
146
|
except Exception as e:
|
136
147
|
logger.error(f"❌ 启动LibreOffice服务失败: {str(e)}")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|