pydatamax 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +84 -72
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.post1.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/parser/docx_parser.py
CHANGED
@@ -1,17 +1,18 @@
|
|
1
|
-
|
1
|
+
import html
|
2
2
|
import os
|
3
|
+
import re
|
3
4
|
import shutil
|
4
5
|
import subprocess
|
5
6
|
import tempfile
|
7
|
+
import zipfile
|
6
8
|
from pathlib import Path
|
7
|
-
from typing import
|
9
|
+
from typing import Optional, Union
|
8
10
|
|
9
11
|
import chardet
|
10
12
|
from loguru import logger
|
13
|
+
|
11
14
|
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
12
|
-
import
|
13
|
-
import re
|
14
|
-
import html
|
15
|
+
from datamax.utils.lifecycle_types import LifeType
|
15
16
|
|
16
17
|
# 尝试导入UNO处理器
|
17
18
|
try:
|
@@ -35,7 +36,6 @@ except ImportError:
|
|
35
36
|
)
|
36
37
|
|
37
38
|
|
38
|
-
|
39
39
|
class DocxParser(BaseLife):
|
40
40
|
def __init__(
|
41
41
|
self,
|
@@ -66,7 +66,9 @@ class DocxParser(BaseLife):
|
|
66
66
|
|
67
67
|
def docx_to_txt(self, docx_path: str, dir_path: str) -> str:
|
68
68
|
"""将.docx文件转换为.txt文件"""
|
69
|
-
logger.info(
|
69
|
+
logger.info(
|
70
|
+
f"🔄 开始转换DOCX文件为TXT - 源文件: {docx_path}, 输出目录: {dir_path}"
|
71
|
+
)
|
70
72
|
|
71
73
|
if self.use_uno:
|
72
74
|
# 使用UNO API进行转换
|
@@ -94,7 +96,7 @@ class DocxParser(BaseLife):
|
|
94
96
|
f" 1. 确保LibreOffice正确安装\n"
|
95
97
|
f" 2. 关闭所有LibreOffice进程\n"
|
96
98
|
f" 3. 检查文件权限和路径\n"
|
97
|
-
f
|
99
|
+
f' 4. 尝试手动运行: soffice --headless --convert-to txt "{docx_path}"'
|
98
100
|
)
|
99
101
|
logger.warning("⚠️ 自动回退到传统命令行方式...")
|
100
102
|
return self._docx_to_txt_subprocess(docx_path, dir_path)
|
@@ -117,13 +119,17 @@ class DocxParser(BaseLife):
|
|
117
119
|
if exit_code == 0:
|
118
120
|
logger.info(f"✅ DOCX到TXT转换成功 - 退出码: {exit_code}")
|
119
121
|
if stdout:
|
120
|
-
logger.debug(
|
122
|
+
logger.debug(
|
123
|
+
f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}"
|
124
|
+
)
|
121
125
|
else:
|
122
126
|
encoding = chardet.detect(stderr)["encoding"]
|
123
127
|
if encoding is None:
|
124
128
|
encoding = "utf-8"
|
125
129
|
error_msg = stderr.decode(encoding, errors="replace")
|
126
|
-
logger.error(
|
130
|
+
logger.error(
|
131
|
+
f"❌ DOCX到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}"
|
132
|
+
)
|
127
133
|
raise Exception(
|
128
134
|
f"Error Output (detected encoding: {encoding}): {error_msg}"
|
129
135
|
)
|
@@ -180,79 +186,93 @@ class DocxParser(BaseLife):
|
|
180
186
|
支持多种DOCX内部格式和存储方式
|
181
187
|
"""
|
182
188
|
logger.info(f"🔍 开始综合内容提取: {docx_path}")
|
183
|
-
|
189
|
+
|
184
190
|
all_content = []
|
185
|
-
|
191
|
+
|
186
192
|
try:
|
187
|
-
with zipfile.ZipFile(docx_path,
|
193
|
+
with zipfile.ZipFile(docx_path, "r") as docx:
|
188
194
|
# 1. 检查并提取altChunk内容 (HTML/MHT嵌入)
|
189
195
|
altchunk_content = self._extract_altchunk_content_internal(docx)
|
190
196
|
if altchunk_content:
|
191
197
|
all_content.append(("altChunk", altchunk_content))
|
192
|
-
|
198
|
+
|
193
199
|
# 2. 提取标准document.xml内容
|
194
200
|
standard_content = self._extract_standard_document_content(docx)
|
195
201
|
if standard_content:
|
196
202
|
all_content.append(("standard", standard_content))
|
197
|
-
|
203
|
+
|
198
204
|
# 3. 提取嵌入对象内容 (embeddings)
|
199
205
|
embedded_content = self._extract_embedded_objects(docx)
|
200
206
|
if embedded_content:
|
201
207
|
all_content.append(("embedded", embedded_content))
|
202
|
-
|
208
|
+
|
203
209
|
# 4. 提取头部和脚部内容
|
204
210
|
header_footer_content = self._extract_headers_footers(docx)
|
205
211
|
if header_footer_content:
|
206
212
|
all_content.append(("header_footer", header_footer_content))
|
207
|
-
|
213
|
+
|
208
214
|
# 5. 提取注释和批注
|
209
215
|
comments_content = self._extract_comments(docx)
|
210
216
|
if comments_content:
|
211
217
|
all_content.append(("comments", comments_content))
|
212
|
-
|
218
|
+
|
213
219
|
# 6. 提取文本框和图形对象中的文本
|
214
220
|
textbox_content = self._extract_textbox_content(docx)
|
215
221
|
if textbox_content:
|
216
222
|
all_content.append(("textboxes", textbox_content))
|
217
|
-
|
223
|
+
|
218
224
|
except Exception as e:
|
219
225
|
logger.error(f"💥 综合内容提取失败: {str(e)}")
|
220
226
|
return ""
|
221
|
-
|
227
|
+
|
222
228
|
# 合并所有内容
|
223
229
|
if all_content:
|
224
230
|
combined_content = self._combine_extracted_content(all_content)
|
225
231
|
logger.info(f"✅ 综合提取完成,总内容长度: {len(combined_content)} 字符")
|
226
232
|
logger.debug(f"📊 提取到的内容类型: {[item[0] for item in all_content]}")
|
227
233
|
return combined_content
|
228
|
-
|
234
|
+
|
229
235
|
return ""
|
230
236
|
|
231
237
|
def _extract_altchunk_content_internal(self, docx_zip: zipfile.ZipFile) -> str:
|
232
238
|
"""内部方法:提取altChunk内容,优先使用MHT方式"""
|
233
239
|
try:
|
234
240
|
# 检查document.xml中的altChunk引用
|
235
|
-
if
|
236
|
-
doc_xml = docx_zip.read(
|
237
|
-
|
241
|
+
if "word/document.xml" in docx_zip.namelist():
|
242
|
+
doc_xml = docx_zip.read("word/document.xml").decode(
|
243
|
+
"utf-8", errors="replace"
|
244
|
+
)
|
245
|
+
if "altChunk" in doc_xml:
|
238
246
|
logger.info("🔍 检测到altChunk格式")
|
239
|
-
|
247
|
+
|
240
248
|
# 优先查找MHT文件(更简洁的处理方式)
|
241
|
-
mht_files = [
|
242
|
-
|
243
|
-
|
249
|
+
mht_files = [
|
250
|
+
f
|
251
|
+
for f in docx_zip.namelist()
|
252
|
+
if f.endswith(".mht") and "word/" in f
|
253
|
+
]
|
254
|
+
html_files = [
|
255
|
+
f
|
256
|
+
for f in docx_zip.namelist()
|
257
|
+
if f.endswith(".html") and "word/" in f
|
258
|
+
]
|
259
|
+
|
244
260
|
# 优先处理MHT文件
|
245
261
|
for filename in mht_files:
|
246
262
|
logger.info(f"📄 优先处理MHT文件: {filename}")
|
247
|
-
content = docx_zip.read(filename).decode(
|
263
|
+
content = docx_zip.read(filename).decode(
|
264
|
+
"utf-8", errors="replace"
|
265
|
+
)
|
248
266
|
return self._extract_html_from_mht(content)
|
249
|
-
|
267
|
+
|
250
268
|
# 如果没有MHT文件,再处理HTML文件
|
251
269
|
for filename in html_files:
|
252
270
|
logger.info(f"📄 处理HTML文件: {filename}")
|
253
|
-
content = docx_zip.read(filename).decode(
|
271
|
+
content = docx_zip.read(filename).decode(
|
272
|
+
"utf-8", errors="replace"
|
273
|
+
)
|
254
274
|
return self._html_to_clean_text(content)
|
255
|
-
|
275
|
+
|
256
276
|
return ""
|
257
277
|
except Exception as e:
|
258
278
|
logger.error(f"💥 提取altChunk内容失败: {str(e)}")
|
@@ -261,20 +281,22 @@ class DocxParser(BaseLife):
|
|
261
281
|
def _extract_standard_document_content(self, docx_zip: zipfile.ZipFile) -> str:
|
262
282
|
"""提取标准document.xml内容 - 只提取纯文本"""
|
263
283
|
try:
|
264
|
-
if
|
265
|
-
doc_xml = docx_zip.read(
|
266
|
-
|
284
|
+
if "word/document.xml" in docx_zip.namelist():
|
285
|
+
doc_xml = docx_zip.read("word/document.xml").decode(
|
286
|
+
"utf-8", errors="replace"
|
287
|
+
)
|
288
|
+
|
267
289
|
# 解码XML实体
|
268
290
|
doc_xml = html.unescape(doc_xml)
|
269
|
-
|
291
|
+
|
270
292
|
# 提取所有<w:t>标签中的文本(包括各种命名空间前缀)
|
271
293
|
# 使用更宽松的正则表达式来匹配任何命名空间前缀
|
272
|
-
text_pattern = r
|
294
|
+
text_pattern = r"<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>"
|
273
295
|
text_matches = re.findall(text_pattern, doc_xml)
|
274
|
-
|
296
|
+
|
275
297
|
# 额外提取可能存在的无命名空间的<t>标签
|
276
|
-
text_matches.extend(re.findall(r
|
277
|
-
|
298
|
+
text_matches.extend(re.findall(r"<t[^>]*>([^<]*)</t>", doc_xml))
|
299
|
+
|
278
300
|
if text_matches:
|
279
301
|
# 清理和组合文本
|
280
302
|
cleaned_texts = []
|
@@ -282,29 +304,33 @@ class DocxParser(BaseLife):
|
|
282
304
|
# 解码XML实体
|
283
305
|
text = html.unescape(text)
|
284
306
|
# 移除多余的空白字符,但保留单个空格
|
285
|
-
text = re.sub(r
|
307
|
+
text = re.sub(r"\s+", " ", text.strip())
|
286
308
|
if text:
|
287
309
|
cleaned_texts.append(text)
|
288
|
-
|
310
|
+
|
289
311
|
# 智能连接文本片段
|
290
|
-
content =
|
312
|
+
content = ""
|
291
313
|
for i, text in enumerate(cleaned_texts):
|
292
314
|
if i == 0:
|
293
315
|
content = text
|
294
316
|
else:
|
295
317
|
# 如果前一个文本片段不是以标点结束,且当前文本不是以大写开头,则不加空格
|
296
|
-
prev_char = content[-1] if content else
|
297
|
-
curr_char = text[0] if text else
|
298
|
-
|
299
|
-
if
|
300
|
-
|
318
|
+
prev_char = content[-1] if content else ""
|
319
|
+
curr_char = text[0] if text else ""
|
320
|
+
|
321
|
+
if (
|
322
|
+
prev_char in ".!?。!?\n"
|
323
|
+
or curr_char.isupper()
|
324
|
+
or curr_char in ",。!?;:"
|
325
|
+
):
|
326
|
+
content += " " + text
|
301
327
|
else:
|
302
328
|
content += text
|
303
|
-
|
329
|
+
|
304
330
|
# 最终清理
|
305
|
-
content = re.sub(r
|
331
|
+
content = re.sub(r"\s+", " ", content)
|
306
332
|
content = content.strip()
|
307
|
-
|
333
|
+
|
308
334
|
logger.info(f"📝 从document.xml提取纯文本: {len(content)} 字符")
|
309
335
|
return content
|
310
336
|
return ""
|
@@ -316,15 +342,15 @@ class DocxParser(BaseLife):
|
|
316
342
|
"""提取嵌入对象内容"""
|
317
343
|
try:
|
318
344
|
embedded_content = []
|
319
|
-
|
345
|
+
|
320
346
|
# 查找嵌入的文档对象
|
321
347
|
for filename in docx_zip.namelist():
|
322
|
-
if
|
348
|
+
if "word/embeddings/" in filename:
|
323
349
|
logger.info(f"📎 找到嵌入对象: {filename}")
|
324
350
|
# 这里可以根据文件类型进一步处理
|
325
351
|
# 例如:.docx, .xlsx, .txt等
|
326
|
-
|
327
|
-
return
|
352
|
+
|
353
|
+
return " ".join(embedded_content) if embedded_content else ""
|
328
354
|
except Exception as e:
|
329
355
|
logger.error(f"💥 提取嵌入对象失败: {str(e)}")
|
330
356
|
return ""
|
@@ -333,40 +359,44 @@ class DocxParser(BaseLife):
|
|
333
359
|
"""提取页眉页脚内容 - 只提取纯文本"""
|
334
360
|
try:
|
335
361
|
header_footer_content = []
|
336
|
-
|
362
|
+
|
337
363
|
for filename in docx_zip.namelist():
|
338
|
-
if (
|
364
|
+
if (
|
365
|
+
"word/header" in filename or "word/footer" in filename
|
366
|
+
) and filename.endswith(".xml"):
|
339
367
|
logger.debug(f"📄 处理页眉页脚: {filename}")
|
340
|
-
content = docx_zip.read(filename).decode(
|
341
|
-
|
368
|
+
content = docx_zip.read(filename).decode("utf-8", errors="replace")
|
369
|
+
|
342
370
|
# 解码XML实体
|
343
371
|
content = html.unescape(content)
|
344
|
-
|
372
|
+
|
345
373
|
# 提取文本内容 - 使用更宽松的模式
|
346
|
-
text_pattern = r
|
374
|
+
text_pattern = r"<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>"
|
347
375
|
text_matches = re.findall(text_pattern, content)
|
348
|
-
text_matches.extend(re.findall(r
|
349
|
-
|
376
|
+
text_matches.extend(re.findall(r"<t[^>]*>([^<]*)</t>", content))
|
377
|
+
|
350
378
|
if text_matches:
|
351
379
|
# 清理和组合文本
|
352
380
|
cleaned_texts = []
|
353
381
|
for text in text_matches:
|
354
382
|
text = html.unescape(text)
|
355
|
-
text = re.sub(r
|
383
|
+
text = re.sub(r"\s+", " ", text.strip())
|
356
384
|
if text:
|
357
385
|
cleaned_texts.append(text)
|
358
|
-
|
386
|
+
|
359
387
|
if cleaned_texts:
|
360
388
|
# 合并文本片段
|
361
|
-
header_footer_text =
|
362
|
-
header_footer_text = re.sub(
|
389
|
+
header_footer_text = " ".join(cleaned_texts)
|
390
|
+
header_footer_text = re.sub(
|
391
|
+
r"\s+", " ", header_footer_text.strip()
|
392
|
+
)
|
363
393
|
if header_footer_text:
|
364
394
|
header_footer_content.append(header_footer_text)
|
365
|
-
|
395
|
+
|
366
396
|
if header_footer_content:
|
367
397
|
logger.info(f"📑 提取页眉页脚纯文本: {len(header_footer_content)} 个")
|
368
|
-
|
369
|
-
return
|
398
|
+
|
399
|
+
return "\n".join(header_footer_content) if header_footer_content else ""
|
370
400
|
except Exception as e:
|
371
401
|
logger.error(f"💥 提取页眉页脚失败: {str(e)}")
|
372
402
|
return ""
|
@@ -374,32 +404,34 @@ class DocxParser(BaseLife):
|
|
374
404
|
def _extract_comments(self, docx_zip: zipfile.ZipFile) -> str:
|
375
405
|
"""提取注释和批注内容 - 只提取纯文本"""
|
376
406
|
try:
|
377
|
-
if
|
378
|
-
comments_xml = docx_zip.read(
|
379
|
-
|
407
|
+
if "word/comments.xml" in docx_zip.namelist():
|
408
|
+
comments_xml = docx_zip.read("word/comments.xml").decode(
|
409
|
+
"utf-8", errors="replace"
|
410
|
+
)
|
411
|
+
|
380
412
|
# 解码XML实体
|
381
413
|
comments_xml = html.unescape(comments_xml)
|
382
|
-
|
414
|
+
|
383
415
|
# 提取注释文本 - 使用更宽松的模式
|
384
|
-
text_pattern = r
|
416
|
+
text_pattern = r"<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>"
|
385
417
|
text_matches = re.findall(text_pattern, comments_xml)
|
386
|
-
text_matches.extend(re.findall(r
|
387
|
-
|
418
|
+
text_matches.extend(re.findall(r"<t[^>]*>([^<]*)</t>", comments_xml))
|
419
|
+
|
388
420
|
if text_matches:
|
389
421
|
# 清理和组合文本
|
390
422
|
cleaned_texts = []
|
391
423
|
for text in text_matches:
|
392
424
|
text = html.unescape(text)
|
393
|
-
text = re.sub(r
|
425
|
+
text = re.sub(r"\s+", " ", text.strip())
|
394
426
|
if text:
|
395
427
|
cleaned_texts.append(text)
|
396
|
-
|
428
|
+
|
397
429
|
if cleaned_texts:
|
398
|
-
comments_text =
|
399
|
-
comments_text = re.sub(r
|
430
|
+
comments_text = " ".join(cleaned_texts)
|
431
|
+
comments_text = re.sub(r"\s+", " ", comments_text.strip())
|
400
432
|
logger.info(f"💬 提取注释纯文本: {len(comments_text)} 字符")
|
401
433
|
return comments_text
|
402
|
-
|
434
|
+
|
403
435
|
return ""
|
404
436
|
except Exception as e:
|
405
437
|
logger.error(f"💥 提取注释失败: {str(e)}")
|
@@ -409,43 +441,47 @@ class DocxParser(BaseLife):
|
|
409
441
|
"""提取文本框和图形对象中的文本 - 只提取纯文本"""
|
410
442
|
try:
|
411
443
|
textbox_content = []
|
412
|
-
|
444
|
+
|
413
445
|
# 查找可能包含文本框的文件
|
414
446
|
for filename in docx_zip.namelist():
|
415
|
-
if
|
416
|
-
content = docx_zip.read(filename).decode(
|
417
|
-
|
447
|
+
if "word/" in filename and filename.endswith(".xml"):
|
448
|
+
content = docx_zip.read(filename).decode("utf-8", errors="replace")
|
449
|
+
|
418
450
|
# 解码XML实体
|
419
451
|
content = html.unescape(content)
|
420
|
-
|
452
|
+
|
421
453
|
# 查找文本框内容 (w:txbxContent)
|
422
|
-
textbox_matches = re.findall(
|
423
|
-
|
454
|
+
textbox_matches = re.findall(
|
455
|
+
r"<[^:>]*:txbxContent[^>]*>(.*?)</[^:>]*:txbxContent>",
|
456
|
+
content,
|
457
|
+
re.DOTALL,
|
458
|
+
)
|
459
|
+
|
424
460
|
for match in textbox_matches:
|
425
461
|
# 从文本框内容中提取文本
|
426
|
-
text_pattern = r
|
462
|
+
text_pattern = r"<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>"
|
427
463
|
text_matches = re.findall(text_pattern, match)
|
428
|
-
text_matches.extend(re.findall(r
|
429
|
-
|
464
|
+
text_matches.extend(re.findall(r"<t[^>]*>([^<]*)</t>", match))
|
465
|
+
|
430
466
|
if text_matches:
|
431
467
|
# 清理和组合文本
|
432
468
|
cleaned_texts = []
|
433
469
|
for text in text_matches:
|
434
470
|
text = html.unescape(text)
|
435
|
-
text = re.sub(r
|
471
|
+
text = re.sub(r"\s+", " ", text.strip())
|
436
472
|
if text:
|
437
473
|
cleaned_texts.append(text)
|
438
|
-
|
474
|
+
|
439
475
|
if cleaned_texts:
|
440
|
-
textbox_text =
|
441
|
-
textbox_text = re.sub(r
|
476
|
+
textbox_text = " ".join(cleaned_texts)
|
477
|
+
textbox_text = re.sub(r"\s+", " ", textbox_text.strip())
|
442
478
|
if textbox_text:
|
443
479
|
textbox_content.append(textbox_text)
|
444
|
-
|
480
|
+
|
445
481
|
if textbox_content:
|
446
482
|
logger.info(f"📦 提取文本框纯文本: {len(textbox_content)} 个")
|
447
|
-
|
448
|
-
return
|
483
|
+
|
484
|
+
return "\n".join(textbox_content) if textbox_content else ""
|
449
485
|
except Exception as e:
|
450
486
|
logger.error(f"💥 提取文本框内容失败: {str(e)}")
|
451
487
|
return ""
|
@@ -453,17 +489,24 @@ class DocxParser(BaseLife):
|
|
453
489
|
def _combine_extracted_content(self, content_list: list) -> str:
|
454
490
|
"""合并提取到的各种内容 - 输出清晰的纯文本"""
|
455
491
|
combined = []
|
456
|
-
|
492
|
+
|
457
493
|
# 按重要性排序内容
|
458
|
-
priority_order = [
|
459
|
-
|
494
|
+
priority_order = [
|
495
|
+
"altChunk",
|
496
|
+
"standard",
|
497
|
+
"header_footer",
|
498
|
+
"textboxes",
|
499
|
+
"comments",
|
500
|
+
"embedded",
|
501
|
+
]
|
502
|
+
|
460
503
|
for content_type in priority_order:
|
461
504
|
for item_type, content in content_list:
|
462
505
|
if item_type == content_type and content.strip():
|
463
506
|
# 清理内容中的多余空白
|
464
|
-
cleaned_content = re.sub(r
|
465
|
-
cleaned_content = re.sub(r
|
466
|
-
|
507
|
+
cleaned_content = re.sub(r"\s+", " ", content.strip())
|
508
|
+
cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content)
|
509
|
+
|
467
510
|
if cleaned_content:
|
468
511
|
# 根据内容类型添加简单的标记(仅在有多种内容类型时)
|
469
512
|
if len([1 for t, c in content_list if c.strip()]) > 1:
|
@@ -477,77 +520,84 @@ class DocxParser(BaseLife):
|
|
477
520
|
combined.append(cleaned_content)
|
478
521
|
else:
|
479
522
|
combined.append(cleaned_content)
|
480
|
-
|
523
|
+
|
481
524
|
# 添加其他未分类的内容
|
482
525
|
for item_type, content in content_list:
|
483
526
|
if item_type not in priority_order and content.strip():
|
484
|
-
cleaned_content = re.sub(r
|
485
|
-
cleaned_content = re.sub(r
|
527
|
+
cleaned_content = re.sub(r"\s+", " ", content.strip())
|
528
|
+
cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content)
|
486
529
|
if cleaned_content:
|
487
530
|
combined.append(cleaned_content)
|
488
|
-
|
531
|
+
|
489
532
|
# 合并所有内容,使用双换行分隔不同部分
|
490
|
-
final_content =
|
491
|
-
|
533
|
+
final_content = "\n\n".join(combined) if combined else ""
|
534
|
+
|
492
535
|
# 最终清理:确保没有过多的空行
|
493
|
-
final_content = re.sub(r
|
536
|
+
final_content = re.sub(r"\n{3,}", "\n\n", final_content)
|
494
537
|
final_content = final_content.strip()
|
495
|
-
|
538
|
+
|
496
539
|
return final_content
|
497
540
|
|
498
541
|
def _extract_html_from_mht(self, mht_content: str) -> str:
|
499
542
|
"""从MHT内容中提取HTML部分并转换为简洁文本"""
|
500
543
|
try:
|
501
544
|
# MHT文件使用MIME格式,寻找HTML部分
|
502
|
-
lines = mht_content.split(
|
545
|
+
lines = mht_content.split("\n")
|
503
546
|
in_html_section = False
|
504
547
|
html_lines = []
|
505
548
|
skip_headers = True
|
506
|
-
|
549
|
+
|
507
550
|
for line in lines:
|
508
551
|
# 检测HTML部分开始
|
509
|
-
if
|
552
|
+
if "Content-Type: text/html" in line:
|
510
553
|
in_html_section = True
|
511
554
|
skip_headers = True
|
512
555
|
continue
|
513
|
-
|
556
|
+
|
514
557
|
# 在HTML部分中
|
515
558
|
if in_html_section:
|
516
559
|
# 跳过Content-*头部
|
517
|
-
if
|
560
|
+
if (
|
561
|
+
skip_headers
|
562
|
+
and line.strip()
|
563
|
+
and not line.startswith("Content-")
|
564
|
+
):
|
518
565
|
skip_headers = False
|
519
|
-
|
566
|
+
|
520
567
|
# 空行表示头部结束,内容开始
|
521
568
|
if skip_headers and not line.strip():
|
522
569
|
skip_headers = False
|
523
570
|
continue
|
524
|
-
|
571
|
+
|
525
572
|
# 检查是否到达下一个MIME部分
|
526
|
-
if line.startswith(
|
573
|
+
if line.startswith("------=") and len(html_lines) > 0:
|
527
574
|
# HTML部分结束
|
528
575
|
break
|
529
|
-
|
576
|
+
|
530
577
|
# 收集HTML内容
|
531
578
|
if not skip_headers:
|
532
579
|
html_lines.append(line)
|
533
|
-
|
580
|
+
|
534
581
|
# 合并所有HTML行
|
535
|
-
html_content =
|
536
|
-
|
582
|
+
html_content = "\n".join(html_lines)
|
583
|
+
|
537
584
|
# 解码quoted-printable编码
|
538
|
-
if
|
585
|
+
if "=3D" in html_content or "=\n" in html_content:
|
539
586
|
try:
|
540
587
|
import quopri
|
541
|
-
|
588
|
+
|
589
|
+
html_content = quopri.decodestring(html_content.encode()).decode(
|
590
|
+
"utf-8", errors="replace"
|
591
|
+
)
|
542
592
|
logger.info("📧 解码quoted-printable编码")
|
543
593
|
except Exception as e:
|
544
594
|
logger.warning(f"⚠️ quoted-printable解码失败: {str(e)}")
|
545
|
-
|
595
|
+
|
546
596
|
logger.debug(f"📄 提取的HTML内容长度: {len(html_content)} 字符")
|
547
|
-
|
597
|
+
|
548
598
|
# 转换为简洁文本
|
549
599
|
return self._html_to_clean_text(html_content)
|
550
|
-
|
600
|
+
|
551
601
|
except Exception as e:
|
552
602
|
logger.error(f"💥 从MHT提取HTML失败: {str(e)}")
|
553
603
|
return ""
|
@@ -557,91 +607,114 @@ class DocxParser(BaseLife):
|
|
557
607
|
try:
|
558
608
|
# 首先解码HTML实体
|
559
609
|
text = html.unescape(html_content)
|
560
|
-
|
610
|
+
|
561
611
|
# 先尝试提取<body>标签内的所有内容
|
562
|
-
body_match = re.search(
|
612
|
+
body_match = re.search(
|
613
|
+
r"<body[^>]*>(.*?)</body>", text, re.DOTALL | re.IGNORECASE
|
614
|
+
)
|
563
615
|
if body_match:
|
564
616
|
main_content = body_match.group(1)
|
565
617
|
logger.info("📄 提取<body>标签内容")
|
566
618
|
else:
|
567
619
|
main_content = text
|
568
620
|
logger.info("📄 使用全部内容(未找到body标签)")
|
569
|
-
|
621
|
+
|
570
622
|
# 特殊处理<pre><code>标签,保持其内部的格式
|
571
623
|
pre_code_blocks = []
|
624
|
+
|
572
625
|
def preserve_pre_code(match):
|
573
626
|
idx = len(pre_code_blocks)
|
574
627
|
pre_code_blocks.append(match.group(1))
|
575
628
|
return f"__PRE_CODE_{idx}__"
|
576
|
-
|
577
|
-
main_content = re.sub(
|
578
|
-
|
579
|
-
|
629
|
+
|
630
|
+
main_content = re.sub(
|
631
|
+
r"<pre[^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>",
|
632
|
+
preserve_pre_code,
|
633
|
+
main_content,
|
634
|
+
flags=re.DOTALL | re.IGNORECASE,
|
635
|
+
)
|
636
|
+
|
580
637
|
# 处理其他HTML结构
|
581
638
|
# 1. 先转换需要保留换行的标签
|
582
|
-
main_content = re.sub(r
|
583
|
-
main_content = re.sub(r
|
584
|
-
main_content = re.sub(r
|
585
|
-
main_content = re.sub(r
|
586
|
-
main_content = re.sub(r
|
587
|
-
main_content = re.sub(
|
588
|
-
|
589
|
-
|
590
|
-
main_content = re.sub(
|
591
|
-
|
592
|
-
|
593
|
-
main_content = re.sub(r
|
594
|
-
|
639
|
+
main_content = re.sub(r"<br\s*/?>", "\n", main_content, flags=re.IGNORECASE)
|
640
|
+
main_content = re.sub(r"</p>", "\n", main_content, flags=re.IGNORECASE)
|
641
|
+
main_content = re.sub(r"<p[^>]*>", "", main_content, flags=re.IGNORECASE)
|
642
|
+
main_content = re.sub(r"</div>", "\n", main_content, flags=re.IGNORECASE)
|
643
|
+
main_content = re.sub(r"<div[^>]*>", "", main_content, flags=re.IGNORECASE)
|
644
|
+
main_content = re.sub(
|
645
|
+
r"</h[1-6]>", "\n\n", main_content, flags=re.IGNORECASE
|
646
|
+
)
|
647
|
+
main_content = re.sub(
|
648
|
+
r"<h[1-6][^>]*>", "", main_content, flags=re.IGNORECASE
|
649
|
+
)
|
650
|
+
main_content = re.sub(r"</li>", "\n", main_content, flags=re.IGNORECASE)
|
651
|
+
main_content = re.sub(r"<li[^>]*>", "• ", main_content, flags=re.IGNORECASE)
|
652
|
+
main_content = re.sub(r"</tr>", "\n", main_content, flags=re.IGNORECASE)
|
653
|
+
main_content = re.sub(r"</td>", " | ", main_content, flags=re.IGNORECASE)
|
654
|
+
main_content = re.sub(r"</th>", " | ", main_content, flags=re.IGNORECASE)
|
655
|
+
|
595
656
|
# 2. 移除style和script标签及其内容
|
596
|
-
main_content = re.sub(
|
597
|
-
|
598
|
-
|
657
|
+
main_content = re.sub(
|
658
|
+
r"<style[^>]*>.*?</style>",
|
659
|
+
"",
|
660
|
+
main_content,
|
661
|
+
flags=re.DOTALL | re.IGNORECASE,
|
662
|
+
)
|
663
|
+
main_content = re.sub(
|
664
|
+
r"<script[^>]*>.*?</script>",
|
665
|
+
"",
|
666
|
+
main_content,
|
667
|
+
flags=re.DOTALL | re.IGNORECASE,
|
668
|
+
)
|
669
|
+
|
599
670
|
# 3. 移除所有剩余的HTML标签
|
600
|
-
main_content = re.sub(r
|
601
|
-
|
671
|
+
main_content = re.sub(r"<[^>]+>", "", main_content)
|
672
|
+
|
602
673
|
# 4. 解码HTML实体(第二次,确保完全解码)
|
603
674
|
main_content = html.unescape(main_content)
|
604
|
-
|
675
|
+
|
605
676
|
# 5. 恢复<pre><code>块的内容
|
606
677
|
for idx, pre_code_content in enumerate(pre_code_blocks):
|
607
678
|
# 清理pre_code内容
|
608
679
|
cleaned_pre_code = html.unescape(pre_code_content)
|
609
|
-
main_content = main_content.replace(
|
610
|
-
|
680
|
+
main_content = main_content.replace(
|
681
|
+
f"__PRE_CODE_{idx}__", cleaned_pre_code
|
682
|
+
)
|
683
|
+
|
611
684
|
# 6. 清理多余的空白字符,但保持段落结构
|
612
|
-
lines = main_content.split(
|
685
|
+
lines = main_content.split("\n")
|
613
686
|
cleaned_lines = []
|
614
|
-
|
687
|
+
|
615
688
|
for line in lines:
|
616
689
|
# 清理每行的首尾空格
|
617
690
|
line = line.strip()
|
618
691
|
# 保留非空行
|
619
692
|
if line:
|
620
693
|
# 清理行内多余空格
|
621
|
-
line = re.sub(r
|
694
|
+
line = re.sub(r"[ \t]+", " ", line)
|
622
695
|
# 清理表格分隔符多余的空格
|
623
|
-
line = re.sub(r
|
696
|
+
line = re.sub(r"\s*\|\s*", " | ", line)
|
624
697
|
cleaned_lines.append(line)
|
625
698
|
else:
|
626
699
|
# 保留空行作为段落分隔
|
627
|
-
if cleaned_lines and cleaned_lines[-1] !=
|
628
|
-
cleaned_lines.append(
|
629
|
-
|
700
|
+
if cleaned_lines and cleaned_lines[-1] != "":
|
701
|
+
cleaned_lines.append("")
|
702
|
+
|
630
703
|
# 7. 合并清理后的行
|
631
|
-
main_content =
|
632
|
-
|
704
|
+
main_content = "\n".join(cleaned_lines)
|
705
|
+
|
633
706
|
# 8. 最终清理:移除多余的空行
|
634
|
-
main_content = re.sub(r
|
707
|
+
main_content = re.sub(r"\n{3,}", "\n\n", main_content)
|
635
708
|
main_content = main_content.strip()
|
636
|
-
|
709
|
+
|
637
710
|
logger.info(f"📝 HTML内容转换为简洁文本: {len(main_content)} 字符")
|
638
|
-
|
711
|
+
|
639
712
|
return main_content
|
640
|
-
|
713
|
+
|
641
714
|
except Exception as e:
|
642
715
|
logger.error(f"💥 HTML转简洁文本失败: {str(e)}")
|
643
716
|
# 如果转换失败,返回原始文本的基础清理版本
|
644
|
-
return re.sub(r
|
717
|
+
return re.sub(r"<[^>]+>", "", html_content)
|
645
718
|
|
646
719
|
def _html_to_text(self, html_content: str) -> str:
|
647
720
|
"""将HTML内容转换为纯文本(保留此方法用于其他HTML内容)"""
|
@@ -653,7 +726,7 @@ class DocxParser(BaseLife):
|
|
653
726
|
提取包含altChunk的DOCX文件内容 (保持向后兼容)
|
654
727
|
"""
|
655
728
|
try:
|
656
|
-
with zipfile.ZipFile(docx_path,
|
729
|
+
with zipfile.ZipFile(docx_path, "r") as docx:
|
657
730
|
return self._extract_altchunk_content_internal(docx)
|
658
731
|
except Exception as e:
|
659
732
|
logger.error(f"💥 提取altChunk内容失败: {str(e)}")
|
@@ -667,12 +740,14 @@ class DocxParser(BaseLife):
|
|
667
740
|
# 首先尝试综合提取所有内容
|
668
741
|
comprehensive_content = self.extract_all_content(docx_path)
|
669
742
|
if comprehensive_content and comprehensive_content.strip():
|
670
|
-
logger.info(
|
743
|
+
logger.info(
|
744
|
+
f"✨ 使用综合提取方式成功,内容长度: {len(comprehensive_content)} 字符"
|
745
|
+
)
|
671
746
|
return comprehensive_content
|
672
|
-
|
747
|
+
|
673
748
|
# 如果综合提取失败,使用传统转换方式
|
674
749
|
logger.info("🔄 综合提取失败或内容为空,使用传统转换方式")
|
675
|
-
|
750
|
+
|
676
751
|
with tempfile.TemporaryDirectory() as temp_path:
|
677
752
|
logger.debug(f"📁 创建临时目录: {temp_path}")
|
678
753
|
|
@@ -685,7 +760,7 @@ class DocxParser(BaseLife):
|
|
685
760
|
# 转换DOCX为TXT
|
686
761
|
txt_file_path = self.docx_to_txt(str(file_path), str(temp_path))
|
687
762
|
logger.info(f"🎯 DOCX转TXT完成: {txt_file_path}")
|
688
|
-
|
763
|
+
|
689
764
|
# 读取TXT文件内容
|
690
765
|
content = self.read_txt_file(txt_file_path)
|
691
766
|
logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
|
@@ -723,9 +798,16 @@ class DocxParser(BaseLife):
|
|
723
798
|
if file_size == 0:
|
724
799
|
logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
|
725
800
|
|
726
|
-
|
727
|
-
|
728
|
-
|
801
|
+
# 🏷️ 提取文件扩展名
|
802
|
+
extension = self.get_file_extension(file_path)
|
803
|
+
logger.debug(f"🏷️ 提取文件扩展名: {extension}")
|
804
|
+
# 1) 处理开始:生成 DATA_PROCESSING 事件
|
805
|
+
lc_start = self.generate_lifecycle(
|
806
|
+
source_file=file_path,
|
807
|
+
domain="Technology",
|
808
|
+
life_type=LifeType.DATA_PROCESSING,
|
809
|
+
usage_purpose="Parsing",
|
810
|
+
)
|
729
811
|
# 使用soffice转换为txt后读取内容
|
730
812
|
logger.info("📝 使用soffice转换DOCX为TXT并读取内容")
|
731
813
|
content = self.read_docx_file(docx_path=file_path)
|
@@ -745,16 +827,23 @@ class DocxParser(BaseLife):
|
|
745
827
|
if not mk_content.strip():
|
746
828
|
logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
|
747
829
|
|
748
|
-
|
830
|
+
# 2) 处理结束:根据内容是否非空生成 DATA_PROCESSED 或 DATA_PROCESS_FAILED 事件
|
831
|
+
lc_end = self.generate_lifecycle(
|
749
832
|
source_file=file_path,
|
750
833
|
domain="Technology",
|
751
|
-
|
752
|
-
|
834
|
+
life_type=(
|
835
|
+
LifeType.DATA_PROCESSED
|
836
|
+
if mk_content.strip()
|
837
|
+
else LifeType.DATA_PROCESS_FAILED
|
838
|
+
),
|
839
|
+
usage_purpose="Parsing",
|
753
840
|
)
|
754
|
-
logger.debug("⚙️
|
841
|
+
logger.debug("⚙️ 生成生命周期事件完成")
|
755
842
|
|
756
|
-
|
757
|
-
output_vo
|
843
|
+
# 3) 封装输出并添加生命周期
|
844
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
845
|
+
output_vo.add_lifecycle(lc_start)
|
846
|
+
output_vo.add_lifecycle(lc_end)
|
758
847
|
|
759
848
|
result = output_vo.to_dict()
|
760
849
|
logger.info(f"🏆 DOCX文件解析完成: {file_path}")
|