pydatamax 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +84 -72
  31. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.post1.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,18 @@
1
- from loguru import logger
1
+ import html
2
2
  import os
3
+ import re
3
4
  import shutil
4
5
  import subprocess
5
6
  import tempfile
7
+ import zipfile
6
8
  from pathlib import Path
7
- from typing import Union, Optional
9
+ from typing import Optional, Union
8
10
 
9
11
  import chardet
10
12
  from loguru import logger
13
+
11
14
  from datamax.parser.base import BaseLife, MarkdownOutputVo
12
- import zipfile
13
- import re
14
- import html
15
+ from datamax.utils.lifecycle_types import LifeType
15
16
 
16
17
  # 尝试导入UNO处理器
17
18
  try:
@@ -35,7 +36,6 @@ except ImportError:
35
36
  )
36
37
 
37
38
 
38
-
39
39
  class DocxParser(BaseLife):
40
40
  def __init__(
41
41
  self,
@@ -66,7 +66,9 @@ class DocxParser(BaseLife):
66
66
 
67
67
  def docx_to_txt(self, docx_path: str, dir_path: str) -> str:
68
68
  """将.docx文件转换为.txt文件"""
69
- logger.info(f"🔄 开始转换DOCX文件为TXT - 源文件: {docx_path}, 输出目录: {dir_path}")
69
+ logger.info(
70
+ f"🔄 开始转换DOCX文件为TXT - 源文件: {docx_path}, 输出目录: {dir_path}"
71
+ )
70
72
 
71
73
  if self.use_uno:
72
74
  # 使用UNO API进行转换
@@ -94,7 +96,7 @@ class DocxParser(BaseLife):
94
96
  f" 1. 确保LibreOffice正确安装\n"
95
97
  f" 2. 关闭所有LibreOffice进程\n"
96
98
  f" 3. 检查文件权限和路径\n"
97
- f" 4. 尝试手动运行: soffice --headless --convert-to txt \"{docx_path}\""
99
+ f' 4. 尝试手动运行: soffice --headless --convert-to txt "{docx_path}"'
98
100
  )
99
101
  logger.warning("⚠️ 自动回退到传统命令行方式...")
100
102
  return self._docx_to_txt_subprocess(docx_path, dir_path)
@@ -117,13 +119,17 @@ class DocxParser(BaseLife):
117
119
  if exit_code == 0:
118
120
  logger.info(f"✅ DOCX到TXT转换成功 - 退出码: {exit_code}")
119
121
  if stdout:
120
- logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
122
+ logger.debug(
123
+ f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}"
124
+ )
121
125
  else:
122
126
  encoding = chardet.detect(stderr)["encoding"]
123
127
  if encoding is None:
124
128
  encoding = "utf-8"
125
129
  error_msg = stderr.decode(encoding, errors="replace")
126
- logger.error(f"❌ DOCX到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
130
+ logger.error(
131
+ f"❌ DOCX到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}"
132
+ )
127
133
  raise Exception(
128
134
  f"Error Output (detected encoding: {encoding}): {error_msg}"
129
135
  )
@@ -180,79 +186,93 @@ class DocxParser(BaseLife):
180
186
  支持多种DOCX内部格式和存储方式
181
187
  """
182
188
  logger.info(f"🔍 开始综合内容提取: {docx_path}")
183
-
189
+
184
190
  all_content = []
185
-
191
+
186
192
  try:
187
- with zipfile.ZipFile(docx_path, 'r') as docx:
193
+ with zipfile.ZipFile(docx_path, "r") as docx:
188
194
  # 1. 检查并提取altChunk内容 (HTML/MHT嵌入)
189
195
  altchunk_content = self._extract_altchunk_content_internal(docx)
190
196
  if altchunk_content:
191
197
  all_content.append(("altChunk", altchunk_content))
192
-
198
+
193
199
  # 2. 提取标准document.xml内容
194
200
  standard_content = self._extract_standard_document_content(docx)
195
201
  if standard_content:
196
202
  all_content.append(("standard", standard_content))
197
-
203
+
198
204
  # 3. 提取嵌入对象内容 (embeddings)
199
205
  embedded_content = self._extract_embedded_objects(docx)
200
206
  if embedded_content:
201
207
  all_content.append(("embedded", embedded_content))
202
-
208
+
203
209
  # 4. 提取头部和脚部内容
204
210
  header_footer_content = self._extract_headers_footers(docx)
205
211
  if header_footer_content:
206
212
  all_content.append(("header_footer", header_footer_content))
207
-
213
+
208
214
  # 5. 提取注释和批注
209
215
  comments_content = self._extract_comments(docx)
210
216
  if comments_content:
211
217
  all_content.append(("comments", comments_content))
212
-
218
+
213
219
  # 6. 提取文本框和图形对象中的文本
214
220
  textbox_content = self._extract_textbox_content(docx)
215
221
  if textbox_content:
216
222
  all_content.append(("textboxes", textbox_content))
217
-
223
+
218
224
  except Exception as e:
219
225
  logger.error(f"💥 综合内容提取失败: {str(e)}")
220
226
  return ""
221
-
227
+
222
228
  # 合并所有内容
223
229
  if all_content:
224
230
  combined_content = self._combine_extracted_content(all_content)
225
231
  logger.info(f"✅ 综合提取完成,总内容长度: {len(combined_content)} 字符")
226
232
  logger.debug(f"📊 提取到的内容类型: {[item[0] for item in all_content]}")
227
233
  return combined_content
228
-
234
+
229
235
  return ""
230
236
 
231
237
  def _extract_altchunk_content_internal(self, docx_zip: zipfile.ZipFile) -> str:
232
238
  """内部方法:提取altChunk内容,优先使用MHT方式"""
233
239
  try:
234
240
  # 检查document.xml中的altChunk引用
235
- if 'word/document.xml' in docx_zip.namelist():
236
- doc_xml = docx_zip.read('word/document.xml').decode('utf-8', errors='replace')
237
- if 'altChunk' in doc_xml:
241
+ if "word/document.xml" in docx_zip.namelist():
242
+ doc_xml = docx_zip.read("word/document.xml").decode(
243
+ "utf-8", errors="replace"
244
+ )
245
+ if "altChunk" in doc_xml:
238
246
  logger.info("🔍 检测到altChunk格式")
239
-
247
+
240
248
  # 优先查找MHT文件(更简洁的处理方式)
241
- mht_files = [f for f in docx_zip.namelist() if f.endswith('.mht') and 'word/' in f]
242
- html_files = [f for f in docx_zip.namelist() if f.endswith('.html') and 'word/' in f]
243
-
249
+ mht_files = [
250
+ f
251
+ for f in docx_zip.namelist()
252
+ if f.endswith(".mht") and "word/" in f
253
+ ]
254
+ html_files = [
255
+ f
256
+ for f in docx_zip.namelist()
257
+ if f.endswith(".html") and "word/" in f
258
+ ]
259
+
244
260
  # 优先处理MHT文件
245
261
  for filename in mht_files:
246
262
  logger.info(f"📄 优先处理MHT文件: {filename}")
247
- content = docx_zip.read(filename).decode('utf-8', errors='replace')
263
+ content = docx_zip.read(filename).decode(
264
+ "utf-8", errors="replace"
265
+ )
248
266
  return self._extract_html_from_mht(content)
249
-
267
+
250
268
  # 如果没有MHT文件,再处理HTML文件
251
269
  for filename in html_files:
252
270
  logger.info(f"📄 处理HTML文件: {filename}")
253
- content = docx_zip.read(filename).decode('utf-8', errors='replace')
271
+ content = docx_zip.read(filename).decode(
272
+ "utf-8", errors="replace"
273
+ )
254
274
  return self._html_to_clean_text(content)
255
-
275
+
256
276
  return ""
257
277
  except Exception as e:
258
278
  logger.error(f"💥 提取altChunk内容失败: {str(e)}")
@@ -261,20 +281,22 @@ class DocxParser(BaseLife):
261
281
  def _extract_standard_document_content(self, docx_zip: zipfile.ZipFile) -> str:
262
282
  """提取标准document.xml内容 - 只提取纯文本"""
263
283
  try:
264
- if 'word/document.xml' in docx_zip.namelist():
265
- doc_xml = docx_zip.read('word/document.xml').decode('utf-8', errors='replace')
266
-
284
+ if "word/document.xml" in docx_zip.namelist():
285
+ doc_xml = docx_zip.read("word/document.xml").decode(
286
+ "utf-8", errors="replace"
287
+ )
288
+
267
289
  # 解码XML实体
268
290
  doc_xml = html.unescape(doc_xml)
269
-
291
+
270
292
  # 提取所有<w:t>标签中的文本(包括各种命名空间前缀)
271
293
  # 使用更宽松的正则表达式来匹配任何命名空间前缀
272
- text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
294
+ text_pattern = r"<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>"
273
295
  text_matches = re.findall(text_pattern, doc_xml)
274
-
296
+
275
297
  # 额外提取可能存在的无命名空间的<t>标签
276
- text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', doc_xml))
277
-
298
+ text_matches.extend(re.findall(r"<t[^>]*>([^<]*)</t>", doc_xml))
299
+
278
300
  if text_matches:
279
301
  # 清理和组合文本
280
302
  cleaned_texts = []
@@ -282,29 +304,33 @@ class DocxParser(BaseLife):
282
304
  # 解码XML实体
283
305
  text = html.unescape(text)
284
306
  # 移除多余的空白字符,但保留单个空格
285
- text = re.sub(r'\s+', ' ', text.strip())
307
+ text = re.sub(r"\s+", " ", text.strip())
286
308
  if text:
287
309
  cleaned_texts.append(text)
288
-
310
+
289
311
  # 智能连接文本片段
290
- content = ''
312
+ content = ""
291
313
  for i, text in enumerate(cleaned_texts):
292
314
  if i == 0:
293
315
  content = text
294
316
  else:
295
317
  # 如果前一个文本片段不是以标点结束,且当前文本不是以大写开头,则不加空格
296
- prev_char = content[-1] if content else ''
297
- curr_char = text[0] if text else ''
298
-
299
- if prev_char in '.!?。!?\n' or curr_char.isupper() or curr_char in ',。!?;:':
300
- content += ' ' + text
318
+ prev_char = content[-1] if content else ""
319
+ curr_char = text[0] if text else ""
320
+
321
+ if (
322
+ prev_char in ".!?。!?\n"
323
+ or curr_char.isupper()
324
+ or curr_char in ",。!?;:"
325
+ ):
326
+ content += " " + text
301
327
  else:
302
328
  content += text
303
-
329
+
304
330
  # 最终清理
305
- content = re.sub(r'\s+', ' ', content)
331
+ content = re.sub(r"\s+", " ", content)
306
332
  content = content.strip()
307
-
333
+
308
334
  logger.info(f"📝 从document.xml提取纯文本: {len(content)} 字符")
309
335
  return content
310
336
  return ""
@@ -316,15 +342,15 @@ class DocxParser(BaseLife):
316
342
  """提取嵌入对象内容"""
317
343
  try:
318
344
  embedded_content = []
319
-
345
+
320
346
  # 查找嵌入的文档对象
321
347
  for filename in docx_zip.namelist():
322
- if 'word/embeddings/' in filename:
348
+ if "word/embeddings/" in filename:
323
349
  logger.info(f"📎 找到嵌入对象: {filename}")
324
350
  # 这里可以根据文件类型进一步处理
325
351
  # 例如:.docx, .xlsx, .txt等
326
-
327
- return ' '.join(embedded_content) if embedded_content else ""
352
+
353
+ return " ".join(embedded_content) if embedded_content else ""
328
354
  except Exception as e:
329
355
  logger.error(f"💥 提取嵌入对象失败: {str(e)}")
330
356
  return ""
@@ -333,40 +359,44 @@ class DocxParser(BaseLife):
333
359
  """提取页眉页脚内容 - 只提取纯文本"""
334
360
  try:
335
361
  header_footer_content = []
336
-
362
+
337
363
  for filename in docx_zip.namelist():
338
- if ('word/header' in filename or 'word/footer' in filename) and filename.endswith('.xml'):
364
+ if (
365
+ "word/header" in filename or "word/footer" in filename
366
+ ) and filename.endswith(".xml"):
339
367
  logger.debug(f"📄 处理页眉页脚: {filename}")
340
- content = docx_zip.read(filename).decode('utf-8', errors='replace')
341
-
368
+ content = docx_zip.read(filename).decode("utf-8", errors="replace")
369
+
342
370
  # 解码XML实体
343
371
  content = html.unescape(content)
344
-
372
+
345
373
  # 提取文本内容 - 使用更宽松的模式
346
- text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
374
+ text_pattern = r"<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>"
347
375
  text_matches = re.findall(text_pattern, content)
348
- text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', content))
349
-
376
+ text_matches.extend(re.findall(r"<t[^>]*>([^<]*)</t>", content))
377
+
350
378
  if text_matches:
351
379
  # 清理和组合文本
352
380
  cleaned_texts = []
353
381
  for text in text_matches:
354
382
  text = html.unescape(text)
355
- text = re.sub(r'\s+', ' ', text.strip())
383
+ text = re.sub(r"\s+", " ", text.strip())
356
384
  if text:
357
385
  cleaned_texts.append(text)
358
-
386
+
359
387
  if cleaned_texts:
360
388
  # 合并文本片段
361
- header_footer_text = ' '.join(cleaned_texts)
362
- header_footer_text = re.sub(r'\s+', ' ', header_footer_text.strip())
389
+ header_footer_text = " ".join(cleaned_texts)
390
+ header_footer_text = re.sub(
391
+ r"\s+", " ", header_footer_text.strip()
392
+ )
363
393
  if header_footer_text:
364
394
  header_footer_content.append(header_footer_text)
365
-
395
+
366
396
  if header_footer_content:
367
397
  logger.info(f"📑 提取页眉页脚纯文本: {len(header_footer_content)} 个")
368
-
369
- return '\n'.join(header_footer_content) if header_footer_content else ""
398
+
399
+ return "\n".join(header_footer_content) if header_footer_content else ""
370
400
  except Exception as e:
371
401
  logger.error(f"💥 提取页眉页脚失败: {str(e)}")
372
402
  return ""
@@ -374,32 +404,34 @@ class DocxParser(BaseLife):
374
404
  def _extract_comments(self, docx_zip: zipfile.ZipFile) -> str:
375
405
  """提取注释和批注内容 - 只提取纯文本"""
376
406
  try:
377
- if 'word/comments.xml' in docx_zip.namelist():
378
- comments_xml = docx_zip.read('word/comments.xml').decode('utf-8', errors='replace')
379
-
407
+ if "word/comments.xml" in docx_zip.namelist():
408
+ comments_xml = docx_zip.read("word/comments.xml").decode(
409
+ "utf-8", errors="replace"
410
+ )
411
+
380
412
  # 解码XML实体
381
413
  comments_xml = html.unescape(comments_xml)
382
-
414
+
383
415
  # 提取注释文本 - 使用更宽松的模式
384
- text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
416
+ text_pattern = r"<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>"
385
417
  text_matches = re.findall(text_pattern, comments_xml)
386
- text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', comments_xml))
387
-
418
+ text_matches.extend(re.findall(r"<t[^>]*>([^<]*)</t>", comments_xml))
419
+
388
420
  if text_matches:
389
421
  # 清理和组合文本
390
422
  cleaned_texts = []
391
423
  for text in text_matches:
392
424
  text = html.unescape(text)
393
- text = re.sub(r'\s+', ' ', text.strip())
425
+ text = re.sub(r"\s+", " ", text.strip())
394
426
  if text:
395
427
  cleaned_texts.append(text)
396
-
428
+
397
429
  if cleaned_texts:
398
- comments_text = ' '.join(cleaned_texts)
399
- comments_text = re.sub(r'\s+', ' ', comments_text.strip())
430
+ comments_text = " ".join(cleaned_texts)
431
+ comments_text = re.sub(r"\s+", " ", comments_text.strip())
400
432
  logger.info(f"💬 提取注释纯文本: {len(comments_text)} 字符")
401
433
  return comments_text
402
-
434
+
403
435
  return ""
404
436
  except Exception as e:
405
437
  logger.error(f"💥 提取注释失败: {str(e)}")
@@ -409,43 +441,47 @@ class DocxParser(BaseLife):
409
441
  """提取文本框和图形对象中的文本 - 只提取纯文本"""
410
442
  try:
411
443
  textbox_content = []
412
-
444
+
413
445
  # 查找可能包含文本框的文件
414
446
  for filename in docx_zip.namelist():
415
- if 'word/' in filename and filename.endswith('.xml'):
416
- content = docx_zip.read(filename).decode('utf-8', errors='replace')
417
-
447
+ if "word/" in filename and filename.endswith(".xml"):
448
+ content = docx_zip.read(filename).decode("utf-8", errors="replace")
449
+
418
450
  # 解码XML实体
419
451
  content = html.unescape(content)
420
-
452
+
421
453
  # 查找文本框内容 (w:txbxContent)
422
- textbox_matches = re.findall(r'<[^:>]*:txbxContent[^>]*>(.*?)</[^:>]*:txbxContent>', content, re.DOTALL)
423
-
454
+ textbox_matches = re.findall(
455
+ r"<[^:>]*:txbxContent[^>]*>(.*?)</[^:>]*:txbxContent>",
456
+ content,
457
+ re.DOTALL,
458
+ )
459
+
424
460
  for match in textbox_matches:
425
461
  # 从文本框内容中提取文本
426
- text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
462
+ text_pattern = r"<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>"
427
463
  text_matches = re.findall(text_pattern, match)
428
- text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', match))
429
-
464
+ text_matches.extend(re.findall(r"<t[^>]*>([^<]*)</t>", match))
465
+
430
466
  if text_matches:
431
467
  # 清理和组合文本
432
468
  cleaned_texts = []
433
469
  for text in text_matches:
434
470
  text = html.unescape(text)
435
- text = re.sub(r'\s+', ' ', text.strip())
471
+ text = re.sub(r"\s+", " ", text.strip())
436
472
  if text:
437
473
  cleaned_texts.append(text)
438
-
474
+
439
475
  if cleaned_texts:
440
- textbox_text = ' '.join(cleaned_texts)
441
- textbox_text = re.sub(r'\s+', ' ', textbox_text.strip())
476
+ textbox_text = " ".join(cleaned_texts)
477
+ textbox_text = re.sub(r"\s+", " ", textbox_text.strip())
442
478
  if textbox_text:
443
479
  textbox_content.append(textbox_text)
444
-
480
+
445
481
  if textbox_content:
446
482
  logger.info(f"📦 提取文本框纯文本: {len(textbox_content)} 个")
447
-
448
- return '\n'.join(textbox_content) if textbox_content else ""
483
+
484
+ return "\n".join(textbox_content) if textbox_content else ""
449
485
  except Exception as e:
450
486
  logger.error(f"💥 提取文本框内容失败: {str(e)}")
451
487
  return ""
@@ -453,17 +489,24 @@ class DocxParser(BaseLife):
453
489
  def _combine_extracted_content(self, content_list: list) -> str:
454
490
  """合并提取到的各种内容 - 输出清晰的纯文本"""
455
491
  combined = []
456
-
492
+
457
493
  # 按重要性排序内容
458
- priority_order = ["altChunk", "standard", "header_footer", "textboxes", "comments", "embedded"]
459
-
494
+ priority_order = [
495
+ "altChunk",
496
+ "standard",
497
+ "header_footer",
498
+ "textboxes",
499
+ "comments",
500
+ "embedded",
501
+ ]
502
+
460
503
  for content_type in priority_order:
461
504
  for item_type, content in content_list:
462
505
  if item_type == content_type and content.strip():
463
506
  # 清理内容中的多余空白
464
- cleaned_content = re.sub(r'\s+', ' ', content.strip())
465
- cleaned_content = re.sub(r'\n\s*\n', '\n\n', cleaned_content)
466
-
507
+ cleaned_content = re.sub(r"\s+", " ", content.strip())
508
+ cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content)
509
+
467
510
  if cleaned_content:
468
511
  # 根据内容类型添加简单的标记(仅在有多种内容类型时)
469
512
  if len([1 for t, c in content_list if c.strip()]) > 1:
@@ -477,77 +520,84 @@ class DocxParser(BaseLife):
477
520
  combined.append(cleaned_content)
478
521
  else:
479
522
  combined.append(cleaned_content)
480
-
523
+
481
524
  # 添加其他未分类的内容
482
525
  for item_type, content in content_list:
483
526
  if item_type not in priority_order and content.strip():
484
- cleaned_content = re.sub(r'\s+', ' ', content.strip())
485
- cleaned_content = re.sub(r'\n\s*\n', '\n\n', cleaned_content)
527
+ cleaned_content = re.sub(r"\s+", " ", content.strip())
528
+ cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content)
486
529
  if cleaned_content:
487
530
  combined.append(cleaned_content)
488
-
531
+
489
532
  # 合并所有内容,使用双换行分隔不同部分
490
- final_content = '\n\n'.join(combined) if combined else ""
491
-
533
+ final_content = "\n\n".join(combined) if combined else ""
534
+
492
535
  # 最终清理:确保没有过多的空行
493
- final_content = re.sub(r'\n{3,}', '\n\n', final_content)
536
+ final_content = re.sub(r"\n{3,}", "\n\n", final_content)
494
537
  final_content = final_content.strip()
495
-
538
+
496
539
  return final_content
497
540
 
498
541
  def _extract_html_from_mht(self, mht_content: str) -> str:
499
542
  """从MHT内容中提取HTML部分并转换为简洁文本"""
500
543
  try:
501
544
  # MHT文件使用MIME格式,寻找HTML部分
502
- lines = mht_content.split('\n')
545
+ lines = mht_content.split("\n")
503
546
  in_html_section = False
504
547
  html_lines = []
505
548
  skip_headers = True
506
-
549
+
507
550
  for line in lines:
508
551
  # 检测HTML部分开始
509
- if 'Content-Type: text/html' in line:
552
+ if "Content-Type: text/html" in line:
510
553
  in_html_section = True
511
554
  skip_headers = True
512
555
  continue
513
-
556
+
514
557
  # 在HTML部分中
515
558
  if in_html_section:
516
559
  # 跳过Content-*头部
517
- if skip_headers and line.strip() and not line.startswith('Content-'):
560
+ if (
561
+ skip_headers
562
+ and line.strip()
563
+ and not line.startswith("Content-")
564
+ ):
518
565
  skip_headers = False
519
-
566
+
520
567
  # 空行表示头部结束,内容开始
521
568
  if skip_headers and not line.strip():
522
569
  skip_headers = False
523
570
  continue
524
-
571
+
525
572
  # 检查是否到达下一个MIME部分
526
- if line.startswith('------=') and len(html_lines) > 0:
573
+ if line.startswith("------=") and len(html_lines) > 0:
527
574
  # HTML部分结束
528
575
  break
529
-
576
+
530
577
  # 收集HTML内容
531
578
  if not skip_headers:
532
579
  html_lines.append(line)
533
-
580
+
534
581
  # 合并所有HTML行
535
- html_content = '\n'.join(html_lines)
536
-
582
+ html_content = "\n".join(html_lines)
583
+
537
584
  # 解码quoted-printable编码
538
- if '=3D' in html_content or '=\n' in html_content:
585
+ if "=3D" in html_content or "=\n" in html_content:
539
586
  try:
540
587
  import quopri
541
- html_content = quopri.decodestring(html_content.encode()).decode('utf-8', errors='replace')
588
+
589
+ html_content = quopri.decodestring(html_content.encode()).decode(
590
+ "utf-8", errors="replace"
591
+ )
542
592
  logger.info("📧 解码quoted-printable编码")
543
593
  except Exception as e:
544
594
  logger.warning(f"⚠️ quoted-printable解码失败: {str(e)}")
545
-
595
+
546
596
  logger.debug(f"📄 提取的HTML内容长度: {len(html_content)} 字符")
547
-
597
+
548
598
  # 转换为简洁文本
549
599
  return self._html_to_clean_text(html_content)
550
-
600
+
551
601
  except Exception as e:
552
602
  logger.error(f"💥 从MHT提取HTML失败: {str(e)}")
553
603
  return ""
@@ -557,91 +607,114 @@ class DocxParser(BaseLife):
557
607
  try:
558
608
  # 首先解码HTML实体
559
609
  text = html.unescape(html_content)
560
-
610
+
561
611
  # 先尝试提取<body>标签内的所有内容
562
- body_match = re.search(r'<body[^>]*>(.*?)</body>', text, re.DOTALL | re.IGNORECASE)
612
+ body_match = re.search(
613
+ r"<body[^>]*>(.*?)</body>", text, re.DOTALL | re.IGNORECASE
614
+ )
563
615
  if body_match:
564
616
  main_content = body_match.group(1)
565
617
  logger.info("📄 提取<body>标签内容")
566
618
  else:
567
619
  main_content = text
568
620
  logger.info("📄 使用全部内容(未找到body标签)")
569
-
621
+
570
622
  # 特殊处理<pre><code>标签,保持其内部的格式
571
623
  pre_code_blocks = []
624
+
572
625
  def preserve_pre_code(match):
573
626
  idx = len(pre_code_blocks)
574
627
  pre_code_blocks.append(match.group(1))
575
628
  return f"__PRE_CODE_{idx}__"
576
-
577
- main_content = re.sub(r'<pre[^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>',
578
- preserve_pre_code, main_content, flags=re.DOTALL | re.IGNORECASE)
579
-
629
+
630
+ main_content = re.sub(
631
+ r"<pre[^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>",
632
+ preserve_pre_code,
633
+ main_content,
634
+ flags=re.DOTALL | re.IGNORECASE,
635
+ )
636
+
580
637
  # 处理其他HTML结构
581
638
  # 1. 先转换需要保留换行的标签
582
- main_content = re.sub(r'<br\s*/?>', '\n', main_content, flags=re.IGNORECASE)
583
- main_content = re.sub(r'</p>', '\n', main_content, flags=re.IGNORECASE)
584
- main_content = re.sub(r'<p[^>]*>', '', main_content, flags=re.IGNORECASE)
585
- main_content = re.sub(r'</div>', '\n', main_content, flags=re.IGNORECASE)
586
- main_content = re.sub(r'<div[^>]*>', '', main_content, flags=re.IGNORECASE)
587
- main_content = re.sub(r'</h[1-6]>', '\n\n', main_content, flags=re.IGNORECASE)
588
- main_content = re.sub(r'<h[1-6][^>]*>', '', main_content, flags=re.IGNORECASE)
589
- main_content = re.sub(r'</li>', '\n', main_content, flags=re.IGNORECASE)
590
- main_content = re.sub(r'<li[^>]*>', '• ', main_content, flags=re.IGNORECASE)
591
- main_content = re.sub(r'</tr>', '\n', main_content, flags=re.IGNORECASE)
592
- main_content = re.sub(r'</td>', ' | ', main_content, flags=re.IGNORECASE)
593
- main_content = re.sub(r'</th>', ' | ', main_content, flags=re.IGNORECASE)
594
-
639
+ main_content = re.sub(r"<br\s*/?>", "\n", main_content, flags=re.IGNORECASE)
640
+ main_content = re.sub(r"</p>", "\n", main_content, flags=re.IGNORECASE)
641
+ main_content = re.sub(r"<p[^>]*>", "", main_content, flags=re.IGNORECASE)
642
+ main_content = re.sub(r"</div>", "\n", main_content, flags=re.IGNORECASE)
643
+ main_content = re.sub(r"<div[^>]*>", "", main_content, flags=re.IGNORECASE)
644
+ main_content = re.sub(
645
+ r"</h[1-6]>", "\n\n", main_content, flags=re.IGNORECASE
646
+ )
647
+ main_content = re.sub(
648
+ r"<h[1-6][^>]*>", "", main_content, flags=re.IGNORECASE
649
+ )
650
+ main_content = re.sub(r"</li>", "\n", main_content, flags=re.IGNORECASE)
651
+ main_content = re.sub(r"<li[^>]*>", "• ", main_content, flags=re.IGNORECASE)
652
+ main_content = re.sub(r"</tr>", "\n", main_content, flags=re.IGNORECASE)
653
+ main_content = re.sub(r"</td>", " | ", main_content, flags=re.IGNORECASE)
654
+ main_content = re.sub(r"</th>", " | ", main_content, flags=re.IGNORECASE)
655
+
595
656
  # 2. 移除style和script标签及其内容
596
- main_content = re.sub(r'<style[^>]*>.*?</style>', '', main_content, flags=re.DOTALL | re.IGNORECASE)
597
- main_content = re.sub(r'<script[^>]*>.*?</script>', '', main_content, flags=re.DOTALL | re.IGNORECASE)
598
-
657
+ main_content = re.sub(
658
+ r"<style[^>]*>.*?</style>",
659
+ "",
660
+ main_content,
661
+ flags=re.DOTALL | re.IGNORECASE,
662
+ )
663
+ main_content = re.sub(
664
+ r"<script[^>]*>.*?</script>",
665
+ "",
666
+ main_content,
667
+ flags=re.DOTALL | re.IGNORECASE,
668
+ )
669
+
599
670
  # 3. 移除所有剩余的HTML标签
600
- main_content = re.sub(r'<[^>]+>', '', main_content)
601
-
671
+ main_content = re.sub(r"<[^>]+>", "", main_content)
672
+
602
673
  # 4. 解码HTML实体(第二次,确保完全解码)
603
674
  main_content = html.unescape(main_content)
604
-
675
+
605
676
  # 5. 恢复<pre><code>块的内容
606
677
  for idx, pre_code_content in enumerate(pre_code_blocks):
607
678
  # 清理pre_code内容
608
679
  cleaned_pre_code = html.unescape(pre_code_content)
609
- main_content = main_content.replace(f"__PRE_CODE_{idx}__", cleaned_pre_code)
610
-
680
+ main_content = main_content.replace(
681
+ f"__PRE_CODE_{idx}__", cleaned_pre_code
682
+ )
683
+
611
684
  # 6. 清理多余的空白字符,但保持段落结构
612
- lines = main_content.split('\n')
685
+ lines = main_content.split("\n")
613
686
  cleaned_lines = []
614
-
687
+
615
688
  for line in lines:
616
689
  # 清理每行的首尾空格
617
690
  line = line.strip()
618
691
  # 保留非空行
619
692
  if line:
620
693
  # 清理行内多余空格
621
- line = re.sub(r'[ \t]+', ' ', line)
694
+ line = re.sub(r"[ \t]+", " ", line)
622
695
  # 清理表格分隔符多余的空格
623
- line = re.sub(r'\s*\|\s*', ' | ', line)
696
+ line = re.sub(r"\s*\|\s*", " | ", line)
624
697
  cleaned_lines.append(line)
625
698
  else:
626
699
  # 保留空行作为段落分隔
627
- if cleaned_lines and cleaned_lines[-1] != '':
628
- cleaned_lines.append('')
629
-
700
+ if cleaned_lines and cleaned_lines[-1] != "":
701
+ cleaned_lines.append("")
702
+
630
703
  # 7. 合并清理后的行
631
- main_content = '\n'.join(cleaned_lines)
632
-
704
+ main_content = "\n".join(cleaned_lines)
705
+
633
706
  # 8. 最终清理:移除多余的空行
634
- main_content = re.sub(r'\n{3,}', '\n\n', main_content)
707
+ main_content = re.sub(r"\n{3,}", "\n\n", main_content)
635
708
  main_content = main_content.strip()
636
-
709
+
637
710
  logger.info(f"📝 HTML内容转换为简洁文本: {len(main_content)} 字符")
638
-
711
+
639
712
  return main_content
640
-
713
+
641
714
  except Exception as e:
642
715
  logger.error(f"💥 HTML转简洁文本失败: {str(e)}")
643
716
  # 如果转换失败,返回原始文本的基础清理版本
644
- return re.sub(r'<[^>]+>', '', html_content)
717
+ return re.sub(r"<[^>]+>", "", html_content)
645
718
 
646
719
  def _html_to_text(self, html_content: str) -> str:
647
720
  """将HTML内容转换为纯文本(保留此方法用于其他HTML内容)"""
@@ -653,7 +726,7 @@ class DocxParser(BaseLife):
653
726
  提取包含altChunk的DOCX文件内容 (保持向后兼容)
654
727
  """
655
728
  try:
656
- with zipfile.ZipFile(docx_path, 'r') as docx:
729
+ with zipfile.ZipFile(docx_path, "r") as docx:
657
730
  return self._extract_altchunk_content_internal(docx)
658
731
  except Exception as e:
659
732
  logger.error(f"💥 提取altChunk内容失败: {str(e)}")
@@ -667,12 +740,14 @@ class DocxParser(BaseLife):
667
740
  # 首先尝试综合提取所有内容
668
741
  comprehensive_content = self.extract_all_content(docx_path)
669
742
  if comprehensive_content and comprehensive_content.strip():
670
- logger.info(f"✨ 使用综合提取方式成功,内容长度: {len(comprehensive_content)} 字符")
743
+ logger.info(
744
+ f"✨ 使用综合提取方式成功,内容长度: {len(comprehensive_content)} 字符"
745
+ )
671
746
  return comprehensive_content
672
-
747
+
673
748
  # 如果综合提取失败,使用传统转换方式
674
749
  logger.info("🔄 综合提取失败或内容为空,使用传统转换方式")
675
-
750
+
676
751
  with tempfile.TemporaryDirectory() as temp_path:
677
752
  logger.debug(f"📁 创建临时目录: {temp_path}")
678
753
 
@@ -685,7 +760,7 @@ class DocxParser(BaseLife):
685
760
  # 转换DOCX为TXT
686
761
  txt_file_path = self.docx_to_txt(str(file_path), str(temp_path))
687
762
  logger.info(f"🎯 DOCX转TXT完成: {txt_file_path}")
688
-
763
+
689
764
  # 读取TXT文件内容
690
765
  content = self.read_txt_file(txt_file_path)
691
766
  logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
@@ -723,9 +798,16 @@ class DocxParser(BaseLife):
723
798
  if file_size == 0:
724
799
  logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
725
800
 
726
- title = os.path.splitext(os.path.basename(file_path))[0]
727
- logger.debug(f"🏷️ 提取文件标题: {title}")
728
-
801
+ # 🏷️ 提取文件扩展名
802
+ extension = self.get_file_extension(file_path)
803
+ logger.debug(f"🏷️ 提取文件扩展名: {extension}")
804
+ # 1) 处理开始:生成 DATA_PROCESSING 事件
805
+ lc_start = self.generate_lifecycle(
806
+ source_file=file_path,
807
+ domain="Technology",
808
+ life_type=LifeType.DATA_PROCESSING,
809
+ usage_purpose="Parsing",
810
+ )
729
811
  # 使用soffice转换为txt后读取内容
730
812
  logger.info("📝 使用soffice转换DOCX为TXT并读取内容")
731
813
  content = self.read_docx_file(docx_path=file_path)
@@ -745,16 +827,23 @@ class DocxParser(BaseLife):
745
827
  if not mk_content.strip():
746
828
  logger.warning(f"⚠️ 解析出的内容为空: {file_path}")
747
829
 
748
- lifecycle = self.generate_lifecycle(
830
+ # 2) 处理结束:根据内容是否非空生成 DATA_PROCESSED 或 DATA_PROCESS_FAILED 事件
831
+ lc_end = self.generate_lifecycle(
749
832
  source_file=file_path,
750
833
  domain="Technology",
751
- usage_purpose="Documentation",
752
- life_type="LLM_ORIGIN",
834
+ life_type=(
835
+ LifeType.DATA_PROCESSED
836
+ if mk_content.strip()
837
+ else LifeType.DATA_PROCESS_FAILED
838
+ ),
839
+ usage_purpose="Parsing",
753
840
  )
754
- logger.debug("⚙️ 生成lifecycle信息完成")
841
+ logger.debug("⚙️ 生成生命周期事件完成")
755
842
 
756
- output_vo = MarkdownOutputVo(title, mk_content)
757
- output_vo.add_lifecycle(lifecycle)
843
+ # 3) 封装输出并添加生命周期
844
+ output_vo = MarkdownOutputVo(extension, mk_content)
845
+ output_vo.add_lifecycle(lc_start)
846
+ output_vo.add_lifecycle(lc_end)
758
847
 
759
848
  result = output_vo.to_dict()
760
849
  logger.info(f"🏆 DOCX文件解析完成: {file_path}")