pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +91 -68
  31. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,22 @@
1
- from loguru import logger
1
+ import html
2
2
  import os
3
+ import re
3
4
  import shutil
4
5
  import subprocess
5
6
  import tempfile
6
7
  from pathlib import Path
7
- from typing import Union, Optional
8
- import struct
9
- import re
10
- import html
8
+ from typing import Union
11
9
 
12
10
  import chardet
11
+ from loguru import logger
13
12
 
14
13
  from datamax.parser.base import BaseLife, MarkdownOutputVo
14
+ from datamax.utils.lifecycle_types import LifeType
15
15
 
16
16
  # 尝试导入OLE相关库(用于读取DOC内部结构)
17
17
  try:
18
18
  import olefile
19
+
19
20
  HAS_OLEFILE = True
20
21
  except ImportError:
21
22
  HAS_OLEFILE = False
@@ -77,36 +78,36 @@ class DocParser(BaseLife):
77
78
  支持多种DOC内部格式和存储方式
78
79
  """
79
80
  logger.info(f"🔍 开始综合内容提取: {doc_path}")
80
-
81
+
81
82
  all_content = []
82
-
83
+
83
84
  try:
84
85
  # 1. 尝试使用OLE解析提取内容(如果可用)
85
86
  if HAS_OLEFILE:
86
87
  ole_content = self._extract_ole_content(doc_path)
87
88
  if ole_content:
88
89
  all_content.append(("ole", ole_content))
89
-
90
+
90
91
  # 2. 尝试提取嵌入对象
91
92
  embedded_content = self._extract_embedded_objects(doc_path)
92
93
  if embedded_content:
93
94
  all_content.append(("embedded", embedded_content))
94
-
95
+
95
96
  # 3. 如果上述方法都没有提取到内容,使用传统转换
96
97
  if not all_content:
97
98
  logger.info("🔄 使用传统转换方式提取内容")
98
99
  return "" # 返回空,让调用者使用传统方式
99
-
100
+
100
101
  # 检查内容质量,特别是对于WPS文件
101
102
  for content_type, content in all_content:
102
103
  if content and self._check_content_quality(content):
103
104
  logger.info(f"✅ 使用 {content_type} 内容提取成功")
104
105
  return content
105
-
106
+
106
107
  # 如果所有内容质量都不佳,返回空
107
108
  logger.warning("⚠️ 所有提取方式的内容质量都不佳")
108
109
  return ""
109
-
110
+
110
111
  except Exception as e:
111
112
  logger.error(f"💥 综合内容提取失败: {str(e)}")
112
113
  return ""
@@ -116,36 +117,36 @@ class DocParser(BaseLife):
116
117
  try:
117
118
  ole = olefile.OleFileIO(doc_path)
118
119
  logger.info(f"📂 成功打开OLE文件: {doc_path}")
119
-
120
+
120
121
  # 列出所有流
121
122
  streams = ole.listdir()
122
123
  logger.debug(f"📋 可用的OLE流: {streams}")
123
-
124
+
124
125
  # 检查是否是WPS生成的文件
125
- is_wps = any('WpsCustomData' in str(stream) for stream in streams)
126
+ is_wps = any("WpsCustomData" in str(stream) for stream in streams)
126
127
  if is_wps:
127
128
  logger.info("📝 检测到WPS DOC文件,建议使用传统转换方式")
128
129
  # 对于WPS文件,OLE解析可能不可靠,返回空让其使用传统方式
129
130
  ole.close()
130
131
  return ""
131
-
132
+
132
133
  all_texts = []
133
-
134
+
134
135
  # 尝试提取WordDocument流
135
- if ole.exists('WordDocument'):
136
+ if ole.exists("WordDocument"):
136
137
  try:
137
- word_stream = ole.openstream('WordDocument').read()
138
+ word_stream = ole.openstream("WordDocument").read()
138
139
  logger.info(f"📄 WordDocument流大小: {len(word_stream)} 字节")
139
140
  text = self._parse_word_stream(word_stream)
140
141
  if text:
141
142
  all_texts.append(text)
142
143
  except Exception as e:
143
144
  logger.error(f"💥 解析WordDocument流失败: {str(e)}")
144
-
145
+
145
146
  # 尝试读取其他可能包含文本的流
146
147
  text_content = []
147
148
  for entry in ole.listdir():
148
- if any(name in str(entry) for name in ['Text', 'Content', 'Body']):
149
+ if any(name in str(entry) for name in ["Text", "Content", "Body"]):
149
150
  try:
150
151
  stream = ole.openstream(entry)
151
152
  data = stream.read()
@@ -155,19 +156,19 @@ class DocParser(BaseLife):
155
156
  text_content.append(decoded)
156
157
  except:
157
158
  continue
158
-
159
+
159
160
  if text_content:
160
- combined = '\n'.join(text_content)
161
+ combined = "\n".join(text_content)
161
162
  logger.info(f"📄 从OLE流中提取文本: {len(combined)} 字符")
162
163
  return self._clean_extracted_text(combined)
163
-
164
+
164
165
  ole.close()
165
-
166
+
166
167
  return ""
167
-
168
+
168
169
  except Exception as e:
169
170
  logger.warning(f"⚠️ OLE解析失败: {str(e)}")
170
-
171
+
171
172
  return ""
172
173
 
173
174
  def _parse_word_stream(self, data: bytes) -> str:
@@ -176,25 +177,38 @@ class DocParser(BaseLife):
176
177
  # DOC文件格式复杂,这里提供基础的文本提取
177
178
  # 查找文本片段
178
179
  text_parts = []
179
-
180
+
180
181
  # 尝试多种编码,特别注意中文编码
181
- for encoding in ['utf-16-le', 'utf-8', 'gbk', 'gb18030', 'gb2312', 'big5', 'cp936', 'cp1252']:
182
+ for encoding in [
183
+ "utf-16-le",
184
+ "utf-8",
185
+ "gbk",
186
+ "gb18030",
187
+ "gb2312",
188
+ "big5",
189
+ "cp936",
190
+ "cp1252",
191
+ ]:
182
192
  try:
183
- decoded = data.decode(encoding, errors='ignore')
193
+ decoded = data.decode(encoding, errors="ignore")
184
194
  # 检查是否包含合理的中文字符
185
- chinese_chars = len([c for c in decoded if '\u4e00' <= c <= '\u9fff'])
195
+ chinese_chars = len(
196
+ [c for c in decoded if "\u4e00" <= c <= "\u9fff"]
197
+ )
186
198
  if chinese_chars > 10 or (decoded and len(decoded.strip()) > 50):
187
199
  # 过滤出可打印字符,但保留中文
188
200
  cleaned = self._filter_printable_text(decoded)
189
201
  if cleaned and len(cleaned.strip()) > 20:
190
202
  text_parts.append(cleaned)
191
- logger.debug(f"📝 使用编码 {encoding} 成功解码,包含 {chinese_chars} 个中文字符")
203
+ logger.debug(
204
+ f"📝 使用编码 {encoding} 成功解码,包含 {chinese_chars} 个中文字符"
205
+ )
192
206
  break
193
207
  except:
194
208
  continue
195
-
196
- return '\n'.join(text_parts) if text_parts else ""
197
-
209
+
210
+ return "\n".join(text_parts) if text_parts else ""
211
+
198
212
  except Exception as e:
199
213
  logger.error(f"💥 解析Word流失败: {str(e)}")
200
214
  return ""
@@ -204,50 +218,67 @@ class DocParser(BaseLife):
204
218
  result = []
205
219
  for char in text:
206
220
  # 保留中文字符
207
- if '\u4e00' <= char <= '\u9fff':
221
+ if "\u4e00" <= char <= "\u9fff":
208
222
  result.append(char)
209
223
  # 保留日文字符
210
- elif '\u3040' <= char <= '\u30ff':
224
+ elif "\u3040" <= char <= "\u30ff":
211
225
  result.append(char)
212
226
  # 保留韩文字符
213
- elif '\uac00' <= char <= '\ud7af':
227
+ elif "\uac00" <= char <= "\ud7af":
214
228
  result.append(char)
215
229
  # 保留ASCII可打印字符和空白字符
216
230
  elif char.isprintable() or char.isspace():
217
231
  result.append(char)
218
232
  # 保留常用标点符号
219
- elif char in ',。!?;:""''()【】《》、·…—':
233
+ elif char in ',。!?;:""' "()【】《》、·…—":
220
234
  result.append(char)
221
-
222
- return ''.join(result)
235
+
236
+ return "".join(result)
223
237
 
224
238
  def _try_decode_bytes(self, data: bytes) -> str:
225
239
  """尝试使用多种编码解码字节数据"""
226
240
  # 优先尝试中文编码
227
- encodings = ['utf-8', 'gbk', 'gb18030', 'gb2312', 'big5', 'utf-16-le', 'utf-16-be', 'cp936', 'cp1252', 'latin-1']
228
-
241
+ encodings = [
242
+ "utf-8",
243
+ "gbk",
244
+ "gb18030",
245
+ "gb2312",
246
+ "big5",
247
+ "utf-16-le",
248
+ "utf-16-be",
249
+ "cp936",
250
+ "cp1252",
251
+ "latin-1",
252
+ ]
253
+
229
254
  # 首先尝试使用chardet检测编码
230
255
  try:
231
256
  import chardet
257
+
232
258
  detected = chardet.detect(data)
233
- if detected['encoding'] and detected['confidence'] > 0.7:
234
- encodings.insert(0, detected['encoding'])
235
- logger.debug(f"🔍 检测到编码: {detected['encoding']} (置信度: {detected['confidence']})")
259
+ if detected["encoding"] and detected["confidence"] > 0.7:
260
+ encodings.insert(0, detected["encoding"])
261
+ logger.debug(
262
+ f"🔍 检测到编码: {detected['encoding']} (置信度: {detected['confidence']})"
263
+ )
236
264
  except:
237
265
  pass
238
-
266
+
239
267
  for encoding in encodings:
240
268
  try:
241
- decoded = data.decode(encoding, errors='ignore')
269
+ decoded = data.decode(encoding, errors="ignore")
242
270
  # 检查是否包含有意义的文本(包括中文)
243
- if decoded and (any(c.isalnum() for c in decoded) or any('\u4e00' <= c <= '\u9fff' for c in decoded)):
271
+ if decoded and (
272
+ any(c.isalnum() for c in decoded)
273
+ or any("\u4e00" <= c <= "\u9fff" for c in decoded)
274
+ ):
244
275
  # 进一步清理文本
245
276
  cleaned = self._filter_printable_text(decoded)
246
277
  if cleaned and len(cleaned.strip()) > 10:
247
278
  return cleaned
248
279
  except:
249
280
  continue
250
-
281
+
251
282
  return ""
252
283
 
253
284
  def _extract_embedded_objects(self, doc_path: str) -> str:
@@ -255,30 +286,33 @@ class DocParser(BaseLife):
255
286
  try:
256
287
  if not HAS_OLEFILE:
257
288
  return ""
258
-
289
+
259
290
  embedded_content = []
260
-
291
+
261
292
  with olefile.OleFileIO(doc_path) as ole:
262
293
  # 查找嵌入的对象
263
294
  for entry in ole.listdir():
264
- entry_name = '/'.join(entry)
265
-
295
+ entry_name = "/".join(entry)
296
+
266
297
  # 检查是否是嵌入对象
267
- if any(pattern in entry_name.lower() for pattern in ['object', 'embed', 'package']):
298
+ if any(
299
+ pattern in entry_name.lower()
300
+ for pattern in ["object", "embed", "package"]
301
+ ):
268
302
  logger.info(f"📎 找到嵌入对象: {entry_name}")
269
303
  try:
270
304
  stream = ole.openstream(entry)
271
305
  data = stream.read()
272
-
306
+
273
307
  # 尝试提取文本内容
274
308
  text = self._try_decode_bytes(data)
275
309
  if text and len(text.strip()) > 20:
276
310
  embedded_content.append(text.strip())
277
311
  except:
278
312
  continue
279
-
280
- return '\n\n'.join(embedded_content) if embedded_content else ""
281
-
313
+
314
+ return "\n\n".join(embedded_content) if embedded_content else ""
315
+
282
316
  except Exception as e:
283
317
  logger.warning(f"⚠️ 提取嵌入对象失败: {str(e)}")
284
318
  return ""
@@ -288,77 +322,87 @@ class DocParser(BaseLife):
288
322
  try:
289
323
  # 1. 解码HTML/XML实体
290
324
  text = html.unescape(text)
291
-
325
+
292
326
  # 2. 移除所有XML/HTML标签
293
- text = re.sub(r'<[^>]+>', '', text)
294
-
327
+ text = re.sub(r"<[^>]+>", "", text)
328
+
295
329
  # 3. 移除XML命名空间前缀
296
- text = re.sub(r'\b\w+:', '', text)
297
-
330
+ text = re.sub(r"\b\w+:", "", text)
331
+
298
332
  # 4. 移除NULL字符和其他控制字符
299
- text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)
300
-
333
+ text = re.sub(r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]", "", text)
334
+
301
335
  # 5. 移除特殊的XML字符序列
302
- text = re.sub(r'&[a-zA-Z]+;', '', text)
303
- text = re.sub(r'&#\d+;', '', text)
304
- text = re.sub(r'&#x[0-9a-fA-F]+;', '', text)
305
-
336
+ text = re.sub(r"&[a-zA-Z]+;", "", text)
337
+ text = re.sub(r"&#\d+;", "", text)
338
+ text = re.sub(r"&#x[0-9a-fA-F]+;", "", text)
339
+
306
340
  # 6. 保留有意义的字符,移除其他特殊字符
307
341
  # 保留:中文、日文、韩文、英文、数字、常用标点和空白
308
342
  allowed_chars = (
309
- r'\w\s' # 字母数字和空白
310
- r'\u4e00-\u9fff' # 中文
311
- r'\u3040-\u30ff' # 日文
312
- r'\uac00-\ud7af' # 韩文
313
- r',。!?;:""''()【】《》、·…—' # 中文标点
343
+ r"\w\s" # 字母数字和空白
344
+ r"\u4e00-\u9fff" # 中文
345
+ r"\u3040-\u30ff" # 日文
346
+ r"\uac00-\ud7af" # 韩文
347
+ r',。!?;:""'
348
+ "()【】《》、·…—" # 中文标点
314
349
  r'.,!?;:()[\]{}"\'`~@#$%^&*+=\-_/\\' # 英文标点和常用符号
315
350
  )
316
-
351
+
317
352
  # 使用更严格的过滤,但保留所有有意义的字符
318
- cleaned_text = ''.join(char for char in text if re.match(f'[{allowed_chars}]', char))
319
-
353
+ cleaned_text = "".join(
354
+ char for char in text if re.match(f"[{allowed_chars}]", char)
355
+ )
356
+
320
357
  # 7. 移除过长的无意义字符序列(通常是二进制垃圾)
321
- cleaned_text = re.sub(r'([^\s\u4e00-\u9fff])\1{5,}', r'\1', cleaned_text)
322
-
358
+ cleaned_text = re.sub(r"([^\s\u4e00-\u9fff])\1{5,}", r"\1", cleaned_text)
359
+
323
360
  # 8. 清理多余的空白,但保留段落结构
324
- cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text) # 多个空格/制表符变为单个空格
325
- cleaned_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', cleaned_text) # 多个空行变为双空行
326
- cleaned_text = re.sub(r'^\s+|\s+$', '', cleaned_text, flags=re.MULTILINE) # 移除行首行尾空白
327
-
361
+ cleaned_text = re.sub(
362
+ r"[ \t]+", " ", cleaned_text
363
+ ) # 多个空格/制表符变为单个空格
364
+ cleaned_text = re.sub(
365
+ r"\n\s*\n\s*\n+", "\n\n", cleaned_text
366
+ ) # 多个空行变为双空行
367
+ cleaned_text = re.sub(
368
+ r"^\s+|\s+$", "", cleaned_text, flags=re.MULTILINE
369
+ ) # 移除行首行尾空白
370
+
328
371
  # 9. 进一步清理:移除独立的标点符号行
329
- lines = cleaned_text.split('\n')
372
+ lines = cleaned_text.split("\n")
330
373
  cleaned_lines = []
331
-
374
+
332
375
  for line in lines:
333
376
  line = line.strip()
334
377
  if line:
335
378
  # 检查行是否主要是有意义的内容
336
379
  # 计算中文、英文字母和数字的比例
337
- meaningful_chars = sum(1 for c in line if (
338
- c.isalnum() or '\u4e00' <= c <= '\u9fff'
339
- ))
340
-
380
+ meaningful_chars = sum(
381
+ 1 for c in line if (c.isalnum() or "\u4e00" <= c <= "\u9fff")
382
+ )
383
+
341
384
  # 如果有意义字符占比超过30%,或者行长度小于5(可能是标题),则保留
342
- if (len(line) < 5 or
343
- (meaningful_chars > 0 and meaningful_chars / len(line) > 0.3)):
385
+ if len(line) < 5 or (
386
+ meaningful_chars > 0 and meaningful_chars / len(line) > 0.3
387
+ ):
344
388
  cleaned_lines.append(line)
345
389
  elif cleaned_lines and cleaned_lines[-1]: # 保留段落分隔
346
- cleaned_lines.append('')
347
-
348
- result = '\n'.join(cleaned_lines).strip()
349
-
390
+ cleaned_lines.append("")
391
+
392
+ result = "\n".join(cleaned_lines).strip()
393
+
350
394
  # 10. 最终检查
351
395
  if len(result) < 10:
352
396
  logger.warning("⚠️ 清理后的文本过短,可能存在问题")
353
397
  return ""
354
-
398
+
355
399
  # 检查是否还包含XML标签
356
- if re.search(r'<[^>]+>', result):
400
+ if re.search(r"<[^>]+>", result):
357
401
  logger.warning("⚠️ 清理后仍包含XML标签,进行二次清理")
358
- result = re.sub(r'<[^>]+>', '', result)
359
-
402
+ result = re.sub(r"<[^>]+>", "", result)
403
+
360
404
  return result
361
-
405
+
362
406
  except Exception as e:
363
407
  logger.error(f"💥 清理文本失败: {str(e)}")
364
408
  return text
@@ -366,25 +410,27 @@ class DocParser(BaseLife):
366
410
  def _combine_extracted_content(self, content_list: list) -> str:
367
411
  """合并提取到的各种内容"""
368
412
  combined = []
369
-
413
+
370
414
  # 按优先级排序内容
371
415
  priority_order = ["ole", "embedded", "converted", "fallback"]
372
-
416
+
373
417
  for content_type in priority_order:
374
418
  for item_type, content in content_list:
375
419
  if item_type == content_type and content.strip():
376
420
  combined.append(content.strip())
377
-
421
+
378
422
  # 添加其他未分类的内容
379
423
  for item_type, content in content_list:
380
424
  if item_type not in priority_order and content.strip():
381
425
  combined.append(content.strip())
382
-
383
- return '\n\n'.join(combined) if combined else ""
426
+
427
+ return "\n\n".join(combined) if combined else ""
384
428
 
385
429
  def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
386
430
  """将.doc文件转换为.txt文件"""
387
- logger.info(f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}")
431
+ logger.info(
432
+ f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}"
433
+ )
388
434
 
389
435
  if self.use_uno:
390
436
  # 使用UNO API进行转换
@@ -412,7 +458,7 @@ class DocParser(BaseLife):
412
458
  f" 1. 确保LibreOffice正确安装\n"
413
459
  f" 2. 关闭所有LibreOffice进程\n"
414
460
  f" 3. 检查文件权限和路径\n"
415
- f" 4. 尝试手动运行: soffice --headless --convert-to txt \"{doc_path}\""
461
+ f' 4. 尝试手动运行: soffice --headless --convert-to txt "{doc_path}"'
416
462
  )
417
463
  logger.warning("⚠️ 自动回退到传统命令行方式...")
418
464
  return self._doc_to_txt_subprocess(doc_path, dir_path)
@@ -435,13 +481,17 @@ class DocParser(BaseLife):
435
481
  if exit_code == 0:
436
482
  logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
437
483
  if stdout:
438
- logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
484
+ logger.debug(
485
+ f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}"
486
+ )
439
487
  else:
440
488
  encoding = chardet.detect(stderr)["encoding"]
441
489
  if encoding is None:
442
490
  encoding = "utf-8"
443
491
  error_msg = stderr.decode(encoding, errors="replace")
444
- logger.error(f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
492
+ logger.error(
493
+ f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}"
494
+ )
445
495
  raise Exception(
446
496
  f"Error Output (detected encoding: {encoding}): {error_msg}"
447
497
  )
@@ -503,14 +553,16 @@ class DocParser(BaseLife):
503
553
  if comprehensive_content and comprehensive_content.strip():
504
554
  # 检查内容质量
505
555
  if self._check_content_quality(comprehensive_content):
506
- logger.info(f"✨ 使用综合提取方式成功,内容长度: {len(comprehensive_content)} 字符")
556
+ logger.info(
557
+ f"✨ 使用综合提取方式成功,内容长度: {len(comprehensive_content)} 字符"
558
+ )
507
559
  return comprehensive_content
508
560
  else:
509
561
  logger.warning("⚠️ 综合提取的内容质量不佳,尝试其他方式")
510
-
562
+
511
563
  # 降级到传统转换方式
512
564
  logger.info("🔄 使用传统转换方式")
513
-
565
+
514
566
  with tempfile.TemporaryDirectory() as temp_path:
515
567
  logger.debug(f"📁 创建临时目录: {temp_path}")
516
568
 
@@ -544,23 +596,29 @@ class DocParser(BaseLife):
544
596
  """检查提取内容的质量"""
545
597
  if not content or len(content) < 50:
546
598
  return False
547
-
599
+
548
600
  # 计算乱码字符比例
549
601
  total_chars = len(content)
550
602
  # 可识别字符:ASCII、中文、日文、韩文、常用标点
551
- recognizable = sum(1 for c in content if (
552
- c.isascii() or
553
- '\u4e00' <= c <= '\u9fff' or # 中文
554
- '\u3040' <= c <= '\u30ff' or # 日文
555
- '\uac00' <= c <= '\ud7af' or # 韩文
556
- c in ',。!?;:""''()【】《》、·…—\n\r\t '
557
- ))
558
-
603
+ recognizable = sum(
604
+ 1
605
+ for c in content
606
+ if (
607
+ c.isascii()
608
+ or "\u4e00" <= c <= "\u9fff" # 中文
609
+ or "\u3040" <= c <= "\u30ff" # 日文
610
+ or "\uac00" <= c <= "\ud7af" # 韩文
611
+ or c in ',。!?;:""' "()【】《》、·…—\n\r\t "
612
+ )
613
+ )
614
+
559
615
  # 如果可识别字符占比低于70%,认为质量不佳
560
616
  if recognizable / total_chars < 0.7:
561
- logger.warning(f"⚠️ 内容质量检查失败:可识别字符比例 {recognizable}/{total_chars} = {recognizable/total_chars:.2%}")
617
+ logger.warning(
618
+ f"⚠️ 内容质量检查失败:可识别字符比例 {recognizable}/{total_chars} = {recognizable/total_chars:.2%}"
619
+ )
562
620
  return False
563
-
621
+
564
622
  return True
565
623
 
566
624
  def parse(self, file_path: str):
@@ -583,9 +641,17 @@ class DocParser(BaseLife):
583
641
 
584
642
  if file_size == 0:
585
643
  logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
644
+ # 生命周期:Data Processing 开始
645
+ lc_start = self.generate_lifecycle(
646
+ source_file=file_path,
647
+ domain="Technology",
648
+ life_type=LifeType.DATA_PROCESSING,
649
+ usage_purpose="Documentation",
650
+ )
586
651
 
587
- title = os.path.splitext(os.path.basename(file_path))[0]
588
- logger.debug(f"🏷️ 提取文件标题: {title}")
652
+ # 🏷️ 提取文件扩展名
653
+ extension = self.get_file_extension(file_path)
654
+ logger.debug(f"🏷️ 提取文件扩展名: {extension}")
589
655
 
590
656
  # 读取文件内容
591
657
  logger.info("📝 读取DOC文件内容")
@@ -599,6 +665,17 @@ class DocParser(BaseLife):
599
665
  else:
600
666
  mk_content = content
601
667
  logger.info("📝 保持原始文本格式")
668
+ # 3) 生命周期:Data Processed or Failed
669
+ lc_end = self.generate_lifecycle(
670
+ source_file=file_path,
671
+ domain="Technology",
672
+ life_type=(
673
+ LifeType.DATA_PROCESSED
674
+ if mk_content.strip()
675
+ else LifeType.DATA_PROCESS_FAILED
676
+ ),
677
+ usage_purpose="Documentation",
678
+ )
602
679
 
603
680
  logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
604
681
 
@@ -614,8 +691,10 @@ class DocParser(BaseLife):
614
691
  )
615
692
  logger.debug("⚙️ 生成lifecycle信息完成")
616
693
 
617
- output_vo = MarkdownOutputVo(title, mk_content)
618
- output_vo.add_lifecycle(lifecycle)
694
+ output_vo = MarkdownOutputVo(extension, mk_content)
695
+ output_vo.add_lifecycle(lc_start)
696
+ output_vo.add_lifecycle(lc_end)
697
+ # output_vo.add_lifecycle(lc_origin)
619
698
 
620
699
  result = output_vo.to_dict()
621
700
  logger.info(f"🏆 DOC文件解析完成: {file_path}")
@@ -630,7 +709,9 @@ class DocParser(BaseLife):
630
709
  logger.error(f"🔒 文件权限错误: {str(e)}")
631
710
  raise Exception(f"无权限访问文件: {file_path}")
632
711
  except Exception as e:
633
- logger.error(f"💀 解析DOC文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}")
712
+ logger.error(
713
+ f"💀 解析DOC文件失败: {file_path}, 错误类型: {type(e).__name__}, 错误信息: {str(e)}"
714
+ )
634
715
  raise
635
716
 
636
717
  def format_as_markdown(self, content: str) -> str:
@@ -657,10 +738,10 @@ class DocParser(BaseLife):
657
738
  """从WPS的WordDocument流中提取文本(使用更宽松的策略)"""
658
739
  try:
659
740
  text_parts = []
660
-
741
+
661
742
  # WPS文件可能使用不同的编码和结构
662
743
  # 尝试多种策略提取文本
663
-
744
+
664
745
  # 策略1:尝试找到连续的文本块
665
746
  # 查找看起来像文本的字节序列
666
747
  i = 0
@@ -668,18 +749,24 @@ class DocParser(BaseLife):
668
749
  # 查找可能的文本开始位置
669
750
  if i + 2 < len(data):
670
751
  # 检查是否是Unicode文本(小端序)
671
- if data[i+1] == 0 and 32 <= data[i] <= 126:
752
+ if data[i + 1] == 0 and 32 <= data[i] <= 126:
672
753
  # 可能是ASCII字符的Unicode编码
673
754
  text_block = bytearray()
674
755
  j = i
675
- while j + 1 < len(data) and data[j+1] == 0 and 32 <= data[j] <= 126:
756
+ while (
757
+ j + 1 < len(data)
758
+ and data[j + 1] == 0
759
+ and 32 <= data[j] <= 126
760
+ ):
676
761
  text_block.append(data[j])
677
762
  j += 2
678
763
  if len(text_block) > 10:
679
- text_parts.append(text_block.decode('ascii', errors='ignore'))
764
+ text_parts.append(
765
+ text_block.decode("ascii", errors="ignore")
766
+ )
680
767
  i = j
681
768
  # 检查是否是UTF-8或GBK中文
682
- elif 0xe0 <= data[i] <= 0xef or 0x81 <= data[i] <= 0xfe:
769
+ elif 0xE0 <= data[i] <= 0xEF or 0x81 <= data[i] <= 0xFE:
683
770
  # 可能是多字节字符
684
771
  text_block = bytearray()
685
772
  j = i
@@ -690,9 +777,11 @@ class DocParser(BaseLife):
690
777
  j += 1
691
778
  if len(text_block) > 20:
692
779
  # 尝试解码
693
- for encoding in ['utf-8', 'gbk', 'gb18030', 'gb2312']:
780
+ for encoding in ["utf-8", "gbk", "gb18030", "gb2312"]:
694
781
  try:
695
- decoded = text_block.decode(encoding, errors='ignore')
782
+ decoded = text_block.decode(
783
+ encoding, errors="ignore"
784
+ )
696
785
  if decoded and len(decoded.strip()) > 10:
697
786
  text_parts.append(decoded)
698
787
  break
@@ -703,15 +792,15 @@ class DocParser(BaseLife):
703
792
  i += 1
704
793
  else:
705
794
  i += 1
706
-
795
+
707
796
  # 合并文本部分
708
797
  if text_parts:
709
- combined = '\n'.join(text_parts)
798
+ combined = "\n".join(text_parts)
710
799
  return self._clean_extracted_text(combined)
711
-
800
+
712
801
  # 如果上述方法失败,回退到原始方法
713
802
  return self._parse_word_stream(data)
714
-
803
+
715
804
  except Exception as e:
716
805
  logger.error(f"💥 解析WPS流失败: {str(e)}")
717
806
  return ""