pydatamax 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamax/parser/core.py CHANGED
@@ -87,7 +87,7 @@ class ParserFactory:
87
87
  )
88
88
  elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
89
89
  return parser_class(
90
- file_path=file_path, to_markdown=to_markdown
90
+ file_path=file_path, to_markdown=to_markdown, use_uno=True
91
91
  )
92
92
  elif parser_class_name == 'XlsxParser':
93
93
  return parser_class(
@@ -26,6 +26,21 @@ try:
26
26
  from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
27
27
  except ImportError:
28
28
  HAS_UNO = False
29
+ logger.error(
30
+ "❌ UNO处理器导入失败!\n"
31
+ "🔧 解决方案:\n"
32
+ "1. 安装LibreOffice和python-uno:\n"
33
+ " - Ubuntu/Debian: sudo apt-get install libreoffice python3-uno\n"
34
+ " - CentOS/RHEL: sudo yum install libreoffice python3-uno\n"
35
+ " - macOS: brew install libreoffice\n"
36
+ " - Windows: 下载并安装LibreOffice\n"
37
+ "2. 确保Python可以访问uno模块:\n"
38
+ " - Linux: export PYTHONPATH=/usr/lib/libreoffice/program:$PYTHONPATH\n"
39
+ " - Windows: 添加LibreOffice\\program到系统PATH\n"
40
+ "3. 验证安装:python -c 'import uno'\n"
41
+ "4. 如果仍有问题,请查看完整文档:\n"
42
+ " https://wiki.documentfoundation.org/Documentation/DevGuide/Installing_the_SDK"
43
+ )
29
44
 
30
45
 
31
46
  class DocParser(BaseLife):
@@ -46,7 +61,11 @@ class DocParser(BaseLife):
46
61
  else:
47
62
  self.use_uno = False
48
63
  if use_uno and not HAS_UNO:
49
- logger.warning(f"⚠️ UNO不可用,回退到传统命令行方式")
64
+ logger.warning(
65
+ f"⚠️ UNO不可用,回退到传统命令行方式\n"
66
+ f"💡 提示:UNO转换更快更稳定,强烈建议安装和配置UNO\n"
67
+ f"📖 请参考上述错误信息中的安装指南"
68
+ )
50
69
  else:
51
70
  logger.info(f"🚀 DocParser初始化完成 - 使用传统命令行方式")
52
71
 
@@ -265,52 +284,78 @@ class DocParser(BaseLife):
265
284
  return ""
266
285
 
267
286
  def _clean_extracted_text(self, text: str) -> str:
268
- """清理提取的文本,移除控制字符和格式化,但保留中文"""
287
+ """清理提取的文本,彻底移除所有XML标签和控制字符,只保留纯文本"""
269
288
  try:
270
- # 移除NULL字符和其他控制字符(但不移除换行等)
289
+ # 1. 解码HTML/XML实体
290
+ text = html.unescape(text)
291
+
292
+ # 2. 移除所有XML/HTML标签
293
+ text = re.sub(r'<[^>]+>', '', text)
294
+
295
+ # 3. 移除XML命名空间前缀
296
+ text = re.sub(r'\b\w+:', '', text)
297
+
298
+ # 4. 移除NULL字符和其他控制字符
271
299
  text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)
272
300
 
273
- # 移除连续的特殊字符(但保留中文和常用标点)
274
- # 修改正则表达式,确保不会误删中文
275
- text = re.sub(r'[^\w\s\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af,。!?;:""''()【】《》、·…—.,!?;:()-]+', ' ', text)
301
+ # 5. 移除特殊的XML字符序列
302
+ text = re.sub(r'&[a-zA-Z]+;', '', text)
303
+ text = re.sub(r'&#\d+;', '', text)
304
+ text = re.sub(r'&#x[0-9a-fA-F]+;', '', text)
305
+
306
+ # 6. 保留有意义的字符,移除其他特殊字符
307
+ # 保留:中文、日文、韩文、英文、数字、常用标点和空白
308
+ allowed_chars = (
309
+ r'\w\s' # 字母数字和空白
310
+ r'\u4e00-\u9fff' # 中文
311
+ r'\u3040-\u30ff' # 日文
312
+ r'\uac00-\ud7af' # 韩文
313
+ r',。!?;:""''()【】《》、·…—' # 中文标点
314
+ r'.,!?;:()[\]{}"\'`~@#$%^&*+=\-_/\\' # 英文标点和常用符号
315
+ )
316
+
317
+ # 使用更严格的过滤,但保留所有有意义的字符
318
+ cleaned_text = ''.join(char for char in text if re.match(f'[{allowed_chars}]', char))
276
319
 
277
- # 移除过长的无意义字符序列(通常是乱码)
278
- text = re.sub(r'[\x80-\xff]{10,}', ' ', text)
320
+ # 7. 移除过长的无意义字符序列(通常是二进制垃圾)
321
+ cleaned_text = re.sub(r'([^\s\u4e00-\u9fff])\1{5,}', r'\1', cleaned_text)
279
322
 
280
- # 移除重复的空白
281
- text = re.sub(r'\s+', ' ', text)
282
- text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
323
+ # 8. 清理多余的空白,但保留段落结构
324
+ cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text) # 多个空格/制表符变为单个空格
325
+ cleaned_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', cleaned_text) # 多个空行变为双空行
326
+ cleaned_text = re.sub(r'^\s+|\s+$', '', cleaned_text, flags=re.MULTILINE) # 移除行首行尾空白
283
327
 
284
- # 确保段落分隔
285
- lines = text.split('\n')
328
+ # 9. 进一步清理:移除独立的标点符号行
329
+ lines = cleaned_text.split('\n')
286
330
  cleaned_lines = []
287
331
 
288
332
  for line in lines:
289
333
  line = line.strip()
290
334
  if line:
291
- # 检查行是否主要是乱码
292
- printable_chars = sum(1 for c in line if c.isprintable() or '\u4e00' <= c <= '\u9fff')
293
- total_chars = len(line)
335
+ # 检查行是否主要是有意义的内容
336
+ # 计算中文、英文字母和数字的比例
337
+ meaningful_chars = sum(1 for c in line if (
338
+ c.isalnum() or '\u4e00' <= c <= '\u9fff'
339
+ ))
294
340
 
295
- # 如果可打印字符(包括中文)占比超过60%,则保留该行
296
- if total_chars > 0 and printable_chars / total_chars > 0.6:
341
+ # 如果有意义字符占比超过30%,或者行长度小于5(可能是标题),则保留
342
+ if (len(line) < 5 or
343
+ (meaningful_chars > 0 and meaningful_chars / len(line) > 0.3)):
297
344
  cleaned_lines.append(line)
298
- elif cleaned_lines and cleaned_lines[-1]:
299
- cleaned_lines.append('') # 保留段落分隔
345
+ elif cleaned_lines and cleaned_lines[-1]: # 保留段落分隔
346
+ cleaned_lines.append('')
300
347
 
301
348
  result = '\n'.join(cleaned_lines).strip()
302
349
 
303
- # 最后检查:如果结果太短或包含太多乱码,返回空
304
- if len(result) < 20:
350
+ # 10. 最终检查
351
+ if len(result) < 10:
352
+ logger.warning("⚠️ 清理后的文本过短,可能存在问题")
305
353
  return ""
306
354
 
307
- # 检查乱码比例
308
- weird_chars = sum(1 for c in result if ord(c) > 127 and not ('\u4e00' <= c <= '\u9fff' or c in ',。!?;:""''()【】《》、·…—'))
309
- if len(result) > 0 and weird_chars / len(result) > 0.3:
310
- logger.warning(f"⚠️ 文本包含过多乱码字符 ({weird_chars}/{len(result)})")
311
- # 尝试只保留ASCII和中文部分
312
- result = re.sub(r'[^\x00-\x7f\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af,。!?;:""''()【】《》、·…—\s]+', ' ', result)
313
- result = re.sub(r'\s+', ' ', result).strip()
355
+ # 检查是否还包含XML标签
356
+ if re.search(r'<[^>]+>', result):
357
+ logger.warning("⚠️ 清理后仍包含XML标签,进行二次清理")
358
+ result = re.sub(r'<[^>]+>', '', result)
314
359
 
315
360
  return result
316
361
 
@@ -355,7 +400,20 @@ class DocParser(BaseLife):
355
400
  return txt_path
356
401
 
357
402
  except Exception as e:
358
- logger.error(f"💥 UNO转换失败: {str(e)}")
403
+ logger.error(
404
+ f"💥 UNO转换失败: {str(e)}\n"
405
+ f"🔍 诊断信息:\n"
406
+ f" - 错误类型: {type(e).__name__}\n"
407
+ f" - LibreOffice是否已安装?尝试运行: soffice --version\n"
408
+ f" - Python UNO模块是否可用?尝试: python -c 'import uno'\n"
409
+ f" - 是否有其他LibreOffice实例在运行?\n"
410
+ f" - 文件权限是否正确?\n"
411
+ f"🔧 可能的解决方案:\n"
412
+ f" 1. 确保LibreOffice正确安装\n"
413
+ f" 2. 关闭所有LibreOffice进程\n"
414
+ f" 3. 检查文件权限和路径\n"
415
+ f" 4. 尝试手动运行: soffice --headless --convert-to txt \"{doc_path}\""
416
+ )
359
417
  logger.warning("⚠️ 自动回退到传统命令行方式...")
360
418
  return self._doc_to_txt_subprocess(doc_path, dir_path)
361
419
  else:
@@ -18,7 +18,21 @@ try:
18
18
  from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
19
19
  except ImportError:
20
20
  HAS_UNO = False
21
- logger.warning("⚠️ UNO不可用,回退到传统命令行方式")
21
+ logger.error(
22
+ "❌ UNO处理器导入失败!\n"
23
+ "🔧 解决方案:\n"
24
+ "1. 安装LibreOffice和python-uno:\n"
25
+ " - Ubuntu/Debian: sudo apt-get install libreoffice python3-uno\n"
26
+ " - CentOS/RHEL: sudo yum install libreoffice python3-uno\n"
27
+ " - macOS: brew install libreoffice\n"
28
+ " - Windows: 下载并安装LibreOffice\n"
29
+ "2. 确保Python可以访问uno模块:\n"
30
+ " - Linux: export PYTHONPATH=/usr/lib/libreoffice/program:$PYTHONPATH\n"
31
+ " - Windows: 添加LibreOffice\\program到系统PATH\n"
32
+ "3. 验证安装:python -c 'import uno'\n"
33
+ "4. 如果仍有问题,请查看完整文档:\n"
34
+ " https://wiki.documentfoundation.org/Documentation/DevGuide/Installing_the_SDK"
35
+ )
22
36
 
23
37
 
24
38
 
@@ -40,7 +54,11 @@ class DocxParser(BaseLife):
40
54
  else:
41
55
  self.use_uno = False
42
56
  if use_uno and not HAS_UNO:
43
- logger.warning(f"⚠️ UNO不可用,回退到传统命令行方式")
57
+ logger.warning(
58
+ f"⚠️ UNO不可用,回退到传统命令行方式\n"
59
+ f"💡 提示:UNO转换更快更稳定,强烈建议安装和配置UNO\n"
60
+ f"📖 请参考上述错误信息中的安装指南"
61
+ )
44
62
  else:
45
63
  logger.info(f"🚀 DocxParser初始化完成 - 使用传统命令行方式")
46
64
 
@@ -64,7 +82,20 @@ class DocxParser(BaseLife):
64
82
  return txt_path
65
83
 
66
84
  except Exception as e:
67
- logger.error(f"💥 UNO转换失败: {str(e)}")
85
+ logger.error(
86
+ f"💥 UNO转换失败: {str(e)}\n"
87
+ f"🔍 诊断信息:\n"
88
+ f" - 错误类型: {type(e).__name__}\n"
89
+ f" - LibreOffice是否已安装?尝试运行: soffice --version\n"
90
+ f" - Python UNO模块是否可用?尝试: python -c 'import uno'\n"
91
+ f" - 是否有其他LibreOffice实例在运行?\n"
92
+ f" - 文件权限是否正确?\n"
93
+ f"🔧 可能的解决方案:\n"
94
+ f" 1. 确保LibreOffice正确安装\n"
95
+ f" 2. 关闭所有LibreOffice进程\n"
96
+ f" 3. 检查文件权限和路径\n"
97
+ f" 4. 尝试手动运行: soffice --headless --convert-to txt \"{docx_path}\""
98
+ )
68
99
  logger.warning("⚠️ 自动回退到传统命令行方式...")
69
100
  return self._docx_to_txt_subprocess(docx_path, dir_path)
70
101
  else:
@@ -228,26 +259,54 @@ class DocxParser(BaseLife):
228
259
  return ""
229
260
 
230
261
  def _extract_standard_document_content(self, docx_zip: zipfile.ZipFile) -> str:
231
- """提取标准document.xml内容"""
262
+ """提取标准document.xml内容 - 只提取纯文本"""
232
263
  try:
233
264
  if 'word/document.xml' in docx_zip.namelist():
234
265
  doc_xml = docx_zip.read('word/document.xml').decode('utf-8', errors='replace')
235
266
 
236
- # 使用正则表达式提取文本内容
237
- import xml.etree.ElementTree as ET
267
+ # 解码XML实体
268
+ doc_xml = html.unescape(doc_xml)
269
+
270
+ # 提取所有<w:t>标签中的文本(包括各种命名空间前缀)
271
+ # 使用更宽松的正则表达式来匹配任何命名空间前缀
272
+ text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
273
+ text_matches = re.findall(text_pattern, doc_xml)
238
274
 
239
- # 移除命名空间前缀以简化处理
240
- doc_xml_clean = re.sub(r'xmlns[^=]*="[^"]*"', '', doc_xml)
241
- doc_xml_clean = re.sub(r'w:', '', doc_xml_clean)
242
- doc_xml_clean = re.sub(r'[a-zA-Z0-9]+:', '', doc_xml_clean)
275
+ # 额外提取可能存在的无命名空间的<t>标签
276
+ text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', doc_xml))
243
277
 
244
- # 提取所有<t>标签中的文本
245
- text_matches = re.findall(r'<t[^>]*>(.*?)</t>', doc_xml_clean, re.DOTALL)
246
278
  if text_matches:
247
- content = ' '.join(text_matches)
248
- content = html.unescape(content)
249
- logger.info(f"📝 从document.xml提取文本: {len(content)} 字符")
250
- return content.strip()
279
+ # 清理和组合文本
280
+ cleaned_texts = []
281
+ for text in text_matches:
282
+ # 解码XML实体
283
+ text = html.unescape(text)
284
+ # 移除多余的空白字符,但保留单个空格
285
+ text = re.sub(r'\s+', ' ', text.strip())
286
+ if text:
287
+ cleaned_texts.append(text)
288
+
289
+ # 智能连接文本片段
290
+ content = ''
291
+ for i, text in enumerate(cleaned_texts):
292
+ if i == 0:
293
+ content = text
294
+ else:
295
+ # 如果前一个文本片段不是以标点结束,且当前文本不是以大写开头,则不加空格
296
+ prev_char = content[-1] if content else ''
297
+ curr_char = text[0] if text else ''
298
+
299
+ if prev_char in '.!?。!?\n' or curr_char.isupper() or curr_char in ',。!?;:':
300
+ content += ' ' + text
301
+ else:
302
+ content += text
303
+
304
+ # 最终清理
305
+ content = re.sub(r'\s+', ' ', content)
306
+ content = content.strip()
307
+
308
+ logger.info(f"📝 从document.xml提取纯文本: {len(content)} 字符")
309
+ return content
251
310
  return ""
252
311
  except Exception as e:
253
312
  logger.error(f"💥 提取标准文档内容失败: {str(e)}")
@@ -271,7 +330,7 @@ class DocxParser(BaseLife):
271
330
  return ""
272
331
 
273
332
  def _extract_headers_footers(self, docx_zip: zipfile.ZipFile) -> str:
274
- """提取页眉页脚内容"""
333
+ """提取页眉页脚内容 - 只提取纯文本"""
275
334
  try:
276
335
  header_footer_content = []
277
336
 
@@ -280,35 +339,66 @@ class DocxParser(BaseLife):
280
339
  logger.debug(f"📄 处理页眉页脚: {filename}")
281
340
  content = docx_zip.read(filename).decode('utf-8', errors='replace')
282
341
 
283
- # 提取文本内容
284
- text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', content, re.DOTALL)
342
+ # 解码XML实体
343
+ content = html.unescape(content)
344
+
345
+ # 提取文本内容 - 使用更宽松的模式
346
+ text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
347
+ text_matches = re.findall(text_pattern, content)
348
+ text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', content))
349
+
285
350
  if text_matches:
286
- header_footer_text = ' '.join(text_matches)
287
- header_footer_text = html.unescape(header_footer_text)
288
- if header_footer_text.strip():
289
- header_footer_content.append(header_footer_text.strip())
351
+ # 清理和组合文本
352
+ cleaned_texts = []
353
+ for text in text_matches:
354
+ text = html.unescape(text)
355
+ text = re.sub(r'\s+', ' ', text.strip())
356
+ if text:
357
+ cleaned_texts.append(text)
358
+
359
+ if cleaned_texts:
360
+ # 合并文本片段
361
+ header_footer_text = ' '.join(cleaned_texts)
362
+ header_footer_text = re.sub(r'\s+', ' ', header_footer_text.strip())
363
+ if header_footer_text:
364
+ header_footer_content.append(header_footer_text)
290
365
 
291
366
  if header_footer_content:
292
- logger.info(f"📑 提取页眉页脚内容: {len(header_footer_content)} 个")
367
+ logger.info(f"📑 提取页眉页脚纯文本: {len(header_footer_content)} 个")
293
368
 
294
- return ' '.join(header_footer_content) if header_footer_content else ""
369
+ return '\n'.join(header_footer_content) if header_footer_content else ""
295
370
  except Exception as e:
296
371
  logger.error(f"💥 提取页眉页脚失败: {str(e)}")
297
372
  return ""
298
373
 
299
374
  def _extract_comments(self, docx_zip: zipfile.ZipFile) -> str:
300
- """提取注释和批注内容"""
375
+ """提取注释和批注内容 - 只提取纯文本"""
301
376
  try:
302
377
  if 'word/comments.xml' in docx_zip.namelist():
303
378
  comments_xml = docx_zip.read('word/comments.xml').decode('utf-8', errors='replace')
304
379
 
305
- # 提取注释文本
306
- text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', comments_xml, re.DOTALL)
380
+ # 解码XML实体
381
+ comments_xml = html.unescape(comments_xml)
382
+
383
+ # 提取注释文本 - 使用更宽松的模式
384
+ text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
385
+ text_matches = re.findall(text_pattern, comments_xml)
386
+ text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', comments_xml))
387
+
307
388
  if text_matches:
308
- comments_text = ' '.join(text_matches)
309
- comments_text = html.unescape(comments_text)
310
- logger.info(f"💬 提取注释内容: {len(comments_text)} 字符")
311
- return comments_text.strip()
389
+ # 清理和组合文本
390
+ cleaned_texts = []
391
+ for text in text_matches:
392
+ text = html.unescape(text)
393
+ text = re.sub(r'\s+', ' ', text.strip())
394
+ if text:
395
+ cleaned_texts.append(text)
396
+
397
+ if cleaned_texts:
398
+ comments_text = ' '.join(cleaned_texts)
399
+ comments_text = re.sub(r'\s+', ' ', comments_text.strip())
400
+ logger.info(f"💬 提取注释纯文本: {len(comments_text)} 字符")
401
+ return comments_text
312
402
 
313
403
  return ""
314
404
  except Exception as e:
@@ -316,7 +406,7 @@ class DocxParser(BaseLife):
316
406
  return ""
317
407
 
318
408
  def _extract_textbox_content(self, docx_zip: zipfile.ZipFile) -> str:
319
- """提取文本框和图形对象中的文本"""
409
+ """提取文本框和图形对象中的文本 - 只提取纯文本"""
320
410
  try:
321
411
  textbox_content = []
322
412
 
@@ -325,26 +415,43 @@ class DocxParser(BaseLife):
325
415
  if 'word/' in filename and filename.endswith('.xml'):
326
416
  content = docx_zip.read(filename).decode('utf-8', errors='replace')
327
417
 
418
+ # 解码XML实体
419
+ content = html.unescape(content)
420
+
328
421
  # 查找文本框内容 (w:txbxContent)
329
- textbox_matches = re.findall(r'<w:txbxContent[^>]*>(.*?)</w:txbxContent>', content, re.DOTALL)
422
+ textbox_matches = re.findall(r'<[^:>]*:txbxContent[^>]*>(.*?)</[^:>]*:txbxContent>', content, re.DOTALL)
423
+
330
424
  for match in textbox_matches:
331
- text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', match, re.DOTALL)
425
+ # 从文本框内容中提取文本
426
+ text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
427
+ text_matches = re.findall(text_pattern, match)
428
+ text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', match))
429
+
332
430
  if text_matches:
333
- textbox_text = ' '.join(text_matches)
334
- textbox_text = html.unescape(textbox_text)
335
- if textbox_text.strip():
336
- textbox_content.append(textbox_text.strip())
431
+ # 清理和组合文本
432
+ cleaned_texts = []
433
+ for text in text_matches:
434
+ text = html.unescape(text)
435
+ text = re.sub(r'\s+', ' ', text.strip())
436
+ if text:
437
+ cleaned_texts.append(text)
438
+
439
+ if cleaned_texts:
440
+ textbox_text = ' '.join(cleaned_texts)
441
+ textbox_text = re.sub(r'\s+', ' ', textbox_text.strip())
442
+ if textbox_text:
443
+ textbox_content.append(textbox_text)
337
444
 
338
445
  if textbox_content:
339
- logger.info(f"📦 提取文本框内容: {len(textbox_content)} 个")
446
+ logger.info(f"📦 提取文本框纯文本: {len(textbox_content)} 个")
340
447
 
341
- return ' '.join(textbox_content) if textbox_content else ""
448
+ return '\n'.join(textbox_content) if textbox_content else ""
342
449
  except Exception as e:
343
450
  logger.error(f"💥 提取文本框内容失败: {str(e)}")
344
451
  return ""
345
452
 
346
453
  def _combine_extracted_content(self, content_list: list) -> str:
347
- """合并提取到的各种内容"""
454
+ """合并提取到的各种内容 - 输出清晰的纯文本"""
348
455
  combined = []
349
456
 
350
457
  # 按重要性排序内容
@@ -353,14 +460,40 @@ class DocxParser(BaseLife):
353
460
  for content_type in priority_order:
354
461
  for item_type, content in content_list:
355
462
  if item_type == content_type and content.strip():
356
- combined.append(content.strip())
463
+ # 清理内容中的多余空白
464
+ cleaned_content = re.sub(r'\s+', ' ', content.strip())
465
+ cleaned_content = re.sub(r'\n\s*\n', '\n\n', cleaned_content)
466
+
467
+ if cleaned_content:
468
+ # 根据内容类型添加简单的标记(仅在有多种内容类型时)
469
+ if len([1 for t, c in content_list if c.strip()]) > 1:
470
+ if item_type == "header_footer":
471
+ combined.append(f"[页眉页脚]\n{cleaned_content}")
472
+ elif item_type == "comments":
473
+ combined.append(f"[批注]\n{cleaned_content}")
474
+ elif item_type == "textboxes":
475
+ combined.append(f"[文本框]\n{cleaned_content}")
476
+ else:
477
+ combined.append(cleaned_content)
478
+ else:
479
+ combined.append(cleaned_content)
357
480
 
358
481
  # 添加其他未分类的内容
359
482
  for item_type, content in content_list:
360
483
  if item_type not in priority_order and content.strip():
361
- combined.append(content.strip())
484
+ cleaned_content = re.sub(r'\s+', ' ', content.strip())
485
+ cleaned_content = re.sub(r'\n\s*\n', '\n\n', cleaned_content)
486
+ if cleaned_content:
487
+ combined.append(cleaned_content)
488
+
489
+ # 合并所有内容,使用双换行分隔不同部分
490
+ final_content = '\n\n'.join(combined) if combined else ""
491
+
492
+ # 最终清理:确保没有过多的空行
493
+ final_content = re.sub(r'\n{3,}', '\n\n', final_content)
494
+ final_content = final_content.strip()
362
495
 
363
- return '\n\n'.join(combined) if combined else ""
496
+ return final_content
364
497
 
365
498
  def _extract_html_from_mht(self, mht_content: str) -> str:
366
499
  """从MHT内容中提取HTML部分并转换为简洁文本"""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydatamax
3
- Version: 0.1.15
3
+ Version: 0.1.16
4
4
  Summary: A library for parsing and converting various file formats.
5
5
  Home-page: https://github.com/Hi-Dolphin/datamax
6
6
  Author: ccy
@@ -5,10 +5,10 @@ datamax/loader/minio_handler.py,sha256=e7ZUlwoStVe5iQfAVfNgEwRLxen4NbxwokpJZl6AR
5
5
  datamax/loader/oss_handler.py,sha256=ZO8ZbbA1oyuNN4Z7iVgSbMArYgJ1gvpqADkXDNDw4y0,7648
6
6
  datamax/parser/__init__.py,sha256=sIB1N4B_fVguSBN-Uw9tGxAO6s0oi4Tq4kRZ59YlUKo,52
7
7
  datamax/parser/base.py,sha256=FamDV6csc3aXVbobMR1lPNtVpvYMO19koRJW9poj_gE,2590
8
- datamax/parser/core.py,sha256=2Gkz08WrRPt9ga0DisXrV1Aa_Yu7jUMlECOXNaexpwQ,16831
8
+ datamax/parser/core.py,sha256=pySissrF6kVVAzT5abIlQ-4cUliFu1HBWjcD6psNkYA,16845
9
9
  datamax/parser/csv_parser.py,sha256=lHQs1MHK9WM4Vl0p9nsE3fFhewF0EoXZUhtk8ixznRw,1028
10
- datamax/parser/doc_parser.py,sha256=g2vZEdwcA-5AM2y0IHBy0bx1cOflkvLBobQ6tljX1fo,27940
11
- datamax/parser/docx_parser.py,sha256=fehvMdOexWRRm1HTOCkVa_zDWI8A_LbGJdEFwW-MGss,29613
10
+ datamax/parser/doc_parser.py,sha256=qPKpZy_p1veV2AodqEQU6LzqmT7y1PANlPtt0CYoHeg,30837
11
+ datamax/parser/docx_parser.py,sha256=wdDGgeYIDg1Se493XZhlduxKjtYZ58Uqxltm2vt9Dy4,36691
12
12
  datamax/parser/epub_parser.py,sha256=K4eCS4wIXJzDicvtVAfQT8yt1gFHeibZN5-EdQZfJe8,1621
13
13
  datamax/parser/html_parser.py,sha256=5ACrVc03Q9pJqWI_b0EtRgOYy0eMYJq4podgHGD68Z8,1453
14
14
  datamax/parser/image_parser.py,sha256=UH3duPvB7Xu6CFlEeAukX5uJ8VlqnMR89hcLsW2O-aU,1281
@@ -31,8 +31,8 @@ datamax/utils/ppt_extract.py,sha256=jBVGYEsBGPjHqyq7drHTOM8MnFOwqKyHhbkKmEAryAk,
31
31
  datamax/utils/qa_generator.py,sha256=pXxdFm_EnWgMuilfmLKgy2c6NDexQZN8nWxT-bYBt74,12548
32
32
  datamax/utils/tokenizer.py,sha256=o78GPmeJ3vs3-SF0b2nMm35XtbrCKbrhDW0gI9gqGl4,880
33
33
  datamax/utils/uno_handler.py,sha256=gDm42OQQQoCiOP0SB7xZ9TRF6A_XBHNavwG5ycj6kEQ,14807
34
- pydatamax-0.1.15.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
35
- pydatamax-0.1.15.dist-info/METADATA,sha256=ySaiq1-bWbeW8W5ECuRTSPXzeNxTxaaJEAdqGnWvw0M,9795
36
- pydatamax-0.1.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
37
- pydatamax-0.1.15.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
38
- pydatamax-0.1.15.dist-info/RECORD,,
34
+ pydatamax-0.1.16.dist-info/licenses/LICENSE,sha256=RltoeMa9c1956S08ThvZE2yJSPbnem68Y8cmiIfOgco,1088
35
+ pydatamax-0.1.16.dist-info/METADATA,sha256=nwHHLzsxwm_Za3aVbLMFfL-3JIZSZclp9KI0XL6rOHE,9795
36
+ pydatamax-0.1.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
37
+ pydatamax-0.1.16.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
38
+ pydatamax-0.1.16.dist-info/RECORD,,