pydatamax 0.1.14__py3-none-any.whl → 0.1.15.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/minio_handler.py +171 -171
  4. datamax/loader/oss_handler.py +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +466 -10
  10. datamax/parser/docx_parser.py +449 -11
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -215
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/METADATA +117 -5
  34. pydatamax-0.1.15.post2.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.14.dist-info/RECORD +0 -39
  38. tests/__init__.py +0 -0
  39. tests/test_basic.py +0 -20
  40. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/WHEEL +0 -0
@@ -1,32 +1,78 @@
1
- import logging
1
+ from loguru import logger
2
2
  import os
3
3
  import shutil
4
4
  import subprocess
5
5
  import tempfile
6
6
  from pathlib import Path
7
- from typing import Union
7
+ from typing import Union, Optional
8
8
 
9
9
  import chardet
10
-
10
+ from loguru import logger
11
11
  from datamax.parser.base import BaseLife, MarkdownOutputVo
12
+ import zipfile
13
+ import re
14
+ import html
15
+
16
+ # 尝试导入UNO处理器
17
+ try:
18
+ from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
19
+ except ImportError:
20
+ HAS_UNO = False
21
+ logger.warning("⚠️ UNO不可用,回退到传统命令行方式")
12
22
 
13
- # 配置日志
14
- logger = logging.getLogger(__name__)
15
23
 
16
24
 
17
25
  class DocxParser(BaseLife):
18
- def __init__(self, file_path: Union[str, list], to_markdown: bool = False):
26
+ def __init__(
27
+ self,
28
+ file_path: Union[str, list],
29
+ to_markdown: bool = False,
30
+ use_uno: bool = True,
31
+ ):
19
32
  super().__init__()
20
33
  self.file_path = file_path
21
34
  self.to_markdown = to_markdown
22
- logger.info(
23
- f"🚀 DocxParser初始化完成 - 文件路径: {file_path}, 转换为markdown: {to_markdown}"
24
- )
35
+
36
+ # 优先使用UNO(除非明确禁用)
37
+ if use_uno and HAS_UNO:
38
+ self.use_uno = True
39
+ logger.info(f"🚀 DocxParser初始化完成 - 使用UNO API进行单线程高效处理")
40
+ else:
41
+ self.use_uno = False
42
+ if use_uno and not HAS_UNO:
43
+ logger.warning(f"⚠️ UNO不可用,回退到传统命令行方式")
44
+ else:
45
+ logger.info(f"🚀 DocxParser初始化完成 - 使用传统命令行方式")
46
+
47
+ logger.info(f"📄 文件路径: {file_path}, 转换为markdown: {to_markdown}")
25
48
 
26
49
  def docx_to_txt(self, docx_path: str, dir_path: str) -> str:
27
50
  """将.docx文件转换为.txt文件"""
28
51
  logger.info(f"🔄 开始转换DOCX文件为TXT - 源文件: {docx_path}, 输出目录: {dir_path}")
29
52
 
53
+ if self.use_uno:
54
+ # 使用UNO API进行转换
55
+ try:
56
+ logger.info("🎯 使用UNO API进行文档转换...")
57
+ txt_path = convert_with_uno(docx_path, "txt", dir_path)
58
+
59
+ if not os.path.exists(txt_path):
60
+ logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
61
+ raise Exception(f"文件转换失败 {docx_path} ==> {txt_path}")
62
+ else:
63
+ logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
64
+ return txt_path
65
+
66
+ except Exception as e:
67
+ logger.error(f"💥 UNO转换失败: {str(e)}")
68
+ logger.warning("⚠️ 自动回退到传统命令行方式...")
69
+ return self._docx_to_txt_subprocess(docx_path, dir_path)
70
+ else:
71
+ # 使用传统的subprocess方式
72
+ return self._docx_to_txt_subprocess(docx_path, dir_path)
73
+
74
+ def _docx_to_txt_subprocess(self, docx_path: str, dir_path: str) -> str:
75
+ """使用subprocess将.docx文件转换为.txt文件(传统方式)"""
30
76
  try:
31
77
  cmd = f'soffice --headless --convert-to txt "{docx_path}" --outdir "{dir_path}"'
32
78
  logger.debug(f"⚡ 执行转换命令: {cmd}")
@@ -97,11 +143,403 @@ class DocxParser(BaseLife):
97
143
  logger.error(f"💥 读取TXT文件时发生错误: {str(e)}")
98
144
  raise
99
145
 
146
+ def extract_all_content(self, docx_path: str) -> str:
147
+ """
148
+ 综合提取DOCX文件的所有内容
149
+ 支持多种DOCX内部格式和存储方式
150
+ """
151
+ logger.info(f"🔍 开始综合内容提取: {docx_path}")
152
+
153
+ all_content = []
154
+
155
+ try:
156
+ with zipfile.ZipFile(docx_path, 'r') as docx:
157
+ # 1. 检查并提取altChunk内容 (HTML/MHT嵌入)
158
+ altchunk_content = self._extract_altchunk_content_internal(docx)
159
+ if altchunk_content:
160
+ all_content.append(("altChunk", altchunk_content))
161
+
162
+ # 2. 提取标准document.xml内容
163
+ standard_content = self._extract_standard_document_content(docx)
164
+ if standard_content:
165
+ all_content.append(("standard", standard_content))
166
+
167
+ # 3. 提取嵌入对象内容 (embeddings)
168
+ embedded_content = self._extract_embedded_objects(docx)
169
+ if embedded_content:
170
+ all_content.append(("embedded", embedded_content))
171
+
172
+ # 4. 提取头部和脚部内容
173
+ header_footer_content = self._extract_headers_footers(docx)
174
+ if header_footer_content:
175
+ all_content.append(("header_footer", header_footer_content))
176
+
177
+ # 5. 提取注释和批注
178
+ comments_content = self._extract_comments(docx)
179
+ if comments_content:
180
+ all_content.append(("comments", comments_content))
181
+
182
+ # 6. 提取文本框和图形对象中的文本
183
+ textbox_content = self._extract_textbox_content(docx)
184
+ if textbox_content:
185
+ all_content.append(("textboxes", textbox_content))
186
+
187
+ except Exception as e:
188
+ logger.error(f"💥 综合内容提取失败: {str(e)}")
189
+ return ""
190
+
191
+ # 合并所有内容
192
+ if all_content:
193
+ combined_content = self._combine_extracted_content(all_content)
194
+ logger.info(f"✅ 综合提取完成,总内容长度: {len(combined_content)} 字符")
195
+ logger.debug(f"📊 提取到的内容类型: {[item[0] for item in all_content]}")
196
+ return combined_content
197
+
198
+ return ""
199
+
200
+ def _extract_altchunk_content_internal(self, docx_zip: zipfile.ZipFile) -> str:
201
+ """内部方法:提取altChunk内容,优先使用MHT方式"""
202
+ try:
203
+ # 检查document.xml中的altChunk引用
204
+ if 'word/document.xml' in docx_zip.namelist():
205
+ doc_xml = docx_zip.read('word/document.xml').decode('utf-8', errors='replace')
206
+ if 'altChunk' in doc_xml:
207
+ logger.info("🔍 检测到altChunk格式")
208
+
209
+ # 优先查找MHT文件(更简洁的处理方式)
210
+ mht_files = [f for f in docx_zip.namelist() if f.endswith('.mht') and 'word/' in f]
211
+ html_files = [f for f in docx_zip.namelist() if f.endswith('.html') and 'word/' in f]
212
+
213
+ # 优先处理MHT文件
214
+ for filename in mht_files:
215
+ logger.info(f"📄 优先处理MHT文件: {filename}")
216
+ content = docx_zip.read(filename).decode('utf-8', errors='replace')
217
+ return self._extract_html_from_mht(content)
218
+
219
+ # 如果没有MHT文件,再处理HTML文件
220
+ for filename in html_files:
221
+ logger.info(f"📄 处理HTML文件: {filename}")
222
+ content = docx_zip.read(filename).decode('utf-8', errors='replace')
223
+ return self._html_to_clean_text(content)
224
+
225
+ return ""
226
+ except Exception as e:
227
+ logger.error(f"💥 提取altChunk内容失败: {str(e)}")
228
+ return ""
229
+
230
+ def _extract_standard_document_content(self, docx_zip: zipfile.ZipFile) -> str:
231
+ """提取标准document.xml内容"""
232
+ try:
233
+ if 'word/document.xml' in docx_zip.namelist():
234
+ doc_xml = docx_zip.read('word/document.xml').decode('utf-8', errors='replace')
235
+
236
+ # 使用正则表达式提取文本内容
237
+ import xml.etree.ElementTree as ET
238
+
239
+ # 移除命名空间前缀以简化处理
240
+ doc_xml_clean = re.sub(r'xmlns[^=]*="[^"]*"', '', doc_xml)
241
+ doc_xml_clean = re.sub(r'w:', '', doc_xml_clean)
242
+ doc_xml_clean = re.sub(r'[a-zA-Z0-9]+:', '', doc_xml_clean)
243
+
244
+ # 提取所有<t>标签中的文本
245
+ text_matches = re.findall(r'<t[^>]*>(.*?)</t>', doc_xml_clean, re.DOTALL)
246
+ if text_matches:
247
+ content = ' '.join(text_matches)
248
+ content = html.unescape(content)
249
+ logger.info(f"📝 从document.xml提取文本: {len(content)} 字符")
250
+ return content.strip()
251
+ return ""
252
+ except Exception as e:
253
+ logger.error(f"💥 提取标准文档内容失败: {str(e)}")
254
+ return ""
255
+
256
+ def _extract_embedded_objects(self, docx_zip: zipfile.ZipFile) -> str:
257
+ """提取嵌入对象内容"""
258
+ try:
259
+ embedded_content = []
260
+
261
+ # 查找嵌入的文档对象
262
+ for filename in docx_zip.namelist():
263
+ if 'word/embeddings/' in filename:
264
+ logger.info(f"📎 找到嵌入对象: {filename}")
265
+ # 这里可以根据文件类型进一步处理
266
+ # 例如:.docx, .xlsx, .txt等
267
+
268
+ return ' '.join(embedded_content) if embedded_content else ""
269
+ except Exception as e:
270
+ logger.error(f"💥 提取嵌入对象失败: {str(e)}")
271
+ return ""
272
+
273
+ def _extract_headers_footers(self, docx_zip: zipfile.ZipFile) -> str:
274
+ """提取页眉页脚内容"""
275
+ try:
276
+ header_footer_content = []
277
+
278
+ for filename in docx_zip.namelist():
279
+ if ('word/header' in filename or 'word/footer' in filename) and filename.endswith('.xml'):
280
+ logger.debug(f"📄 处理页眉页脚: {filename}")
281
+ content = docx_zip.read(filename).decode('utf-8', errors='replace')
282
+
283
+ # 提取文本内容
284
+ text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', content, re.DOTALL)
285
+ if text_matches:
286
+ header_footer_text = ' '.join(text_matches)
287
+ header_footer_text = html.unescape(header_footer_text)
288
+ if header_footer_text.strip():
289
+ header_footer_content.append(header_footer_text.strip())
290
+
291
+ if header_footer_content:
292
+ logger.info(f"📑 提取页眉页脚内容: {len(header_footer_content)} 个")
293
+
294
+ return ' '.join(header_footer_content) if header_footer_content else ""
295
+ except Exception as e:
296
+ logger.error(f"💥 提取页眉页脚失败: {str(e)}")
297
+ return ""
298
+
299
+ def _extract_comments(self, docx_zip: zipfile.ZipFile) -> str:
300
+ """提取注释和批注内容"""
301
+ try:
302
+ if 'word/comments.xml' in docx_zip.namelist():
303
+ comments_xml = docx_zip.read('word/comments.xml').decode('utf-8', errors='replace')
304
+
305
+ # 提取注释文本
306
+ text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', comments_xml, re.DOTALL)
307
+ if text_matches:
308
+ comments_text = ' '.join(text_matches)
309
+ comments_text = html.unescape(comments_text)
310
+ logger.info(f"💬 提取注释内容: {len(comments_text)} 字符")
311
+ return comments_text.strip()
312
+
313
+ return ""
314
+ except Exception as e:
315
+ logger.error(f"💥 提取注释失败: {str(e)}")
316
+ return ""
317
+
318
+ def _extract_textbox_content(self, docx_zip: zipfile.ZipFile) -> str:
319
+ """提取文本框和图形对象中的文本"""
320
+ try:
321
+ textbox_content = []
322
+
323
+ # 查找可能包含文本框的文件
324
+ for filename in docx_zip.namelist():
325
+ if 'word/' in filename and filename.endswith('.xml'):
326
+ content = docx_zip.read(filename).decode('utf-8', errors='replace')
327
+
328
+ # 查找文本框内容 (w:txbxContent)
329
+ textbox_matches = re.findall(r'<w:txbxContent[^>]*>(.*?)</w:txbxContent>', content, re.DOTALL)
330
+ for match in textbox_matches:
331
+ text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', match, re.DOTALL)
332
+ if text_matches:
333
+ textbox_text = ' '.join(text_matches)
334
+ textbox_text = html.unescape(textbox_text)
335
+ if textbox_text.strip():
336
+ textbox_content.append(textbox_text.strip())
337
+
338
+ if textbox_content:
339
+ logger.info(f"📦 提取文本框内容: {len(textbox_content)} 个")
340
+
341
+ return ' '.join(textbox_content) if textbox_content else ""
342
+ except Exception as e:
343
+ logger.error(f"💥 提取文本框内容失败: {str(e)}")
344
+ return ""
345
+
346
+ def _combine_extracted_content(self, content_list: list) -> str:
347
+ """合并提取到的各种内容"""
348
+ combined = []
349
+
350
+ # 按重要性排序内容
351
+ priority_order = ["altChunk", "standard", "header_footer", "textboxes", "comments", "embedded"]
352
+
353
+ for content_type in priority_order:
354
+ for item_type, content in content_list:
355
+ if item_type == content_type and content.strip():
356
+ combined.append(content.strip())
357
+
358
+ # 添加其他未分类的内容
359
+ for item_type, content in content_list:
360
+ if item_type not in priority_order and content.strip():
361
+ combined.append(content.strip())
362
+
363
+ return '\n\n'.join(combined) if combined else ""
364
+
365
+ def _extract_html_from_mht(self, mht_content: str) -> str:
366
+ """从MHT内容中提取HTML部分并转换为简洁文本"""
367
+ try:
368
+ # MHT文件使用MIME格式,寻找HTML部分
369
+ lines = mht_content.split('\n')
370
+ in_html_section = False
371
+ html_lines = []
372
+ skip_headers = True
373
+
374
+ for line in lines:
375
+ # 检测HTML部分开始
376
+ if 'Content-Type: text/html' in line:
377
+ in_html_section = True
378
+ skip_headers = True
379
+ continue
380
+
381
+ # 在HTML部分中
382
+ if in_html_section:
383
+ # 跳过Content-*头部
384
+ if skip_headers and line.strip() and not line.startswith('Content-'):
385
+ skip_headers = False
386
+
387
+ # 空行表示头部结束,内容开始
388
+ if skip_headers and not line.strip():
389
+ skip_headers = False
390
+ continue
391
+
392
+ # 检查是否到达下一个MIME部分
393
+ if line.startswith('------=') and len(html_lines) > 0:
394
+ # HTML部分结束
395
+ break
396
+
397
+ # 收集HTML内容
398
+ if not skip_headers:
399
+ html_lines.append(line)
400
+
401
+ # 合并所有HTML行
402
+ html_content = '\n'.join(html_lines)
403
+
404
+ # 解码quoted-printable编码
405
+ if '=3D' in html_content or '=\n' in html_content:
406
+ try:
407
+ import quopri
408
+ html_content = quopri.decodestring(html_content.encode()).decode('utf-8', errors='replace')
409
+ logger.info("📧 解码quoted-printable编码")
410
+ except Exception as e:
411
+ logger.warning(f"⚠️ quoted-printable解码失败: {str(e)}")
412
+
413
+ logger.debug(f"📄 提取的HTML内容长度: {len(html_content)} 字符")
414
+
415
+ # 转换为简洁文本
416
+ return self._html_to_clean_text(html_content)
417
+
418
+ except Exception as e:
419
+ logger.error(f"💥 从MHT提取HTML失败: {str(e)}")
420
+ return ""
421
+
422
+ def _html_to_clean_text(self, html_content: str) -> str:
423
+ """将HTML内容转换为简洁的纯文本,专门优化MHT内容"""
424
+ try:
425
+ # 首先解码HTML实体
426
+ text = html.unescape(html_content)
427
+
428
+ # 先尝试提取<body>标签内的所有内容
429
+ body_match = re.search(r'<body[^>]*>(.*?)</body>', text, re.DOTALL | re.IGNORECASE)
430
+ if body_match:
431
+ main_content = body_match.group(1)
432
+ logger.info("📄 提取<body>标签内容")
433
+ else:
434
+ main_content = text
435
+ logger.info("📄 使用全部内容(未找到body标签)")
436
+
437
+ # 特殊处理<pre><code>标签,保持其内部的格式
438
+ pre_code_blocks = []
439
+ def preserve_pre_code(match):
440
+ idx = len(pre_code_blocks)
441
+ pre_code_blocks.append(match.group(1))
442
+ return f"__PRE_CODE_{idx}__"
443
+
444
+ main_content = re.sub(r'<pre[^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>',
445
+ preserve_pre_code, main_content, flags=re.DOTALL | re.IGNORECASE)
446
+
447
+ # 处理其他HTML结构
448
+ # 1. 先转换需要保留换行的标签
449
+ main_content = re.sub(r'<br\s*/?>', '\n', main_content, flags=re.IGNORECASE)
450
+ main_content = re.sub(r'</p>', '\n', main_content, flags=re.IGNORECASE)
451
+ main_content = re.sub(r'<p[^>]*>', '', main_content, flags=re.IGNORECASE)
452
+ main_content = re.sub(r'</div>', '\n', main_content, flags=re.IGNORECASE)
453
+ main_content = re.sub(r'<div[^>]*>', '', main_content, flags=re.IGNORECASE)
454
+ main_content = re.sub(r'</h[1-6]>', '\n\n', main_content, flags=re.IGNORECASE)
455
+ main_content = re.sub(r'<h[1-6][^>]*>', '', main_content, flags=re.IGNORECASE)
456
+ main_content = re.sub(r'</li>', '\n', main_content, flags=re.IGNORECASE)
457
+ main_content = re.sub(r'<li[^>]*>', '• ', main_content, flags=re.IGNORECASE)
458
+ main_content = re.sub(r'</tr>', '\n', main_content, flags=re.IGNORECASE)
459
+ main_content = re.sub(r'</td>', ' | ', main_content, flags=re.IGNORECASE)
460
+ main_content = re.sub(r'</th>', ' | ', main_content, flags=re.IGNORECASE)
461
+
462
+ # 2. 移除style和script标签及其内容
463
+ main_content = re.sub(r'<style[^>]*>.*?</style>', '', main_content, flags=re.DOTALL | re.IGNORECASE)
464
+ main_content = re.sub(r'<script[^>]*>.*?</script>', '', main_content, flags=re.DOTALL | re.IGNORECASE)
465
+
466
+ # 3. 移除所有剩余的HTML标签
467
+ main_content = re.sub(r'<[^>]+>', '', main_content)
468
+
469
+ # 4. 解码HTML实体(第二次,确保完全解码)
470
+ main_content = html.unescape(main_content)
471
+
472
+ # 5. 恢复<pre><code>块的内容
473
+ for idx, pre_code_content in enumerate(pre_code_blocks):
474
+ # 清理pre_code内容
475
+ cleaned_pre_code = html.unescape(pre_code_content)
476
+ main_content = main_content.replace(f"__PRE_CODE_{idx}__", cleaned_pre_code)
477
+
478
+ # 6. 清理多余的空白字符,但保持段落结构
479
+ lines = main_content.split('\n')
480
+ cleaned_lines = []
481
+
482
+ for line in lines:
483
+ # 清理每行的首尾空格
484
+ line = line.strip()
485
+ # 保留非空行
486
+ if line:
487
+ # 清理行内多余空格
488
+ line = re.sub(r'[ \t]+', ' ', line)
489
+ # 清理表格分隔符多余的空格
490
+ line = re.sub(r'\s*\|\s*', ' | ', line)
491
+ cleaned_lines.append(line)
492
+ else:
493
+ # 保留空行作为段落分隔
494
+ if cleaned_lines and cleaned_lines[-1] != '':
495
+ cleaned_lines.append('')
496
+
497
+ # 7. 合并清理后的行
498
+ main_content = '\n'.join(cleaned_lines)
499
+
500
+ # 8. 最终清理:移除多余的空行
501
+ main_content = re.sub(r'\n{3,}', '\n\n', main_content)
502
+ main_content = main_content.strip()
503
+
504
+ logger.info(f"📝 HTML内容转换为简洁文本: {len(main_content)} 字符")
505
+
506
+ return main_content
507
+
508
+ except Exception as e:
509
+ logger.error(f"💥 HTML转简洁文本失败: {str(e)}")
510
+ # 如果转换失败,返回原始文本的基础清理版本
511
+ return re.sub(r'<[^>]+>', '', html_content)
512
+
513
+ def _html_to_text(self, html_content: str) -> str:
514
+ """将HTML内容转换为纯文本(保留此方法用于其他HTML内容)"""
515
+ # 对于非MHT的HTML内容,使用这个更通用的方法
516
+ return self._html_to_clean_text(html_content)
517
+
518
+ def extract_altchunk_content(self, docx_path: str) -> Optional[str]:
519
+ """
520
+ 提取包含altChunk的DOCX文件内容 (保持向后兼容)
521
+ """
522
+ try:
523
+ with zipfile.ZipFile(docx_path, 'r') as docx:
524
+ return self._extract_altchunk_content_internal(docx)
525
+ except Exception as e:
526
+ logger.error(f"💥 提取altChunk内容失败: {str(e)}")
527
+ return None
528
+
100
529
  def read_docx_file(self, docx_path: str) -> str:
101
530
  """读取docx文件并转换为文本"""
102
531
  logger.info(f"📖 开始读取DOCX文件 - 文件: {docx_path}")
103
532
 
104
533
  try:
534
+ # 首先尝试综合提取所有内容
535
+ comprehensive_content = self.extract_all_content(docx_path)
536
+ if comprehensive_content and comprehensive_content.strip():
537
+ logger.info(f"✨ 使用综合提取方式成功,内容长度: {len(comprehensive_content)} 字符")
538
+ return comprehensive_content
539
+
540
+ # 如果综合提取失败,使用传统转换方式
541
+ logger.info("🔄 综合提取失败或内容为空,使用传统转换方式")
542
+
105
543
  with tempfile.TemporaryDirectory() as temp_path:
106
544
  logger.debug(f"📁 创建临时目录: {temp_path}")
107
545
 
@@ -114,7 +552,7 @@ class DocxParser(BaseLife):
114
552
  # 转换DOCX为TXT
115
553
  txt_file_path = self.docx_to_txt(str(file_path), str(temp_path))
116
554
  logger.info(f"🎯 DOCX转TXT完成: {txt_file_path}")
117
-
555
+
118
556
  # 读取TXT文件内容
119
557
  content = self.read_txt_file(txt_file_path)
120
558
  logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
@@ -152,7 +590,7 @@ class DocxParser(BaseLife):
152
590
  if file_size == 0:
153
591
  logger.warning(f"⚠️ 文件大小为0字节: {file_path}")
154
592
 
155
- title = self.get_file_extension(file_path)
593
+ title = os.path.splitext(os.path.basename(file_path))[0]
156
594
  logger.debug(f"🏷️ 提取文件标题: {title}")
157
595
 
158
596
  # 使用soffice转换为txt后读取内容
@@ -1,41 +1,41 @@
1
- import ebooklib
2
- from typing import Union
3
- from bs4 import BeautifulSoup
4
- from ebooklib import epub
5
- from datamax.parser.base import BaseLife
6
- from datamax.parser.base import MarkdownOutputVo
7
-
8
-
9
- class EpubParser(BaseLife):
10
- def __init__(self, file_path: Union[str, list]):
11
- super().__init__()
12
- self.file_path = file_path
13
-
14
- @staticmethod
15
- def read_epub_file(file_path: str) -> str:
16
- try:
17
- book = epub.read_epub(file_path)
18
- content = ""
19
- for item in book.get_items():
20
- if item.get_type() == ebooklib.ITEM_DOCUMENT:
21
- chapter_content = item.get_content().decode('utf-8')
22
- soup = BeautifulSoup(chapter_content, 'html.parser')
23
- text = soup.get_text()
24
- text = text.replace('\u3000', ' ')
25
- content += text
26
- return content
27
- except Exception as e:
28
- raise e
29
-
30
- def parse(self, file_path: str) -> MarkdownOutputVo:
31
- try:
32
- title = self.get_file_extension(file_path)
33
- content = self.read_epub_file(file_path=file_path)
34
- mk_content = content
35
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
36
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
37
- output_vo = MarkdownOutputVo(title, mk_content)
38
- output_vo.add_lifecycle(lifecycle)
39
- return output_vo.to_dict()
40
- except Exception as e:
41
- raise e
1
+ import ebooklib
2
+ from typing import Union
3
+ from bs4 import BeautifulSoup
4
+ from ebooklib import epub
5
+ from datamax.parser.base import BaseLife
6
+ from datamax.parser.base import MarkdownOutputVo
7
+ import os
8
+
9
+ class EpubParser(BaseLife):
10
+ def __init__(self, file_path: Union[str, list]):
11
+ super().__init__()
12
+ self.file_path = file_path
13
+
14
+ @staticmethod
15
+ def read_epub_file(file_path: str) -> str:
16
+ try:
17
+ book = epub.read_epub(file_path)
18
+ content = ""
19
+ for item in book.get_items():
20
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
21
+ chapter_content = item.get_content().decode('utf-8')
22
+ soup = BeautifulSoup(chapter_content, 'html.parser')
23
+ text = soup.get_text()
24
+ text = text.replace('\u3000', ' ')
25
+ content += text
26
+ return content
27
+ except Exception as e:
28
+ raise e
29
+
30
+ def parse(self, file_path: str) -> MarkdownOutputVo:
31
+ try:
32
+ title = os.path.splitext(os.path.basename(file_path))[0]
33
+ content = self.read_epub_file(file_path=file_path)
34
+ mk_content = content
35
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
36
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
37
+ output_vo = MarkdownOutputVo(title, mk_content)
38
+ output_vo.add_lifecycle(lifecycle)
39
+ return output_vo.to_dict()
40
+ except Exception as e:
41
+ raise e
@@ -1,38 +1,38 @@
1
- from typing import Union
2
- import pathlib
3
- import sys
4
-
5
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
- sys.path.insert(0, str(ROOT_DIR))
7
- from datamax.parser.base import BaseLife
8
- from datamax.parser.base import MarkdownOutputVo
9
- from bs4 import BeautifulSoup
10
-
11
-
12
- class HtmlParser(BaseLife):
13
- def __init__(self, file_path: Union[str, list]):
14
- super().__init__()
15
- self.file_path = file_path
16
-
17
- @staticmethod
18
- def read_html_file(file_path: str) -> str:
19
- try:
20
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
21
- data = f.read()
22
- soup = BeautifulSoup(data, 'html.parser')
23
- return soup.get_text(separator='\n', strip=True)
24
- except Exception:
25
- raise
26
-
27
- def parse(self, file_path: str) -> MarkdownOutputVo:
28
- try:
29
- title = self.get_file_extension(file_path)
30
- content = self.read_html_file(file_path=file_path)
31
- mk_content = content
32
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
33
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
34
- output_vo = MarkdownOutputVo(title, mk_content)
35
- output_vo.add_lifecycle(lifecycle)
36
- return output_vo.to_dict()
37
- except Exception:
1
+ from typing import Union
2
+ import pathlib
3
+ import sys
4
+
5
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
+ sys.path.insert(0, str(ROOT_DIR))
7
+ from datamax.parser.base import BaseLife
8
+ from datamax.parser.base import MarkdownOutputVo
9
+ from bs4 import BeautifulSoup
10
+ import os
11
+
12
+ class HtmlParser(BaseLife):
13
+ def __init__(self, file_path: Union[str, list]):
14
+ super().__init__()
15
+ self.file_path = file_path
16
+
17
+ @staticmethod
18
+ def read_html_file(file_path: str) -> str:
19
+ try:
20
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
21
+ data = f.read()
22
+ soup = BeautifulSoup(data, 'html.parser')
23
+ return soup.get_text(separator='\n', strip=True)
24
+ except Exception:
25
+ raise
26
+
27
+ def parse(self, file_path: str) -> MarkdownOutputVo:
28
+ try:
29
+ title = os.path.splitext(os.path.basename(file_path))[0]
30
+ content = self.read_html_file(file_path=file_path)
31
+ mk_content = content
32
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
33
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
34
+ output_vo = MarkdownOutputVo(title, mk_content)
35
+ output_vo.add_lifecycle(lifecycle)
36
+ return output_vo.to_dict()
37
+ except Exception:
38
38
  raise