pydatamax 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. pydatamax-0.1.14/PKG-INFO +228 -0
  2. pydatamax-0.1.14/README.md +167 -0
  3. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/loader/core.py +2 -2
  4. pydatamax-0.1.14/datamax/parser/doc_parser.py +203 -0
  5. pydatamax-0.1.14/datamax/parser/docx_parser.py +224 -0
  6. pydatamax-0.1.14/datamax/parser/xlsx_parser.py +215 -0
  7. pydatamax-0.1.14/pydatamax.egg-info/PKG-INFO +228 -0
  8. {pydatamax-0.1.12 → pydatamax-0.1.14}/pydatamax.egg-info/SOURCES.txt +2 -2
  9. {pydatamax-0.1.12 → pydatamax-0.1.14}/pydatamax.egg-info/requires.txt +2 -1
  10. {pydatamax-0.1.12 → pydatamax-0.1.14}/setup.py +16 -15
  11. pydatamax-0.1.12/PKG-INFO +0 -281
  12. pydatamax-0.1.12/README.md +0 -221
  13. pydatamax-0.1.12/datamax/parser/doc_parser.py +0 -77
  14. pydatamax-0.1.12/datamax/parser/docx_parser.py +0 -43
  15. pydatamax-0.1.12/datamax/parser/xlsx_parser.py +0 -71
  16. pydatamax-0.1.12/pydatamax.egg-info/PKG-INFO +0 -281
  17. {pydatamax-0.1.12 → pydatamax-0.1.14}/LICENSE +0 -0
  18. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/__init__.py +0 -0
  19. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/loader/__init__.py +0 -0
  20. /pydatamax-0.1.12/datamax/loader/MinioHandler.py → /pydatamax-0.1.14/datamax/loader/minio_handler.py +0 -0
  21. /pydatamax-0.1.12/datamax/loader/OssHandler.py → /pydatamax-0.1.14/datamax/loader/oss_handler.py +0 -0
  22. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/__init__.py +0 -0
  23. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/base.py +0 -0
  24. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/core.py +0 -0
  25. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/csv_parser.py +0 -0
  26. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/epub_parser.py +0 -0
  27. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/html_parser.py +0 -0
  28. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/image_parser.py +0 -0
  29. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/json_parser.py +0 -0
  30. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/md_parser.py +0 -0
  31. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/pdf_parser.py +0 -0
  32. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/ppt_parser.py +0 -0
  33. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/pptx_parser.py +0 -0
  34. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/txt_parser.py +0 -0
  35. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/parser/xls_parser.py +0 -0
  36. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/__init__.py +0 -0
  37. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/constants.py +0 -0
  38. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/data_cleaner.py +0 -0
  39. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/env_setup.py +0 -0
  40. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/gotocr_pdf.py +0 -0
  41. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/mineru_operator.py +0 -0
  42. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/paddleocr_pdf_operator.py +0 -0
  43. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/ppt_extract.py +0 -0
  44. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/qa_generator.py +0 -0
  45. {pydatamax-0.1.12 → pydatamax-0.1.14}/datamax/utils/tokenizer.py +0 -0
  46. {pydatamax-0.1.12 → pydatamax-0.1.14}/pydatamax.egg-info/dependency_links.txt +0 -0
  47. {pydatamax-0.1.12 → pydatamax-0.1.14}/pydatamax.egg-info/top_level.txt +0 -0
  48. {pydatamax-0.1.12 → pydatamax-0.1.14}/setup.cfg +0 -0
  49. {pydatamax-0.1.12 → pydatamax-0.1.14}/tests/__init__.py +0 -0
  50. {pydatamax-0.1.12 → pydatamax-0.1.14}/tests/test_basic.py +0 -0
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydatamax
3
+ Version: 0.1.14
4
+ Summary: A library for parsing and converting various file formats.
5
+ Home-page: https://github.com/Hi-Dolphin/datamax
6
+ Author: ccy
7
+ Author-email: cy.kron@foxmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: oss2<3.0.0,>=2.19.1
15
+ Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
+ Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
+ Requires-Dist: crcmod<2.0.0,>=1.7
18
+ Requires-Dist: langdetect<2.0.0,>=1.0.9
19
+ Requires-Dist: loguru<1.0.0,>=0.7.3
20
+ Requires-Dist: python-docx<2.0.0,>=1.1.2
21
+ Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
+ Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
+ Requires-Dist: pypdf<6.0.0,>=5.5.0
24
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
+ Requires-Dist: pandas<3.0.0,>=2.2.3
26
+ Requires-Dist: numpy<3.0.0,>=2.2.6
27
+ Requires-Dist: requests<3.0.0,>=2.32.3
28
+ Requires-Dist: tqdm<5.0.0,>=4.67.1
29
+ Requires-Dist: pydantic<3.0.0,>=2.11.5
30
+ Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
+ Requires-Dist: python-magic<1.0.0,>=0.4.27
32
+ Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
+ Requires-Dist: Pillow<12.0.0,>=11.2.1
34
+ Requires-Dist: packaging<25.0,>=24.2
35
+ Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
+ Requires-Dist: minio<8.0.0,>=7.2.15
37
+ Requires-Dist: openai<2.0.0,>=1.82.0
38
+ Requires-Dist: jionlp<2.0.0,>=1.5.23
39
+ Requires-Dist: chardet<6.0.0,>=5.2.0
40
+ Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
+ Requires-Dist: tiktoken<1.0.0,>=0.9.0
42
+ Requires-Dist: markitdown<1.0.0,>=0.1.1
43
+ Requires-Dist: xlrd<3.0.0,>=2.0.1
44
+ Requires-Dist: tabulate<1.0.0,>=0.9.0
45
+ Requires-Dist: unstructured<1.0.0,>=0.17.2
46
+ Requires-Dist: markdown<4.0.0,>=3.8
47
+ Requires-Dist: langchain<1.0.0,>=0.3.0
48
+ Requires-Dist: langchain-community<1.0.0,>=0.3.0
49
+ Requires-Dist: ebooklib==0.19
50
+ Requires-Dist: setuptools
51
+ Dynamic: author
52
+ Dynamic: author-email
53
+ Dynamic: classifier
54
+ Dynamic: description
55
+ Dynamic: description-content-type
56
+ Dynamic: home-page
57
+ Dynamic: license-file
58
+ Dynamic: requires-dist
59
+ Dynamic: requires-python
60
+ Dynamic: summary
61
+
62
+ # DataMax
63
+
64
+ <div align="center">
65
+
66
+ [中文](README_zh.md) | **English**
67
+
68
+ [![PyPI version](https://badge.fury.io/py/pydatamax.svg)](https://badge.fury.io/py/pydatamax) [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
69
+
70
+ </div>
71
+
72
+ A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
73
+
74
+ ## ✨ Core Features
75
+
76
+ - 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
77
+ - 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
78
+ - 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
79
+ - ⚡ **Batch Processing**: Efficient multi-file parallel processing
80
+ - 🎯 **Easy Integration**: Clean API design, ready to use out of the box
81
+
82
+ ## 🚀 Quick Start
83
+
84
+ ### Installation
85
+
86
+ ```bash
87
+ pip install pydatamax
88
+ ```
89
+
90
+ ### Basic Usage
91
+
92
+ ```python
93
+ from datamax import DataMax
94
+
95
+ # Parse a single file
96
+ dm = DataMax(file_path="document.pdf")
97
+ data = dm.get_data()
98
+
99
+ # Batch processing
100
+ dm = DataMax(file_path=["file1.docx", "file2.pdf"])
101
+ data = dm.get_data()
102
+
103
+ # Data cleaning
104
+ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
105
+
106
+ # AI annotation
107
+ qa_data = dm.get_pre_label(
108
+ api_key="your-api-key",
109
+ base_url="https://api.openai.com/v1",
110
+ model_name="gpt-3.5-turbo"
111
+ )
112
+ ```
113
+
114
+ ## 📖 Detailed Documentation
115
+
116
+ ### File Parsing
117
+
118
+ #### Supported Formats
119
+
120
+ | Format | Extensions | Special Features |
121
+ |--------|------------|------------------|
122
+ | Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
123
+ | Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
124
+ | Presentations | `.pptx`, `.ppt` | Slide content extraction |
125
+ | Web | `.html`, `.epub` | Tag parsing |
126
+ | Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
127
+ | Text | `.txt` | Automatic encoding detection |
128
+
129
+ #### Advanced Features
130
+
131
+ ```python
132
+ # Advanced PDF parsing (requires MinerU)
133
+ dm = DataMax(file_path="complex.pdf", use_mineru=True)
134
+
135
+ # Word to Markdown conversion
136
+ dm = DataMax(file_path="document.docx", to_markdown=True)
137
+
138
+ # Image OCR
139
+ dm = DataMax(file_path="image.jpg", use_ocr=True)
140
+ ```
141
+
142
+ ### Data Cleaning
143
+
144
+ ```python
145
+ # Three cleaning modes
146
+ dm.clean_data(method_list=[
147
+ "abnormal", # Anomaly data processing
148
+ "private", # Privacy information masking
149
+ "filter" # Text filtering and normalization
150
+ ])
151
+ ```
152
+
153
+ ### AI Annotation
154
+
155
+ ```python
156
+ # Custom annotation tasks
157
+ qa_data = dm.get_pre_label(
158
+ api_key="sk-xxx",
159
+ base_url="https://api.provider.com/v1",
160
+ model_name="model-name",
161
+ chunk_size=500, # Text chunk size
162
+ chunk_overlap=100, # Overlap length
163
+ question_number=5, # Questions per chunk
164
+ max_workers=5 # Concurrency
165
+ )
166
+ ```
167
+
168
+ ## ⚙️ Environment Setup
169
+
170
+ ### Optional Dependencies
171
+
172
+ #### LibreOffice (DOC file support)
173
+
174
+ **Ubuntu/Debian:**
175
+ ```bash
176
+ sudo apt-get install libreoffice
177
+ ```
178
+
179
+ **Windows:**
180
+ 1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
181
+ 2. Add to environment variables: `C:\Program Files\LibreOffice\program`
182
+
183
+ #### MinerU (Advanced PDF parsing)
184
+
185
+ ```bash
186
+ # Create virtual environment
187
+ conda create -n mineru python=3.10
188
+ conda activate mineru
189
+
190
+ # Install MinerU
191
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
192
+ ```
193
+
194
+ For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
195
+
196
+ ## 🛠️ Development
197
+
198
+ ### Local Installation
199
+
200
+ ```bash
201
+ git clone https://github.com/Hi-Dolphin/datamax.git
202
+ cd datamax
203
+ pip install -r requirements.txt
204
+ python setup.py install
205
+ ```
206
+
207
+ ## 📋 System Requirements
208
+
209
+ - Python >= 3.10
210
+ - Supports Windows, macOS, Linux
211
+
212
+ ## 🤝 Contributing
213
+
214
+ Issues and Pull Requests are welcome!
215
+
216
+ ## 📄 License
217
+
218
+ This project is licensed under the [MIT License](LICENSE).
219
+
220
+ ## 📞 Contact Us
221
+
222
+ - 📧 Email: cy.kron@foxmail.com
223
+ - 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
224
+ - 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
225
+
226
+ ---
227
+
228
+ ⭐ If this project helps you, please give us a star!
@@ -0,0 +1,167 @@
1
+ # DataMax
2
+
3
+ <div align="center">
4
+
5
+ [中文](README_zh.md) | **English**
6
+
7
+ [![PyPI version](https://badge.fury.io/py/pydatamax.svg)](https://badge.fury.io/py/pydatamax) [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+
9
+ </div>
10
+
11
+ A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
12
+
13
+ ## ✨ Core Features
14
+
15
+ - 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
16
+ - 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
17
+ - 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
18
+ - ⚡ **Batch Processing**: Efficient multi-file parallel processing
19
+ - 🎯 **Easy Integration**: Clean API design, ready to use out of the box
20
+
21
+ ## 🚀 Quick Start
22
+
23
+ ### Installation
24
+
25
+ ```bash
26
+ pip install pydatamax
27
+ ```
28
+
29
+ ### Basic Usage
30
+
31
+ ```python
32
+ from datamax import DataMax
33
+
34
+ # Parse a single file
35
+ dm = DataMax(file_path="document.pdf")
36
+ data = dm.get_data()
37
+
38
+ # Batch processing
39
+ dm = DataMax(file_path=["file1.docx", "file2.pdf"])
40
+ data = dm.get_data()
41
+
42
+ # Data cleaning
43
+ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
44
+
45
+ # AI annotation
46
+ qa_data = dm.get_pre_label(
47
+ api_key="your-api-key",
48
+ base_url="https://api.openai.com/v1",
49
+ model_name="gpt-3.5-turbo"
50
+ )
51
+ ```
52
+
53
+ ## 📖 Detailed Documentation
54
+
55
+ ### File Parsing
56
+
57
+ #### Supported Formats
58
+
59
+ | Format | Extensions | Special Features |
60
+ |--------|------------|------------------|
61
+ | Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
62
+ | Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
63
+ | Presentations | `.pptx`, `.ppt` | Slide content extraction |
64
+ | Web | `.html`, `.epub` | Tag parsing |
65
+ | Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
66
+ | Text | `.txt` | Automatic encoding detection |
67
+
68
+ #### Advanced Features
69
+
70
+ ```python
71
+ # Advanced PDF parsing (requires MinerU)
72
+ dm = DataMax(file_path="complex.pdf", use_mineru=True)
73
+
74
+ # Word to Markdown conversion
75
+ dm = DataMax(file_path="document.docx", to_markdown=True)
76
+
77
+ # Image OCR
78
+ dm = DataMax(file_path="image.jpg", use_ocr=True)
79
+ ```
80
+
81
+ ### Data Cleaning
82
+
83
+ ```python
84
+ # Three cleaning modes
85
+ dm.clean_data(method_list=[
86
+ "abnormal", # Anomaly data processing
87
+ "private", # Privacy information masking
88
+ "filter" # Text filtering and normalization
89
+ ])
90
+ ```
91
+
92
+ ### AI Annotation
93
+
94
+ ```python
95
+ # Custom annotation tasks
96
+ qa_data = dm.get_pre_label(
97
+ api_key="sk-xxx",
98
+ base_url="https://api.provider.com/v1",
99
+ model_name="model-name",
100
+ chunk_size=500, # Text chunk size
101
+ chunk_overlap=100, # Overlap length
102
+ question_number=5, # Questions per chunk
103
+ max_workers=5 # Concurrency
104
+ )
105
+ ```
106
+
107
+ ## ⚙️ Environment Setup
108
+
109
+ ### Optional Dependencies
110
+
111
+ #### LibreOffice (DOC file support)
112
+
113
+ **Ubuntu/Debian:**
114
+ ```bash
115
+ sudo apt-get install libreoffice
116
+ ```
117
+
118
+ **Windows:**
119
+ 1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
120
+ 2. Add to environment variables: `C:\Program Files\LibreOffice\program`
121
+
122
+ #### MinerU (Advanced PDF parsing)
123
+
124
+ ```bash
125
+ # Create virtual environment
126
+ conda create -n mineru python=3.10
127
+ conda activate mineru
128
+
129
+ # Install MinerU
130
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
131
+ ```
132
+
133
+ For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
134
+
135
+ ## 🛠️ Development
136
+
137
+ ### Local Installation
138
+
139
+ ```bash
140
+ git clone https://github.com/Hi-Dolphin/datamax.git
141
+ cd datamax
142
+ pip install -r requirements.txt
143
+ python setup.py install
144
+ ```
145
+
146
+ ## 📋 System Requirements
147
+
148
+ - Python >= 3.10
149
+ - Supports Windows, macOS, Linux
150
+
151
+ ## 🤝 Contributing
152
+
153
+ Issues and Pull Requests are welcome!
154
+
155
+ ## 📄 License
156
+
157
+ This project is licensed under the [MIT License](LICENSE).
158
+
159
+ ## 📞 Contact Us
160
+
161
+ - 📧 Email: cy.kron@foxmail.com
162
+ - 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
163
+ - 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
164
+
165
+ ---
166
+
167
+ ⭐ If this project helps you, please give us a star!
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from typing import List
3
- from datamax.loader.MinioHandler import MinIOClient
4
- from datamax.loader.OssHandler import OssClient
3
+ from datamax.loader.minio_handler import MinIOClient
4
+ from datamax.loader.oss_handler import OssClient
5
5
 
6
6
 
7
7
  class DataLoader:
@@ -0,0 +1,203 @@
1
+ import logging
2
+ import os
3
+ import shutil
4
+ import subprocess
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Union
8
+
9
+ import chardet
10
+
11
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
12
+
13
+ # 配置日志
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DocParser(BaseLife):
18
+ def __init__(self, file_path: Union[str, list], to_markdown: bool = False):
19
+ super().__init__()
20
+ self.file_path = file_path
21
+ self.to_markdown = to_markdown
22
+ logger.info(f"🚀 DocParser初始化完成 - 文件路径: {file_path}, 转换为markdown: {to_markdown}")
23
+
24
+ def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
25
+ """将.doc文件转换为.txt文件"""
26
+ logger.info(f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}")
27
+
28
+ try:
29
+ cmd = f'soffice --headless --convert-to txt "{doc_path}" --outdir "{dir_path}"'
30
+ logger.debug(f"⚡ 执行转换命令: {cmd}")
31
+
32
+ process = subprocess.Popen(
33
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
34
+ )
35
+ stdout, stderr = process.communicate()
36
+ exit_code = process.returncode
37
+
38
+ if exit_code == 0:
39
+ logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
40
+ if stdout:
41
+ logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
42
+ else:
43
+ encoding = chardet.detect(stderr)["encoding"]
44
+ if encoding is None:
45
+ encoding = "utf-8"
46
+ error_msg = stderr.decode(encoding, errors="replace")
47
+ logger.error(f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
48
+ raise Exception(
49
+ f"Error Output (detected encoding: {encoding}): {error_msg}"
50
+ )
51
+
52
+ fname = str(Path(doc_path).stem)
53
+ txt_path = os.path.join(dir_path, f"{fname}.txt")
54
+
55
+ if not os.path.exists(txt_path):
56
+ logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
57
+ raise Exception(f"文件转换失败 {doc_path} ==> {txt_path}")
58
+ else:
59
+ logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
60
+ return txt_path
61
+
62
+ except subprocess.SubprocessError as e:
63
+ logger.error(f"💥 subprocess执行失败: {str(e)}")
64
+ raise Exception(f"执行转换命令时发生错误: {str(e)}")
65
+ except Exception as e:
66
+ logger.error(f"💥 DOC到TXT转换过程中发生未知错误: {str(e)}")
67
+ raise
68
+
69
+ def read_txt_file(self, txt_path: str) -> str:
70
+ """读取txt文件内容"""
71
+ logger.info(f"📖 开始读取TXT文件: {txt_path}")
72
+
73
+ try:
74
+ # 检测文件编码
75
+ with open(txt_path, "rb") as f:
76
+ raw_data = f.read()
77
+ encoding = chardet.detect(raw_data)["encoding"]
78
+ if encoding is None:
79
+ encoding = "utf-8"
80
+ logger.debug(f"🔍 检测到文件编码: {encoding}")
81
+
82
+ # 读取文件内容
83
+ with open(txt_path, "r", encoding=encoding, errors="replace") as f:
84
+ content = f.read()
85
+
86
+ logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
87
+ logger.debug(f"👀 前100字符预览: {content[:100]}...")
88
+
89
+ return content
90
+
91
+ except FileNotFoundError as e:
92
+ logger.error(f"🚫 TXT文件未找到: {str(e)}")
93
+ raise Exception(f"文件未找到: {txt_path}")
94
+ except Exception as e:
95
+ logger.error(f"💥 读取TXT文件时发生错误: {str(e)}")
96
+ raise
97
+
98
+ def read_doc_file(self, doc_path: str) -> str:
99
+ """读取doc文件并转换为文本"""
100
+ logger.info(f"📖 开始读取DOC文件 - 文件: {doc_path}")
101
+
102
+ try:
103
+ with tempfile.TemporaryDirectory() as temp_path:
104
+ logger.debug(f"📁 创建临时目录: {temp_path}")
105
+
106
+ temp_dir = Path(temp_path)
107
+
108
+ file_path = temp_dir / "tmp.doc"
109
+ shutil.copy(doc_path, file_path)
110
+ logger.debug(f"📋 复制文件到临时目录: {doc_path} -> {file_path}")
111
+
112
+ # 转换DOC为TXT
113
+ txt_file_path = self.doc_to_txt(str(file_path), str(temp_path))
114
+ logger.info(f"🎯 DOC转TXT完成: {txt_file_path}")
115
+
116
+ # 读取TXT文件内容
117
+ content = self.read_txt_file(txt_file_path)
118
+ logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
119
+
120
+ return content
121
+
122
+ except FileNotFoundError as e:
123
+ logger.error(f"🚫 文件未找到: {str(e)}")
124
+ raise Exception(f"文件未找到: {doc_path}")
125
+ except PermissionError as e:
126
+ logger.error(f"🔒 文件权限错误: {str(e)}")
127
+ raise Exception(f"无权限访问文件: {doc_path}")
128
+ except Exception as e:
129
+ logger.error(f"💥 读取DOC文件时发生错误: {str(e)}")
130
+ raise
131
+
132
+ def parse(self, file_path: str):
133
+ """解析DOC文件"""
134
+ logger.info(f"🎬 开始解析DOC文件: {file_path}")
135
+
136
+ try:
137
+ # 验证文件存在
138
+ if not os.path.exists(file_path):
139
+ logger.error(f"🚫 文件不存在: {file_path}")
140
+ raise FileNotFoundError(f"文件不存在: {file_path}")
141
+
142
+ # 验证文件大小
143
+ file_size = os.path.getsize(file_path)
144
+ logger.info(f"📏 文件大小: {file_size} 字节")
145
+
146
+ title = self.get_file_extension(file_path)
147
+ logger.debug(f"🏷️ 提取文件标题: {title}")
148
+
149
+ # 使用soffice转换为txt后读取内容
150
+ logger.info("📝 使用soffice转换DOC为TXT并读取内容")
151
+ content = self.read_doc_file(doc_path=file_path)
152
+
153
+ # 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
154
+ if self.to_markdown:
155
+ # 简单的文本到markdown转换(保持段落结构)
156
+ mk_content = self.format_as_markdown(content)
157
+ logger.info("🎨 内容已格式化为markdown格式")
158
+ else:
159
+ mk_content = content
160
+ logger.info("📝 保持原始文本格式")
161
+
162
+ logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
163
+
164
+ lifecycle = self.generate_lifecycle(
165
+ source_file=file_path,
166
+ domain="Technology",
167
+ usage_purpose="Documentation",
168
+ life_type="LLM_ORIGIN",
169
+ )
170
+ logger.debug("⚙️ 生成lifecycle信息完成")
171
+
172
+ output_vo = MarkdownOutputVo(title, mk_content)
173
+ output_vo.add_lifecycle(lifecycle)
174
+
175
+ result = output_vo.to_dict()
176
+ logger.info(f"🏆 DOC文件解析完成: {file_path}")
177
+ logger.debug(f"🔑 返回结果键: {list(result.keys())}")
178
+
179
+ return result
180
+
181
+ except Exception as e:
182
+ logger.error(f"💀 解析DOC文件失败: {file_path}, 错误: {str(e)}")
183
+ raise
184
+
185
+ def format_as_markdown(self, content: str) -> str:
186
+ """将纯文本格式化为简单的markdown格式"""
187
+ if not content.strip():
188
+ return content
189
+
190
+ lines = content.split("\n")
191
+ formatted_lines = []
192
+
193
+ for line in lines:
194
+ line = line.strip()
195
+ if not line:
196
+ formatted_lines.append("")
197
+ continue
198
+
199
+ # 简单的markdown格式化规则
200
+ # 可以根据需要扩展更多规则
201
+ formatted_lines.append(line)
202
+
203
+ return "\n".join(formatted_lines)