pydatamax 0.1.13__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. pydatamax-0.1.14/PKG-INFO +228 -0
  2. pydatamax-0.1.14/README.md +167 -0
  3. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/loader/core.py +2 -2
  4. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/doc_parser.py +60 -52
  5. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/docx_parser.py +70 -58
  6. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/xlsx_parser.py +53 -46
  7. pydatamax-0.1.14/pydatamax.egg-info/PKG-INFO +228 -0
  8. {pydatamax-0.1.13 → pydatamax-0.1.14}/pydatamax.egg-info/SOURCES.txt +2 -2
  9. {pydatamax-0.1.13 → pydatamax-0.1.14}/pydatamax.egg-info/requires.txt +2 -0
  10. {pydatamax-0.1.13 → pydatamax-0.1.14}/setup.py +16 -14
  11. pydatamax-0.1.13/PKG-INFO +0 -280
  12. pydatamax-0.1.13/README.md +0 -221
  13. pydatamax-0.1.13/pydatamax.egg-info/PKG-INFO +0 -280
  14. {pydatamax-0.1.13 → pydatamax-0.1.14}/LICENSE +0 -0
  15. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/__init__.py +0 -0
  16. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/loader/__init__.py +0 -0
  17. /pydatamax-0.1.13/datamax/loader/MinioHandler.py → /pydatamax-0.1.14/datamax/loader/minio_handler.py +0 -0
  18. /pydatamax-0.1.13/datamax/loader/OssHandler.py → /pydatamax-0.1.14/datamax/loader/oss_handler.py +0 -0
  19. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/__init__.py +0 -0
  20. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/base.py +0 -0
  21. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/core.py +0 -0
  22. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/csv_parser.py +0 -0
  23. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/epub_parser.py +0 -0
  24. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/html_parser.py +0 -0
  25. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/image_parser.py +0 -0
  26. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/json_parser.py +0 -0
  27. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/md_parser.py +0 -0
  28. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/pdf_parser.py +0 -0
  29. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/ppt_parser.py +0 -0
  30. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/pptx_parser.py +0 -0
  31. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/txt_parser.py +0 -0
  32. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/xls_parser.py +0 -0
  33. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/__init__.py +0 -0
  34. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/constants.py +0 -0
  35. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/data_cleaner.py +0 -0
  36. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/env_setup.py +0 -0
  37. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/gotocr_pdf.py +0 -0
  38. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/mineru_operator.py +0 -0
  39. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/paddleocr_pdf_operator.py +0 -0
  40. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/ppt_extract.py +0 -0
  41. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/qa_generator.py +0 -0
  42. {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/tokenizer.py +0 -0
  43. {pydatamax-0.1.13 → pydatamax-0.1.14}/pydatamax.egg-info/dependency_links.txt +0 -0
  44. {pydatamax-0.1.13 → pydatamax-0.1.14}/pydatamax.egg-info/top_level.txt +0 -0
  45. {pydatamax-0.1.13 → pydatamax-0.1.14}/setup.cfg +0 -0
  46. {pydatamax-0.1.13 → pydatamax-0.1.14}/tests/__init__.py +0 -0
  47. {pydatamax-0.1.13 → pydatamax-0.1.14}/tests/test_basic.py +0 -0
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydatamax
3
+ Version: 0.1.14
4
+ Summary: A library for parsing and converting various file formats.
5
+ Home-page: https://github.com/Hi-Dolphin/datamax
6
+ Author: ccy
7
+ Author-email: cy.kron@foxmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: oss2<3.0.0,>=2.19.1
15
+ Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
+ Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
+ Requires-Dist: crcmod<2.0.0,>=1.7
18
+ Requires-Dist: langdetect<2.0.0,>=1.0.9
19
+ Requires-Dist: loguru<1.0.0,>=0.7.3
20
+ Requires-Dist: python-docx<2.0.0,>=1.1.2
21
+ Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
+ Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
+ Requires-Dist: pypdf<6.0.0,>=5.5.0
24
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
+ Requires-Dist: pandas<3.0.0,>=2.2.3
26
+ Requires-Dist: numpy<3.0.0,>=2.2.6
27
+ Requires-Dist: requests<3.0.0,>=2.32.3
28
+ Requires-Dist: tqdm<5.0.0,>=4.67.1
29
+ Requires-Dist: pydantic<3.0.0,>=2.11.5
30
+ Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
+ Requires-Dist: python-magic<1.0.0,>=0.4.27
32
+ Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
+ Requires-Dist: Pillow<12.0.0,>=11.2.1
34
+ Requires-Dist: packaging<25.0,>=24.2
35
+ Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
+ Requires-Dist: minio<8.0.0,>=7.2.15
37
+ Requires-Dist: openai<2.0.0,>=1.82.0
38
+ Requires-Dist: jionlp<2.0.0,>=1.5.23
39
+ Requires-Dist: chardet<6.0.0,>=5.2.0
40
+ Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
+ Requires-Dist: tiktoken<1.0.0,>=0.9.0
42
+ Requires-Dist: markitdown<1.0.0,>=0.1.1
43
+ Requires-Dist: xlrd<3.0.0,>=2.0.1
44
+ Requires-Dist: tabulate<1.0.0,>=0.9.0
45
+ Requires-Dist: unstructured<1.0.0,>=0.17.2
46
+ Requires-Dist: markdown<4.0.0,>=3.8
47
+ Requires-Dist: langchain<1.0.0,>=0.3.0
48
+ Requires-Dist: langchain-community<1.0.0,>=0.3.0
49
+ Requires-Dist: ebooklib==0.19
50
+ Requires-Dist: setuptools
51
+ Dynamic: author
52
+ Dynamic: author-email
53
+ Dynamic: classifier
54
+ Dynamic: description
55
+ Dynamic: description-content-type
56
+ Dynamic: home-page
57
+ Dynamic: license-file
58
+ Dynamic: requires-dist
59
+ Dynamic: requires-python
60
+ Dynamic: summary
61
+
62
+ # DataMax
63
+
64
+ <div align="center">
65
+
66
+ [中文](README_zh.md) | **English**
67
+
68
+ [![PyPI version](https://badge.fury.io/py/pydatamax.svg)](https://badge.fury.io/py/pydatamax) [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
69
+
70
+ </div>
71
+
72
+ A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
73
+
74
+ ## ✨ Core Features
75
+
76
+ - 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
77
+ - 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
78
+ - 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
79
+ - ⚡ **Batch Processing**: Efficient multi-file parallel processing
80
+ - 🎯 **Easy Integration**: Clean API design, ready to use out of the box
81
+
82
+ ## 🚀 Quick Start
83
+
84
+ ### Installation
85
+
86
+ ```bash
87
+ pip install pydatamax
88
+ ```
89
+
90
+ ### Basic Usage
91
+
92
+ ```python
93
+ from datamax import DataMax
94
+
95
+ # Parse a single file
96
+ dm = DataMax(file_path="document.pdf")
97
+ data = dm.get_data()
98
+
99
+ # Batch processing
100
+ dm = DataMax(file_path=["file1.docx", "file2.pdf"])
101
+ data = dm.get_data()
102
+
103
+ # Data cleaning
104
+ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
105
+
106
+ # AI annotation
107
+ qa_data = dm.get_pre_label(
108
+ api_key="your-api-key",
109
+ base_url="https://api.openai.com/v1",
110
+ model_name="gpt-3.5-turbo"
111
+ )
112
+ ```
113
+
114
+ ## 📖 Detailed Documentation
115
+
116
+ ### File Parsing
117
+
118
+ #### Supported Formats
119
+
120
+ | Format | Extensions | Special Features |
121
+ |--------|------------|------------------|
122
+ | Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
123
+ | Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
124
+ | Presentations | `.pptx`, `.ppt` | Slide content extraction |
125
+ | Web | `.html`, `.epub` | Tag parsing |
126
+ | Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
127
+ | Text | `.txt` | Automatic encoding detection |
128
+
129
+ #### Advanced Features
130
+
131
+ ```python
132
+ # Advanced PDF parsing (requires MinerU)
133
+ dm = DataMax(file_path="complex.pdf", use_mineru=True)
134
+
135
+ # Word to Markdown conversion
136
+ dm = DataMax(file_path="document.docx", to_markdown=True)
137
+
138
+ # Image OCR
139
+ dm = DataMax(file_path="image.jpg", use_ocr=True)
140
+ ```
141
+
142
+ ### Data Cleaning
143
+
144
+ ```python
145
+ # Three cleaning modes
146
+ dm.clean_data(method_list=[
147
+ "abnormal", # Anomaly data processing
148
+ "private", # Privacy information masking
149
+ "filter" # Text filtering and normalization
150
+ ])
151
+ ```
152
+
153
+ ### AI Annotation
154
+
155
+ ```python
156
+ # Custom annotation tasks
157
+ qa_data = dm.get_pre_label(
158
+ api_key="sk-xxx",
159
+ base_url="https://api.provider.com/v1",
160
+ model_name="model-name",
161
+ chunk_size=500, # Text chunk size
162
+ chunk_overlap=100, # Overlap length
163
+ question_number=5, # Questions per chunk
164
+ max_workers=5 # Concurrency
165
+ )
166
+ ```
167
+
168
+ ## ⚙️ Environment Setup
169
+
170
+ ### Optional Dependencies
171
+
172
+ #### LibreOffice (DOC file support)
173
+
174
+ **Ubuntu/Debian:**
175
+ ```bash
176
+ sudo apt-get install libreoffice
177
+ ```
178
+
179
+ **Windows:**
180
+ 1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
181
+ 2. Add to environment variables: `C:\Program Files\LibreOffice\program`
182
+
183
+ #### MinerU (Advanced PDF parsing)
184
+
185
+ ```bash
186
+ # Create virtual environment
187
+ conda create -n mineru python=3.10
188
+ conda activate mineru
189
+
190
+ # Install MinerU
191
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
192
+ ```
193
+
194
+ For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
195
+
196
+ ## 🛠️ Development
197
+
198
+ ### Local Installation
199
+
200
+ ```bash
201
+ git clone https://github.com/Hi-Dolphin/datamax.git
202
+ cd datamax
203
+ pip install -r requirements.txt
204
+ python setup.py install
205
+ ```
206
+
207
+ ## 📋 System Requirements
208
+
209
+ - Python >= 3.10
210
+ - Supports Windows, macOS, Linux
211
+
212
+ ## 🤝 Contributing
213
+
214
+ Issues and Pull Requests are welcome!
215
+
216
+ ## 📄 License
217
+
218
+ This project is licensed under the [MIT License](LICENSE).
219
+
220
+ ## 📞 Contact Us
221
+
222
+ - 📧 Email: cy.kron@foxmail.com
223
+ - 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
224
+ - 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
225
+
226
+ ---
227
+
228
+ ⭐ If this project helps you, please give us a star!
@@ -0,0 +1,167 @@
1
+ # DataMax
2
+
3
+ <div align="center">
4
+
5
+ [中文](README_zh.md) | **English**
6
+
7
+ [![PyPI version](https://badge.fury.io/py/pydatamax.svg)](https://badge.fury.io/py/pydatamax) [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+
9
+ </div>
10
+
11
+ A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
12
+
13
+ ## ✨ Core Features
14
+
15
+ - 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
16
+ - 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
17
+ - 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
18
+ - ⚡ **Batch Processing**: Efficient multi-file parallel processing
19
+ - 🎯 **Easy Integration**: Clean API design, ready to use out of the box
20
+
21
+ ## 🚀 Quick Start
22
+
23
+ ### Installation
24
+
25
+ ```bash
26
+ pip install pydatamax
27
+ ```
28
+
29
+ ### Basic Usage
30
+
31
+ ```python
32
+ from datamax import DataMax
33
+
34
+ # Parse a single file
35
+ dm = DataMax(file_path="document.pdf")
36
+ data = dm.get_data()
37
+
38
+ # Batch processing
39
+ dm = DataMax(file_path=["file1.docx", "file2.pdf"])
40
+ data = dm.get_data()
41
+
42
+ # Data cleaning
43
+ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
44
+
45
+ # AI annotation
46
+ qa_data = dm.get_pre_label(
47
+ api_key="your-api-key",
48
+ base_url="https://api.openai.com/v1",
49
+ model_name="gpt-3.5-turbo"
50
+ )
51
+ ```
52
+
53
+ ## 📖 Detailed Documentation
54
+
55
+ ### File Parsing
56
+
57
+ #### Supported Formats
58
+
59
+ | Format | Extensions | Special Features |
60
+ |--------|------------|------------------|
61
+ | Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
62
+ | Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
63
+ | Presentations | `.pptx`, `.ppt` | Slide content extraction |
64
+ | Web | `.html`, `.epub` | Tag parsing |
65
+ | Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
66
+ | Text | `.txt` | Automatic encoding detection |
67
+
68
+ #### Advanced Features
69
+
70
+ ```python
71
+ # Advanced PDF parsing (requires MinerU)
72
+ dm = DataMax(file_path="complex.pdf", use_mineru=True)
73
+
74
+ # Word to Markdown conversion
75
+ dm = DataMax(file_path="document.docx", to_markdown=True)
76
+
77
+ # Image OCR
78
+ dm = DataMax(file_path="image.jpg", use_ocr=True)
79
+ ```
80
+
81
+ ### Data Cleaning
82
+
83
+ ```python
84
+ # Three cleaning modes
85
+ dm.clean_data(method_list=[
86
+ "abnormal", # Anomaly data processing
87
+ "private", # Privacy information masking
88
+ "filter" # Text filtering and normalization
89
+ ])
90
+ ```
91
+
92
+ ### AI Annotation
93
+
94
+ ```python
95
+ # Custom annotation tasks
96
+ qa_data = dm.get_pre_label(
97
+ api_key="sk-xxx",
98
+ base_url="https://api.provider.com/v1",
99
+ model_name="model-name",
100
+ chunk_size=500, # Text chunk size
101
+ chunk_overlap=100, # Overlap length
102
+ question_number=5, # Questions per chunk
103
+ max_workers=5 # Concurrency
104
+ )
105
+ ```
106
+
107
+ ## ⚙️ Environment Setup
108
+
109
+ ### Optional Dependencies
110
+
111
+ #### LibreOffice (DOC file support)
112
+
113
+ **Ubuntu/Debian:**
114
+ ```bash
115
+ sudo apt-get install libreoffice
116
+ ```
117
+
118
+ **Windows:**
119
+ 1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
120
+ 2. Add to environment variables: `C:\Program Files\LibreOffice\program`
121
+
122
+ #### MinerU (Advanced PDF parsing)
123
+
124
+ ```bash
125
+ # Create virtual environment
126
+ conda create -n mineru python=3.10
127
+ conda activate mineru
128
+
129
+ # Install MinerU
130
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
131
+ ```
132
+
133
+ For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
134
+
135
+ ## 🛠️ Development
136
+
137
+ ### Local Installation
138
+
139
+ ```bash
140
+ git clone https://github.com/Hi-Dolphin/datamax.git
141
+ cd datamax
142
+ pip install -r requirements.txt
143
+ python setup.py install
144
+ ```
145
+
146
+ ## 📋 System Requirements
147
+
148
+ - Python >= 3.10
149
+ - Supports Windows, macOS, Linux
150
+
151
+ ## 🤝 Contributing
152
+
153
+ Issues and Pull Requests are welcome!
154
+
155
+ ## 📄 License
156
+
157
+ This project is licensed under the [MIT License](LICENSE).
158
+
159
+ ## 📞 Contact Us
160
+
161
+ - 📧 Email: cy.kron@foxmail.com
162
+ - 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
163
+ - 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
164
+
165
+ ---
166
+
167
+ ⭐ If this project helps you, please give us a star!
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from typing import List
3
- from datamax.loader.MinioHandler import MinIOClient
4
- from datamax.loader.OssHandler import OssClient
3
+ from datamax.loader.minio_handler import MinIOClient
4
+ from datamax.loader.oss_handler import OssClient
5
5
 
6
6
 
7
7
  class DataLoader:
@@ -1,14 +1,14 @@
1
+ import logging
1
2
  import os
2
3
  import shutil
3
4
  import subprocess
4
5
  import tempfile
5
- import chardet
6
- import logging
7
6
  from pathlib import Path
8
7
  from typing import Union
9
- from datamax.parser.base import BaseLife
10
- from datamax.parser.base import MarkdownOutputVo
11
8
 
9
+ import chardet
10
+
11
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
12
12
 
13
13
  # 配置日志
14
14
  logger = logging.getLogger(__name__)
@@ -24,37 +24,41 @@ class DocParser(BaseLife):
24
24
  def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
25
25
  """将.doc文件转换为.txt文件"""
26
26
  logger.info(f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}")
27
-
27
+
28
28
  try:
29
29
  cmd = f'soffice --headless --convert-to txt "{doc_path}" --outdir "{dir_path}"'
30
30
  logger.debug(f"⚡ 执行转换命令: {cmd}")
31
-
32
- process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
31
+
32
+ process = subprocess.Popen(
33
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
34
+ )
33
35
  stdout, stderr = process.communicate()
34
36
  exit_code = process.returncode
35
-
37
+
36
38
  if exit_code == 0:
37
39
  logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
38
40
  if stdout:
39
41
  logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
40
42
  else:
41
- encoding = chardet.detect(stderr)['encoding']
43
+ encoding = chardet.detect(stderr)["encoding"]
42
44
  if encoding is None:
43
- encoding = 'utf-8'
44
- error_msg = stderr.decode(encoding, errors='replace')
45
+ encoding = "utf-8"
46
+ error_msg = stderr.decode(encoding, errors="replace")
45
47
  logger.error(f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
46
- raise Exception(f"Error Output (detected encoding: {encoding}): {error_msg}")
47
-
48
+ raise Exception(
49
+ f"Error Output (detected encoding: {encoding}): {error_msg}"
50
+ )
51
+
48
52
  fname = str(Path(doc_path).stem)
49
- txt_path = os.path.join(dir_path, f'{fname}.txt')
50
-
53
+ txt_path = os.path.join(dir_path, f"{fname}.txt")
54
+
51
55
  if not os.path.exists(txt_path):
52
56
  logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
53
57
  raise Exception(f"文件转换失败 {doc_path} ==> {txt_path}")
54
58
  else:
55
59
  logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
56
60
  return txt_path
57
-
61
+
58
62
  except subprocess.SubprocessError as e:
59
63
  logger.error(f"💥 subprocess执行失败: {str(e)}")
60
64
  raise Exception(f"执行转换命令时发生错误: {str(e)}")
@@ -65,25 +69,25 @@ class DocParser(BaseLife):
65
69
  def read_txt_file(self, txt_path: str) -> str:
66
70
  """读取txt文件内容"""
67
71
  logger.info(f"📖 开始读取TXT文件: {txt_path}")
68
-
72
+
69
73
  try:
70
74
  # 检测文件编码
71
- with open(txt_path, 'rb') as f:
75
+ with open(txt_path, "rb") as f:
72
76
  raw_data = f.read()
73
- encoding = chardet.detect(raw_data)['encoding']
77
+ encoding = chardet.detect(raw_data)["encoding"]
74
78
  if encoding is None:
75
- encoding = 'utf-8'
79
+ encoding = "utf-8"
76
80
  logger.debug(f"🔍 检测到文件编码: {encoding}")
77
-
81
+
78
82
  # 读取文件内容
79
- with open(txt_path, 'r', encoding=encoding, errors='replace') as f:
83
+ with open(txt_path, "r", encoding=encoding, errors="replace") as f:
80
84
  content = f.read()
81
-
85
+
82
86
  logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
83
87
  logger.debug(f"👀 前100字符预览: {content[:100]}...")
84
-
88
+
85
89
  return content
86
-
90
+
87
91
  except FileNotFoundError as e:
88
92
  logger.error(f"🚫 TXT文件未找到: {str(e)}")
89
93
  raise Exception(f"文件未找到: {txt_path}")
@@ -94,27 +98,27 @@ class DocParser(BaseLife):
94
98
  def read_doc_file(self, doc_path: str) -> str:
95
99
  """读取doc文件并转换为文本"""
96
100
  logger.info(f"📖 开始读取DOC文件 - 文件: {doc_path}")
97
-
101
+
98
102
  try:
99
103
  with tempfile.TemporaryDirectory() as temp_path:
100
104
  logger.debug(f"📁 创建临时目录: {temp_path}")
101
-
105
+
102
106
  temp_dir = Path(temp_path)
103
-
107
+
104
108
  file_path = temp_dir / "tmp.doc"
105
109
  shutil.copy(doc_path, file_path)
106
110
  logger.debug(f"📋 复制文件到临时目录: {doc_path} -> {file_path}")
107
-
111
+
108
112
  # 转换DOC为TXT
109
113
  txt_file_path = self.doc_to_txt(str(file_path), str(temp_path))
110
114
  logger.info(f"🎯 DOC转TXT完成: {txt_file_path}")
111
-
115
+
112
116
  # 读取TXT文件内容
113
117
  content = self.read_txt_file(txt_file_path)
114
118
  logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
115
-
119
+
116
120
  return content
117
-
121
+
118
122
  except FileNotFoundError as e:
119
123
  logger.error(f"🚫 文件未找到: {str(e)}")
120
124
  raise Exception(f"文件未找到: {doc_path}")
@@ -128,24 +132,24 @@ class DocParser(BaseLife):
128
132
  def parse(self, file_path: str):
129
133
  """解析DOC文件"""
130
134
  logger.info(f"🎬 开始解析DOC文件: {file_path}")
131
-
135
+
132
136
  try:
133
137
  # 验证文件存在
134
138
  if not os.path.exists(file_path):
135
139
  logger.error(f"🚫 文件不存在: {file_path}")
136
140
  raise FileNotFoundError(f"文件不存在: {file_path}")
137
-
141
+
138
142
  # 验证文件大小
139
143
  file_size = os.path.getsize(file_path)
140
144
  logger.info(f"📏 文件大小: {file_size} 字节")
141
-
145
+
142
146
  title = self.get_file_extension(file_path)
143
147
  logger.debug(f"🏷️ 提取文件标题: {title}")
144
-
148
+
145
149
  # 使用soffice转换为txt后读取内容
146
150
  logger.info("📝 使用soffice转换DOC为TXT并读取内容")
147
151
  content = self.read_doc_file(doc_path=file_path)
148
-
152
+
149
153
  # 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
150
154
  if self.to_markdown:
151
155
  # 简单的文本到markdown转换(保持段落结构)
@@ -154,22 +158,26 @@ class DocParser(BaseLife):
154
158
  else:
155
159
  mk_content = content
156
160
  logger.info("📝 保持原始文本格式")
157
-
161
+
158
162
  logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
159
-
160
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
161
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
163
+
164
+ lifecycle = self.generate_lifecycle(
165
+ source_file=file_path,
166
+ domain="Technology",
167
+ usage_purpose="Documentation",
168
+ life_type="LLM_ORIGIN",
169
+ )
162
170
  logger.debug("⚙️ 生成lifecycle信息完成")
163
-
171
+
164
172
  output_vo = MarkdownOutputVo(title, mk_content)
165
173
  output_vo.add_lifecycle(lifecycle)
166
-
174
+
167
175
  result = output_vo.to_dict()
168
176
  logger.info(f"🏆 DOC文件解析完成: {file_path}")
169
177
  logger.debug(f"🔑 返回结果键: {list(result.keys())}")
170
-
178
+
171
179
  return result
172
-
180
+
173
181
  except Exception as e:
174
182
  logger.error(f"💀 解析DOC文件失败: {file_path}, 错误: {str(e)}")
175
183
  raise
@@ -178,18 +186,18 @@ class DocParser(BaseLife):
178
186
  """将纯文本格式化为简单的markdown格式"""
179
187
  if not content.strip():
180
188
  return content
181
-
182
- lines = content.split('\n')
189
+
190
+ lines = content.split("\n")
183
191
  formatted_lines = []
184
-
192
+
185
193
  for line in lines:
186
194
  line = line.strip()
187
195
  if not line:
188
- formatted_lines.append('')
196
+ formatted_lines.append("")
189
197
  continue
190
-
198
+
191
199
  # 简单的markdown格式化规则
192
200
  # 可以根据需要扩展更多规则
193
201
  formatted_lines.append(line)
194
-
195
- return '\n'.join(formatted_lines)
202
+
203
+ return "\n".join(formatted_lines)