pydatamax 0.1.13__tar.gz → 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydatamax-0.1.14/PKG-INFO +228 -0
- pydatamax-0.1.14/README.md +167 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/loader/core.py +2 -2
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/doc_parser.py +60 -52
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/docx_parser.py +70 -58
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/xlsx_parser.py +53 -46
- pydatamax-0.1.14/pydatamax.egg-info/PKG-INFO +228 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/pydatamax.egg-info/SOURCES.txt +2 -2
- {pydatamax-0.1.13 → pydatamax-0.1.14}/pydatamax.egg-info/requires.txt +2 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/setup.py +16 -14
- pydatamax-0.1.13/PKG-INFO +0 -280
- pydatamax-0.1.13/README.md +0 -221
- pydatamax-0.1.13/pydatamax.egg-info/PKG-INFO +0 -280
- {pydatamax-0.1.13 → pydatamax-0.1.14}/LICENSE +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/__init__.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/loader/__init__.py +0 -0
- /pydatamax-0.1.13/datamax/loader/MinioHandler.py → /pydatamax-0.1.14/datamax/loader/minio_handler.py +0 -0
- /pydatamax-0.1.13/datamax/loader/OssHandler.py → /pydatamax-0.1.14/datamax/loader/oss_handler.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/__init__.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/base.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/core.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/csv_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/epub_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/html_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/image_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/json_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/md_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/pdf_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/ppt_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/pptx_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/txt_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/parser/xls_parser.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/__init__.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/constants.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/data_cleaner.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/env_setup.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/gotocr_pdf.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/mineru_operator.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/paddleocr_pdf_operator.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/ppt_extract.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/qa_generator.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/datamax/utils/tokenizer.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/pydatamax.egg-info/dependency_links.txt +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/pydatamax.egg-info/top_level.txt +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/setup.cfg +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/tests/__init__.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.14}/tests/test_basic.py +0 -0
@@ -0,0 +1,228 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: pydatamax
|
3
|
+
Version: 0.1.14
|
4
|
+
Summary: A library for parsing and converting various file formats.
|
5
|
+
Home-page: https://github.com/Hi-Dolphin/datamax
|
6
|
+
Author: ccy
|
7
|
+
Author-email: cy.kron@foxmail.com
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.10
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Requires-Dist: oss2<3.0.0,>=2.19.1
|
15
|
+
Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
|
16
|
+
Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
|
17
|
+
Requires-Dist: crcmod<2.0.0,>=1.7
|
18
|
+
Requires-Dist: langdetect<2.0.0,>=1.0.9
|
19
|
+
Requires-Dist: loguru<1.0.0,>=0.7.3
|
20
|
+
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
21
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.1.0
|
22
|
+
Requires-Dist: pymupdf<2.0.0,>=1.26.0
|
23
|
+
Requires-Dist: pypdf<6.0.0,>=5.5.0
|
24
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
25
|
+
Requires-Dist: pandas<3.0.0,>=2.2.3
|
26
|
+
Requires-Dist: numpy<3.0.0,>=2.2.6
|
27
|
+
Requires-Dist: requests<3.0.0,>=2.32.3
|
28
|
+
Requires-Dist: tqdm<5.0.0,>=4.67.1
|
29
|
+
Requires-Dist: pydantic<3.0.0,>=2.11.5
|
30
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
|
31
|
+
Requires-Dist: python-magic<1.0.0,>=0.4.27
|
32
|
+
Requires-Dist: PyYAML<7.0.0,>=6.0.2
|
33
|
+
Requires-Dist: Pillow<12.0.0,>=11.2.1
|
34
|
+
Requires-Dist: packaging<25.0,>=24.2
|
35
|
+
Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
|
36
|
+
Requires-Dist: minio<8.0.0,>=7.2.15
|
37
|
+
Requires-Dist: openai<2.0.0,>=1.82.0
|
38
|
+
Requires-Dist: jionlp<2.0.0,>=1.5.23
|
39
|
+
Requires-Dist: chardet<6.0.0,>=5.2.0
|
40
|
+
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
41
|
+
Requires-Dist: tiktoken<1.0.0,>=0.9.0
|
42
|
+
Requires-Dist: markitdown<1.0.0,>=0.1.1
|
43
|
+
Requires-Dist: xlrd<3.0.0,>=2.0.1
|
44
|
+
Requires-Dist: tabulate<1.0.0,>=0.9.0
|
45
|
+
Requires-Dist: unstructured<1.0.0,>=0.17.2
|
46
|
+
Requires-Dist: markdown<4.0.0,>=3.8
|
47
|
+
Requires-Dist: langchain<1.0.0,>=0.3.0
|
48
|
+
Requires-Dist: langchain-community<1.0.0,>=0.3.0
|
49
|
+
Requires-Dist: ebooklib==0.19
|
50
|
+
Requires-Dist: setuptools
|
51
|
+
Dynamic: author
|
52
|
+
Dynamic: author-email
|
53
|
+
Dynamic: classifier
|
54
|
+
Dynamic: description
|
55
|
+
Dynamic: description-content-type
|
56
|
+
Dynamic: home-page
|
57
|
+
Dynamic: license-file
|
58
|
+
Dynamic: requires-dist
|
59
|
+
Dynamic: requires-python
|
60
|
+
Dynamic: summary
|
61
|
+
|
62
|
+
# DataMax
|
63
|
+
|
64
|
+
<div align="center">
|
65
|
+
|
66
|
+
[中文](README_zh.md) | **English**
|
67
|
+
|
68
|
+
[](https://badge.fury.io/py/pydatamax) [](https://www.python.org/downloads/) [](https://opensource.org/licenses/MIT)
|
69
|
+
|
70
|
+
</div>
|
71
|
+
|
72
|
+
A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
|
73
|
+
|
74
|
+
## ✨ Core Features
|
75
|
+
|
76
|
+
- 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
|
77
|
+
- 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
|
78
|
+
- 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
|
79
|
+
- ⚡ **Batch Processing**: Efficient multi-file parallel processing
|
80
|
+
- 🎯 **Easy Integration**: Clean API design, ready to use out of the box
|
81
|
+
|
82
|
+
## 🚀 Quick Start
|
83
|
+
|
84
|
+
### Installation
|
85
|
+
|
86
|
+
```bash
|
87
|
+
pip install pydatamax
|
88
|
+
```
|
89
|
+
|
90
|
+
### Basic Usage
|
91
|
+
|
92
|
+
```python
|
93
|
+
from datamax import DataMax
|
94
|
+
|
95
|
+
# Parse a single file
|
96
|
+
dm = DataMax(file_path="document.pdf")
|
97
|
+
data = dm.get_data()
|
98
|
+
|
99
|
+
# Batch processing
|
100
|
+
dm = DataMax(file_path=["file1.docx", "file2.pdf"])
|
101
|
+
data = dm.get_data()
|
102
|
+
|
103
|
+
# Data cleaning
|
104
|
+
cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
|
105
|
+
|
106
|
+
# AI annotation
|
107
|
+
qa_data = dm.get_pre_label(
|
108
|
+
api_key="your-api-key",
|
109
|
+
base_url="https://api.openai.com/v1",
|
110
|
+
model_name="gpt-3.5-turbo"
|
111
|
+
)
|
112
|
+
```
|
113
|
+
|
114
|
+
## 📖 Detailed Documentation
|
115
|
+
|
116
|
+
### File Parsing
|
117
|
+
|
118
|
+
#### Supported Formats
|
119
|
+
|
120
|
+
| Format | Extensions | Special Features |
|
121
|
+
|--------|------------|------------------|
|
122
|
+
| Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
|
123
|
+
| Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
|
124
|
+
| Presentations | `.pptx`, `.ppt` | Slide content extraction |
|
125
|
+
| Web | `.html`, `.epub` | Tag parsing |
|
126
|
+
| Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
|
127
|
+
| Text | `.txt` | Automatic encoding detection |
|
128
|
+
|
129
|
+
#### Advanced Features
|
130
|
+
|
131
|
+
```python
|
132
|
+
# Advanced PDF parsing (requires MinerU)
|
133
|
+
dm = DataMax(file_path="complex.pdf", use_mineru=True)
|
134
|
+
|
135
|
+
# Word to Markdown conversion
|
136
|
+
dm = DataMax(file_path="document.docx", to_markdown=True)
|
137
|
+
|
138
|
+
# Image OCR
|
139
|
+
dm = DataMax(file_path="image.jpg", use_ocr=True)
|
140
|
+
```
|
141
|
+
|
142
|
+
### Data Cleaning
|
143
|
+
|
144
|
+
```python
|
145
|
+
# Three cleaning modes
|
146
|
+
dm.clean_data(method_list=[
|
147
|
+
"abnormal", # Anomaly data processing
|
148
|
+
"private", # Privacy information masking
|
149
|
+
"filter" # Text filtering and normalization
|
150
|
+
])
|
151
|
+
```
|
152
|
+
|
153
|
+
### AI Annotation
|
154
|
+
|
155
|
+
```python
|
156
|
+
# Custom annotation tasks
|
157
|
+
qa_data = dm.get_pre_label(
|
158
|
+
api_key="sk-xxx",
|
159
|
+
base_url="https://api.provider.com/v1",
|
160
|
+
model_name="model-name",
|
161
|
+
chunk_size=500, # Text chunk size
|
162
|
+
chunk_overlap=100, # Overlap length
|
163
|
+
question_number=5, # Questions per chunk
|
164
|
+
max_workers=5 # Concurrency
|
165
|
+
)
|
166
|
+
```
|
167
|
+
|
168
|
+
## ⚙️ Environment Setup
|
169
|
+
|
170
|
+
### Optional Dependencies
|
171
|
+
|
172
|
+
#### LibreOffice (DOC file support)
|
173
|
+
|
174
|
+
**Ubuntu/Debian:**
|
175
|
+
```bash
|
176
|
+
sudo apt-get install libreoffice
|
177
|
+
```
|
178
|
+
|
179
|
+
**Windows:**
|
180
|
+
1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
|
181
|
+
2. Add to environment variables: `C:\Program Files\LibreOffice\program`
|
182
|
+
|
183
|
+
#### MinerU (Advanced PDF parsing)
|
184
|
+
|
185
|
+
```bash
|
186
|
+
# Create virtual environment
|
187
|
+
conda create -n mineru python=3.10
|
188
|
+
conda activate mineru
|
189
|
+
|
190
|
+
# Install MinerU
|
191
|
+
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
|
192
|
+
```
|
193
|
+
|
194
|
+
For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
|
195
|
+
|
196
|
+
## 🛠️ Development
|
197
|
+
|
198
|
+
### Local Installation
|
199
|
+
|
200
|
+
```bash
|
201
|
+
git clone https://github.com/Hi-Dolphin/datamax.git
|
202
|
+
cd datamax
|
203
|
+
pip install -r requirements.txt
|
204
|
+
python setup.py install
|
205
|
+
```
|
206
|
+
|
207
|
+
## 📋 System Requirements
|
208
|
+
|
209
|
+
- Python >= 3.10
|
210
|
+
- Supports Windows, macOS, Linux
|
211
|
+
|
212
|
+
## 🤝 Contributing
|
213
|
+
|
214
|
+
Issues and Pull Requests are welcome!
|
215
|
+
|
216
|
+
## 📄 License
|
217
|
+
|
218
|
+
This project is licensed under the [MIT License](LICENSE).
|
219
|
+
|
220
|
+
## 📞 Contact Us
|
221
|
+
|
222
|
+
- 📧 Email: cy.kron@foxmail.com
|
223
|
+
- 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
|
224
|
+
- 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
|
225
|
+
|
226
|
+
---
|
227
|
+
|
228
|
+
⭐ If this project helps you, please give us a star!
|
@@ -0,0 +1,167 @@
|
|
1
|
+
# DataMax
|
2
|
+
|
3
|
+
<div align="center">
|
4
|
+
|
5
|
+
[中文](README_zh.md) | **English**
|
6
|
+
|
7
|
+
[](https://badge.fury.io/py/pydatamax) [](https://www.python.org/downloads/) [](https://opensource.org/licenses/MIT)
|
8
|
+
|
9
|
+
</div>
|
10
|
+
|
11
|
+
A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
|
12
|
+
|
13
|
+
## ✨ Core Features
|
14
|
+
|
15
|
+
- 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
|
16
|
+
- 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
|
17
|
+
- 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
|
18
|
+
- ⚡ **Batch Processing**: Efficient multi-file parallel processing
|
19
|
+
- 🎯 **Easy Integration**: Clean API design, ready to use out of the box
|
20
|
+
|
21
|
+
## 🚀 Quick Start
|
22
|
+
|
23
|
+
### Installation
|
24
|
+
|
25
|
+
```bash
|
26
|
+
pip install pydatamax
|
27
|
+
```
|
28
|
+
|
29
|
+
### Basic Usage
|
30
|
+
|
31
|
+
```python
|
32
|
+
from datamax import DataMax
|
33
|
+
|
34
|
+
# Parse a single file
|
35
|
+
dm = DataMax(file_path="document.pdf")
|
36
|
+
data = dm.get_data()
|
37
|
+
|
38
|
+
# Batch processing
|
39
|
+
dm = DataMax(file_path=["file1.docx", "file2.pdf"])
|
40
|
+
data = dm.get_data()
|
41
|
+
|
42
|
+
# Data cleaning
|
43
|
+
cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
|
44
|
+
|
45
|
+
# AI annotation
|
46
|
+
qa_data = dm.get_pre_label(
|
47
|
+
api_key="your-api-key",
|
48
|
+
base_url="https://api.openai.com/v1",
|
49
|
+
model_name="gpt-3.5-turbo"
|
50
|
+
)
|
51
|
+
```
|
52
|
+
|
53
|
+
## 📖 Detailed Documentation
|
54
|
+
|
55
|
+
### File Parsing
|
56
|
+
|
57
|
+
#### Supported Formats
|
58
|
+
|
59
|
+
| Format | Extensions | Special Features |
|
60
|
+
|--------|------------|------------------|
|
61
|
+
| Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
|
62
|
+
| Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
|
63
|
+
| Presentations | `.pptx`, `.ppt` | Slide content extraction |
|
64
|
+
| Web | `.html`, `.epub` | Tag parsing |
|
65
|
+
| Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
|
66
|
+
| Text | `.txt` | Automatic encoding detection |
|
67
|
+
|
68
|
+
#### Advanced Features
|
69
|
+
|
70
|
+
```python
|
71
|
+
# Advanced PDF parsing (requires MinerU)
|
72
|
+
dm = DataMax(file_path="complex.pdf", use_mineru=True)
|
73
|
+
|
74
|
+
# Word to Markdown conversion
|
75
|
+
dm = DataMax(file_path="document.docx", to_markdown=True)
|
76
|
+
|
77
|
+
# Image OCR
|
78
|
+
dm = DataMax(file_path="image.jpg", use_ocr=True)
|
79
|
+
```
|
80
|
+
|
81
|
+
### Data Cleaning
|
82
|
+
|
83
|
+
```python
|
84
|
+
# Three cleaning modes
|
85
|
+
dm.clean_data(method_list=[
|
86
|
+
"abnormal", # Anomaly data processing
|
87
|
+
"private", # Privacy information masking
|
88
|
+
"filter" # Text filtering and normalization
|
89
|
+
])
|
90
|
+
```
|
91
|
+
|
92
|
+
### AI Annotation
|
93
|
+
|
94
|
+
```python
|
95
|
+
# Custom annotation tasks
|
96
|
+
qa_data = dm.get_pre_label(
|
97
|
+
api_key="sk-xxx",
|
98
|
+
base_url="https://api.provider.com/v1",
|
99
|
+
model_name="model-name",
|
100
|
+
chunk_size=500, # Text chunk size
|
101
|
+
chunk_overlap=100, # Overlap length
|
102
|
+
question_number=5, # Questions per chunk
|
103
|
+
max_workers=5 # Concurrency
|
104
|
+
)
|
105
|
+
```
|
106
|
+
|
107
|
+
## ⚙️ Environment Setup
|
108
|
+
|
109
|
+
### Optional Dependencies
|
110
|
+
|
111
|
+
#### LibreOffice (DOC file support)
|
112
|
+
|
113
|
+
**Ubuntu/Debian:**
|
114
|
+
```bash
|
115
|
+
sudo apt-get install libreoffice
|
116
|
+
```
|
117
|
+
|
118
|
+
**Windows:**
|
119
|
+
1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
|
120
|
+
2. Add to environment variables: `C:\Program Files\LibreOffice\program`
|
121
|
+
|
122
|
+
#### MinerU (Advanced PDF parsing)
|
123
|
+
|
124
|
+
```bash
|
125
|
+
# Create virtual environment
|
126
|
+
conda create -n mineru python=3.10
|
127
|
+
conda activate mineru
|
128
|
+
|
129
|
+
# Install MinerU
|
130
|
+
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
|
131
|
+
```
|
132
|
+
|
133
|
+
For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
|
134
|
+
|
135
|
+
## 🛠️ Development
|
136
|
+
|
137
|
+
### Local Installation
|
138
|
+
|
139
|
+
```bash
|
140
|
+
git clone https://github.com/Hi-Dolphin/datamax.git
|
141
|
+
cd datamax
|
142
|
+
pip install -r requirements.txt
|
143
|
+
python setup.py install
|
144
|
+
```
|
145
|
+
|
146
|
+
## 📋 System Requirements
|
147
|
+
|
148
|
+
- Python >= 3.10
|
149
|
+
- Supports Windows, macOS, Linux
|
150
|
+
|
151
|
+
## 🤝 Contributing
|
152
|
+
|
153
|
+
Issues and Pull Requests are welcome!
|
154
|
+
|
155
|
+
## 📄 License
|
156
|
+
|
157
|
+
This project is licensed under the [MIT License](LICENSE).
|
158
|
+
|
159
|
+
## 📞 Contact Us
|
160
|
+
|
161
|
+
- 📧 Email: cy.kron@foxmail.com
|
162
|
+
- 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
|
163
|
+
- 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
|
164
|
+
|
165
|
+
---
|
166
|
+
|
167
|
+
⭐ If this project helps you, please give us a star!
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import os
|
2
2
|
from typing import List
|
3
|
-
from datamax.loader.
|
4
|
-
from datamax.loader.
|
3
|
+
from datamax.loader.minio_handler import MinIOClient
|
4
|
+
from datamax.loader.oss_handler import OssClient
|
5
5
|
|
6
6
|
|
7
7
|
class DataLoader:
|
@@ -1,14 +1,14 @@
|
|
1
|
+
import logging
|
1
2
|
import os
|
2
3
|
import shutil
|
3
4
|
import subprocess
|
4
5
|
import tempfile
|
5
|
-
import chardet
|
6
|
-
import logging
|
7
6
|
from pathlib import Path
|
8
7
|
from typing import Union
|
9
|
-
from datamax.parser.base import BaseLife
|
10
|
-
from datamax.parser.base import MarkdownOutputVo
|
11
8
|
|
9
|
+
import chardet
|
10
|
+
|
11
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
12
12
|
|
13
13
|
# 配置日志
|
14
14
|
logger = logging.getLogger(__name__)
|
@@ -24,37 +24,41 @@ class DocParser(BaseLife):
|
|
24
24
|
def doc_to_txt(self, doc_path: str, dir_path: str) -> str:
|
25
25
|
"""将.doc文件转换为.txt文件"""
|
26
26
|
logger.info(f"🔄 开始转换DOC文件为TXT - 源文件: {doc_path}, 输出目录: {dir_path}")
|
27
|
-
|
27
|
+
|
28
28
|
try:
|
29
29
|
cmd = f'soffice --headless --convert-to txt "{doc_path}" --outdir "{dir_path}"'
|
30
30
|
logger.debug(f"⚡ 执行转换命令: {cmd}")
|
31
|
-
|
32
|
-
process = subprocess.Popen(
|
31
|
+
|
32
|
+
process = subprocess.Popen(
|
33
|
+
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
34
|
+
)
|
33
35
|
stdout, stderr = process.communicate()
|
34
36
|
exit_code = process.returncode
|
35
|
-
|
37
|
+
|
36
38
|
if exit_code == 0:
|
37
39
|
logger.info(f"✅ DOC到TXT转换成功 - 退出码: {exit_code}")
|
38
40
|
if stdout:
|
39
41
|
logger.debug(f"📄 转换输出: {stdout.decode('utf-8', errors='replace')}")
|
40
42
|
else:
|
41
|
-
encoding = chardet.detect(stderr)[
|
43
|
+
encoding = chardet.detect(stderr)["encoding"]
|
42
44
|
if encoding is None:
|
43
|
-
encoding =
|
44
|
-
error_msg = stderr.decode(encoding, errors=
|
45
|
+
encoding = "utf-8"
|
46
|
+
error_msg = stderr.decode(encoding, errors="replace")
|
45
47
|
logger.error(f"❌ DOC到TXT转换失败 - 退出码: {exit_code}, 错误信息: {error_msg}")
|
46
|
-
raise Exception(
|
47
|
-
|
48
|
+
raise Exception(
|
49
|
+
f"Error Output (detected encoding: {encoding}): {error_msg}"
|
50
|
+
)
|
51
|
+
|
48
52
|
fname = str(Path(doc_path).stem)
|
49
|
-
txt_path = os.path.join(dir_path, f
|
50
|
-
|
53
|
+
txt_path = os.path.join(dir_path, f"{fname}.txt")
|
54
|
+
|
51
55
|
if not os.path.exists(txt_path):
|
52
56
|
logger.error(f"❌ 转换后的TXT文件不存在: {txt_path}")
|
53
57
|
raise Exception(f"文件转换失败 {doc_path} ==> {txt_path}")
|
54
58
|
else:
|
55
59
|
logger.info(f"🎉 TXT文件转换成功,文件路径: {txt_path}")
|
56
60
|
return txt_path
|
57
|
-
|
61
|
+
|
58
62
|
except subprocess.SubprocessError as e:
|
59
63
|
logger.error(f"💥 subprocess执行失败: {str(e)}")
|
60
64
|
raise Exception(f"执行转换命令时发生错误: {str(e)}")
|
@@ -65,25 +69,25 @@ class DocParser(BaseLife):
|
|
65
69
|
def read_txt_file(self, txt_path: str) -> str:
|
66
70
|
"""读取txt文件内容"""
|
67
71
|
logger.info(f"📖 开始读取TXT文件: {txt_path}")
|
68
|
-
|
72
|
+
|
69
73
|
try:
|
70
74
|
# 检测文件编码
|
71
|
-
with open(txt_path,
|
75
|
+
with open(txt_path, "rb") as f:
|
72
76
|
raw_data = f.read()
|
73
|
-
encoding = chardet.detect(raw_data)[
|
77
|
+
encoding = chardet.detect(raw_data)["encoding"]
|
74
78
|
if encoding is None:
|
75
|
-
encoding =
|
79
|
+
encoding = "utf-8"
|
76
80
|
logger.debug(f"🔍 检测到文件编码: {encoding}")
|
77
|
-
|
81
|
+
|
78
82
|
# 读取文件内容
|
79
|
-
with open(txt_path,
|
83
|
+
with open(txt_path, "r", encoding=encoding, errors="replace") as f:
|
80
84
|
content = f.read()
|
81
|
-
|
85
|
+
|
82
86
|
logger.info(f"📄 TXT文件读取完成 - 内容长度: {len(content)} 字符")
|
83
87
|
logger.debug(f"👀 前100字符预览: {content[:100]}...")
|
84
|
-
|
88
|
+
|
85
89
|
return content
|
86
|
-
|
90
|
+
|
87
91
|
except FileNotFoundError as e:
|
88
92
|
logger.error(f"🚫 TXT文件未找到: {str(e)}")
|
89
93
|
raise Exception(f"文件未找到: {txt_path}")
|
@@ -94,27 +98,27 @@ class DocParser(BaseLife):
|
|
94
98
|
def read_doc_file(self, doc_path: str) -> str:
|
95
99
|
"""读取doc文件并转换为文本"""
|
96
100
|
logger.info(f"📖 开始读取DOC文件 - 文件: {doc_path}")
|
97
|
-
|
101
|
+
|
98
102
|
try:
|
99
103
|
with tempfile.TemporaryDirectory() as temp_path:
|
100
104
|
logger.debug(f"📁 创建临时目录: {temp_path}")
|
101
|
-
|
105
|
+
|
102
106
|
temp_dir = Path(temp_path)
|
103
|
-
|
107
|
+
|
104
108
|
file_path = temp_dir / "tmp.doc"
|
105
109
|
shutil.copy(doc_path, file_path)
|
106
110
|
logger.debug(f"📋 复制文件到临时目录: {doc_path} -> {file_path}")
|
107
|
-
|
111
|
+
|
108
112
|
# 转换DOC为TXT
|
109
113
|
txt_file_path = self.doc_to_txt(str(file_path), str(temp_path))
|
110
114
|
logger.info(f"🎯 DOC转TXT完成: {txt_file_path}")
|
111
|
-
|
115
|
+
|
112
116
|
# 读取TXT文件内容
|
113
117
|
content = self.read_txt_file(txt_file_path)
|
114
118
|
logger.info(f"✨ TXT文件内容读取完成,内容长度: {len(content)} 字符")
|
115
|
-
|
119
|
+
|
116
120
|
return content
|
117
|
-
|
121
|
+
|
118
122
|
except FileNotFoundError as e:
|
119
123
|
logger.error(f"🚫 文件未找到: {str(e)}")
|
120
124
|
raise Exception(f"文件未找到: {doc_path}")
|
@@ -128,24 +132,24 @@ class DocParser(BaseLife):
|
|
128
132
|
def parse(self, file_path: str):
|
129
133
|
"""解析DOC文件"""
|
130
134
|
logger.info(f"🎬 开始解析DOC文件: {file_path}")
|
131
|
-
|
135
|
+
|
132
136
|
try:
|
133
137
|
# 验证文件存在
|
134
138
|
if not os.path.exists(file_path):
|
135
139
|
logger.error(f"🚫 文件不存在: {file_path}")
|
136
140
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
137
|
-
|
141
|
+
|
138
142
|
# 验证文件大小
|
139
143
|
file_size = os.path.getsize(file_path)
|
140
144
|
logger.info(f"📏 文件大小: {file_size} 字节")
|
141
|
-
|
145
|
+
|
142
146
|
title = self.get_file_extension(file_path)
|
143
147
|
logger.debug(f"🏷️ 提取文件标题: {title}")
|
144
|
-
|
148
|
+
|
145
149
|
# 使用soffice转换为txt后读取内容
|
146
150
|
logger.info("📝 使用soffice转换DOC为TXT并读取内容")
|
147
151
|
content = self.read_doc_file(doc_path=file_path)
|
148
|
-
|
152
|
+
|
149
153
|
# 根据to_markdown参数决定是否保持原格式还是处理为markdown格式
|
150
154
|
if self.to_markdown:
|
151
155
|
# 简单的文本到markdown转换(保持段落结构)
|
@@ -154,22 +158,26 @@ class DocParser(BaseLife):
|
|
154
158
|
else:
|
155
159
|
mk_content = content
|
156
160
|
logger.info("📝 保持原始文本格式")
|
157
|
-
|
161
|
+
|
158
162
|
logger.info(f"🎊 文件内容解析完成,最终内容长度: {len(mk_content)} 字符")
|
159
|
-
|
160
|
-
lifecycle = self.generate_lifecycle(
|
161
|
-
|
163
|
+
|
164
|
+
lifecycle = self.generate_lifecycle(
|
165
|
+
source_file=file_path,
|
166
|
+
domain="Technology",
|
167
|
+
usage_purpose="Documentation",
|
168
|
+
life_type="LLM_ORIGIN",
|
169
|
+
)
|
162
170
|
logger.debug("⚙️ 生成lifecycle信息完成")
|
163
|
-
|
171
|
+
|
164
172
|
output_vo = MarkdownOutputVo(title, mk_content)
|
165
173
|
output_vo.add_lifecycle(lifecycle)
|
166
|
-
|
174
|
+
|
167
175
|
result = output_vo.to_dict()
|
168
176
|
logger.info(f"🏆 DOC文件解析完成: {file_path}")
|
169
177
|
logger.debug(f"🔑 返回结果键: {list(result.keys())}")
|
170
|
-
|
178
|
+
|
171
179
|
return result
|
172
|
-
|
180
|
+
|
173
181
|
except Exception as e:
|
174
182
|
logger.error(f"💀 解析DOC文件失败: {file_path}, 错误: {str(e)}")
|
175
183
|
raise
|
@@ -178,18 +186,18 @@ class DocParser(BaseLife):
|
|
178
186
|
"""将纯文本格式化为简单的markdown格式"""
|
179
187
|
if not content.strip():
|
180
188
|
return content
|
181
|
-
|
182
|
-
lines = content.split(
|
189
|
+
|
190
|
+
lines = content.split("\n")
|
183
191
|
formatted_lines = []
|
184
|
-
|
192
|
+
|
185
193
|
for line in lines:
|
186
194
|
line = line.strip()
|
187
195
|
if not line:
|
188
|
-
formatted_lines.append(
|
196
|
+
formatted_lines.append("")
|
189
197
|
continue
|
190
|
-
|
198
|
+
|
191
199
|
# 简单的markdown格式化规则
|
192
200
|
# 可以根据需要扩展更多规则
|
193
201
|
formatted_lines.append(line)
|
194
|
-
|
195
|
-
return
|
202
|
+
|
203
|
+
return "\n".join(formatted_lines)
|