pydatamax 0.1.13__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pydatamax-0.1.13 → pydatamax-0.1.15}/LICENSE +21 -21
- pydatamax-0.1.15/PKG-INFO +340 -0
- pydatamax-0.1.15/README.md +279 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/__init__.py +1 -1
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/loader/__init__.py +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/loader/core.py +118 -118
- pydatamax-0.1.13/datamax/loader/MinioHandler.py → pydatamax-0.1.15/datamax/loader/minio_handler.py +171 -171
- pydatamax-0.1.13/datamax/loader/OssHandler.py → pydatamax-0.1.15/datamax/loader/oss_handler.py +191 -191
- pydatamax-0.1.15/datamax/parser/__init__.py +2 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/base.py +76 -76
- pydatamax-0.1.15/datamax/parser/core.py +406 -0
- pydatamax-0.1.15/datamax/parser/csv_parser.py +31 -0
- pydatamax-0.1.15/datamax/parser/doc_parser.py +659 -0
- pydatamax-0.1.15/datamax/parser/docx_parser.py +662 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/epub_parser.py +41 -41
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/html_parser.py +37 -37
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/image_parser.py +34 -34
- pydatamax-0.1.15/datamax/parser/json_parser.py +32 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/md_parser.py +72 -72
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/pdf_parser.py +101 -101
- pydatamax-0.1.15/datamax/parser/ppt_parser.py +124 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/pptx_parser.py +45 -45
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/txt_parser.py +45 -45
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/xls_parser.py +26 -26
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/xlsx_parser.py +212 -208
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/__init__.py +23 -2
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/constants.py +58 -58
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/data_cleaner.py +275 -237
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/env_setup.py +79 -79
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/gotocr_pdf.py +265 -265
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/mineru_operator.py +62 -62
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/paddleocr_pdf_operator.py +90 -90
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/ppt_extract.py +140 -140
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/qa_generator.py +369 -376
- {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/tokenizer.py +21 -21
- pydatamax-0.1.15/datamax/utils/uno_handler.py +426 -0
- pydatamax-0.1.15/pydatamax.egg-info/PKG-INFO +340 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/pydatamax.egg-info/SOURCES.txt +7 -4
- {pydatamax-0.1.13 → pydatamax-0.1.15}/pydatamax.egg-info/dependency_links.txt +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/pydatamax.egg-info/requires.txt +2 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/pydatamax.egg-info/top_level.txt +0 -1
- {pydatamax-0.1.13 → pydatamax-0.1.15}/setup.cfg +0 -0
- {pydatamax-0.1.13 → pydatamax-0.1.15}/setup.py +58 -56
- pydatamax-0.1.15/tests/test_doc_parser.py +247 -0
- pydatamax-0.1.15/tests/test_docx_format_analysis.py +340 -0
- pydatamax-0.1.15/tests/test_docx_parser.py +310 -0
- pydatamax-0.1.15/tests/test_wps_doc.py +138 -0
- pydatamax-0.1.13/PKG-INFO +0 -280
- pydatamax-0.1.13/README.md +0 -221
- pydatamax-0.1.13/datamax/parser/__init__.py +0 -4
- pydatamax-0.1.13/datamax/parser/core.py +0 -288
- pydatamax-0.1.13/datamax/parser/csv_parser.py +0 -10
- pydatamax-0.1.13/datamax/parser/doc_parser.py +0 -195
- pydatamax-0.1.13/datamax/parser/docx_parser.py +0 -212
- pydatamax-0.1.13/datamax/parser/json_parser.py +0 -10
- pydatamax-0.1.13/datamax/parser/ppt_parser.py +0 -74
- pydatamax-0.1.13/pydatamax.egg-info/PKG-INFO +0 -280
- pydatamax-0.1.13/tests/__init__.py +0 -0
- pydatamax-0.1.13/tests/test_basic.py +0 -20
@@ -1,21 +1,21 @@
|
|
1
|
-
MIT License
|
2
|
-
|
3
|
-
Copyright (c) 2024 Hi-Dolphin
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
7
|
-
in the Software without restriction, including without limitation the rights
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
10
|
-
furnished to do so, subject to the following conditions:
|
11
|
-
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
13
|
-
copies or substantial portions of the Software.
|
14
|
-
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
-
SOFTWARE.
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Hi-Dolphin
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,340 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: pydatamax
|
3
|
+
Version: 0.1.15
|
4
|
+
Summary: A library for parsing and converting various file formats.
|
5
|
+
Home-page: https://github.com/Hi-Dolphin/datamax
|
6
|
+
Author: ccy
|
7
|
+
Author-email: cy.kron@foxmail.com
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.10
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Requires-Dist: oss2<3.0.0,>=2.19.1
|
15
|
+
Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
|
16
|
+
Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
|
17
|
+
Requires-Dist: crcmod<2.0.0,>=1.7
|
18
|
+
Requires-Dist: langdetect<2.0.0,>=1.0.9
|
19
|
+
Requires-Dist: loguru<1.0.0,>=0.7.3
|
20
|
+
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
21
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.1.0
|
22
|
+
Requires-Dist: pymupdf<2.0.0,>=1.26.0
|
23
|
+
Requires-Dist: pypdf<6.0.0,>=5.5.0
|
24
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
25
|
+
Requires-Dist: pandas<3.0.0,>=2.2.3
|
26
|
+
Requires-Dist: numpy<3.0.0,>=2.2.6
|
27
|
+
Requires-Dist: requests<3.0.0,>=2.32.3
|
28
|
+
Requires-Dist: tqdm<5.0.0,>=4.67.1
|
29
|
+
Requires-Dist: pydantic<3.0.0,>=2.11.5
|
30
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
|
31
|
+
Requires-Dist: python-magic<1.0.0,>=0.4.27
|
32
|
+
Requires-Dist: PyYAML<7.0.0,>=6.0.2
|
33
|
+
Requires-Dist: Pillow<12.0.0,>=11.2.1
|
34
|
+
Requires-Dist: packaging<25.0,>=24.2
|
35
|
+
Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
|
36
|
+
Requires-Dist: minio<8.0.0,>=7.2.15
|
37
|
+
Requires-Dist: openai<2.0.0,>=1.82.0
|
38
|
+
Requires-Dist: jionlp<2.0.0,>=1.5.23
|
39
|
+
Requires-Dist: chardet<6.0.0,>=5.2.0
|
40
|
+
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
41
|
+
Requires-Dist: tiktoken<1.0.0,>=0.9.0
|
42
|
+
Requires-Dist: markitdown<1.0.0,>=0.1.1
|
43
|
+
Requires-Dist: xlrd<3.0.0,>=2.0.1
|
44
|
+
Requires-Dist: tabulate<1.0.0,>=0.9.0
|
45
|
+
Requires-Dist: unstructured<1.0.0,>=0.17.2
|
46
|
+
Requires-Dist: markdown<4.0.0,>=3.8
|
47
|
+
Requires-Dist: langchain<1.0.0,>=0.3.0
|
48
|
+
Requires-Dist: langchain-community<1.0.0,>=0.3.0
|
49
|
+
Requires-Dist: ebooklib==0.19
|
50
|
+
Requires-Dist: setuptools
|
51
|
+
Dynamic: author
|
52
|
+
Dynamic: author-email
|
53
|
+
Dynamic: classifier
|
54
|
+
Dynamic: description
|
55
|
+
Dynamic: description-content-type
|
56
|
+
Dynamic: home-page
|
57
|
+
Dynamic: license-file
|
58
|
+
Dynamic: requires-dist
|
59
|
+
Dynamic: requires-python
|
60
|
+
Dynamic: summary
|
61
|
+
|
62
|
+
# DataMax
|
63
|
+
|
64
|
+
<div align="center">
|
65
|
+
|
66
|
+
[中文](README_zh.md) | **English**
|
67
|
+
|
68
|
+
[](https://badge.fury.io/py/pydatamax) [](https://www.python.org/downloads/) [](https://opensource.org/licenses/MIT)
|
69
|
+
|
70
|
+
</div>
|
71
|
+
|
72
|
+
A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
|
73
|
+
|
74
|
+
## ✨ Core Features
|
75
|
+
|
76
|
+
- 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
|
77
|
+
- 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
|
78
|
+
- 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
|
79
|
+
- ⚡ **Batch Processing**: Efficient multi-file parallel processing
|
80
|
+
- 🎯 **Easy Integration**: Clean API design, ready to use out of the box
|
81
|
+
|
82
|
+
## 🚀 Quick Start
|
83
|
+
|
84
|
+
### Installation
|
85
|
+
|
86
|
+
```bash
|
87
|
+
pip install pydatamax
|
88
|
+
```
|
89
|
+
|
90
|
+
### Basic Usage
|
91
|
+
|
92
|
+
```python
|
93
|
+
from datamax import DataMax
|
94
|
+
|
95
|
+
# Parse a single file
|
96
|
+
dm = DataMax(file_path="document.pdf")
|
97
|
+
data = dm.get_data()
|
98
|
+
|
99
|
+
# Batch processing
|
100
|
+
dm = DataMax(file_path=["file1.docx", "file2.pdf"])
|
101
|
+
data = dm.get_data()
|
102
|
+
|
103
|
+
# Data cleaning
|
104
|
+
cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
|
105
|
+
|
106
|
+
# AI annotation
|
107
|
+
qa_data = dm.get_pre_label(
|
108
|
+
api_key="sk-xxx",
|
109
|
+
base_url="https://api.provider.com/v1",
|
110
|
+
model_name="model-name",
|
111
|
+
chunk_size=500, # 文本块大小
|
112
|
+
chunk_overlap=100, # 重叠长度
|
113
|
+
question_number=5, # 每块生成问题数
|
114
|
+
max_workers=5 # 并发数
|
115
|
+
)
|
116
|
+
dm.save_label_data(res)
|
117
|
+
```
|
118
|
+
|
119
|
+
## 📖 Detailed Documentation
|
120
|
+
|
121
|
+
### File Parsing
|
122
|
+
|
123
|
+
#### Supported Formats
|
124
|
+
|
125
|
+
| Format | Extensions | Special Features |
|
126
|
+
|--------|------------|------------------|
|
127
|
+
| Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
|
128
|
+
| Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
|
129
|
+
| Presentations | `.pptx`, `.ppt` | Slide content extraction |
|
130
|
+
| Web | `.html`, `.epub` | Tag parsing |
|
131
|
+
| Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
|
132
|
+
| Text | `.txt` | Automatic encoding detection |
|
133
|
+
|
134
|
+
#### Advanced Features
|
135
|
+
|
136
|
+
```python
|
137
|
+
# Advanced PDF parsing (requires MinerU)
|
138
|
+
dm = DataMax(file_path="complex.pdf", use_mineru=True)
|
139
|
+
|
140
|
+
# Word to Markdown conversion
|
141
|
+
dm = DataMax(file_path="document.docx", to_markdown=True)
|
142
|
+
|
143
|
+
# Image OCR
|
144
|
+
dm = DataMax(file_path="image.jpg", use_ocr=True)
|
145
|
+
```
|
146
|
+
### Batch Processing
|
147
|
+
```python
|
148
|
+
# Parse multiple files in batch
|
149
|
+
dm = DataMax(
|
150
|
+
file_path=["file1.pdf", "file2.docx"],
|
151
|
+
use_mineru=True
|
152
|
+
)
|
153
|
+
data = dm.get_data()
|
154
|
+
```
|
155
|
+
|
156
|
+
### Cache parsed results
|
157
|
+
```python
|
158
|
+
# Cache parsed results to avoid repeated parsing
|
159
|
+
dm = DataMax(
|
160
|
+
file_path=["file1.pdf", "file2.docx"],
|
161
|
+
ttl=3600 # Cache duration in seconds, default 3600s, 0 means no caching
|
162
|
+
)
|
163
|
+
data = dm.get_data()
|
164
|
+
```
|
165
|
+
|
166
|
+
### Data Cleaning
|
167
|
+
## Exception Handling
|
168
|
+
|
169
|
+
- remove_abnormal_chars Remove abnormal characters from text
|
170
|
+
- remove_html_tags Remove HTML tags
|
171
|
+
- convert_newlines Convert \r to \n and merge multiple \n into single \n
|
172
|
+
- single_space Convert multiple spaces (more than 2) to single space
|
173
|
+
- tabs_to_spaces Convert tabs to 4 spaces
|
174
|
+
- remove_invisible_chars Remove invisible ASCII characters
|
175
|
+
- simplify_chinese Convert traditional Chinese to simplified Chinese
|
176
|
+
|
177
|
+
## Text Filtering
|
178
|
+
|
179
|
+
- filter_by_word_repetition Filter by word repetition rate
|
180
|
+
- filter_by_char_count Filter by character count
|
181
|
+
- filter_by_numeric_content Filter by numeric content ratio
|
182
|
+
|
183
|
+
## Privacy Desensitization
|
184
|
+
|
185
|
+
- replace_ip
|
186
|
+
- replace_email
|
187
|
+
- replace_customer_number Clean hotline numbers like 4008-123-123
|
188
|
+
- replace_bank_id
|
189
|
+
- replace_phone_number
|
190
|
+
- replace_qq
|
191
|
+
- replace_id_card
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
```python
|
196
|
+
# Three cleaning modes
|
197
|
+
dm.clean_data(method_list=[
|
198
|
+
"abnormal", # Anomaly data processing
|
199
|
+
"private", # Privacy information masking
|
200
|
+
"filter" # Text filtering and normalization
|
201
|
+
])
|
202
|
+
|
203
|
+
# Custom cleaning mode
|
204
|
+
from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
|
205
|
+
dm = DataMax(
|
206
|
+
file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
|
207
|
+
)
|
208
|
+
parsed_data = dm.get_data().get('content')
|
209
|
+
# 1. Text filtering
|
210
|
+
tf = TextFilter(parsed_data=parsed_data)
|
211
|
+
# Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
|
212
|
+
tf_bool = tf.filter_by_word_repetition(threshold=0.6)
|
213
|
+
if tf_bool:
|
214
|
+
print("Text passed word repetition filtering")
|
215
|
+
else:
|
216
|
+
print("Text failed word repetition filtering")
|
217
|
+
|
218
|
+
# Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
|
219
|
+
tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
|
220
|
+
if tf_bool:
|
221
|
+
print("Text passed character count filtering")
|
222
|
+
else:
|
223
|
+
print("Text failed character count filtering")
|
224
|
+
|
225
|
+
# Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
|
226
|
+
tf_bool = tf.filter_by_numeric_content(threshold=0.6)
|
227
|
+
if tf_bool:
|
228
|
+
print("Text passed numeric ratio filtering")
|
229
|
+
else:
|
230
|
+
print("Text failed numeric ratio filtering")
|
231
|
+
|
232
|
+
# 2. Privacy desensitization
|
233
|
+
pd = PrivacyDesensitization(parsed_data=parsed_data)
|
234
|
+
res = pd.replace_ip(
|
235
|
+
token="MyIP"
|
236
|
+
)
|
237
|
+
print(res)
|
238
|
+
|
239
|
+
# 3. Abnormal character cleaning
|
240
|
+
ac = AbnormalCleaner(parsed_data=parsed_data)
|
241
|
+
res = ac.remove_abnormal_chars()
|
242
|
+
res = ac.remove_html_tags()
|
243
|
+
res = ac.convert_newlines()
|
244
|
+
res = ac.single_space()
|
245
|
+
res = ac.tabs_to_spaces()
|
246
|
+
res = ac.remove_invisible_chars()
|
247
|
+
res = ac.simplify_chinese()
|
248
|
+
print(res)
|
249
|
+
```
|
250
|
+
# Text Segmentation
|
251
|
+
```python
|
252
|
+
dm.split_data(
|
253
|
+
chunk_size=500, # Chunk size
|
254
|
+
chunk_overlap=100, # Overlap length
|
255
|
+
use_langchain=True # Use LangChain for text segmentation
|
256
|
+
)
|
257
|
+
|
258
|
+
# When use_langchain is False, use custom segmentation method
|
259
|
+
# Using 。!? as separators, consecutive separators will be merged
|
260
|
+
# chunk_size strictly limits the string length
|
261
|
+
for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
|
262
|
+
print(chunk)
|
263
|
+
```
|
264
|
+
|
265
|
+
### AI Annotation
|
266
|
+
|
267
|
+
```python
|
268
|
+
# Custom annotation tasks
|
269
|
+
qa_data = dm.get_pre_label(
|
270
|
+
api_key="sk-xxx",
|
271
|
+
base_url="https://api.provider.com/v1",
|
272
|
+
model_name="model-name",
|
273
|
+
chunk_size=500, # Text chunk size
|
274
|
+
chunk_overlap=100, # Overlap length
|
275
|
+
question_number=5, # Questions per chunk
|
276
|
+
max_workers=5 # Concurrency
|
277
|
+
)
|
278
|
+
```
|
279
|
+
|
280
|
+
## ⚙️ Environment Setup
|
281
|
+
|
282
|
+
### Optional Dependencies
|
283
|
+
|
284
|
+
#### LibreOffice (DOC file support)
|
285
|
+
|
286
|
+
**Ubuntu/Debian:**
|
287
|
+
```bash
|
288
|
+
sudo apt-get install libreoffice
|
289
|
+
```
|
290
|
+
|
291
|
+
**Windows:**
|
292
|
+
1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
|
293
|
+
2. Add to environment variables: `C:\Program Files\LibreOffice\program`
|
294
|
+
|
295
|
+
#### MinerU (Advanced PDF parsing)
|
296
|
+
|
297
|
+
```bash
|
298
|
+
# Create virtual environment
|
299
|
+
conda create -n mineru python=3.10
|
300
|
+
conda activate mineru
|
301
|
+
|
302
|
+
# Install MinerU
|
303
|
+
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
|
304
|
+
```
|
305
|
+
|
306
|
+
For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
|
307
|
+
|
308
|
+
## 🛠️ Development
|
309
|
+
|
310
|
+
### Local Installation
|
311
|
+
|
312
|
+
```bash
|
313
|
+
git clone https://github.com/Hi-Dolphin/datamax.git
|
314
|
+
cd datamax
|
315
|
+
pip install -r requirements.txt
|
316
|
+
python setup.py install
|
317
|
+
```
|
318
|
+
|
319
|
+
## 📋 System Requirements
|
320
|
+
|
321
|
+
- Python >= 3.10
|
322
|
+
- Supports Windows, macOS, Linux
|
323
|
+
|
324
|
+
## 🤝 Contributing
|
325
|
+
|
326
|
+
Issues and Pull Requests are welcome!
|
327
|
+
|
328
|
+
## 📄 License
|
329
|
+
|
330
|
+
This project is licensed under the [MIT License](LICENSE).
|
331
|
+
|
332
|
+
## 📞 Contact Us
|
333
|
+
|
334
|
+
- 📧 Email: cy.kron@foxmail.com
|
335
|
+
- 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
|
336
|
+
- 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
|
337
|
+
|
338
|
+
---
|
339
|
+
|
340
|
+
⭐ If this project helps you, please give us a star!
|
@@ -0,0 +1,279 @@
|
|
1
|
+
# DataMax
|
2
|
+
|
3
|
+
<div align="center">
|
4
|
+
|
5
|
+
[中文](README_zh.md) | **English**
|
6
|
+
|
7
|
+
[](https://badge.fury.io/py/pydatamax) [](https://www.python.org/downloads/) [](https://opensource.org/licenses/MIT)
|
8
|
+
|
9
|
+
</div>
|
10
|
+
|
11
|
+
A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
|
12
|
+
|
13
|
+
## ✨ Core Features
|
14
|
+
|
15
|
+
- 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
|
16
|
+
- 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
|
17
|
+
- 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
|
18
|
+
- ⚡ **Batch Processing**: Efficient multi-file parallel processing
|
19
|
+
- 🎯 **Easy Integration**: Clean API design, ready to use out of the box
|
20
|
+
|
21
|
+
## 🚀 Quick Start
|
22
|
+
|
23
|
+
### Installation
|
24
|
+
|
25
|
+
```bash
|
26
|
+
pip install pydatamax
|
27
|
+
```
|
28
|
+
|
29
|
+
### Basic Usage
|
30
|
+
|
31
|
+
```python
|
32
|
+
from datamax import DataMax
|
33
|
+
|
34
|
+
# Parse a single file
|
35
|
+
dm = DataMax(file_path="document.pdf")
|
36
|
+
data = dm.get_data()
|
37
|
+
|
38
|
+
# Batch processing
|
39
|
+
dm = DataMax(file_path=["file1.docx", "file2.pdf"])
|
40
|
+
data = dm.get_data()
|
41
|
+
|
42
|
+
# Data cleaning
|
43
|
+
cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
|
44
|
+
|
45
|
+
# AI annotation
|
46
|
+
qa_data = dm.get_pre_label(
|
47
|
+
api_key="sk-xxx",
|
48
|
+
base_url="https://api.provider.com/v1",
|
49
|
+
model_name="model-name",
|
50
|
+
chunk_size=500, # 文本块大小
|
51
|
+
chunk_overlap=100, # 重叠长度
|
52
|
+
question_number=5, # 每块生成问题数
|
53
|
+
max_workers=5 # 并发数
|
54
|
+
)
|
55
|
+
dm.save_label_data(res)
|
56
|
+
```
|
57
|
+
|
58
|
+
## 📖 Detailed Documentation
|
59
|
+
|
60
|
+
### File Parsing
|
61
|
+
|
62
|
+
#### Supported Formats
|
63
|
+
|
64
|
+
| Format | Extensions | Special Features |
|
65
|
+
|--------|------------|------------------|
|
66
|
+
| Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
|
67
|
+
| Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
|
68
|
+
| Presentations | `.pptx`, `.ppt` | Slide content extraction |
|
69
|
+
| Web | `.html`, `.epub` | Tag parsing |
|
70
|
+
| Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
|
71
|
+
| Text | `.txt` | Automatic encoding detection |
|
72
|
+
|
73
|
+
#### Advanced Features
|
74
|
+
|
75
|
+
```python
|
76
|
+
# Advanced PDF parsing (requires MinerU)
|
77
|
+
dm = DataMax(file_path="complex.pdf", use_mineru=True)
|
78
|
+
|
79
|
+
# Word to Markdown conversion
|
80
|
+
dm = DataMax(file_path="document.docx", to_markdown=True)
|
81
|
+
|
82
|
+
# Image OCR
|
83
|
+
dm = DataMax(file_path="image.jpg", use_ocr=True)
|
84
|
+
```
|
85
|
+
### Batch Processing
|
86
|
+
```python
|
87
|
+
# Parse multiple files in batch
|
88
|
+
dm = DataMax(
|
89
|
+
file_path=["file1.pdf", "file2.docx"],
|
90
|
+
use_mineru=True
|
91
|
+
)
|
92
|
+
data = dm.get_data()
|
93
|
+
```
|
94
|
+
|
95
|
+
### Cache parsed results
|
96
|
+
```python
|
97
|
+
# Cache parsed results to avoid repeated parsing
|
98
|
+
dm = DataMax(
|
99
|
+
file_path=["file1.pdf", "file2.docx"],
|
100
|
+
ttl=3600 # Cache duration in seconds, default 3600s, 0 means no caching
|
101
|
+
)
|
102
|
+
data = dm.get_data()
|
103
|
+
```
|
104
|
+
|
105
|
+
### Data Cleaning
|
106
|
+
## Exception Handling
|
107
|
+
|
108
|
+
- remove_abnormal_chars Remove abnormal characters from text
|
109
|
+
- remove_html_tags Remove HTML tags
|
110
|
+
- convert_newlines Convert \r to \n and merge multiple \n into single \n
|
111
|
+
- single_space Convert multiple spaces (more than 2) to single space
|
112
|
+
- tabs_to_spaces Convert tabs to 4 spaces
|
113
|
+
- remove_invisible_chars Remove invisible ASCII characters
|
114
|
+
- simplify_chinese Convert traditional Chinese to simplified Chinese
|
115
|
+
|
116
|
+
## Text Filtering
|
117
|
+
|
118
|
+
- filter_by_word_repetition Filter by word repetition rate
|
119
|
+
- filter_by_char_count Filter by character count
|
120
|
+
- filter_by_numeric_content Filter by numeric content ratio
|
121
|
+
|
122
|
+
## Privacy Desensitization
|
123
|
+
|
124
|
+
- replace_ip
|
125
|
+
- replace_email
|
126
|
+
- replace_customer_number Clean hotline numbers like 4008-123-123
|
127
|
+
- replace_bank_id
|
128
|
+
- replace_phone_number
|
129
|
+
- replace_qq
|
130
|
+
- replace_id_card
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
```python
|
135
|
+
# Three cleaning modes
|
136
|
+
dm.clean_data(method_list=[
|
137
|
+
"abnormal", # Anomaly data processing
|
138
|
+
"private", # Privacy information masking
|
139
|
+
"filter" # Text filtering and normalization
|
140
|
+
])
|
141
|
+
|
142
|
+
# Custom cleaning mode
|
143
|
+
from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
|
144
|
+
dm = DataMax(
|
145
|
+
file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
|
146
|
+
)
|
147
|
+
parsed_data = dm.get_data().get('content')
|
148
|
+
# 1. Text filtering
|
149
|
+
tf = TextFilter(parsed_data=parsed_data)
|
150
|
+
# Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
|
151
|
+
tf_bool = tf.filter_by_word_repetition(threshold=0.6)
|
152
|
+
if tf_bool:
|
153
|
+
print("Text passed word repetition filtering")
|
154
|
+
else:
|
155
|
+
print("Text failed word repetition filtering")
|
156
|
+
|
157
|
+
# Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
|
158
|
+
tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
|
159
|
+
if tf_bool:
|
160
|
+
print("Text passed character count filtering")
|
161
|
+
else:
|
162
|
+
print("Text failed character count filtering")
|
163
|
+
|
164
|
+
# Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
|
165
|
+
tf_bool = tf.filter_by_numeric_content(threshold=0.6)
|
166
|
+
if tf_bool:
|
167
|
+
print("Text passed numeric ratio filtering")
|
168
|
+
else:
|
169
|
+
print("Text failed numeric ratio filtering")
|
170
|
+
|
171
|
+
# 2. Privacy desensitization
|
172
|
+
pd = PrivacyDesensitization(parsed_data=parsed_data)
|
173
|
+
res = pd.replace_ip(
|
174
|
+
token="MyIP"
|
175
|
+
)
|
176
|
+
print(res)
|
177
|
+
|
178
|
+
# 3. Abnormal character cleaning
|
179
|
+
ac = AbnormalCleaner(parsed_data=parsed_data)
|
180
|
+
res = ac.remove_abnormal_chars()
|
181
|
+
res = ac.remove_html_tags()
|
182
|
+
res = ac.convert_newlines()
|
183
|
+
res = ac.single_space()
|
184
|
+
res = ac.tabs_to_spaces()
|
185
|
+
res = ac.remove_invisible_chars()
|
186
|
+
res = ac.simplify_chinese()
|
187
|
+
print(res)
|
188
|
+
```
|
189
|
+
# Text Segmentation
|
190
|
+
```python
|
191
|
+
dm.split_data(
|
192
|
+
chunk_size=500, # Chunk size
|
193
|
+
chunk_overlap=100, # Overlap length
|
194
|
+
use_langchain=True # Use LangChain for text segmentation
|
195
|
+
)
|
196
|
+
|
197
|
+
# When use_langchain is False, use custom segmentation method
|
198
|
+
# Using 。!? as separators, consecutive separators will be merged
|
199
|
+
# chunk_size strictly limits the string length
|
200
|
+
for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
|
201
|
+
print(chunk)
|
202
|
+
```
|
203
|
+
|
204
|
+
### AI Annotation
|
205
|
+
|
206
|
+
```python
|
207
|
+
# Custom annotation tasks
|
208
|
+
qa_data = dm.get_pre_label(
|
209
|
+
api_key="sk-xxx",
|
210
|
+
base_url="https://api.provider.com/v1",
|
211
|
+
model_name="model-name",
|
212
|
+
chunk_size=500, # Text chunk size
|
213
|
+
chunk_overlap=100, # Overlap length
|
214
|
+
question_number=5, # Questions per chunk
|
215
|
+
max_workers=5 # Concurrency
|
216
|
+
)
|
217
|
+
```
|
218
|
+
|
219
|
+
## ⚙️ Environment Setup
|
220
|
+
|
221
|
+
### Optional Dependencies
|
222
|
+
|
223
|
+
#### LibreOffice (DOC file support)
|
224
|
+
|
225
|
+
**Ubuntu/Debian:**
|
226
|
+
```bash
|
227
|
+
sudo apt-get install libreoffice
|
228
|
+
```
|
229
|
+
|
230
|
+
**Windows:**
|
231
|
+
1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
|
232
|
+
2. Add to environment variables: `C:\Program Files\LibreOffice\program`
|
233
|
+
|
234
|
+
#### MinerU (Advanced PDF parsing)
|
235
|
+
|
236
|
+
```bash
|
237
|
+
# Create virtual environment
|
238
|
+
conda create -n mineru python=3.10
|
239
|
+
conda activate mineru
|
240
|
+
|
241
|
+
# Install MinerU
|
242
|
+
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
|
243
|
+
```
|
244
|
+
|
245
|
+
For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
|
246
|
+
|
247
|
+
## 🛠️ Development
|
248
|
+
|
249
|
+
### Local Installation
|
250
|
+
|
251
|
+
```bash
|
252
|
+
git clone https://github.com/Hi-Dolphin/datamax.git
|
253
|
+
cd datamax
|
254
|
+
pip install -r requirements.txt
|
255
|
+
python setup.py install
|
256
|
+
```
|
257
|
+
|
258
|
+
## 📋 System Requirements
|
259
|
+
|
260
|
+
- Python >= 3.10
|
261
|
+
- Supports Windows, macOS, Linux
|
262
|
+
|
263
|
+
## 🤝 Contributing
|
264
|
+
|
265
|
+
Issues and Pull Requests are welcome!
|
266
|
+
|
267
|
+
## 📄 License
|
268
|
+
|
269
|
+
This project is licensed under the [MIT License](LICENSE).
|
270
|
+
|
271
|
+
## 📞 Contact Us
|
272
|
+
|
273
|
+
- 📧 Email: cy.kron@foxmail.com
|
274
|
+
- 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
|
275
|
+
- 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
|
276
|
+
|
277
|
+
---
|
278
|
+
|
279
|
+
⭐ If this project helps you, please give us a star!
|
@@ -1 +1 @@
|
|
1
|
-
from .parser import DataMax
|
1
|
+
from .parser import DataMax
|
File without changes
|