pydatamax 0.1.13__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {pydatamax-0.1.13 → pydatamax-0.1.15}/LICENSE +21 -21
  2. pydatamax-0.1.15/PKG-INFO +340 -0
  3. pydatamax-0.1.15/README.md +279 -0
  4. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/__init__.py +1 -1
  5. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/loader/__init__.py +0 -0
  6. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/loader/core.py +118 -118
  7. pydatamax-0.1.13/datamax/loader/MinioHandler.py → pydatamax-0.1.15/datamax/loader/minio_handler.py +171 -171
  8. pydatamax-0.1.13/datamax/loader/OssHandler.py → pydatamax-0.1.15/datamax/loader/oss_handler.py +191 -191
  9. pydatamax-0.1.15/datamax/parser/__init__.py +2 -0
  10. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/base.py +76 -76
  11. pydatamax-0.1.15/datamax/parser/core.py +406 -0
  12. pydatamax-0.1.15/datamax/parser/csv_parser.py +31 -0
  13. pydatamax-0.1.15/datamax/parser/doc_parser.py +659 -0
  14. pydatamax-0.1.15/datamax/parser/docx_parser.py +662 -0
  15. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/epub_parser.py +41 -41
  16. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/html_parser.py +37 -37
  17. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/image_parser.py +34 -34
  18. pydatamax-0.1.15/datamax/parser/json_parser.py +32 -0
  19. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/md_parser.py +72 -72
  20. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/pdf_parser.py +101 -101
  21. pydatamax-0.1.15/datamax/parser/ppt_parser.py +124 -0
  22. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/pptx_parser.py +45 -45
  23. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/txt_parser.py +45 -45
  24. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/xls_parser.py +26 -26
  25. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/parser/xlsx_parser.py +212 -208
  26. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/__init__.py +23 -2
  27. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/constants.py +58 -58
  28. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/data_cleaner.py +275 -237
  29. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/env_setup.py +79 -79
  30. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/gotocr_pdf.py +265 -265
  31. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/mineru_operator.py +62 -62
  32. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/paddleocr_pdf_operator.py +90 -90
  33. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/ppt_extract.py +140 -140
  34. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/qa_generator.py +369 -376
  35. {pydatamax-0.1.13 → pydatamax-0.1.15}/datamax/utils/tokenizer.py +21 -21
  36. pydatamax-0.1.15/datamax/utils/uno_handler.py +426 -0
  37. pydatamax-0.1.15/pydatamax.egg-info/PKG-INFO +340 -0
  38. {pydatamax-0.1.13 → pydatamax-0.1.15}/pydatamax.egg-info/SOURCES.txt +7 -4
  39. {pydatamax-0.1.13 → pydatamax-0.1.15}/pydatamax.egg-info/dependency_links.txt +0 -0
  40. {pydatamax-0.1.13 → pydatamax-0.1.15}/pydatamax.egg-info/requires.txt +2 -0
  41. {pydatamax-0.1.13 → pydatamax-0.1.15}/pydatamax.egg-info/top_level.txt +0 -1
  42. {pydatamax-0.1.13 → pydatamax-0.1.15}/setup.cfg +0 -0
  43. {pydatamax-0.1.13 → pydatamax-0.1.15}/setup.py +58 -56
  44. pydatamax-0.1.15/tests/test_doc_parser.py +247 -0
  45. pydatamax-0.1.15/tests/test_docx_format_analysis.py +340 -0
  46. pydatamax-0.1.15/tests/test_docx_parser.py +310 -0
  47. pydatamax-0.1.15/tests/test_wps_doc.py +138 -0
  48. pydatamax-0.1.13/PKG-INFO +0 -280
  49. pydatamax-0.1.13/README.md +0 -221
  50. pydatamax-0.1.13/datamax/parser/__init__.py +0 -4
  51. pydatamax-0.1.13/datamax/parser/core.py +0 -288
  52. pydatamax-0.1.13/datamax/parser/csv_parser.py +0 -10
  53. pydatamax-0.1.13/datamax/parser/doc_parser.py +0 -195
  54. pydatamax-0.1.13/datamax/parser/docx_parser.py +0 -212
  55. pydatamax-0.1.13/datamax/parser/json_parser.py +0 -10
  56. pydatamax-0.1.13/datamax/parser/ppt_parser.py +0 -74
  57. pydatamax-0.1.13/pydatamax.egg-info/PKG-INFO +0 -280
  58. pydatamax-0.1.13/tests/__init__.py +0 -0
  59. pydatamax-0.1.13/tests/test_basic.py +0 -20
@@ -1,21 +1,21 @@
1
- MIT License
2
-
3
- Copyright (c) 2024 Hi-Dolphin
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Hi-Dolphin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,340 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydatamax
3
+ Version: 0.1.15
4
+ Summary: A library for parsing and converting various file formats.
5
+ Home-page: https://github.com/Hi-Dolphin/datamax
6
+ Author: ccy
7
+ Author-email: cy.kron@foxmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: oss2<3.0.0,>=2.19.1
15
+ Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
+ Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
+ Requires-Dist: crcmod<2.0.0,>=1.7
18
+ Requires-Dist: langdetect<2.0.0,>=1.0.9
19
+ Requires-Dist: loguru<1.0.0,>=0.7.3
20
+ Requires-Dist: python-docx<2.0.0,>=1.1.2
21
+ Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
+ Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
+ Requires-Dist: pypdf<6.0.0,>=5.5.0
24
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
+ Requires-Dist: pandas<3.0.0,>=2.2.3
26
+ Requires-Dist: numpy<3.0.0,>=2.2.6
27
+ Requires-Dist: requests<3.0.0,>=2.32.3
28
+ Requires-Dist: tqdm<5.0.0,>=4.67.1
29
+ Requires-Dist: pydantic<3.0.0,>=2.11.5
30
+ Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
+ Requires-Dist: python-magic<1.0.0,>=0.4.27
32
+ Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
+ Requires-Dist: Pillow<12.0.0,>=11.2.1
34
+ Requires-Dist: packaging<25.0,>=24.2
35
+ Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
+ Requires-Dist: minio<8.0.0,>=7.2.15
37
+ Requires-Dist: openai<2.0.0,>=1.82.0
38
+ Requires-Dist: jionlp<2.0.0,>=1.5.23
39
+ Requires-Dist: chardet<6.0.0,>=5.2.0
40
+ Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
+ Requires-Dist: tiktoken<1.0.0,>=0.9.0
42
+ Requires-Dist: markitdown<1.0.0,>=0.1.1
43
+ Requires-Dist: xlrd<3.0.0,>=2.0.1
44
+ Requires-Dist: tabulate<1.0.0,>=0.9.0
45
+ Requires-Dist: unstructured<1.0.0,>=0.17.2
46
+ Requires-Dist: markdown<4.0.0,>=3.8
47
+ Requires-Dist: langchain<1.0.0,>=0.3.0
48
+ Requires-Dist: langchain-community<1.0.0,>=0.3.0
49
+ Requires-Dist: ebooklib==0.19
50
+ Requires-Dist: setuptools
51
+ Dynamic: author
52
+ Dynamic: author-email
53
+ Dynamic: classifier
54
+ Dynamic: description
55
+ Dynamic: description-content-type
56
+ Dynamic: home-page
57
+ Dynamic: license-file
58
+ Dynamic: requires-dist
59
+ Dynamic: requires-python
60
+ Dynamic: summary
61
+
62
+ # DataMax
63
+
64
+ <div align="center">
65
+
66
+ [中文](README_zh.md) | **English**
67
+
68
+ [![PyPI version](https://badge.fury.io/py/pydatamax.svg)](https://badge.fury.io/py/pydatamax) [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
69
+
70
+ </div>
71
+
72
+ A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
73
+
74
+ ## ✨ Core Features
75
+
76
+ - 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
77
+ - 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
78
+ - 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
79
+ - ⚡ **Batch Processing**: Efficient multi-file parallel processing
80
+ - 🎯 **Easy Integration**: Clean API design, ready to use out of the box
81
+
82
+ ## 🚀 Quick Start
83
+
84
+ ### Installation
85
+
86
+ ```bash
87
+ pip install pydatamax
88
+ ```
89
+
90
+ ### Basic Usage
91
+
92
+ ```python
93
+ from datamax import DataMax
94
+
95
+ # Parse a single file
96
+ dm = DataMax(file_path="document.pdf")
97
+ data = dm.get_data()
98
+
99
+ # Batch processing
100
+ dm = DataMax(file_path=["file1.docx", "file2.pdf"])
101
+ data = dm.get_data()
102
+
103
+ # Data cleaning
104
+ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
105
+
106
+ # AI annotation
107
+ qa_data = dm.get_pre_label(
108
+ api_key="sk-xxx",
109
+ base_url="https://api.provider.com/v1",
110
+ model_name="model-name",
111
+ chunk_size=500, # 文本块大小
112
+ chunk_overlap=100, # 重叠长度
113
+ question_number=5, # 每块生成问题数
114
+ max_workers=5 # 并发数
115
+ )
116
+ dm.save_label_data(res)
117
+ ```
118
+
119
+ ## 📖 Detailed Documentation
120
+
121
+ ### File Parsing
122
+
123
+ #### Supported Formats
124
+
125
+ | Format | Extensions | Special Features |
126
+ |--------|------------|------------------|
127
+ | Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
128
+ | Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
129
+ | Presentations | `.pptx`, `.ppt` | Slide content extraction |
130
+ | Web | `.html`, `.epub` | Tag parsing |
131
+ | Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
132
+ | Text | `.txt` | Automatic encoding detection |
133
+
134
+ #### Advanced Features
135
+
136
+ ```python
137
+ # Advanced PDF parsing (requires MinerU)
138
+ dm = DataMax(file_path="complex.pdf", use_mineru=True)
139
+
140
+ # Word to Markdown conversion
141
+ dm = DataMax(file_path="document.docx", to_markdown=True)
142
+
143
+ # Image OCR
144
+ dm = DataMax(file_path="image.jpg", use_ocr=True)
145
+ ```
146
+ ### Batch Processing
147
+ ```python
148
+ # Parse multiple files in batch
149
+ dm = DataMax(
150
+ file_path=["file1.pdf", "file2.docx"],
151
+ use_mineru=True
152
+ )
153
+ data = dm.get_data()
154
+ ```
155
+
156
+ ### Cache parsed results
157
+ ```python
158
+ # Cache parsed results to avoid repeated parsing
159
+ dm = DataMax(
160
+ file_path=["file1.pdf", "file2.docx"],
161
+ ttl=3600 # Cache duration in seconds, default 3600s, 0 means no caching
162
+ )
163
+ data = dm.get_data()
164
+ ```
165
+
166
+ ### Data Cleaning
167
+ ## Exception Handling
168
+
169
+ - remove_abnormal_chars Remove abnormal characters from text
170
+ - remove_html_tags Remove HTML tags
171
+ - convert_newlines Convert \r to \n and merge multiple \n into single \n
172
+ - single_space Convert multiple spaces (more than 2) to single space
173
+ - tabs_to_spaces Convert tabs to 4 spaces
174
+ - remove_invisible_chars Remove invisible ASCII characters
175
+ - simplify_chinese Convert traditional Chinese to simplified Chinese
176
+
177
+ ## Text Filtering
178
+
179
+ - filter_by_word_repetition Filter by word repetition rate
180
+ - filter_by_char_count Filter by character count
181
+ - filter_by_numeric_content Filter by numeric content ratio
182
+
183
+ ## Privacy Desensitization
184
+
185
+ - replace_ip
186
+ - replace_email
187
+ - replace_customer_number Clean hotline numbers like 4008-123-123
188
+ - replace_bank_id
189
+ - replace_phone_number
190
+ - replace_qq
191
+ - replace_id_card
192
+
193
+
194
+
195
+ ```python
196
+ # Three cleaning modes
197
+ dm.clean_data(method_list=[
198
+ "abnormal", # Anomaly data processing
199
+ "private", # Privacy information masking
200
+ "filter" # Text filtering and normalization
201
+ ])
202
+
203
+ # Custom cleaning mode
204
+ from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
205
+ dm = DataMax(
206
+ file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
207
+ )
208
+ parsed_data = dm.get_data().get('content')
209
+ # 1. Text filtering
210
+ tf = TextFilter(parsed_data=parsed_data)
211
+ # Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
212
+ tf_bool = tf.filter_by_word_repetition(threshold=0.6)
213
+ if tf_bool:
214
+ print("Text passed word repetition filtering")
215
+ else:
216
+ print("Text failed word repetition filtering")
217
+
218
+ # Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
219
+ tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
220
+ if tf_bool:
221
+ print("Text passed character count filtering")
222
+ else:
223
+ print("Text failed character count filtering")
224
+
225
+ # Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
226
+ tf_bool = tf.filter_by_numeric_content(threshold=0.6)
227
+ if tf_bool:
228
+ print("Text passed numeric ratio filtering")
229
+ else:
230
+ print("Text failed numeric ratio filtering")
231
+
232
+ # 2. Privacy desensitization
233
+ pd = PrivacyDesensitization(parsed_data=parsed_data)
234
+ res = pd.replace_ip(
235
+ token="MyIP"
236
+ )
237
+ print(res)
238
+
239
+ # 3. Abnormal character cleaning
240
+ ac = AbnormalCleaner(parsed_data=parsed_data)
241
+ res = ac.remove_abnormal_chars()
242
+ res = ac.remove_html_tags()
243
+ res = ac.convert_newlines()
244
+ res = ac.single_space()
245
+ res = ac.tabs_to_spaces()
246
+ res = ac.remove_invisible_chars()
247
+ res = ac.simplify_chinese()
248
+ print(res)
249
+ ```
250
+ # Text Segmentation
251
+ ```python
252
+ dm.split_data(
253
+ chunk_size=500, # Chunk size
254
+ chunk_overlap=100, # Overlap length
255
+ use_langchain=True # Use LangChain for text segmentation
256
+ )
257
+
258
+ # When use_langchain is False, use custom segmentation method
259
+ # Using 。!? as separators, consecutive separators will be merged
260
+ # chunk_size strictly limits the string length
261
+ for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
262
+ print(chunk)
263
+ ```
264
+
265
+ ### AI Annotation
266
+
267
+ ```python
268
+ # Custom annotation tasks
269
+ qa_data = dm.get_pre_label(
270
+ api_key="sk-xxx",
271
+ base_url="https://api.provider.com/v1",
272
+ model_name="model-name",
273
+ chunk_size=500, # Text chunk size
274
+ chunk_overlap=100, # Overlap length
275
+ question_number=5, # Questions per chunk
276
+ max_workers=5 # Concurrency
277
+ )
278
+ ```
279
+
280
+ ## ⚙️ Environment Setup
281
+
282
+ ### Optional Dependencies
283
+
284
+ #### LibreOffice (DOC file support)
285
+
286
+ **Ubuntu/Debian:**
287
+ ```bash
288
+ sudo apt-get install libreoffice
289
+ ```
290
+
291
+ **Windows:**
292
+ 1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
293
+ 2. Add to environment variables: `C:\Program Files\LibreOffice\program`
294
+
295
+ #### MinerU (Advanced PDF parsing)
296
+
297
+ ```bash
298
+ # Create virtual environment
299
+ conda create -n mineru python=3.10
300
+ conda activate mineru
301
+
302
+ # Install MinerU
303
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
304
+ ```
305
+
306
+ For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
307
+
308
+ ## 🛠️ Development
309
+
310
+ ### Local Installation
311
+
312
+ ```bash
313
+ git clone https://github.com/Hi-Dolphin/datamax.git
314
+ cd datamax
315
+ pip install -r requirements.txt
316
+ python setup.py install
317
+ ```
318
+
319
+ ## 📋 System Requirements
320
+
321
+ - Python >= 3.10
322
+ - Supports Windows, macOS, Linux
323
+
324
+ ## 🤝 Contributing
325
+
326
+ Issues and Pull Requests are welcome!
327
+
328
+ ## 📄 License
329
+
330
+ This project is licensed under the [MIT License](LICENSE).
331
+
332
+ ## 📞 Contact Us
333
+
334
+ - 📧 Email: cy.kron@foxmail.com
335
+ - 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
336
+ - 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
337
+
338
+ ---
339
+
340
+ ⭐ If this project helps you, please give us a star!
@@ -0,0 +1,279 @@
1
+ # DataMax
2
+
3
+ <div align="center">
4
+
5
+ [中文](README_zh.md) | **English**
6
+
7
+ [![PyPI version](https://badge.fury.io/py/pydatamax.svg)](https://badge.fury.io/py/pydatamax) [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+
9
+ </div>
10
+
11
+ A powerful multi-format file parsing, data cleaning, and AI annotation toolkit.
12
+
13
+ ## ✨ Core Features
14
+
15
+ - 🔄 **Multi-format Support**: PDF, DOCX/DOC, PPT/PPTX, XLS/XLSX, HTML, EPUB, TXT, images, and more
16
+ - 🧹 **Intelligent Cleaning**: Three-layer cleaning process with anomaly detection, privacy protection, and text filtering
17
+ - 🤖 **AI Annotation**: LLM-based automatic data annotation and pre-labeling
18
+ - ⚡ **Batch Processing**: Efficient multi-file parallel processing
19
+ - 🎯 **Easy Integration**: Clean API design, ready to use out of the box
20
+
21
+ ## 🚀 Quick Start
22
+
23
+ ### Installation
24
+
25
+ ```bash
26
+ pip install pydatamax
27
+ ```
28
+
29
+ ### Basic Usage
30
+
31
+ ```python
32
+ from datamax import DataMax
33
+
34
+ # Parse a single file
35
+ dm = DataMax(file_path="document.pdf")
36
+ data = dm.get_data()
37
+
38
+ # Batch processing
39
+ dm = DataMax(file_path=["file1.docx", "file2.pdf"])
40
+ data = dm.get_data()
41
+
42
+ # Data cleaning
43
+ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
44
+
45
+ # AI annotation
46
+ qa_data = dm.get_pre_label(
47
+ api_key="sk-xxx",
48
+ base_url="https://api.provider.com/v1",
49
+ model_name="model-name",
50
+ chunk_size=500, # 文本块大小
51
+ chunk_overlap=100, # 重叠长度
52
+ question_number=5, # 每块生成问题数
53
+ max_workers=5 # 并发数
54
+ )
55
+ dm.save_label_data(res)
56
+ ```
57
+
58
+ ## 📖 Detailed Documentation
59
+
60
+ ### File Parsing
61
+
62
+ #### Supported Formats
63
+
64
+ | Format | Extensions | Special Features |
65
+ |--------|------------|------------------|
66
+ | Documents | `.pdf`, `.docx`, `.doc` | OCR support, Markdown conversion |
67
+ | Spreadsheets | `.xlsx`, `.xls` | Structured data extraction |
68
+ | Presentations | `.pptx`, `.ppt` | Slide content extraction |
69
+ | Web | `.html`, `.epub` | Tag parsing |
70
+ | Images | `.jpg`, `.png`, `.jpeg` | OCR text recognition |
71
+ | Text | `.txt` | Automatic encoding detection |
72
+
73
+ #### Advanced Features
74
+
75
+ ```python
76
+ # Advanced PDF parsing (requires MinerU)
77
+ dm = DataMax(file_path="complex.pdf", use_mineru=True)
78
+
79
+ # Word to Markdown conversion
80
+ dm = DataMax(file_path="document.docx", to_markdown=True)
81
+
82
+ # Image OCR
83
+ dm = DataMax(file_path="image.jpg", use_ocr=True)
84
+ ```
85
+ ### Batch Processing
86
+ ```python
87
+ # Parse multiple files in batch
88
+ dm = DataMax(
89
+ file_path=["file1.pdf", "file2.docx"],
90
+ use_mineru=True
91
+ )
92
+ data = dm.get_data()
93
+ ```
94
+
95
+ ### Cache parsed results
96
+ ```python
97
+ # Cache parsed results to avoid repeated parsing
98
+ dm = DataMax(
99
+ file_path=["file1.pdf", "file2.docx"],
100
+ ttl=3600 # Cache duration in seconds, default 3600s, 0 means no caching
101
+ )
102
+ data = dm.get_data()
103
+ ```
104
+
105
+ ### Data Cleaning
106
+ ## Exception Handling
107
+
108
+ - remove_abnormal_chars Remove abnormal characters from text
109
+ - remove_html_tags Remove HTML tags
110
+ - convert_newlines Convert \r to \n and merge multiple \n into single \n
111
+ - single_space Convert multiple spaces (more than 2) to single space
112
+ - tabs_to_spaces Convert tabs to 4 spaces
113
+ - remove_invisible_chars Remove invisible ASCII characters
114
+ - simplify_chinese Convert traditional Chinese to simplified Chinese
115
+
116
+ ## Text Filtering
117
+
118
+ - filter_by_word_repetition Filter by word repetition rate
119
+ - filter_by_char_count Filter by character count
120
+ - filter_by_numeric_content Filter by numeric content ratio
121
+
122
+ ## Privacy Desensitization
123
+
124
+ - replace_ip
125
+ - replace_email
126
+ - replace_customer_number Clean hotline numbers like 4008-123-123
127
+ - replace_bank_id
128
+ - replace_phone_number
129
+ - replace_qq
130
+ - replace_id_card
131
+
132
+
133
+
134
+ ```python
135
+ # Three cleaning modes
136
+ dm.clean_data(method_list=[
137
+ "abnormal", # Anomaly data processing
138
+ "private", # Privacy information masking
139
+ "filter" # Text filtering and normalization
140
+ ])
141
+
142
+ # Custom cleaning mode
143
+ from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
144
+ dm = DataMax(
145
+ file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
146
+ )
147
+ parsed_data = dm.get_data().get('content')
148
+ # 1. Text filtering
149
+ tf = TextFilter(parsed_data=parsed_data)
150
+ # Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
151
+ tf_bool = tf.filter_by_word_repetition(threshold=0.6)
152
+ if tf_bool:
153
+ print("Text passed word repetition filtering")
154
+ else:
155
+ print("Text failed word repetition filtering")
156
+
157
+ # Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
158
+ tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
159
+ if tf_bool:
160
+ print("Text passed character count filtering")
161
+ else:
162
+ print("Text failed character count filtering")
163
+
164
+ # Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
165
+ tf_bool = tf.filter_by_numeric_content(threshold=0.6)
166
+ if tf_bool:
167
+ print("Text passed numeric ratio filtering")
168
+ else:
169
+ print("Text failed numeric ratio filtering")
170
+
171
+ # 2. Privacy desensitization
172
+ pd = PrivacyDesensitization(parsed_data=parsed_data)
173
+ res = pd.replace_ip(
174
+ token="MyIP"
175
+ )
176
+ print(res)
177
+
178
+ # 3. Abnormal character cleaning
179
+ ac = AbnormalCleaner(parsed_data=parsed_data)
180
+ res = ac.remove_abnormal_chars()
181
+ res = ac.remove_html_tags()
182
+ res = ac.convert_newlines()
183
+ res = ac.single_space()
184
+ res = ac.tabs_to_spaces()
185
+ res = ac.remove_invisible_chars()
186
+ res = ac.simplify_chinese()
187
+ print(res)
188
+ ```
189
+ # Text Segmentation
190
+ ```python
191
+ dm.split_data(
192
+ chunk_size=500, # Chunk size
193
+ chunk_overlap=100, # Overlap length
194
+ use_langchain=True # Use LangChain for text segmentation
195
+ )
196
+
197
+ # When use_langchain is False, use custom segmentation method
198
+ # Using 。!? as separators, consecutive separators will be merged
199
+ # chunk_size strictly limits the string length
200
+ for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
201
+ print(chunk)
202
+ ```
203
+
204
+ ### AI Annotation
205
+
206
+ ```python
207
+ # Custom annotation tasks
208
+ qa_data = dm.get_pre_label(
209
+ api_key="sk-xxx",
210
+ base_url="https://api.provider.com/v1",
211
+ model_name="model-name",
212
+ chunk_size=500, # Text chunk size
213
+ chunk_overlap=100, # Overlap length
214
+ question_number=5, # Questions per chunk
215
+ max_workers=5 # Concurrency
216
+ )
217
+ ```
218
+
219
+ ## ⚙️ Environment Setup
220
+
221
+ ### Optional Dependencies
222
+
223
+ #### LibreOffice (DOC file support)
224
+
225
+ **Ubuntu/Debian:**
226
+ ```bash
227
+ sudo apt-get install libreoffice
228
+ ```
229
+
230
+ **Windows:**
231
+ 1. Download and install [LibreOffice](https://www.libreoffice.org/download/)
232
+ 2. Add to environment variables: `C:\Program Files\LibreOffice\program`
233
+
234
+ #### MinerU (Advanced PDF parsing)
235
+
236
+ ```bash
237
+ # Create virtual environment
238
+ conda create -n mineru python=3.10
239
+ conda activate mineru
240
+
241
+ # Install MinerU
242
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
243
+ ```
244
+
245
+ For detailed configuration, please refer to [MinerU Documentation](https://github.com/opendatalab/MinerU)
246
+
247
+ ## 🛠️ Development
248
+
249
+ ### Local Installation
250
+
251
+ ```bash
252
+ git clone https://github.com/Hi-Dolphin/datamax.git
253
+ cd datamax
254
+ pip install -r requirements.txt
255
+ python setup.py install
256
+ ```
257
+
258
+ ## 📋 System Requirements
259
+
260
+ - Python >= 3.10
261
+ - Supports Windows, macOS, Linux
262
+
263
+ ## 🤝 Contributing
264
+
265
+ Issues and Pull Requests are welcome!
266
+
267
+ ## 📄 License
268
+
269
+ This project is licensed under the [MIT License](LICENSE).
270
+
271
+ ## 📞 Contact Us
272
+
273
+ - 📧 Email: cy.kron@foxmail.com
274
+ - 🐛 Issues: [GitHub Issues](https://github.com/Hi-Dolphin/datamax/issues)
275
+ - 📚 Documentation: [Project Homepage](https://github.com/Hi-Dolphin/datamax)
276
+
277
+ ---
278
+
279
+ ⭐ If this project helps you, please give us a star!
@@ -1 +1 @@
1
- from .parser import DataMax
1
+ from .parser import DataMax