pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
  4. datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +525 -61
  10. datamax/parser/docx_parser.py +512 -62
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -208
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. pydatamax-0.1.15.dist-info/METADATA +340 -0
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.13.dist-info/METADATA +0 -280
  38. pydatamax-0.1.13.dist-info/RECORD +0 -39
  39. tests/__init__.py +0 -0
  40. tests/test_basic.py +0 -20
  41. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
@@ -1,280 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: pydatamax
3
- Version: 0.1.13
4
- Summary: A library for parsing and converting various file formats.
5
- Home-page: https://github.com/Hi-Dolphin/datamax
6
- Author: ccy
7
- Author-email: cy.kron@foxmail.com
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.10
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: oss2<3.0.0,>=2.19.1
15
- Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
- Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
- Requires-Dist: crcmod<2.0.0,>=1.7
18
- Requires-Dist: langdetect<2.0.0,>=1.0.9
19
- Requires-Dist: loguru<1.0.0,>=0.7.3
20
- Requires-Dist: python-docx<2.0.0,>=1.1.2
21
- Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
- Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
- Requires-Dist: pypdf<6.0.0,>=5.5.0
24
- Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
- Requires-Dist: pandas<3.0.0,>=2.2.3
26
- Requires-Dist: numpy<3.0.0,>=2.2.6
27
- Requires-Dist: requests<3.0.0,>=2.32.3
28
- Requires-Dist: tqdm<5.0.0,>=4.67.1
29
- Requires-Dist: pydantic<3.0.0,>=2.11.5
30
- Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
- Requires-Dist: python-magic<1.0.0,>=0.4.27
32
- Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
- Requires-Dist: Pillow<12.0.0,>=11.2.1
34
- Requires-Dist: packaging<25.0,>=24.2
35
- Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
- Requires-Dist: minio<8.0.0,>=7.2.15
37
- Requires-Dist: openai<2.0.0,>=1.82.0
38
- Requires-Dist: jionlp<2.0.0,>=1.5.23
39
- Requires-Dist: chardet<6.0.0,>=5.2.0
40
- Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
- Requires-Dist: tiktoken<1.0.0,>=0.9.0
42
- Requires-Dist: markitdown<1.0.0,>=0.1.1
43
- Requires-Dist: xlrd<3.0.0,>=2.0.1
44
- Requires-Dist: tabulate<1.0.0,>=0.9.0
45
- Requires-Dist: unstructured<1.0.0,>=0.17.2
46
- Requires-Dist: markdown<4.0.0,>=3.8
47
- Requires-Dist: langchain<1.0.0,>=0.3.0
48
- Requires-Dist: langchain-community<1.0.0,>=0.3.0
49
- Dynamic: author
50
- Dynamic: author-email
51
- Dynamic: classifier
52
- Dynamic: description
53
- Dynamic: description-content-type
54
- Dynamic: home-page
55
- Dynamic: license-file
56
- Dynamic: requires-dist
57
- Dynamic: requires-python
58
- Dynamic: summary
59
-
60
- # DataMax
61
-
62
- ## Overview
63
- DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
64
-
65
- ## Key Features
66
-
67
- ### File Processing Capabilities
68
- Currently supports reading, conversion, and extraction from:
69
- - PDF, HTML
70
- - DOCX/DOC, PPT/PPTX
71
- - EPUB
72
- - Images
73
- - XLS/XLSX spreadsheets
74
- - Plain text (TXT)
75
-
76
- ### Data Cleaning Pipeline
77
- Three-tiered cleaning process:
78
- 1. Anomaly detection and handling
79
- 2. Privacy protection processing
80
- 3. Text filtering and normalization
81
-
82
- ### AI-Powered Data Annotation
83
- Implements an LLM+Prompt to:
84
- - Continuously generate pre-labeled datasets
85
- - Provide optimized training data for model fine-tuning
86
-
87
-
88
- ## Installation Guide (Key Dependencies)
89
- Dependencies include libreoffice, datamax, and MinerU.
90
-
91
- ### 1. Installing libreoffice Dependency
92
- **Note:** Without datamax, .doc files will not be supported.
93
-
94
- #### Linux (Debian/Ubuntu)
95
- ```bash
96
- sudo apt-get update
97
- sudo apt-get install libreoffice
98
- ```
99
- ### Windows
100
- ```text
101
- Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
102
- Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
103
- ```
104
- ### Checking LibreOffice Installation
105
- ```bash
106
- soffice --version
107
- ```
108
-
109
- ## 2. Installing MinerU Dependency
110
- Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
111
- ### Create a Virtual Environment and Install Basic Dependencies
112
- ```bash
113
- conda create -n mineru python=3.10
114
- conda activate mineru
115
- pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
116
- ```
117
- ### Installing Model Weight Files
118
- https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
119
- ```bash
120
- pip install modelscope
121
- wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
122
- python download_models.py
123
- ```
124
-
125
- ### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
126
- ```json
127
- {
128
- "models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
129
- "layoutreader-model-dir": "path\\to\\folder\\layoutreader",
130
- "device-mode": "cpu",
131
- ...
132
- }
133
- ```
134
-
135
- ## 3. Installing Basic Dependencies for datamax
136
- 1. Clone the repository to your local machine:
137
- ```bash
138
- git clone <repository-url>
139
- ```
140
- 2. Install dependencies into conda:
141
- ```bash
142
- cd datamax
143
- pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
144
- ```
145
-
146
-
147
- ## Features
148
- - **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
149
- - **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
150
- - **Data Conversion**: Supports converting processed data into markdown format for further analysis.
151
- - **Batch Processing**: Can handle multiple files at once, improving work efficiency.
152
- - **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
153
- - **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
154
-
155
-
156
- ## Technology Stack
157
-
158
- - **Programming Language**: Python >= 3.10
159
- - **Dependency Libraries**:
160
- - PyMuPDF: For PDF file parsing.
161
- - BeautifulSoup: For HTML file parsing.
162
- - python-docx: For DOCX file parsing.
163
- - pandas: For data processing and conversion.
164
- - paddleocr: For parsing scanned PDFs, tables, and images.
165
- - **Development Environment**: Visual Studio Code or PyCharm
166
- - **Version Control**: Git
167
-
168
- ## Usage Instructions
169
- ### Installing the SDK
170
- - **Installation Commands**:
171
- ```bash
172
- ## Local Installation
173
- python setup.py sdist bdist_wheel
174
- pip install dist/datamax-0.1.3-py3-none-any.whl
175
-
176
- ## Pip Installation
177
- pip install pydatamax
178
- ```
179
-
180
-
181
- - **Importing the Code**:
182
- ```python
183
- # File Parsing
184
- from datamax import DataMax
185
-
186
- ## Handling a Single File in Two Ways
187
- # 1. Using a List of Length 1
188
- data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
189
- data = data.get_data()
190
-
191
- # 2. Using a String
192
- data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
193
- data = data.get_data()
194
-
195
- ## Handling Multiple Files
196
- # 1. Using a List of Length n
197
- data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
198
- data = data.get_data()
199
-
200
- # 2. Passing a Folder Path as a String
201
- data = DataMax(file_path=r"docx_files_example/")
202
- data = data.get_data()
203
-
204
- # Data Cleaning
205
- """
206
- Cleaning rules can be found in datamax/utils/data_cleaner.py
207
- abnormal: Abnormal cleaning
208
- private: Privacy processing
209
- filter: Text filtering
210
- """
211
- # Direct Use: Clean the text parameter directly and return a string
212
- dm = DataMax()
213
- data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
214
-
215
- # Process Use: Use after get_data() to return the complete data structure
216
- dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
217
- data2 = dm.get_data()
218
- cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
219
-
220
- # Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
221
- data = DataMax(file_path=r"path\to\xxx.docx")
222
- parsed_data = data.get_data()
223
- # If no custom messages are passed, the default messages in the SDK will be used
224
- messages = [
225
- {'role': 'system', 'content': 'You are a helpful assistant.'},
226
- {'role': 'user', 'content': 'Who are you?'}
227
- ]
228
- qa_datas = data.get_pre_label(
229
- api_key="sk-xxx",
230
- base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
231
- model_name="qwen-max",
232
- chunk_size=500,
233
- chunk_overlap=100,
234
- question_number=5,
235
- max_workers=5,
236
- # message=[]
237
- )
238
- print(f'Annotated result:{qa_datas}')
239
- ```
240
-
241
-
242
- ## Examples
243
- ```python
244
- ## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
245
- from datamax import DataMax
246
- data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
247
- """
248
- Parameters:
249
- file_path: Relative file path / Absolute file path
250
- to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
251
- """
252
-
253
- ## jpg | jpeg | png | ...(image types)
254
- data = DataMax(file_path=r"image.jpg", use_mineru=True)
255
- """
256
- Parameters:
257
- file_path: Relative file path / Absolute file path
258
- use_mineru: Whether to use MinerU enhancement
259
- """
260
-
261
- ## pdf
262
- from datamax import DataMax
263
- data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
264
- """
265
- Parameters:
266
- file_path: Relative file path / Absolute file path
267
- use_mineru: Whether to use MinerU enhancement
268
- """
269
- ```
270
-
271
- ## Contribution Guide
272
- We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
273
- ## License
274
- This project is licensed under the MIT License. For more details, see the LICENSE file.
275
-
276
- ## Contact Information
277
- If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
278
- - Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
279
- - Project Homepage: GitHub Project Link
280
-
@@ -1,39 +0,0 @@
1
- datamax/__init__.py,sha256=Kbs8ITE6suPy0VL8WzKH8A_iAGqukC0jIHcFGLgoBw8,28
2
- datamax/loader/MinioHandler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
3
- datamax/loader/OssHandler.py,sha256=YFbUfH2vkKRVg8OhkYpOrlYHrIX4zofbHFVe8nPpsQ4,7457
4
- datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- datamax/loader/core.py,sha256=tSIkOw5D3EVFYme1b7joFt0e_LxJdf-mdUzxpyVt0VI,5098
6
- datamax/parser/__init__.py,sha256=Jilq2PLBNonmoXKATzsIHWWvFuBdlcV2dbSP1cOZ6zg,111
7
- datamax/parser/base.py,sha256=riGcMn4m295_qf9O0-NbHU2BcHGBXvoF4T3fWj9vgUQ,2514
8
- datamax/parser/core.py,sha256=9rzIjsVTRacPTUTAVa5gm5fx0h95LxYnw0lEGqjIIB4,11437
9
- datamax/parser/csv_parser.py,sha256=IcyVq8mGE6auIcUInXGYWDnh0H0XJ_3SyQrLVRrS7i0,190
10
- datamax/parser/doc_parser.py,sha256=WIWZqvWT4bbquMn1t5Y4P3rEFG6YZ6z3b-f-5yCEtwU,8266
11
- datamax/parser/docx_parser.py,sha256=Ipk9ea281N8Edj74tnqUpc_MGZgD4qn780MX_QA9SiU,9111
12
- datamax/parser/epub_parser.py,sha256=ljCGxLBPwE5gXVKARJec93VpP4dE9R2GspzuSZBkqPQ,1557
13
- datamax/parser/html_parser.py,sha256=xQaaK8674QbQwE-Up9X0DJIH0Gg0mR2KoI7fJ6iw2m0,1393
14
- datamax/parser/image_parser.py,sha256=qGCndc_21PwsfuxFG03wHSsV0uc-XMBaW3VDbsJQd90,1233
15
- datamax/parser/json_parser.py,sha256=MFamKCkP5Ny1kJyJlPkd_vNqk31ngPRf8NoYw8SxMY4,190
16
- datamax/parser/md_parser.py,sha256=lgRlcvtV_9gkB2BnygzcdqIfj94tWjEq6ziGeLq3p00,2156
17
- datamax/parser/pdf_parser.py,sha256=EbhXjTU09hMTr850_o1K7m7zD4QU9_A54MsbOF7pLT0,3992
18
- datamax/parser/ppt_parser.py,sha256=Niu3Ina6I6m6lAMS1Z-A7rUbR_iFGmNTaASBoNH_vZ0,3142
19
- datamax/parser/pptx_parser.py,sha256=sFWyOa3QNIs4BgtpmSzFQgsgPmunfGqCqi6fulbLFW0,1811
20
- datamax/parser/txt_parser.py,sha256=4DIP1LVOw21NDdtqG2RTD_hMcHufkvC8kr048AkuLFs,1682
21
- datamax/parser/xls_parser.py,sha256=pRlqgg96f76H8UqXQfheQT9O0ThdP7958hKUCEyQfPM,954
22
- datamax/parser/xlsx_parser.py,sha256=tyLU6wa3F31p7JaoCpML6TJyzYd2Lpeuhzs4036en2U,9274
23
- datamax/utils/__init__.py,sha256=d69SJvqOXzItyg9rEcLc4z67Lw9vACispOe3x7NvZLA,1051
24
- datamax/utils/constants.py,sha256=A0S56mkIfeT6oQmOd-VGTChzLOSBUqsG4skMmLt6uNk,4507
25
- datamax/utils/data_cleaner.py,sha256=zlk2dXmhU-_9KVfqmqMGr967v-nc7Iv8ZKRdMkIJsGM,7784
26
- datamax/utils/env_setup.py,sha256=KrRQIbCMgtTjD8lKwzc9jv7jFPMMNMzikEb0_TfIstU,3460
27
- datamax/utils/gotocr_pdf.py,sha256=YCYio_5Yt77hky4nSyfREw5_Bh55XbGy7l2cypvGxNg,8479
28
- datamax/utils/mineru_operator.py,sha256=Rss7YVSAUnoWmDnCGPJlgsMNmJWmb6blYuS4UB7PgQ8,2241
29
- datamax/utils/paddleocr_pdf_operator.py,sha256=Tnb-5SzUd6OXM-XeaL8vdPnsOhgG_GKz-gfIdVtYoSs,3555
30
- datamax/utils/ppt_extract.py,sha256=nd6KSqEzxANrPhNPUZY4ogAyxHzKCbdsI5ZfDQCz0Cw,6164
31
- datamax/utils/qa_generator.py,sha256=d75an9JEyT6sxlSjdmWYveQshfyTb0v4aGSuTpTJa0A,12561
32
- datamax/utils/tokenizer.py,sha256=Y8XB06XQVsNuG8IPl_4iBZj2yu1xzXldVbmZtXFMQM4,859
33
- pydatamax-0.1.13.dist-info/licenses/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
34
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
- tests/test_basic.py,sha256=4AByx25-MIt6_zmzxpFRoSCBqLtIjyfTwFLb1UCJz6k,303
36
- pydatamax-0.1.13.dist-info/METADATA,sha256=knte2YZ9jdSGxmO0fzBVtMFAcq1exCKyEdfBde4aCjA,9731
37
- pydatamax-0.1.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
- pydatamax-0.1.13.dist-info/top_level.txt,sha256=DvdVIUrxJDgRUYiGA5vznYZIP-K8ZnDkTZfrqYLNZMQ,14
39
- pydatamax-0.1.13.dist-info/RECORD,,
tests/__init__.py DELETED
File without changes
tests/test_basic.py DELETED
@@ -1,20 +0,0 @@
1
- """
2
- DataMax 基础测试
3
- """
4
-
5
- from datamax import DataMax
6
-
7
-
8
- def test_import():
9
- """测试模块导入"""
10
- assert DataMax is not None
11
-
12
-
13
- def test_version():
14
- """测试版本号"""
15
- import datamax
16
-
17
- assert hasattr(datamax, "__version__") or True # 版本号检查
18
-
19
-
20
- # 更多测试用例...