pydatamax 0.1.5__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/OssHandler.py +85 -51
- datamax/parser/__init__.py +1 -1
- datamax/parser/base.py +2 -2
- datamax/parser/core.py +205 -31
- datamax/parser/doc_parser.py +2 -5
- datamax/parser/docx_parser.py +3 -6
- datamax/parser/epub_parser.py +2 -5
- datamax/parser/html_parser.py +2 -5
- datamax/parser/image_parser.py +18 -14
- datamax/parser/md_parser.py +67 -4
- datamax/parser/pdf_parser.py +59 -20
- datamax/parser/ppt_parser.py +3 -5
- datamax/parser/pptx_parser.py +10 -13
- datamax/parser/txt_parser.py +2 -5
- datamax/parser/xls_parser.py +26 -0
- datamax/parser/xlsx_parser.py +65 -4
- datamax/utils/__init__.py +1 -0
- datamax/utils/constants.py +58 -0
- datamax/utils/data_cleaner.py +45 -28
- datamax/utils/env_setup.py +80 -0
- datamax/utils/gotocr_pdf.py +265 -0
- datamax/utils/mineru_operator.py +62 -0
- datamax/utils/paddleocr_pdf_operator.py +2 -1
- datamax/utils/qa_generator.py +376 -0
- datamax/utils/tokenizer.py +1 -1
- pydatamax-0.1.11.dist-info/METADATA +271 -0
- pydatamax-0.1.11.dist-info/RECORD +39 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info}/WHEEL +1 -1
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info/licenses}/LICENSE +0 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info}/top_level.txt +1 -0
- tests/__init__.py +0 -0
- tests/test_basic.py +20 -0
- pydatamax-0.1.5.dist-info/METADATA +0 -282
- pydatamax-0.1.5.dist-info/RECORD +0 -31
File without changes
|
tests/__init__.py
ADDED
File without changes
|
tests/test_basic.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
DataMax 基础测试
|
3
|
+
"""
|
4
|
+
|
5
|
+
from datamax import DataMax
|
6
|
+
|
7
|
+
|
8
|
+
def test_import():
|
9
|
+
"""测试模块导入"""
|
10
|
+
assert DataMax is not None
|
11
|
+
|
12
|
+
|
13
|
+
def test_version():
|
14
|
+
"""测试版本号"""
|
15
|
+
import datamax
|
16
|
+
|
17
|
+
assert hasattr(datamax, "__version__") or True # 版本号检查
|
18
|
+
|
19
|
+
|
20
|
+
# 更多测试用例...
|
@@ -1,282 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: pydatamax
|
3
|
-
Version: 0.1.5
|
4
|
-
Summary: A library for parsing and converting various file formats.
|
5
|
-
Home-page: https://github.com/cosco/datamax
|
6
|
-
Author: hzb | ccy
|
7
|
-
Author-email: zhibaohe@hotmail.com | cy.kron@foxmail.com
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Requires-Python: >=3.9
|
12
|
-
Description-Content-Type: text/markdown
|
13
|
-
License-File: LICENSE
|
14
|
-
Requires-Dist: ebooklib
|
15
|
-
Requires-Dist: python-docx
|
16
|
-
Requires-Dist: beautifulsoup4
|
17
|
-
Requires-Dist: python-dotenv
|
18
|
-
Requires-Dist: minio
|
19
|
-
Requires-Dist: loguru
|
20
|
-
Requires-Dist: tqdm
|
21
|
-
Requires-Dist: oss2
|
22
|
-
Requires-Dist: python-docx
|
23
|
-
Requires-Dist: openai
|
24
|
-
Requires-Dist: jionlp
|
25
|
-
Requires-Dist: chardet
|
26
|
-
Requires-Dist: python-pptx
|
27
|
-
Requires-Dist: paddlepaddle
|
28
|
-
Requires-Dist: paddlepaddle-gpu
|
29
|
-
Requires-Dist: openpyxl
|
30
|
-
Requires-Dist: paddleocr==2.8.1
|
31
|
-
Requires-Dist: pymupdf
|
32
|
-
Requires-Dist: langchain_community==0.2.9
|
33
|
-
Requires-Dist: premailer
|
34
|
-
Requires-Dist: setuptools==75.3.0
|
35
|
-
Requires-Dist: docx2markdown
|
36
|
-
Requires-Dist: tiktoken
|
37
|
-
|
38
|
-
# DataMax
|
39
|
-
|
40
|
-
<p align="center">
|
41
|
-
<a href="./README.md"><img alt="README in English" src="https://img.shields.io/badge/English-d9d9d9"></a>
|
42
|
-
<a href="./README_zh.md"><img alt="简体中文版自述文件" src="https://img.shields.io/badge/简体中文-d9d9d9"></a>
|
43
|
-
</p>
|
44
|
-
|
45
|
-
## Project Introduction
|
46
|
-
The Text Processing Tool is a multifunctional data processing project designed to provide an efficient and easy-to-use way to handle various formats of text files. The project supports reading, converting, and extracting from common text formats such as PDF, HTML, DOCX/DOC, PPT/PPTX, EPUB, images (.jpg | .png), and TXT.
|
47
|
-
|
48
|
-
|
49
|
-
## Installation Guide
|
50
|
-
|
51
|
-
## Install
|
52
|
-
|
53
|
-
### Linux(Debian/Ubuntu)
|
54
|
-
```
|
55
|
-
sudo apt-get update
|
56
|
-
sudo apt-get install libreoffice
|
57
|
-
```
|
58
|
-
### Windows
|
59
|
-
```
|
60
|
-
Windows install LibreOffice:https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh
|
61
|
-
$env:PATH += ";C:\Program Files\LibreOffice\program"
|
62
|
-
```
|
63
|
-
### Check LibreOffice Installation
|
64
|
-
```
|
65
|
-
soffice --version
|
66
|
-
```
|
67
|
-
### Install Dependencies
|
68
|
-
```
|
69
|
-
pip install --no-cache-dir -r requirements.txt -i http://repo-reader:6Np2jcK%248Fqp@nexus.csntcorp.com/repository/aliyun_pipy/simple/ --trusted-host nexus.csntcorp.com
|
70
|
-
```
|
71
|
-
|
72
|
-
1. Clone the repository to local:
|
73
|
-
```bash
|
74
|
-
git clone
|
75
|
-
```
|
76
|
-
2. Enter the project directory and create a virtual environment (e.g., uv):
|
77
|
-
```bash
|
78
|
-
cd DataMax
|
79
|
-
uv init
|
80
|
-
uv add requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
81
|
-
uv sync
|
82
|
-
. .venv/bin/activate
|
83
|
-
```
|
84
|
-
|
85
|
-
|
86
|
-
## Features
|
87
|
-
- **Multi-format Support**: Capable of handling PDF, HTML, DOCX, and TXT, among other text file types.
|
88
|
-
- **Content Extraction**: Provides powerful content extraction capabilities to accurately extract needed information from complex document structures.
|
89
|
-
- **Data Conversion**: Supports conversion of processed data to Markdown format for further data analysis.
|
90
|
-
- **Batch Processing**: Capable of processing a large number of files at once, improving work efficiency.
|
91
|
-
- **Custom Configuration**: Users can adjust processing parameters according to their needs to meet various business requirements.
|
92
|
-
- **Cross-platform Compatibility**: This SDK can run on multiple operating systems including Windows, MacOS, and Linux.
|
93
|
-
|
94
|
-
|
95
|
-
## Technology Stack
|
96
|
-
- **Programming Language**: Python >= 3.9
|
97
|
-
- **Dependency Libraries**:
|
98
|
-
- PyMuPDF: For parsing PDF files.
|
99
|
-
- BeautifulSoup: For parsing HTML files.
|
100
|
-
- python-docx: For parsing DOCX files.
|
101
|
-
- pandas: For data processing and conversion.
|
102
|
-
- paddleocr: For parsing PDF scans, forms, and images.
|
103
|
-
- **Development Environment**: Visual Studio Code or PyCharm
|
104
|
-
- **Version Control**: Git
|
105
|
-
|
106
|
-
|
107
|
-
## Usage Instructions
|
108
|
-
### Install SDK
|
109
|
-
- Installation Commands:
|
110
|
-
```bash
|
111
|
-
## Local installation
|
112
|
-
python setup.py sdist bdist_wheel
|
113
|
-
pip install dist/datamax-0.1.3-py3-none-any.whl
|
114
|
-
```
|
115
|
-
- Import Code
|
116
|
-
```python
|
117
|
-
from datamax import DataMaxParser
|
118
|
-
## Handling a single file in two ways
|
119
|
-
# 1. List of length 1
|
120
|
-
data = DataMaxParser(file_path=[r"docx_files_example/ShipViewOverview.doc"])
|
121
|
-
data = data.get_data()
|
122
|
-
# 2. String
|
123
|
-
data = DataMaxParser(file_path=r"docx_files_example/ShipViewOverview.doc")
|
124
|
-
data = data.get_data()
|
125
|
-
|
126
|
-
## Handling multiple files
|
127
|
-
## 1. List of length n
|
128
|
-
data = DataMaxParser(file_path=[r"docx_files_example/ShipViewOverview1.doc", r"docx_files_example/ShipViewOverview2.doc"])
|
129
|
-
data = data.get_data()
|
130
|
-
|
131
|
-
## 2. Passing a folder string
|
132
|
-
data = DataMaxParser(file_path=r"docx_files_example/")
|
133
|
-
data = data.get_data()
|
134
|
-
```
|
135
|
-
|
136
|
-
## Examples
|
137
|
-
```python
|
138
|
-
## docx | doc | epub | html | txt | jpg | png | ppt | pptx
|
139
|
-
from datamax import DataMaxParser
|
140
|
-
data = DataMaxParser(file_path=r"docx_files_example/ShipViewOverview.doc", to_markdown=True)
|
141
|
-
"""
|
142
|
-
Parameters:
|
143
|
-
file_path: File relative path / File absolute path
|
144
|
-
to_markdown: Whether to convert to markdown (default value False, returns text directly) This parameter only supports Word files (doc | docx)
|
145
|
-
"""
|
146
|
-
|
147
|
-
|
148
|
-
## pdf
|
149
|
-
from datamax import DataMaxParser
|
150
|
-
data = DataMaxParser(file_path=r"docx_files_example/ShipViewOverview.pdf", use_ocr=True, use_gpu=True, gpu_id=0)
|
151
|
-
"""
|
152
|
-
Parameters:
|
153
|
-
file_path: File relative path / File absolute path
|
154
|
-
use_ocr: Whether to use OCR (True returns markdown, False returns plain text)
|
155
|
-
use_gpu: Whether to use GPU (ensure CUDA environment is available)
|
156
|
-
gpu_id: GPU card (default 6th card)
|
157
|
-
"""
|
158
|
-
```
|
159
|
-
|
160
|
-
## Contribution Guidelines
|
161
|
-
We welcome any form of contribution, whether it's reporting bugs, suggesting new features, or directly submitting code improvements. Please read our [Contributor Guidelines](CONTRIBUTING.md) to get started.
|
162
|
-
|
163
|
-
## License
|
164
|
-
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
165
|
-
|
166
|
-
## Contact
|
167
|
-
If you encounter any issues during use, or have any suggestions and feedback, please contact us through the following methods:
|
168
|
-
- Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
|
169
|
-
- Project Homepage: [GitHub Project Link](xxxx)
|
170
|
-
|
171
|
-
## RoadMap
|
172
|
-
- [x] Implement OSS Data Class
|
173
|
-
- [x] Implement OBS Data Class
|
174
|
-
- [x] Implement PGSQL Data Class
|
175
|
-
- [x] Implement Local Data Class
|
176
|
-
- [x] OSS Data Class should be able to obtain bucket information, metadata, and file lists from the OSS data source
|
177
|
-
- [x] OBS Data Class should be able to obtain bucket information, metadata, and file lists from the OBS data source
|
178
|
-
- [ ] PGSQL Data Class should be able to obtain schema information, metadata, and table structures from the PGSQL data source
|
179
|
-
|
180
|
-
|
181
|
-
- (Optional) Data Classes (Access Data Sources) / Direct Local Reading DataSourceClass
|
182
|
-
- MinIO
|
183
|
-
- OSS
|
184
|
-
- OBS
|
185
|
-
- PostgreSQL
|
186
|
-
|
187
|
-
DataLoader.load(fp: file_path, ) -> local path
|
188
|
-
|
189
|
-
DataLoader.load(source: DataSourceClass, ) -> Access data source, return metadata of the data source: able to know file size & file path & file download address & storage space occupied by the data source ->
|
190
|
-
|
191
|
-
- Data Loading Class (Load and read PDF, Word, Excel. Able to load the following types of data from the data source on demand)
|
192
|
-
- .pdf
|
193
|
-
- .pdf(Image type / Scanned copy)
|
194
|
-
- .docx
|
195
|
-
- .html
|
196
|
-
- .pptx
|
197
|
-
- .epub
|
198
|
-
- .txt
|
199
|
-
- .md
|
200
|
-
---
|
201
|
-
- .csv
|
202
|
-
- .json
|
203
|
-
- .xlsx
|
204
|
-
---
|
205
|
-
- http / https
|
206
|
-
---
|
207
|
-
- Multimodal
|
208
|
-
- .png
|
209
|
-
- .jpg
|
210
|
-
- .jpeg
|
211
|
-
- .bmp
|
212
|
-
- .gif
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
class DataLoaderClass:
|
217
|
-
def read_docx(self, DocxInputVo): # TODO: ccy
|
218
|
-
...
|
219
|
-
|
220
|
-
def read_pdf():
|
221
|
-
...
|
222
|
-
|
223
|
-
|
224
|
-
- Data Parsing Class (Parsing algorithms, parsing logic & output string objects)
|
225
|
-
- .pdf
|
226
|
-
- .pdf (Image type / Scanned copy) PaddleOCR / AI LAB OCR / xx OCR
|
227
|
-
- .doc
|
228
|
-
- .docx
|
229
|
-
- .html
|
230
|
-
- .ppt
|
231
|
-
- .pptx
|
232
|
-
- .epub
|
233
|
-
- .txt
|
234
|
-
- .md
|
235
|
-
---
|
236
|
-
- .csv
|
237
|
-
- .json
|
238
|
-
- .xlsx
|
239
|
-
- .xls
|
240
|
-
---
|
241
|
-
- http / https
|
242
|
-
---
|
243
|
-
- Multimodal
|
244
|
-
- .png
|
245
|
-
- .jpg
|
246
|
-
- .jpeg
|
247
|
-
- .bmp
|
248
|
-
- .gif
|
249
|
-
|
250
|
-
|
251
|
-
- Data Cleaning Class (Input: String object, according to cleaning rules, get cleaned markdown format text. Output: markdown)
|
252
|
-
|
253
|
-
- Large Model Class (Pre-tagging)
|
254
|
-
|
255
|
-
- Data Output Class (Unified format)
|
256
|
-
|
257
|
-
|
258
|
-
## Structure
|
259
|
-
```bash
|
260
|
-
├── api # Various interfaces exposed by the project
|
261
|
-
├── datamax # Core functions and classes of various SDKs
|
262
|
-
├── dockerfiles # Docker configuration files
|
263
|
-
├── docs # Project documentation
|
264
|
-
├── example # Example code
|
265
|
-
├── README.md # Project description file
|
266
|
-
├── scripts # Various scripts
|
267
|
-
└── test # Test code
|
268
|
-
```
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
## Standards
|
273
|
-
|
274
|
-
1. Class names are declared using PascalCase
|
275
|
-
2. Function names are declared using lowercase and underscores
|
276
|
-
3. Function input and output need to be declared with data types using colons
|
277
|
-
4. Remove print logger before pushing code
|
278
|
-
5. Rebase:
|
279
|
-
1. git add .
|
280
|
-
2. git commit -m ":boom: new feature"
|
281
|
-
3. git pull --rebase
|
282
|
-
4. git push
|
pydatamax-0.1.5.dist-info/RECORD
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
datamax/__init__.py,sha256=1rbhISrgmv_MT11nIiajPUxsX5kKUUY1B5Q1A-oscuc,34
|
2
|
-
datamax/loader/MinioHandler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
|
3
|
-
datamax/loader/OssHandler.py,sha256=JWBulEPHv3ThA6RUdwA_EGW8ROnB5OGG8P91J1VmP8U,7236
|
4
|
-
datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
datamax/loader/core.py,sha256=tSIkOw5D3EVFYme1b7joFt0e_LxJdf-mdUzxpyVt0VI,5098
|
6
|
-
datamax/parser/__init__.py,sha256=n4VG9tdHOgnq7KjzD5adQKw6nIQCaB6dLZJWhCtWNiY,117
|
7
|
-
datamax/parser/base.py,sha256=TOJHRq7ytQSyHIagChLKHkcuSj_46ivqMiXq9NDlt-w,2525
|
8
|
-
datamax/parser/core.py,sha256=acIyW_UXaucDNvaSGj__XDBXp-k_KGW5MiS1OGQzUiU,4521
|
9
|
-
datamax/parser/csv_parser.py,sha256=IcyVq8mGE6auIcUInXGYWDnh0H0XJ_3SyQrLVRrS7i0,190
|
10
|
-
datamax/parser/doc_parser.py,sha256=U_QCqeXdgJ5KDkO8glPmpmy4lLG-avd4-7wrUrrKd5c,3571
|
11
|
-
datamax/parser/docx_parser.py,sha256=yyRW-vPOTn0f_Kk0A1VUX7_sIrOHaJCgjfAAuMUDDG8,1949
|
12
|
-
datamax/parser/epub_parser.py,sha256=4TIi6ufQ_7O27z0o2WKxjHkDDI3FT2nvldlfffWycCI,1776
|
13
|
-
datamax/parser/html_parser.py,sha256=-y1AtPTZpvtdtvFJtxDoamzI0FAYrWb8Jb4RDt2DX1Q,1608
|
14
|
-
datamax/parser/image_parser.py,sha256=XyzZlWvLJL3itOTsy0_eYsljTfgA5O0vacJXjtxeYiU,1260
|
15
|
-
datamax/parser/json_parser.py,sha256=MFamKCkP5Ny1kJyJlPkd_vNqk31ngPRf8NoYw8SxMY4,190
|
16
|
-
datamax/parser/md_parser.py,sha256=MFamKCkP5Ny1kJyJlPkd_vNqk31ngPRf8NoYw8SxMY4,190
|
17
|
-
datamax/parser/pdf_parser.py,sha256=cNXRbpTpncvV56-fMbinslpMuUsXIkv_DqHVtZfUt00,2594
|
18
|
-
datamax/parser/ppt_parser.py,sha256=Q_wVl9-PmLbU3ezwltKf4DuVmVQrG_L5vEDEUfKhKEw,3305
|
19
|
-
datamax/parser/pptx_parser.py,sha256=y2ea39FCsxO-6gekd4BmjiYzpoG-saBWmd2IonNde2A,2014
|
20
|
-
datamax/parser/txt_parser.py,sha256=9NidWdHUOCW17WSMLnDSZqVhlXQwSQhumQ99GxypjEY,1901
|
21
|
-
datamax/parser/xlsx_parser.py,sha256=MFamKCkP5Ny1kJyJlPkd_vNqk31ngPRf8NoYw8SxMY4,190
|
22
|
-
datamax/utils/__init__.py,sha256=eifKmh0wYX6I7xcaZtZcw3rh1I1US_gPJyr5yKv3VCg,997
|
23
|
-
datamax/utils/data_cleaner.py,sha256=XxaDn5QHBOcNKXrLMdmbo5uEvPWCtovZitdrKBqnF4U,7307
|
24
|
-
datamax/utils/paddleocr_pdf_operator.py,sha256=-Vqb1L9DhuUWRjM_aDD2Nok8XwUGyuYOVI-hagfb5II,3482
|
25
|
-
datamax/utils/ppt_extract.py,sha256=nd6KSqEzxANrPhNPUZY4ogAyxHzKCbdsI5ZfDQCz0Cw,6164
|
26
|
-
datamax/utils/tokenizer.py,sha256=5Gr7bfmtr6cfQOrJVf0n8PPTdW0Dj1pN5xO5hjJhOg8,858
|
27
|
-
pydatamax-0.1.5.dist-info/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
|
28
|
-
pydatamax-0.1.5.dist-info/METADATA,sha256=9Fzp3sCiaUaFwBAhWshWdd3Z87ZRmoYZUHlp5Rqol2s,8955
|
29
|
-
pydatamax-0.1.5.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
30
|
-
pydatamax-0.1.5.dist-info/top_level.txt,sha256=N9TrwI3GKnWW07RRXHr0xX5Bm8dIM_sahfAnf9j8J9M,8
|
31
|
-
pydatamax-0.1.5.dist-info/RECORD,,
|