pydatamax 0.1.5__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {pydatamax-0.1.5 → pydatamax-0.1.12}/LICENSE +0 -0
  2. pydatamax-0.1.12/PKG-INFO +281 -0
  3. pydatamax-0.1.12/README.md +221 -0
  4. pydatamax-0.1.12/datamax/__init__.py +1 -0
  5. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/loader/MinioHandler.py +0 -0
  6. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/loader/OssHandler.py +85 -51
  7. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/loader/__init__.py +0 -0
  8. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/loader/core.py +0 -0
  9. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/__init__.py +1 -1
  10. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/base.py +2 -2
  11. pydatamax-0.1.12/datamax/parser/core.py +288 -0
  12. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/csv_parser.py +0 -0
  13. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/doc_parser.py +2 -5
  14. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/docx_parser.py +3 -6
  15. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/epub_parser.py +2 -5
  16. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/html_parser.py +2 -5
  17. pydatamax-0.1.12/datamax/parser/image_parser.py +34 -0
  18. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/json_parser.py +0 -0
  19. pydatamax-0.1.12/datamax/parser/md_parser.py +73 -0
  20. pydatamax-0.1.12/datamax/parser/pdf_parser.py +101 -0
  21. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/ppt_parser.py +3 -5
  22. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/pptx_parser.py +10 -13
  23. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/parser/txt_parser.py +2 -5
  24. pydatamax-0.1.12/datamax/parser/xls_parser.py +26 -0
  25. pydatamax-0.1.12/datamax/parser/xlsx_parser.py +71 -0
  26. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/utils/__init__.py +1 -0
  27. pydatamax-0.1.12/datamax/utils/constants.py +58 -0
  28. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/utils/data_cleaner.py +45 -28
  29. pydatamax-0.1.12/datamax/utils/env_setup.py +80 -0
  30. pydatamax-0.1.12/datamax/utils/gotocr_pdf.py +265 -0
  31. pydatamax-0.1.12/datamax/utils/mineru_operator.py +62 -0
  32. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/utils/paddleocr_pdf_operator.py +2 -1
  33. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/utils/ppt_extract.py +0 -0
  34. pydatamax-0.1.12/datamax/utils/qa_generator.py +376 -0
  35. {pydatamax-0.1.5 → pydatamax-0.1.12}/datamax/utils/tokenizer.py +1 -1
  36. pydatamax-0.1.12/pydatamax.egg-info/PKG-INFO +281 -0
  37. {pydatamax-0.1.5 → pydatamax-0.1.12}/pydatamax.egg-info/SOURCES.txt +9 -1
  38. {pydatamax-0.1.5 → pydatamax-0.1.12}/pydatamax.egg-info/dependency_links.txt +0 -0
  39. pydatamax-0.1.12/pydatamax.egg-info/requires.txt +36 -0
  40. {pydatamax-0.1.5 → pydatamax-0.1.12}/pydatamax.egg-info/top_level.txt +1 -0
  41. {pydatamax-0.1.5 → pydatamax-0.1.12}/setup.cfg +0 -0
  42. pydatamax-0.1.12/setup.py +57 -0
  43. pydatamax-0.1.12/tests/__init__.py +0 -0
  44. pydatamax-0.1.12/tests/test_basic.py +20 -0
  45. pydatamax-0.1.5/PKG-INFO +0 -282
  46. pydatamax-0.1.5/README.md +0 -245
  47. pydatamax-0.1.5/datamax/__init__.py +0 -1
  48. pydatamax-0.1.5/datamax/parser/core.py +0 -114
  49. pydatamax-0.1.5/datamax/parser/image_parser.py +0 -30
  50. pydatamax-0.1.5/datamax/parser/md_parser.py +0 -10
  51. pydatamax-0.1.5/datamax/parser/pdf_parser.py +0 -62
  52. pydatamax-0.1.5/datamax/parser/xlsx_parser.py +0 -10
  53. pydatamax-0.1.5/pydatamax.egg-info/PKG-INFO +0 -282
  54. pydatamax-0.1.5/pydatamax.egg-info/requires.txt +0 -23
  55. pydatamax-0.1.5/setup.py +0 -44
File without changes
@@ -0,0 +1,281 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydatamax
3
+ Version: 0.1.12
4
+ Summary: A library for parsing and converting various file formats.
5
+ Home-page: https://github.com/Hi-Dolphin/datamax
6
+ Author: ccy
7
+ Author-email: cy.kron@foxmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: oss2<3.0.0,>=2.19.1
15
+ Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
+ Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
+ Requires-Dist: crcmod<2.0.0,>=1.7
18
+ Requires-Dist: langdetect<2.0.0,>=1.0.9
19
+ Requires-Dist: loguru<1.0.0,>=0.7.3
20
+ Requires-Dist: python-docx<2.0.0,>=1.1.2
21
+ Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
+ Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
+ Requires-Dist: pypdf<6.0.0,>=5.5.0
24
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
+ Requires-Dist: pandas<3.0.0,>=2.2.3
26
+ Requires-Dist: numpy<3.0.0,>=2.2.6
27
+ Requires-Dist: requests<3.0.0,>=2.32.3
28
+ Requires-Dist: tqdm<5.0.0,>=4.67.1
29
+ Requires-Dist: pydantic<3.0.0,>=2.11.5
30
+ Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
+ Requires-Dist: python-magic<1.0.0,>=0.4.27
32
+ Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
+ Requires-Dist: Pillow<12.0.0,>=11.2.1
34
+ Requires-Dist: packaging<25.0,>=24.2
35
+ Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
+ Requires-Dist: minio<8.0.0,>=7.2.15
37
+ Requires-Dist: openai<2.0.0,>=1.82.0
38
+ Requires-Dist: jionlp<2.0.0,>=1.5.23
39
+ Requires-Dist: chardet<6.0.0,>=5.2.0
40
+ Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
+ Requires-Dist: docx2markdown<1.0.0,>=0.1.1
42
+ Requires-Dist: tiktoken<1.0.0,>=0.9.0
43
+ Requires-Dist: markitdown<1.0.0,>=0.1.1
44
+ Requires-Dist: xlrd<3.0.0,>=2.0.1
45
+ Requires-Dist: tabulate<1.0.0,>=0.9.0
46
+ Requires-Dist: unstructured<1.0.0,>=0.17.2
47
+ Requires-Dist: markdown<4.0.0,>=3.8
48
+ Requires-Dist: langchain<1.0.0,>=0.3.0
49
+ Requires-Dist: langchain-community<1.0.0,>=0.3.0
50
+ Dynamic: author
51
+ Dynamic: author-email
52
+ Dynamic: classifier
53
+ Dynamic: description
54
+ Dynamic: description-content-type
55
+ Dynamic: home-page
56
+ Dynamic: license-file
57
+ Dynamic: requires-dist
58
+ Dynamic: requires-python
59
+ Dynamic: summary
60
+
61
+ # DataMax
62
+
63
+ ## Overview
64
+ DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
65
+
66
+ ## Key Features
67
+
68
+ ### File Processing Capabilities
69
+ Currently supports reading, conversion, and extraction from:
70
+ - PDF, HTML
71
+ - DOCX/DOC, PPT/PPTX
72
+ - EPUB
73
+ - Images
74
+ - XLS/XLSX spreadsheets
75
+ - Plain text (TXT)
76
+
77
+ ### Data Cleaning Pipeline
78
+ Three-tiered cleaning process:
79
+ 1. Anomaly detection and handling
80
+ 2. Privacy protection processing
81
+ 3. Text filtering and normalization
82
+
83
+ ### AI-Powered Data Annotation
84
+ Implements an LLM+Prompt to:
85
+ - Continuously generate pre-labeled datasets
86
+ - Provide optimized training data for model fine-tuning
87
+
88
+
89
+ ## Installation Guide (Key Dependencies)
90
+ Dependencies include libreoffice, datamax, and MinerU.
91
+
92
+ ### 1. Installing libreoffice Dependency
93
+ **Note:** Without datamax, .doc files will not be supported.
94
+
95
+ #### Linux (Debian/Ubuntu)
96
+ ```bash
97
+ sudo apt-get update
98
+ sudo apt-get install libreoffice
99
+ ```
100
+ ### Windows
101
+ ```text
102
+ Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
103
+ Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
104
+ ```
105
+ ### Checking LibreOffice Installation
106
+ ```bash
107
+ soffice --version
108
+ ```
109
+
110
+ ## 2. Installing MinerU Dependency
111
+ Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
112
+ ### Create a Virtual Environment and Install Basic Dependencies
113
+ ```bash
114
+ conda create -n mineru python=3.10
115
+ conda activate mineru
116
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
117
+ ```
118
+ ### Installing Model Weight Files
119
+ https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
120
+ ```bash
121
+ pip install modelscope
122
+ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
123
+ python download_models.py
124
+ ```
125
+
126
+ ### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
127
+ ```json
128
+ {
129
+ "models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
130
+ "layoutreader-model-dir": "path\\to\\folder\\layoutreader",
131
+ "device-mode": "cpu",
132
+ ...
133
+ }
134
+ ```
135
+
136
+ ## 3. Installing Basic Dependencies for datamax
137
+ 1. Clone the repository to your local machine:
138
+ ```bash
139
+ git clone <repository-url>
140
+ ```
141
+ 2. Install dependencies into conda:
142
+ ```bash
143
+ cd datamax
144
+ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
145
+ ```
146
+
147
+
148
+ ## Features
149
+ - **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
150
+ - **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
151
+ - **Data Conversion**: Supports converting processed data into markdown format for further analysis.
152
+ - **Batch Processing**: Can handle multiple files at once, improving work efficiency.
153
+ - **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
154
+ - **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
155
+
156
+
157
+ ## Technology Stack
158
+
159
+ - **Programming Language**: Python >= 3.10
160
+ - **Dependency Libraries**:
161
+ - PyMuPDF: For PDF file parsing.
162
+ - BeautifulSoup: For HTML file parsing.
163
+ - python-docx: For DOCX file parsing.
164
+ - pandas: For data processing and conversion.
165
+ - paddleocr: For parsing scanned PDFs, tables, and images.
166
+ - **Development Environment**: Visual Studio Code or PyCharm
167
+ - **Version Control**: Git
168
+
169
+ ## Usage Instructions
170
+ ### Installing the SDK
171
+ - **Installation Commands**:
172
+ ```bash
173
+ ## Local Installation
174
+ python setup.py sdist bdist_wheel
175
+ pip install dist/datamax-0.1.3-py3-none-any.whl
176
+
177
+ ## Pip Installation
178
+ pip install pydatamax
179
+ ```
180
+
181
+
182
+ - **Importing the Code**:
183
+ ```python
184
+ # File Parsing
185
+ from datamax import DataMax
186
+
187
+ ## Handling a Single File in Two Ways
188
+ # 1. Using a List of Length 1
189
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
190
+ data = data.get_data()
191
+
192
+ # 2. Using a String
193
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
194
+ data = data.get_data()
195
+
196
+ ## Handling Multiple Files
197
+ # 1. Using a List of Length n
198
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
199
+ data = data.get_data()
200
+
201
+ # 2. Passing a Folder Path as a String
202
+ data = DataMax(file_path=r"docx_files_example/")
203
+ data = data.get_data()
204
+
205
+ # Data Cleaning
206
+ """
207
+ Cleaning rules can be found in datamax/utils/data_cleaner.py
208
+ abnormal: Abnormal cleaning
209
+ private: Privacy processing
210
+ filter: Text filtering
211
+ """
212
+ # Direct Use: Clean the text parameter directly and return a string
213
+ dm = DataMax()
214
+ data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
215
+
216
+ # Process Use: Use after get_data() to return the complete data structure
217
+ dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
218
+ data2 = dm.get_data()
219
+ cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
220
+
221
+ # Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
222
+ data = DataMax(file_path=r"path\to\xxx.docx")
223
+ parsed_data = data.get_data()
224
+ # If no custom messages are passed, the default messages in the SDK will be used
225
+ messages = [
226
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
227
+ {'role': 'user', 'content': 'Who are you?'}
228
+ ]
229
+ qa_datas = data.get_pre_label(
230
+ api_key="sk-xxx",
231
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
232
+ model_name="qwen-max",
233
+ chunk_size=500,
234
+ chunk_overlap=100,
235
+ question_number=5,
236
+ max_workers=5,
237
+ # message=[]
238
+ )
239
+ print(f'Annotated result:{qa_datas}')
240
+ ```
241
+
242
+
243
+ ## Examples
244
+ ```python
245
+ ## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
246
+ from datamax import DataMax
247
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
248
+ """
249
+ Parameters:
250
+ file_path: Relative file path / Absolute file path
251
+ to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
252
+ """
253
+
254
+ ## jpg | jpeg | png | ...(image types)
255
+ data = DataMax(file_path=r"image.jpg", use_mineru=True)
256
+ """
257
+ Parameters:
258
+ file_path: Relative file path / Absolute file path
259
+ use_mineru: Whether to use MinerU enhancement
260
+ """
261
+
262
+ ## pdf
263
+ from datamax import DataMax
264
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
265
+ """
266
+ Parameters:
267
+ file_path: Relative file path / Absolute file path
268
+ use_mineru: Whether to use MinerU enhancement
269
+ """
270
+ ```
271
+
272
+ ## Contribution Guide
273
+ We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
274
+ ## License
275
+ This project is licensed under the MIT License. For more details, see the LICENSE file.
276
+
277
+ ## Contact Information
278
+ If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
279
+ - Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
280
+ - Project Homepage: GitHub Project Link
281
+
@@ -0,0 +1,221 @@
1
+ # DataMax
2
+
3
+ ## Overview
4
+ DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
5
+
6
+ ## Key Features
7
+
8
+ ### File Processing Capabilities
9
+ Currently supports reading, conversion, and extraction from:
10
+ - PDF, HTML
11
+ - DOCX/DOC, PPT/PPTX
12
+ - EPUB
13
+ - Images
14
+ - XLS/XLSX spreadsheets
15
+ - Plain text (TXT)
16
+
17
+ ### Data Cleaning Pipeline
18
+ Three-tiered cleaning process:
19
+ 1. Anomaly detection and handling
20
+ 2. Privacy protection processing
21
+ 3. Text filtering and normalization
22
+
23
+ ### AI-Powered Data Annotation
24
+ Implements an LLM+Prompt to:
25
+ - Continuously generate pre-labeled datasets
26
+ - Provide optimized training data for model fine-tuning
27
+
28
+
29
+ ## Installation Guide (Key Dependencies)
30
+ Dependencies include libreoffice, datamax, and MinerU.
31
+
32
+ ### 1. Installing libreoffice Dependency
33
+ **Note:** Without datamax, .doc files will not be supported.
34
+
35
+ #### Linux (Debian/Ubuntu)
36
+ ```bash
37
+ sudo apt-get update
38
+ sudo apt-get install libreoffice
39
+ ```
40
+ ### Windows
41
+ ```text
42
+ Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
43
+ Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
44
+ ```
45
+ ### Checking LibreOffice Installation
46
+ ```bash
47
+ soffice --version
48
+ ```
49
+
50
+ ## 2. Installing MinerU Dependency
51
+ Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
52
+ ### Create a Virtual Environment and Install Basic Dependencies
53
+ ```bash
54
+ conda create -n mineru python=3.10
55
+ conda activate mineru
56
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
57
+ ```
58
+ ### Installing Model Weight Files
59
+ https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
60
+ ```bash
61
+ pip install modelscope
62
+ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
63
+ python download_models.py
64
+ ```
65
+
66
+ ### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
67
+ ```json
68
+ {
69
+ "models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
70
+ "layoutreader-model-dir": "path\\to\\folder\\layoutreader",
71
+ "device-mode": "cpu",
72
+ ...
73
+ }
74
+ ```
75
+
76
+ ## 3. Installing Basic Dependencies for datamax
77
+ 1. Clone the repository to your local machine:
78
+ ```bash
79
+ git clone <repository-url>
80
+ ```
81
+ 2. Install dependencies into conda:
82
+ ```bash
83
+ cd datamax
84
+ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
85
+ ```
86
+
87
+
88
+ ## Features
89
+ - **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
90
+ - **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
91
+ - **Data Conversion**: Supports converting processed data into markdown format for further analysis.
92
+ - **Batch Processing**: Can handle multiple files at once, improving work efficiency.
93
+ - **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
94
+ - **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
95
+
96
+
97
+ ## Technology Stack
98
+
99
+ - **Programming Language**: Python >= 3.10
100
+ - **Dependency Libraries**:
101
+ - PyMuPDF: For PDF file parsing.
102
+ - BeautifulSoup: For HTML file parsing.
103
+ - python-docx: For DOCX file parsing.
104
+ - pandas: For data processing and conversion.
105
+ - paddleocr: For parsing scanned PDFs, tables, and images.
106
+ - **Development Environment**: Visual Studio Code or PyCharm
107
+ - **Version Control**: Git
108
+
109
+ ## Usage Instructions
110
+ ### Installing the SDK
111
+ - **Installation Commands**:
112
+ ```bash
113
+ ## Local Installation
114
+ python setup.py sdist bdist_wheel
115
+ pip install dist/datamax-0.1.3-py3-none-any.whl
116
+
117
+ ## Pip Installation
118
+ pip install pydatamax
119
+ ```
120
+
121
+
122
+ - **Importing the Code**:
123
+ ```python
124
+ # File Parsing
125
+ from datamax import DataMax
126
+
127
+ ## Handling a Single File in Two Ways
128
+ # 1. Using a List of Length 1
129
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
130
+ data = data.get_data()
131
+
132
+ # 2. Using a String
133
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
134
+ data = data.get_data()
135
+
136
+ ## Handling Multiple Files
137
+ # 1. Using a List of Length n
138
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
139
+ data = data.get_data()
140
+
141
+ # 2. Passing a Folder Path as a String
142
+ data = DataMax(file_path=r"docx_files_example/")
143
+ data = data.get_data()
144
+
145
+ # Data Cleaning
146
+ """
147
+ Cleaning rules can be found in datamax/utils/data_cleaner.py
148
+ abnormal: Abnormal cleaning
149
+ private: Privacy processing
150
+ filter: Text filtering
151
+ """
152
+ # Direct Use: Clean the text parameter directly and return a string
153
+ dm = DataMax()
154
+ data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
155
+
156
+ # Process Use: Use after get_data() to return the complete data structure
157
+ dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
158
+ data2 = dm.get_data()
159
+ cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
160
+
161
+ # Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
162
+ data = DataMax(file_path=r"path\to\xxx.docx")
163
+ parsed_data = data.get_data()
164
+ # If no custom messages are passed, the default messages in the SDK will be used
165
+ messages = [
166
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
167
+ {'role': 'user', 'content': 'Who are you?'}
168
+ ]
169
+ qa_datas = data.get_pre_label(
170
+ api_key="sk-xxx",
171
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
172
+ model_name="qwen-max",
173
+ chunk_size=500,
174
+ chunk_overlap=100,
175
+ question_number=5,
176
+ max_workers=5,
177
+ # message=[]
178
+ )
179
+ print(f'Annotated result:{qa_datas}')
180
+ ```
181
+
182
+
183
+ ## Examples
184
+ ```python
185
+ ## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
186
+ from datamax import DataMax
187
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
188
+ """
189
+ Parameters:
190
+ file_path: Relative file path / Absolute file path
191
+ to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
192
+ """
193
+
194
+ ## jpg | jpeg | png | ...(image types)
195
+ data = DataMax(file_path=r"image.jpg", use_mineru=True)
196
+ """
197
+ Parameters:
198
+ file_path: Relative file path / Absolute file path
199
+ use_mineru: Whether to use MinerU enhancement
200
+ """
201
+
202
+ ## pdf
203
+ from datamax import DataMax
204
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
205
+ """
206
+ Parameters:
207
+ file_path: Relative file path / Absolute file path
208
+ use_mineru: Whether to use MinerU enhancement
209
+ """
210
+ ```
211
+
212
+ ## Contribution Guide
213
+ We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
214
+ ## License
215
+ This project is licensed under the MIT License. For more details, see the LICENSE file.
216
+
217
+ ## Contact Information
218
+ If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
219
+ - Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
220
+ - Project Homepage: GitHub Project Link
221
+
@@ -0,0 +1 @@
1
+ from .parser import DataMax