pydatamax 0.1.5__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {pydatamax-0.1.5 → pydatamax-0.1.11}/LICENSE +0 -0
  2. pydatamax-0.1.11/PKG-INFO +271 -0
  3. pydatamax-0.1.11/README.md +221 -0
  4. pydatamax-0.1.11/datamax/__init__.py +1 -0
  5. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/loader/MinioHandler.py +0 -0
  6. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/loader/OssHandler.py +85 -51
  7. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/loader/__init__.py +0 -0
  8. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/loader/core.py +0 -0
  9. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/__init__.py +1 -1
  10. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/base.py +2 -2
  11. pydatamax-0.1.11/datamax/parser/core.py +288 -0
  12. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/csv_parser.py +0 -0
  13. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/doc_parser.py +2 -5
  14. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/docx_parser.py +3 -6
  15. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/epub_parser.py +2 -5
  16. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/html_parser.py +2 -5
  17. pydatamax-0.1.11/datamax/parser/image_parser.py +34 -0
  18. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/json_parser.py +0 -0
  19. pydatamax-0.1.11/datamax/parser/md_parser.py +73 -0
  20. pydatamax-0.1.11/datamax/parser/pdf_parser.py +101 -0
  21. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/ppt_parser.py +3 -5
  22. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/pptx_parser.py +10 -13
  23. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/parser/txt_parser.py +2 -5
  24. pydatamax-0.1.11/datamax/parser/xls_parser.py +26 -0
  25. pydatamax-0.1.11/datamax/parser/xlsx_parser.py +71 -0
  26. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/utils/__init__.py +1 -0
  27. pydatamax-0.1.11/datamax/utils/constants.py +58 -0
  28. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/utils/data_cleaner.py +45 -28
  29. pydatamax-0.1.11/datamax/utils/env_setup.py +80 -0
  30. pydatamax-0.1.11/datamax/utils/gotocr_pdf.py +265 -0
  31. pydatamax-0.1.11/datamax/utils/mineru_operator.py +62 -0
  32. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/utils/paddleocr_pdf_operator.py +2 -1
  33. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/utils/ppt_extract.py +0 -0
  34. pydatamax-0.1.11/datamax/utils/qa_generator.py +376 -0
  35. {pydatamax-0.1.5 → pydatamax-0.1.11}/datamax/utils/tokenizer.py +1 -1
  36. pydatamax-0.1.11/pydatamax.egg-info/PKG-INFO +271 -0
  37. {pydatamax-0.1.5 → pydatamax-0.1.11}/pydatamax.egg-info/SOURCES.txt +9 -1
  38. {pydatamax-0.1.5 → pydatamax-0.1.11}/pydatamax.egg-info/dependency_links.txt +0 -0
  39. {pydatamax-0.1.5 → pydatamax-0.1.11}/pydatamax.egg-info/requires.txt +6 -3
  40. {pydatamax-0.1.5 → pydatamax-0.1.11}/pydatamax.egg-info/top_level.txt +1 -0
  41. {pydatamax-0.1.5 → pydatamax-0.1.11}/setup.cfg +0 -0
  42. {pydatamax-0.1.5 → pydatamax-0.1.11}/setup.py +9 -6
  43. pydatamax-0.1.11/tests/__init__.py +0 -0
  44. pydatamax-0.1.11/tests/test_basic.py +20 -0
  45. pydatamax-0.1.5/PKG-INFO +0 -282
  46. pydatamax-0.1.5/README.md +0 -245
  47. pydatamax-0.1.5/datamax/__init__.py +0 -1
  48. pydatamax-0.1.5/datamax/parser/core.py +0 -114
  49. pydatamax-0.1.5/datamax/parser/image_parser.py +0 -30
  50. pydatamax-0.1.5/datamax/parser/md_parser.py +0 -10
  51. pydatamax-0.1.5/datamax/parser/pdf_parser.py +0 -62
  52. pydatamax-0.1.5/datamax/parser/xlsx_parser.py +0 -10
  53. pydatamax-0.1.5/pydatamax.egg-info/PKG-INFO +0 -282
File without changes
@@ -0,0 +1,271 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydatamax
3
+ Version: 0.1.11
4
+ Summary: A library for parsing and converting various file formats.
5
+ Home-page: https://github.com/cosco/datamax
6
+ Author: hzb | ccy
7
+ Author-email: zhibaohe@hotmail.com | cy.kron@foxmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: ebooklib
15
+ Requires-Dist: python-docx
16
+ Requires-Dist: beautifulsoup4
17
+ Requires-Dist: python-dotenv
18
+ Requires-Dist: minio
19
+ Requires-Dist: loguru
20
+ Requires-Dist: tqdm
21
+ Requires-Dist: oss2
22
+ Requires-Dist: python-docx
23
+ Requires-Dist: openai
24
+ Requires-Dist: jionlp
25
+ Requires-Dist: chardet
26
+ Requires-Dist: python-pptx
27
+ Requires-Dist: openpyxl
28
+ Requires-Dist: pymupdf
29
+ Requires-Dist: langchain_community==0.2.9
30
+ Requires-Dist: premailer
31
+ Requires-Dist: setuptools==75.3.0
32
+ Requires-Dist: docx2markdown
33
+ Requires-Dist: tiktoken
34
+ Requires-Dist: markitdown
35
+ Requires-Dist: pandas
36
+ Requires-Dist: xlrd
37
+ Requires-Dist: tabulate
38
+ Requires-Dist: unstructured[all]
39
+ Requires-Dist: markdown
40
+ Dynamic: author
41
+ Dynamic: author-email
42
+ Dynamic: classifier
43
+ Dynamic: description
44
+ Dynamic: description-content-type
45
+ Dynamic: home-page
46
+ Dynamic: license-file
47
+ Dynamic: requires-dist
48
+ Dynamic: requires-python
49
+ Dynamic: summary
50
+
51
+ # DataMax
52
+
53
+ ## Overview
54
+ DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
55
+
56
+ ## Key Features
57
+
58
+ ### File Processing Capabilities
59
+ Currently supports reading, conversion, and extraction from:
60
+ - PDF, HTML
61
+ - DOCX/DOC, PPT/PPTX
62
+ - EPUB
63
+ - Images
64
+ - XLS/XLSX spreadsheets
65
+ - Plain text (TXT)
66
+
67
+ ### Data Cleaning Pipeline
68
+ Three-tiered cleaning process:
69
+ 1. Anomaly detection and handling
70
+ 2. Privacy protection processing
71
+ 3. Text filtering and normalization
72
+
73
+ ### AI-Powered Data Annotation
74
+ Implements an LLM+Prompt to:
75
+ - Continuously generate pre-labeled datasets
76
+ - Provide optimized training data for model fine-tuning
77
+
78
+
79
+ ## Installation Guide (Key Dependencies)
80
+ Dependencies include libreoffice, datamax, and MinerU.
81
+
82
+ ### 1. Installing libreoffice Dependency
83
+ **Note:** Without datamax, .doc files will not be supported.
84
+
85
+ #### Linux (Debian/Ubuntu)
86
+ ```bash
87
+ sudo apt-get update
88
+ sudo apt-get install libreoffice
89
+ ```
90
+ ### Windows
91
+ ```text
92
+ Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
93
+ Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
94
+ ```
95
+ ### Checking LibreOffice Installation
96
+ ```bash
97
+ soffice --version
98
+ ```
99
+
100
+ ## 2. Installing MinerU Dependency
101
+ Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
102
+ ### Create a Virtual Environment and Install Basic Dependencies
103
+ ```bash
104
+ conda create -n mineru python=3.10
105
+ conda activate mineru
106
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
107
+ ```
108
+ ### Installing Model Weight Files
109
+ https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
110
+ ```bash
111
+ pip install modelscope
112
+ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
113
+ python download_models.py
114
+ ```
115
+
116
+ ### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
117
+ ```json
118
+ {
119
+ "models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
120
+ "layoutreader-model-dir": "path\\to\\folder\\layoutreader",
121
+ "device-mode": "cpu",
122
+ ...
123
+ }
124
+ ```
125
+
126
+ ## 3. Installing Basic Dependencies for datamax
127
+ 1. Clone the repository to your local machine:
128
+ ```bash
129
+ git clone <repository-url>
130
+ ```
131
+ 2. Install dependencies into conda:
132
+ ```bash
133
+ cd datamax
134
+ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
135
+ ```
136
+
137
+
138
+ ## Features
139
+ - **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
140
+ - **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
141
+ - **Data Conversion**: Supports converting processed data into markdown format for further analysis.
142
+ - **Batch Processing**: Can handle multiple files at once, improving work efficiency.
143
+ - **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
144
+ - **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
145
+
146
+
147
+ ## Technology Stack
148
+
149
+ - **Programming Language**: Python >= 3.10
150
+ - **Dependency Libraries**:
151
+ - PyMuPDF: For PDF file parsing.
152
+ - BeautifulSoup: For HTML file parsing.
153
+ - python-docx: For DOCX file parsing.
154
+ - pandas: For data processing and conversion.
155
+ - paddleocr: For parsing scanned PDFs, tables, and images.
156
+ - **Development Environment**: Visual Studio Code or PyCharm
157
+ - **Version Control**: Git
158
+
159
+ ## Usage Instructions
160
+ ### Installing the SDK
161
+ - **Installation Commands**:
162
+ ```bash
163
+ ## Local Installation
164
+ python setup.py sdist bdist_wheel
165
+ pip install dist/datamax-0.1.3-py3-none-any.whl
166
+
167
+ ## Pip Installation
168
+ pip install pydatamax
169
+ ```
170
+
171
+
172
+ - **Importing the Code**:
173
+ ```python
174
+ # File Parsing
175
+ from datamax import DataMax
176
+
177
+ ## Handling a Single File in Two Ways
178
+ # 1. Using a List of Length 1
179
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
180
+ data = data.get_data()
181
+
182
+ # 2. Using a String
183
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
184
+ data = data.get_data()
185
+
186
+ ## Handling Multiple Files
187
+ # 1. Using a List of Length n
188
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
189
+ data = data.get_data()
190
+
191
+ # 2. Passing a Folder Path as a String
192
+ data = DataMax(file_path=r"docx_files_example/")
193
+ data = data.get_data()
194
+
195
+ # Data Cleaning
196
+ """
197
+ Cleaning rules can be found in datamax/utils/data_cleaner.py
198
+ abnormal: Abnormal cleaning
199
+ private: Privacy processing
200
+ filter: Text filtering
201
+ """
202
+ # Direct Use: Clean the text parameter directly and return a string
203
+ dm = DataMax()
204
+ data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
205
+
206
+ # Process Use: Use after get_data() to return the complete data structure
207
+ dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
208
+ data2 = dm.get_data()
209
+ cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
210
+
211
+ # Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
212
+ data = DataMax(file_path=r"path\to\xxx.docx")
213
+ parsed_data = data.get_data()
214
+ # If no custom messages are passed, the default messages in the SDK will be used
215
+ messages = [
216
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
217
+ {'role': 'user', 'content': 'Who are you?'}
218
+ ]
219
+ qa_datas = data.get_pre_label(
220
+ api_key="sk-xxx",
221
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
222
+ model_name="qwen-max",
223
+ chunk_size=500,
224
+ chunk_overlap=100,
225
+ question_number=5,
226
+ max_workers=5,
227
+ # message=[]
228
+ )
229
+ print(f'Annotated result:{qa_datas}')
230
+ ```
231
+
232
+
233
+ ## Examples
234
+ ```python
235
+ ## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
236
+ from datamax import DataMax
237
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
238
+ """
239
+ Parameters:
240
+ file_path: Relative file path / Absolute file path
241
+ to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
242
+ """
243
+
244
+ ## jpg | jpeg | png | ...(image types)
245
+ data = DataMax(file_path=r"image.jpg", use_mineru=True)
246
+ """
247
+ Parameters:
248
+ file_path: Relative file path / Absolute file path
249
+ use_mineru: Whether to use MinerU enhancement
250
+ """
251
+
252
+ ## pdf
253
+ from datamax import DataMax
254
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
255
+ """
256
+ Parameters:
257
+ file_path: Relative file path / Absolute file path
258
+ use_mineru: Whether to use MinerU enhancement
259
+ """
260
+ ```
261
+
262
+ ## Contribution Guide
263
+ We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
264
+ ## License
265
+ This project is licensed under the MIT License. For more details, see the LICENSE file.
266
+
267
+ ## Contact Information
268
+ If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
269
+ - Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
270
+ - Project Homepage: GitHub Project Link
271
+
@@ -0,0 +1,221 @@
1
+ # DataMax
2
+
3
+ ## Overview
4
+ DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
5
+
6
+ ## Key Features
7
+
8
+ ### File Processing Capabilities
9
+ Currently supports reading, conversion, and extraction from:
10
+ - PDF, HTML
11
+ - DOCX/DOC, PPT/PPTX
12
+ - EPUB
13
+ - Images
14
+ - XLS/XLSX spreadsheets
15
+ - Plain text (TXT)
16
+
17
+ ### Data Cleaning Pipeline
18
+ Three-tiered cleaning process:
19
+ 1. Anomaly detection and handling
20
+ 2. Privacy protection processing
21
+ 3. Text filtering and normalization
22
+
23
+ ### AI-Powered Data Annotation
24
+ Implements an LLM+Prompt to:
25
+ - Continuously generate pre-labeled datasets
26
+ - Provide optimized training data for model fine-tuning
27
+
28
+
29
+ ## Installation Guide (Key Dependencies)
30
+ Dependencies include libreoffice, datamax, and MinerU.
31
+
32
+ ### 1. Installing libreoffice Dependency
33
+ **Note:** Without datamax, .doc files will not be supported.
34
+
35
+ #### Linux (Debian/Ubuntu)
36
+ ```bash
37
+ sudo apt-get update
38
+ sudo apt-get install libreoffice
39
+ ```
40
+ ### Windows
41
+ ```text
42
+ Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
43
+ Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
44
+ ```
45
+ ### Checking LibreOffice Installation
46
+ ```bash
47
+ soffice --version
48
+ ```
49
+
50
+ ## 2. Installing MinerU Dependency
51
+ Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
52
+ ### Create a Virtual Environment and Install Basic Dependencies
53
+ ```bash
54
+ conda create -n mineru python=3.10
55
+ conda activate mineru
56
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
57
+ ```
58
+ ### Installing Model Weight Files
59
+ https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
60
+ ```bash
61
+ pip install modelscope
62
+ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
63
+ python download_models.py
64
+ ```
65
+
66
+ ### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
67
+ ```json
68
+ {
69
+ "models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
70
+ "layoutreader-model-dir": "path\\to\\folder\\layoutreader",
71
+ "device-mode": "cpu",
72
+ ...
73
+ }
74
+ ```
75
+
76
+ ## 3. Installing Basic Dependencies for datamax
77
+ 1. Clone the repository to your local machine:
78
+ ```bash
79
+ git clone <repository-url>
80
+ ```
81
+ 2. Install dependencies into conda:
82
+ ```bash
83
+ cd datamax
84
+ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
85
+ ```
86
+
87
+
88
+ ## Features
89
+ - **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
90
+ - **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
91
+ - **Data Conversion**: Supports converting processed data into markdown format for further analysis.
92
+ - **Batch Processing**: Can handle multiple files at once, improving work efficiency.
93
+ - **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
94
+ - **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
95
+
96
+
97
+ ## Technology Stack
98
+
99
+ - **Programming Language**: Python >= 3.10
100
+ - **Dependency Libraries**:
101
+ - PyMuPDF: For PDF file parsing.
102
+ - BeautifulSoup: For HTML file parsing.
103
+ - python-docx: For DOCX file parsing.
104
+ - pandas: For data processing and conversion.
105
+ - paddleocr: For parsing scanned PDFs, tables, and images.
106
+ - **Development Environment**: Visual Studio Code or PyCharm
107
+ - **Version Control**: Git
108
+
109
+ ## Usage Instructions
110
+ ### Installing the SDK
111
+ - **Installation Commands**:
112
+ ```bash
113
+ ## Local Installation
114
+ python setup.py sdist bdist_wheel
115
+ pip install dist/datamax-0.1.3-py3-none-any.whl
116
+
117
+ ## Pip Installation
118
+ pip install pydatamax
119
+ ```
120
+
121
+
122
+ - **Importing the Code**:
123
+ ```python
124
+ # File Parsing
125
+ from datamax import DataMax
126
+
127
+ ## Handling a Single File in Two Ways
128
+ # 1. Using a List of Length 1
129
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
130
+ data = data.get_data()
131
+
132
+ # 2. Using a String
133
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
134
+ data = data.get_data()
135
+
136
+ ## Handling Multiple Files
137
+ # 1. Using a List of Length n
138
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
139
+ data = data.get_data()
140
+
141
+ # 2. Passing a Folder Path as a String
142
+ data = DataMax(file_path=r"docx_files_example/")
143
+ data = data.get_data()
144
+
145
+ # Data Cleaning
146
+ """
147
+ Cleaning rules can be found in datamax/utils/data_cleaner.py
148
+ abnormal: Abnormal cleaning
149
+ private: Privacy processing
150
+ filter: Text filtering
151
+ """
152
+ # Direct Use: Clean the text parameter directly and return a string
153
+ dm = DataMax()
154
+ data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
155
+
156
+ # Process Use: Use after get_data() to return the complete data structure
157
+ dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
158
+ data2 = dm.get_data()
159
+ cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
160
+
161
+ # Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
162
+ data = DataMax(file_path=r"path\to\xxx.docx")
163
+ parsed_data = data.get_data()
164
+ # If no custom messages are passed, the default messages in the SDK will be used
165
+ messages = [
166
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
167
+ {'role': 'user', 'content': 'Who are you?'}
168
+ ]
169
+ qa_datas = data.get_pre_label(
170
+ api_key="sk-xxx",
171
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
172
+ model_name="qwen-max",
173
+ chunk_size=500,
174
+ chunk_overlap=100,
175
+ question_number=5,
176
+ max_workers=5,
177
+ # message=[]
178
+ )
179
+ print(f'Annotated result:{qa_datas}')
180
+ ```
181
+
182
+
183
+ ## Examples
184
+ ```python
185
+ ## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
186
+ from datamax import DataMax
187
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
188
+ """
189
+ Parameters:
190
+ file_path: Relative file path / Absolute file path
191
+ to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
192
+ """
193
+
194
+ ## jpg | jpeg | png | ...(image types)
195
+ data = DataMax(file_path=r"image.jpg", use_mineru=True)
196
+ """
197
+ Parameters:
198
+ file_path: Relative file path / Absolute file path
199
+ use_mineru: Whether to use MinerU enhancement
200
+ """
201
+
202
+ ## pdf
203
+ from datamax import DataMax
204
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
205
+ """
206
+ Parameters:
207
+ file_path: Relative file path / Absolute file path
208
+ use_mineru: Whether to use MinerU enhancement
209
+ """
210
+ ```
211
+
212
+ ## Contribution Guide
213
+ We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
214
+ ## License
215
+ This project is licensed under the MIT License. For more details, see the LICENSE file.
216
+
217
+ ## Contact Information
218
+ If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
219
+ - Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
220
+ - Project Homepage: GitHub Project Link
221
+
@@ -0,0 +1 @@
1
+ from .parser import DataMax