pydatamax 0.1.5__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,376 @@
1
+ import json
2
+ import os.path
3
+ import re
4
+ import threading
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from pathlib import Path
7
+
8
+ import requests
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_community.document_loaders import UnstructuredMarkdownLoader
11
+ from loguru import logger
12
+ from pyexpat.errors import messages
13
+ from tqdm import tqdm # For progress bar display
14
+
15
+ lock = threading.Lock()
16
+
17
+
18
+ # ------------prompt-----------------
19
+ def get_system_prompt_for_question(query_text, question_number):
20
+ """Generate system prompt for question generation task"""
21
+ system_prompt = f"""
22
+ # 角色使命
23
+ 你是一位专业的文本分析专家,擅长从复杂文本中提取关键信息并生成可用于模型微调的结构化数据(仅生成问题)。
24
+
25
+ ## 核心任务
26
+ 根据用户提供的文本,生成不少于 ${question_number} 个高质量问题。
27
+
28
+ ## 约束条件(重要!)
29
+ - 必须基于文本内容直接生成
30
+ - 问题应具有明确答案指向性
31
+ - 需覆盖文本的不同方面
32
+ - 禁止生成假设性、重复或相似问题
33
+ - 确保生成得完整性
34
+
35
+ ## 处理流程
36
+ 1. 【文本解析】分段处理内容,识别关键实体和核心概念
37
+ 2. 【问题生成】基于信息密度选择最佳提问点
38
+ 3. 【质量检查】确保:
39
+ - 问题答案可在原文中找到依据
40
+ - 标签与问题内容强相关
41
+ - 无格式错误
42
+
43
+ ## 输出格式
44
+ - JSON 数组格式必须正确
45
+ - 字段名使用英文双引号
46
+ - 输出的 JSON 数组必须严格符合以下结构:
47
+ \`\`\`json
48
+ ["问题1", "问题2", "..."]
49
+ \`\`\`
50
+
51
+ ## 输出示例
52
+ \`\`\`json
53
+ [ "人工智能伦理框架应包含哪些核心要素?","民法典对个人数据保护有哪些新规定?"]
54
+ \`\`\`
55
+
56
+ ## 待处理文本
57
+ ${query_text}
58
+
59
+ ## 限制
60
+ - 必须按照规定的 JSON 格式输出,不要输出任何其他不相关内容
61
+ - 生成不少于${question_number}个高质量问题
62
+ - 问题不要和材料本身相关,例如禁止出现作者、章节、目录等相关问题
63
+ - 问题不得包含【报告、文章、文献、表格】中提到的这种话术,必须是一个自然的问题
64
+ """
65
+ return system_prompt
66
+
67
+
68
+ def get_system_prompt_for_answer(text, query_question):
69
+ """Generate system prompt for answer generation task"""
70
+ system_prompt = f"""
71
+ # Role: 微调数据集生成专家
72
+ ## Profile:
73
+ - Description: 你是一名微调数据集生成专家,擅长从给定的内容中生成准确的问题答案,确保答案的准确性和相关性,你要直接回答用户问题,所有信息已内化为你的专业知识。
74
+
75
+ ## Skills :
76
+ 1. 答案必须基于给定的内容
77
+ 2. 答案必须准确,不能胡编乱造
78
+ 3. 答案必须与问题相关
79
+ 4. 答案必须符合逻辑
80
+ 5. 基于给定参考内容,用自然流畅的语言整合成一个完整答案,不需要提及文献来源或引用标记
81
+
82
+ ## Workflow:
83
+ 1. Take a deep breath and work on this problem step-by-step.
84
+ 2. 首先,分析给定的文件内容
85
+ 3. 然后,从内容中提取关键信息
86
+ 4. 接着,生成与问题相关的准确答案
87
+ 5. 最后,确保答案的准确性和相关性
88
+
89
+ ## 参考内容:
90
+ ${text}
91
+
92
+ ## 问题
93
+ ${query_question}
94
+
95
+ ## Constrains:
96
+ 1. 答案必须基于给定的内容
97
+ 2. 答案必须准确,必须与问题相关,不能胡编乱造
98
+ 3. 答案必须充分、详细、包含所有必要的信息、适合微调大模型训练使用
99
+ 4. 答案中不得出现 ' 参考 / 依据 / 文献中提到 ' 等任何引用性表述,只需呈现最终结果
100
+ """
101
+ return system_prompt
102
+
103
+
104
+ # ------------spliter----------------
105
+ def load_and_split_markdown(md_path: str, chunk_size: int, chunk_overlap: int) -> list:
106
+ """
107
+ Parse Markdown using UnstructuredMarkdownLoader
108
+ Chunking strategy that preserves original paragraph structure
109
+
110
+ Args:
111
+ md_path: Path to the markdown file
112
+ chunk_size: Size of each chunk
113
+ chunk_overlap: Overlap between chunks
114
+
115
+ Returns:
116
+ List of document chunks
117
+ """
118
+ try:
119
+ # Use LangChain's MarkdownLoader to load Markdown file
120
+ loader = UnstructuredMarkdownLoader(md_path)
121
+ documents = loader.load()
122
+ # Further split documents if needed
123
+ splitter = RecursiveCharacterTextSplitter(
124
+ chunk_size=chunk_size,
125
+ chunk_overlap=chunk_overlap,
126
+ length_function=len,
127
+ is_separator_regex=False,
128
+ )
129
+ return splitter.split_documents(documents)
130
+ except Exception as e:
131
+ logger.error(f"加载 {Path(md_path).name} 失败: {str(e)}")
132
+ return []
133
+
134
+
135
+ # ------------llm generator-------------------
136
+ def extract_json_from_llm_output(output: str):
137
+ """
138
+ Extract JSON content from LLM output, handling multiple possible formats
139
+
140
+ Args:
141
+ output: Raw output string from LLM
142
+
143
+ Returns:
144
+ Parsed JSON list if successful, None otherwise
145
+ """
146
+ # Try to parse the entire output directly
147
+ try:
148
+ return json.loads(output)
149
+ except json.JSONDecodeError:
150
+ pass
151
+
152
+ # Try to extract content wrapped in ```json ```
153
+ json_match = re.search(r"```json\n([\s\S]*?)\n```", output)
154
+ if json_match:
155
+ try:
156
+ return json.loads(json_match.group(1))
157
+ except json.JSONDecodeError as e:
158
+ print(f"解析 JSON 时出错: {e}")
159
+
160
+ # Try to extract the most JSON-like part
161
+ json_start = output.find("[")
162
+ json_end = output.rfind("]") + 1
163
+ if json_start != -1 and json_end != 0:
164
+ try:
165
+ return json.loads(output[json_start:json_end])
166
+ except json.JSONDecodeError:
167
+ pass
168
+
169
+ print("模型未按标准格式输出:", output)
170
+ return None
171
+
172
+
173
+ def llm_generator(
174
+ api_key: str,
175
+ model: str,
176
+ base_url: str,
177
+ prompt: str,
178
+ type: str,
179
+ message: list = None,
180
+ temperature: float = 0.7,
181
+ top_p: float = 0.9,
182
+ max_token: int = 2048,
183
+ ) -> list:
184
+ """Generate content using LLM API"""
185
+ try:
186
+ if not message:
187
+ message = [
188
+ {"role": "system", "content": prompt},
189
+ {"role": "user", "content": "请严格按照要求生成内容"},
190
+ ]
191
+ headers = {
192
+ "Authorization": f"Bearer {api_key}",
193
+ "Content-Type": "application/json",
194
+ }
195
+ data = {
196
+ "model": model,
197
+ "messages": message,
198
+ "max_tokens": max_token,
199
+ "temperature": temperature,
200
+ "top_p": top_p,
201
+ }
202
+ response = requests.post(base_url, headers=headers, json=data, timeout=30)
203
+ response.raise_for_status()
204
+ result = response.json()
205
+
206
+ # Parse LLM response
207
+ if "choices" in result and len(result["choices"]) > 0:
208
+ output = result["choices"][0]["message"]["content"]
209
+ if type == "question":
210
+ fmt_output = extract_json_from_llm_output(output)
211
+ else:
212
+ return output
213
+ return fmt_output
214
+ return []
215
+
216
+ except Exception as e:
217
+ print(f"LLM提取关键词失败: {e, e.__traceback__.tb_lineno}")
218
+ return []
219
+
220
+
221
+ # ------------thread_process-------------
222
+
223
+
224
+ def process_questions(
225
+ api_key: str,
226
+ model: str,
227
+ base_url: str,
228
+ page_content: list,
229
+ question_number: int,
230
+ message: list,
231
+ max_workers: int = 5,
232
+ ) -> list:
233
+ """Generate questions using multi-threading"""
234
+ total_questions = []
235
+
236
+ def _generate_questions(page):
237
+ """Inner function for question generation"""
238
+ prompt = get_system_prompt_for_question(page, question_number)
239
+ questions = llm_generator(
240
+ api_key=api_key,
241
+ model=model,
242
+ base_url=base_url,
243
+ message=message,
244
+ prompt=prompt,
245
+ type="question",
246
+ )
247
+ return [{"question": q, "page": page} for q in questions] if questions else []
248
+
249
+ logger.info(f"开始生成问题 (线程数: {max_workers})...")
250
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
251
+ futures = [executor.submit(_generate_questions, page) for page in page_content]
252
+
253
+ with tqdm(as_completed(futures), total=len(futures), desc="生成问题") as pbar:
254
+ for future in pbar:
255
+ result = future.result()
256
+ if result:
257
+ with lock:
258
+ total_questions.extend(result)
259
+ pbar.set_postfix({"已生成问题": len(total_questions)})
260
+
261
+ return total_questions
262
+
263
+
264
+ def process_answers(
265
+ api_key: str,
266
+ model: str,
267
+ base_url: str,
268
+ question_items: list,
269
+ message: list = None,
270
+ max_workers=5,
271
+ ) -> dict:
272
+ """Generate answers using multi-threading"""
273
+ qa_pairs = {}
274
+
275
+ def _generate_answer(item):
276
+ """Inner function for answer generation"""
277
+ prompt = get_system_prompt_for_answer(item["page"], item["question"])
278
+ answer = llm_generator(
279
+ api_key=api_key,
280
+ model=model,
281
+ base_url=base_url,
282
+ prompt=prompt,
283
+ message=message,
284
+ type="answer",
285
+ )
286
+ return item["question"], answer
287
+
288
+ logger.info(f"开始生成答案 (线程数: {max_workers})...")
289
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
290
+ futures = {
291
+ executor.submit(_generate_answer, item): item for item in question_items
292
+ }
293
+
294
+ with tqdm(as_completed(futures), total=len(futures), desc="生成答案") as pbar:
295
+ for future in pbar:
296
+ question, answer = future.result()
297
+ if answer:
298
+ with lock:
299
+ qa_pairs[question] = answer
300
+ pbar.set_postfix({"已生成答案": len(qa_pairs)})
301
+ return qa_pairs
302
+
303
+
304
+ def generatr_qa_pairs(
305
+ file_path: str,
306
+ api_key: str,
307
+ base_url: str,
308
+ model_name: str,
309
+ chunk_size=500,
310
+ chunk_overlap=100,
311
+ question_number=5,
312
+ message: list = None,
313
+ max_workers=5,
314
+ ):
315
+ """Main function to generate QA pairs from markdown file"""
316
+ # 1. Split markdown text into chunks
317
+ pages = load_and_split_markdown(
318
+ md_path=file_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap
319
+ )
320
+ page_content = [i.page_content for i in pages]
321
+ logger.info(f"markdown被分解了{len(page_content)}个chunk")
322
+
323
+ # 2. Generate questions using multi-threading
324
+ questions = process_questions(
325
+ page_content=page_content,
326
+ message=message,
327
+ question_number=question_number,
328
+ max_workers=max_workers,
329
+ api_key=api_key,
330
+ base_url=base_url,
331
+ model=model_name,
332
+ )
333
+ if not questions:
334
+ logger.error("未能生成任何问题,请检查输入文档和API设置")
335
+
336
+ # 3. Generate answers using multi-threading
337
+ qa_pairs = process_answers(
338
+ question_items=questions,
339
+ message=message,
340
+ max_workers=max_workers,
341
+ api_key=api_key,
342
+ base_url=base_url,
343
+ model=model_name,
344
+ )
345
+
346
+ # 4. Save results
347
+ res_list = []
348
+ with open(
349
+ f"{os.path.basename(file_path).strip('.md')}.jsonl", "w", encoding="utf-8"
350
+ ) as f:
351
+ for question, answer in qa_pairs.items():
352
+ # Build properly formatted JSON object
353
+ qa_entry = {"instruction": question, "input": "", "output": answer}
354
+ res_list.append(qa_entry)
355
+ # Write to JSONL file (one JSON object per line)
356
+ f.write(json.dumps(qa_entry, ensure_ascii=False) + "\n")
357
+
358
+ logger.success(
359
+ f"完成! 共生成 {len(qa_pairs)} 个问答对,已保存到 {os.path.basename(file_path).strip('.md')}.jsonl"
360
+ )
361
+
362
+ return res_list
363
+
364
+
365
+ if __name__ == "__main__":
366
+ generatr_qa_pairs(
367
+ file_path=r"C:\Users\cykro\Desktop\文档整理\知识图谱\知识图谱概要设计.md",
368
+ api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxx",
369
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
370
+ model_name="qwen-max",
371
+ chunk_size=500,
372
+ chunk_overlap=100,
373
+ question_number=5,
374
+ max_workers=5,
375
+ # message=[]
376
+ )
@@ -17,6 +17,6 @@ class DashScopeClient:
17
17
  p50k_base corresponds to models text-davinci-002 and text-davinci-003
18
18
  r50k_base corresponds to model gpt2
19
19
  '''
20
- encoding = tiktoken.get_encoding(encoding_name="o200k_base")
20
+ encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
21
21
  num_tokens = len(encoding.encode(content))
22
22
  return num_tokens
@@ -0,0 +1,281 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydatamax
3
+ Version: 0.1.12
4
+ Summary: A library for parsing and converting various file formats.
5
+ Home-page: https://github.com/Hi-Dolphin/datamax
6
+ Author: ccy
7
+ Author-email: cy.kron@foxmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: oss2<3.0.0,>=2.19.1
15
+ Requires-Dist: aliyun-python-sdk-core<3.0.0,>=2.16.0
16
+ Requires-Dist: aliyun-python-sdk-kms<3.0.0,>=2.16.5
17
+ Requires-Dist: crcmod<2.0.0,>=1.7
18
+ Requires-Dist: langdetect<2.0.0,>=1.0.9
19
+ Requires-Dist: loguru<1.0.0,>=0.7.3
20
+ Requires-Dist: python-docx<2.0.0,>=1.1.2
21
+ Requires-Dist: python-dotenv<2.0.0,>=1.1.0
22
+ Requires-Dist: pymupdf<2.0.0,>=1.26.0
23
+ Requires-Dist: pypdf<6.0.0,>=5.5.0
24
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5
25
+ Requires-Dist: pandas<3.0.0,>=2.2.3
26
+ Requires-Dist: numpy<3.0.0,>=2.2.6
27
+ Requires-Dist: requests<3.0.0,>=2.32.3
28
+ Requires-Dist: tqdm<5.0.0,>=4.67.1
29
+ Requires-Dist: pydantic<3.0.0,>=2.11.5
30
+ Requires-Dist: pydantic-settings<3.0.0,>=2.9.1
31
+ Requires-Dist: python-magic<1.0.0,>=0.4.27
32
+ Requires-Dist: PyYAML<7.0.0,>=6.0.2
33
+ Requires-Dist: Pillow<12.0.0,>=11.2.1
34
+ Requires-Dist: packaging<25.0,>=24.2
35
+ Requires-Dist: beautifulsoup4<5.0.0,>=4.13.4
36
+ Requires-Dist: minio<8.0.0,>=7.2.15
37
+ Requires-Dist: openai<2.0.0,>=1.82.0
38
+ Requires-Dist: jionlp<2.0.0,>=1.5.23
39
+ Requires-Dist: chardet<6.0.0,>=5.2.0
40
+ Requires-Dist: python-pptx<2.0.0,>=1.0.2
41
+ Requires-Dist: docx2markdown<1.0.0,>=0.1.1
42
+ Requires-Dist: tiktoken<1.0.0,>=0.9.0
43
+ Requires-Dist: markitdown<1.0.0,>=0.1.1
44
+ Requires-Dist: xlrd<3.0.0,>=2.0.1
45
+ Requires-Dist: tabulate<1.0.0,>=0.9.0
46
+ Requires-Dist: unstructured<1.0.0,>=0.17.2
47
+ Requires-Dist: markdown<4.0.0,>=3.8
48
+ Requires-Dist: langchain<1.0.0,>=0.3.0
49
+ Requires-Dist: langchain-community<1.0.0,>=0.3.0
50
+ Dynamic: author
51
+ Dynamic: author-email
52
+ Dynamic: classifier
53
+ Dynamic: description
54
+ Dynamic: description-content-type
55
+ Dynamic: home-page
56
+ Dynamic: license-file
57
+ Dynamic: requires-dist
58
+ Dynamic: requires-python
59
+ Dynamic: summary
60
+
61
+ # DataMax
62
+
63
+ ## Overview
64
+ DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
65
+
66
+ ## Key Features
67
+
68
+ ### File Processing Capabilities
69
+ Currently supports reading, conversion, and extraction from:
70
+ - PDF, HTML
71
+ - DOCX/DOC, PPT/PPTX
72
+ - EPUB
73
+ - Images
74
+ - XLS/XLSX spreadsheets
75
+ - Plain text (TXT)
76
+
77
+ ### Data Cleaning Pipeline
78
+ Three-tiered cleaning process:
79
+ 1. Anomaly detection and handling
80
+ 2. Privacy protection processing
81
+ 3. Text filtering and normalization
82
+
83
+ ### AI-Powered Data Annotation
84
+ Implements an LLM+Prompt to:
85
+ - Continuously generate pre-labeled datasets
86
+ - Provide optimized training data for model fine-tuning
87
+
88
+
89
+ ## Installation Guide (Key Dependencies)
90
+ Dependencies include libreoffice, datamax, and MinerU.
91
+
92
+ ### 1. Installing libreoffice Dependency
93
+ **Note:** Without datamax, .doc files will not be supported.
94
+
95
+ #### Linux (Debian/Ubuntu)
96
+ ```bash
97
+ sudo apt-get update
98
+ sudo apt-get install libreoffice
99
+ ```
100
+ ### Windows
101
+ ```text
102
+ Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
103
+ Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
104
+ ```
105
+ ### Checking LibreOffice Installation
106
+ ```bash
107
+ soffice --version
108
+ ```
109
+
110
+ ## 2. Installing MinerU Dependency
111
+ Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
112
+ ### Create a Virtual Environment and Install Basic Dependencies
113
+ ```bash
114
+ conda create -n mineru python=3.10
115
+ conda activate mineru
116
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
117
+ ```
118
+ ### Installing Model Weight Files
119
+ https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
120
+ ```bash
121
+ pip install modelscope
122
+ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
123
+ python download_models.py
124
+ ```
125
+
126
+ ### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
127
+ ```json
128
+ {
129
+ "models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
130
+ "layoutreader-model-dir": "path\\to\\folder\\layoutreader",
131
+ "device-mode": "cpu",
132
+ ...
133
+ }
134
+ ```
135
+
136
+ ## 3. Installing Basic Dependencies for datamax
137
+ 1. Clone the repository to your local machine:
138
+ ```bash
139
+ git clone <repository-url>
140
+ ```
141
+ 2. Install dependencies into conda:
142
+ ```bash
143
+ cd datamax
144
+ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
145
+ ```
146
+
147
+
148
+ ## Features
149
+ - **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
150
+ - **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
151
+ - **Data Conversion**: Supports converting processed data into markdown format for further analysis.
152
+ - **Batch Processing**: Can handle multiple files at once, improving work efficiency.
153
+ - **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
154
+ - **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
155
+
156
+
157
+ ## Technology Stack
158
+
159
+ - **Programming Language**: Python >= 3.10
160
+ - **Dependency Libraries**:
161
+ - PyMuPDF: For PDF file parsing.
162
+ - BeautifulSoup: For HTML file parsing.
163
+ - python-docx: For DOCX file parsing.
164
+ - pandas: For data processing and conversion.
165
+ - paddleocr: For parsing scanned PDFs, tables, and images.
166
+ - **Development Environment**: Visual Studio Code or PyCharm
167
+ - **Version Control**: Git
168
+
169
+ ## Usage Instructions
170
+ ### Installing the SDK
171
+ - **Installation Commands**:
172
+ ```bash
173
+ ## Local Installation
174
+ python setup.py sdist bdist_wheel
175
+ pip install dist/datamax-0.1.3-py3-none-any.whl
176
+
177
+ ## Pip Installation
178
+ pip install pydatamax
179
+ ```
180
+
181
+
182
+ - **Importing the Code**:
183
+ ```python
184
+ # File Parsing
185
+ from datamax import DataMax
186
+
187
+ ## Handling a Single File in Two Ways
188
+ # 1. Using a List of Length 1
189
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
190
+ data = data.get_data()
191
+
192
+ # 2. Using a String
193
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
194
+ data = data.get_data()
195
+
196
+ ## Handling Multiple Files
197
+ # 1. Using a List of Length n
198
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
199
+ data = data.get_data()
200
+
201
+ # 2. Passing a Folder Path as a String
202
+ data = DataMax(file_path=r"docx_files_example/")
203
+ data = data.get_data()
204
+
205
+ # Data Cleaning
206
+ """
207
+ Cleaning rules can be found in datamax/utils/data_cleaner.py
208
+ abnormal: Abnormal cleaning
209
+ private: Privacy processing
210
+ filter: Text filtering
211
+ """
212
+ # Direct Use: Clean the text parameter directly and return a string
213
+ dm = DataMax()
214
+ data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
215
+
216
+ # Process Use: Use after get_data() to return the complete data structure
217
+ dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
218
+ data2 = dm.get_data()
219
+ cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
220
+
221
+ # Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
222
+ data = DataMax(file_path=r"path\to\xxx.docx")
223
+ parsed_data = data.get_data()
224
+ # If no custom messages are passed, the default messages in the SDK will be used
225
+ messages = [
226
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
227
+ {'role': 'user', 'content': 'Who are you?'}
228
+ ]
229
+ qa_datas = data.get_pre_label(
230
+ api_key="sk-xxx",
231
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
232
+ model_name="qwen-max",
233
+ chunk_size=500,
234
+ chunk_overlap=100,
235
+ question_number=5,
236
+ max_workers=5,
237
+ # message=[]
238
+ )
239
+ print(f'Annotated result:{qa_datas}')
240
+ ```
241
+
242
+
243
+ ## Examples
244
+ ```python
245
+ ## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
246
+ from datamax import DataMax
247
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
248
+ """
249
+ Parameters:
250
+ file_path: Relative file path / Absolute file path
251
+ to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
252
+ """
253
+
254
+ ## jpg | jpeg | png | ...(image types)
255
+ data = DataMax(file_path=r"image.jpg", use_mineru=True)
256
+ """
257
+ Parameters:
258
+ file_path: Relative file path / Absolute file path
259
+ use_mineru: Whether to use MinerU enhancement
260
+ """
261
+
262
+ ## pdf
263
+ from datamax import DataMax
264
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
265
+ """
266
+ Parameters:
267
+ file_path: Relative file path / Absolute file path
268
+ use_mineru: Whether to use MinerU enhancement
269
+ """
270
+ ```
271
+
272
+ ## Contribution Guide
273
+ We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
274
+ ## License
275
+ This project is licensed under the MIT License. For more details, see the LICENSE file.
276
+
277
+ ## Contact Information
278
+ If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
279
+ - Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
280
+ - Project Homepage: GitHub Project Link
281
+