pydatamax 0.1.5__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,376 @@
1
+ import json
2
+ import os.path
3
+ import re
4
+ import threading
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from pathlib import Path
7
+
8
+ import requests
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_community.document_loaders import UnstructuredMarkdownLoader
11
+ from loguru import logger
12
+ from pyexpat.errors import messages
13
+ from tqdm import tqdm # For progress bar display
14
+
15
+ lock = threading.Lock()
16
+
17
+
18
+ # ------------prompt-----------------
19
+ def get_system_prompt_for_question(query_text, question_number):
20
+ """Generate system prompt for question generation task"""
21
+ system_prompt = f"""
22
+ # 角色使命
23
+ 你是一位专业的文本分析专家,擅长从复杂文本中提取关键信息并生成可用于模型微调的结构化数据(仅生成问题)。
24
+
25
+ ## 核心任务
26
+ 根据用户提供的文本,生成不少于 ${question_number} 个高质量问题。
27
+
28
+ ## 约束条件(重要!)
29
+ - 必须基于文本内容直接生成
30
+ - 问题应具有明确答案指向性
31
+ - 需覆盖文本的不同方面
32
+ - 禁止生成假设性、重复或相似问题
33
+ - 确保生成得完整性
34
+
35
+ ## 处理流程
36
+ 1. 【文本解析】分段处理内容,识别关键实体和核心概念
37
+ 2. 【问题生成】基于信息密度选择最佳提问点
38
+ 3. 【质量检查】确保:
39
+ - 问题答案可在原文中找到依据
40
+ - 标签与问题内容强相关
41
+ - 无格式错误
42
+
43
+ ## 输出格式
44
+ - JSON 数组格式必须正确
45
+ - 字段名使用英文双引号
46
+ - 输出的 JSON 数组必须严格符合以下结构:
47
+ \`\`\`json
48
+ ["问题1", "问题2", "..."]
49
+ \`\`\`
50
+
51
+ ## 输出示例
52
+ \`\`\`json
53
+ [ "人工智能伦理框架应包含哪些核心要素?","民法典对个人数据保护有哪些新规定?"]
54
+ \`\`\`
55
+
56
+ ## 待处理文本
57
+ ${query_text}
58
+
59
+ ## 限制
60
+ - 必须按照规定的 JSON 格式输出,不要输出任何其他不相关内容
61
+ - 生成不少于${question_number}个高质量问题
62
+ - 问题不要和材料本身相关,例如禁止出现作者、章节、目录等相关问题
63
+ - 问题不得包含【报告、文章、文献、表格】中提到的这种话术,必须是一个自然的问题
64
+ """
65
+ return system_prompt
66
+
67
+
68
+ def get_system_prompt_for_answer(text, query_question):
69
+ """Generate system prompt for answer generation task"""
70
+ system_prompt = f"""
71
+ # Role: 微调数据集生成专家
72
+ ## Profile:
73
+ - Description: 你是一名微调数据集生成专家,擅长从给定的内容中生成准确的问题答案,确保答案的准确性和相关性,你要直接回答用户问题,所有信息已内化为你的专业知识。
74
+
75
+ ## Skills :
76
+ 1. 答案必须基于给定的内容
77
+ 2. 答案必须准确,不能胡编乱造
78
+ 3. 答案必须与问题相关
79
+ 4. 答案必须符合逻辑
80
+ 5. 基于给定参考内容,用自然流畅的语言整合成一个完整答案,不需要提及文献来源或引用标记
81
+
82
+ ## Workflow:
83
+ 1. Take a deep breath and work on this problem step-by-step.
84
+ 2. 首先,分析给定的文件内容
85
+ 3. 然后,从内容中提取关键信息
86
+ 4. 接着,生成与问题相关的准确答案
87
+ 5. 最后,确保答案的准确性和相关性
88
+
89
+ ## 参考内容:
90
+ ${text}
91
+
92
+ ## 问题
93
+ ${query_question}
94
+
95
+ ## Constrains:
96
+ 1. 答案必须基于给定的内容
97
+ 2. 答案必须准确,必须与问题相关,不能胡编乱造
98
+ 3. 答案必须充分、详细、包含所有必要的信息、适合微调大模型训练使用
99
+ 4. 答案中不得出现 ' 参考 / 依据 / 文献中提到 ' 等任何引用性表述,只需呈现最终结果
100
+ """
101
+ return system_prompt
102
+
103
+
104
+ # ------------spliter----------------
105
+ def load_and_split_markdown(md_path: str, chunk_size: int, chunk_overlap: int) -> list:
106
+ """
107
+ Parse Markdown using UnstructuredMarkdownLoader
108
+ Chunking strategy that preserves original paragraph structure
109
+
110
+ Args:
111
+ md_path: Path to the markdown file
112
+ chunk_size: Size of each chunk
113
+ chunk_overlap: Overlap between chunks
114
+
115
+ Returns:
116
+ List of document chunks
117
+ """
118
+ try:
119
+ # Use LangChain's MarkdownLoader to load Markdown file
120
+ loader = UnstructuredMarkdownLoader(md_path)
121
+ documents = loader.load()
122
+ # Further split documents if needed
123
+ splitter = RecursiveCharacterTextSplitter(
124
+ chunk_size=chunk_size,
125
+ chunk_overlap=chunk_overlap,
126
+ length_function=len,
127
+ is_separator_regex=False,
128
+ )
129
+ return splitter.split_documents(documents)
130
+ except Exception as e:
131
+ logger.error(f"加载 {Path(md_path).name} 失败: {str(e)}")
132
+ return []
133
+
134
+
135
+ # ------------llm generator-------------------
136
+ def extract_json_from_llm_output(output: str):
137
+ """
138
+ Extract JSON content from LLM output, handling multiple possible formats
139
+
140
+ Args:
141
+ output: Raw output string from LLM
142
+
143
+ Returns:
144
+ Parsed JSON list if successful, None otherwise
145
+ """
146
+ # Try to parse the entire output directly
147
+ try:
148
+ return json.loads(output)
149
+ except json.JSONDecodeError:
150
+ pass
151
+
152
+ # Try to extract content wrapped in ```json ```
153
+ json_match = re.search(r"```json\n([\s\S]*?)\n```", output)
154
+ if json_match:
155
+ try:
156
+ return json.loads(json_match.group(1))
157
+ except json.JSONDecodeError as e:
158
+ print(f"解析 JSON 时出错: {e}")
159
+
160
+ # Try to extract the most JSON-like part
161
+ json_start = output.find("[")
162
+ json_end = output.rfind("]") + 1
163
+ if json_start != -1 and json_end != 0:
164
+ try:
165
+ return json.loads(output[json_start:json_end])
166
+ except json.JSONDecodeError:
167
+ pass
168
+
169
+ print("模型未按标准格式输出:", output)
170
+ return None
171
+
172
+
173
+ def llm_generator(
174
+ api_key: str,
175
+ model: str,
176
+ base_url: str,
177
+ prompt: str,
178
+ type: str,
179
+ message: list = None,
180
+ temperature: float = 0.7,
181
+ top_p: float = 0.9,
182
+ max_token: int = 2048,
183
+ ) -> list:
184
+ """Generate content using LLM API"""
185
+ try:
186
+ if not message:
187
+ message = [
188
+ {"role": "system", "content": prompt},
189
+ {"role": "user", "content": "请严格按照要求生成内容"},
190
+ ]
191
+ headers = {
192
+ "Authorization": f"Bearer {api_key}",
193
+ "Content-Type": "application/json",
194
+ }
195
+ data = {
196
+ "model": model,
197
+ "messages": message,
198
+ "max_tokens": max_token,
199
+ "temperature": temperature,
200
+ "top_p": top_p,
201
+ }
202
+ response = requests.post(base_url, headers=headers, json=data, timeout=30)
203
+ response.raise_for_status()
204
+ result = response.json()
205
+
206
+ # Parse LLM response
207
+ if "choices" in result and len(result["choices"]) > 0:
208
+ output = result["choices"][0]["message"]["content"]
209
+ if type == "question":
210
+ fmt_output = extract_json_from_llm_output(output)
211
+ else:
212
+ return output
213
+ return fmt_output
214
+ return []
215
+
216
+ except Exception as e:
217
+ print(f"LLM提取关键词失败: {e, e.__traceback__.tb_lineno}")
218
+ return []
219
+
220
+
221
+ # ------------thread_process-------------
222
+
223
+
224
+ def process_questions(
225
+ api_key: str,
226
+ model: str,
227
+ base_url: str,
228
+ page_content: list,
229
+ question_number: int,
230
+ message: list,
231
+ max_workers: int = 5,
232
+ ) -> list:
233
+ """Generate questions using multi-threading"""
234
+ total_questions = []
235
+
236
+ def _generate_questions(page):
237
+ """Inner function for question generation"""
238
+ prompt = get_system_prompt_for_question(page, question_number)
239
+ questions = llm_generator(
240
+ api_key=api_key,
241
+ model=model,
242
+ base_url=base_url,
243
+ message=message,
244
+ prompt=prompt,
245
+ type="question",
246
+ )
247
+ return [{"question": q, "page": page} for q in questions] if questions else []
248
+
249
+ logger.info(f"开始生成问题 (线程数: {max_workers})...")
250
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
251
+ futures = [executor.submit(_generate_questions, page) for page in page_content]
252
+
253
+ with tqdm(as_completed(futures), total=len(futures), desc="生成问题") as pbar:
254
+ for future in pbar:
255
+ result = future.result()
256
+ if result:
257
+ with lock:
258
+ total_questions.extend(result)
259
+ pbar.set_postfix({"已生成问题": len(total_questions)})
260
+
261
+ return total_questions
262
+
263
+
264
+ def process_answers(
265
+ api_key: str,
266
+ model: str,
267
+ base_url: str,
268
+ question_items: list,
269
+ message: list = None,
270
+ max_workers=5,
271
+ ) -> dict:
272
+ """Generate answers using multi-threading"""
273
+ qa_pairs = {}
274
+
275
+ def _generate_answer(item):
276
+ """Inner function for answer generation"""
277
+ prompt = get_system_prompt_for_answer(item["page"], item["question"])
278
+ answer = llm_generator(
279
+ api_key=api_key,
280
+ model=model,
281
+ base_url=base_url,
282
+ prompt=prompt,
283
+ message=message,
284
+ type="answer",
285
+ )
286
+ return item["question"], answer
287
+
288
+ logger.info(f"开始生成答案 (线程数: {max_workers})...")
289
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
290
+ futures = {
291
+ executor.submit(_generate_answer, item): item for item in question_items
292
+ }
293
+
294
+ with tqdm(as_completed(futures), total=len(futures), desc="生成答案") as pbar:
295
+ for future in pbar:
296
+ question, answer = future.result()
297
+ if answer:
298
+ with lock:
299
+ qa_pairs[question] = answer
300
+ pbar.set_postfix({"已生成答案": len(qa_pairs)})
301
+ return qa_pairs
302
+
303
+
304
+ def generatr_qa_pairs(
305
+ file_path: str,
306
+ api_key: str,
307
+ base_url: str,
308
+ model_name: str,
309
+ chunk_size=500,
310
+ chunk_overlap=100,
311
+ question_number=5,
312
+ message: list = None,
313
+ max_workers=5,
314
+ ):
315
+ """Main function to generate QA pairs from markdown file"""
316
+ # 1. Split markdown text into chunks
317
+ pages = load_and_split_markdown(
318
+ md_path=file_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap
319
+ )
320
+ page_content = [i.page_content for i in pages]
321
+ logger.info(f"markdown被分解了{len(page_content)}个chunk")
322
+
323
+ # 2. Generate questions using multi-threading
324
+ questions = process_questions(
325
+ page_content=page_content,
326
+ message=message,
327
+ question_number=question_number,
328
+ max_workers=max_workers,
329
+ api_key=api_key,
330
+ base_url=base_url,
331
+ model=model_name,
332
+ )
333
+ if not questions:
334
+ logger.error("未能生成任何问题,请检查输入文档和API设置")
335
+
336
+ # 3. Generate answers using multi-threading
337
+ qa_pairs = process_answers(
338
+ question_items=questions,
339
+ message=message,
340
+ max_workers=max_workers,
341
+ api_key=api_key,
342
+ base_url=base_url,
343
+ model=model_name,
344
+ )
345
+
346
+ # 4. Save results
347
+ res_list = []
348
+ with open(
349
+ f"{os.path.basename(file_path).strip('.md')}.jsonl", "w", encoding="utf-8"
350
+ ) as f:
351
+ for question, answer in qa_pairs.items():
352
+ # Build properly formatted JSON object
353
+ qa_entry = {"instruction": question, "input": "", "output": answer}
354
+ res_list.append(qa_entry)
355
+ # Write to JSONL file (one JSON object per line)
356
+ f.write(json.dumps(qa_entry, ensure_ascii=False) + "\n")
357
+
358
+ logger.success(
359
+ f"完成! 共生成 {len(qa_pairs)} 个问答对,已保存到 {os.path.basename(file_path).strip('.md')}.jsonl"
360
+ )
361
+
362
+ return res_list
363
+
364
+
365
+ if __name__ == "__main__":
366
+ generatr_qa_pairs(
367
+ file_path=r"C:\Users\cykro\Desktop\文档整理\知识图谱\知识图谱概要设计.md",
368
+ api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxx",
369
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
370
+ model_name="qwen-max",
371
+ chunk_size=500,
372
+ chunk_overlap=100,
373
+ question_number=5,
374
+ max_workers=5,
375
+ # message=[]
376
+ )
@@ -17,6 +17,6 @@ class DashScopeClient:
17
17
  p50k_base corresponds to models text-davinci-002 and text-davinci-003
18
18
  r50k_base corresponds to model gpt2
19
19
  '''
20
- encoding = tiktoken.get_encoding(encoding_name="o200k_base")
20
+ encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
21
21
  num_tokens = len(encoding.encode(content))
22
22
  return num_tokens
@@ -0,0 +1,271 @@
1
+ Metadata-Version: 2.4
2
+ Name: pydatamax
3
+ Version: 0.1.11
4
+ Summary: A library for parsing and converting various file formats.
5
+ Home-page: https://github.com/cosco/datamax
6
+ Author: hzb | ccy
7
+ Author-email: zhibaohe@hotmail.com | cy.kron@foxmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: ebooklib
15
+ Requires-Dist: python-docx
16
+ Requires-Dist: beautifulsoup4
17
+ Requires-Dist: python-dotenv
18
+ Requires-Dist: minio
19
+ Requires-Dist: loguru
20
+ Requires-Dist: tqdm
21
+ Requires-Dist: oss2
22
+ Requires-Dist: python-docx
23
+ Requires-Dist: openai
24
+ Requires-Dist: jionlp
25
+ Requires-Dist: chardet
26
+ Requires-Dist: python-pptx
27
+ Requires-Dist: openpyxl
28
+ Requires-Dist: pymupdf
29
+ Requires-Dist: langchain_community==0.2.9
30
+ Requires-Dist: premailer
31
+ Requires-Dist: setuptools==75.3.0
32
+ Requires-Dist: docx2markdown
33
+ Requires-Dist: tiktoken
34
+ Requires-Dist: markitdown
35
+ Requires-Dist: pandas
36
+ Requires-Dist: xlrd
37
+ Requires-Dist: tabulate
38
+ Requires-Dist: unstructured[all]
39
+ Requires-Dist: markdown
40
+ Dynamic: author
41
+ Dynamic: author-email
42
+ Dynamic: classifier
43
+ Dynamic: description
44
+ Dynamic: description-content-type
45
+ Dynamic: home-page
46
+ Dynamic: license-file
47
+ Dynamic: requires-dist
48
+ Dynamic: requires-python
49
+ Dynamic: summary
50
+
51
+ # DataMax
52
+
53
+ ## Overview
54
+ DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
55
+
56
+ ## Key Features
57
+
58
+ ### File Processing Capabilities
59
+ Currently supports reading, conversion, and extraction from:
60
+ - PDF, HTML
61
+ - DOCX/DOC, PPT/PPTX
62
+ - EPUB
63
+ - Images
64
+ - XLS/XLSX spreadsheets
65
+ - Plain text (TXT)
66
+
67
+ ### Data Cleaning Pipeline
68
+ Three-tiered cleaning process:
69
+ 1. Anomaly detection and handling
70
+ 2. Privacy protection processing
71
+ 3. Text filtering and normalization
72
+
73
+ ### AI-Powered Data Annotation
74
+ Implements an LLM+Prompt to:
75
+ - Continuously generate pre-labeled datasets
76
+ - Provide optimized training data for model fine-tuning
77
+
78
+
79
+ ## Installation Guide (Key Dependencies)
80
+ Dependencies include libreoffice, datamax, and MinerU.
81
+
82
+ ### 1. Installing libreoffice Dependency
83
+ **Note:** Without datamax, .doc files will not be supported.
84
+
85
+ #### Linux (Debian/Ubuntu)
86
+ ```bash
87
+ sudo apt-get update
88
+ sudo apt-get install libreoffice
89
+ ```
90
+ ### Windows
91
+ ```text
92
+ Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
93
+ Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
94
+ ```
95
+ ### Checking LibreOffice Installation
96
+ ```bash
97
+ soffice --version
98
+ ```
99
+
100
+ ## 2. Installing MinerU Dependency
101
+ Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
102
+ ### Create a Virtual Environment and Install Basic Dependencies
103
+ ```bash
104
+ conda create -n mineru python=3.10
105
+ conda activate mineru
106
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
107
+ ```
108
+ ### Installing Model Weight Files
109
+ https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
110
+ ```bash
111
+ pip install modelscope
112
+ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
113
+ python download_models.py
114
+ ```
115
+
116
+ ### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
117
+ ```json
118
+ {
119
+ "models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
120
+ "layoutreader-model-dir": "path\\to\\folder\\layoutreader",
121
+ "device-mode": "cpu",
122
+ ...
123
+ }
124
+ ```
125
+
126
+ ## 3. Installing Basic Dependencies for datamax
127
+ 1. Clone the repository to your local machine:
128
+ ```bash
129
+ git clone <repository-url>
130
+ ```
131
+ 2. Install dependencies into conda:
132
+ ```bash
133
+ cd datamax
134
+ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
135
+ ```
136
+
137
+
138
+ ## Features
139
+ - **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
140
+ - **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
141
+ - **Data Conversion**: Supports converting processed data into markdown format for further analysis.
142
+ - **Batch Processing**: Can handle multiple files at once, improving work efficiency.
143
+ - **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
144
+ - **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
145
+
146
+
147
+ ## Technology Stack
148
+
149
+ - **Programming Language**: Python >= 3.10
150
+ - **Dependency Libraries**:
151
+ - PyMuPDF: For PDF file parsing.
152
+ - BeautifulSoup: For HTML file parsing.
153
+ - python-docx: For DOCX file parsing.
154
+ - pandas: For data processing and conversion.
155
+ - paddleocr: For parsing scanned PDFs, tables, and images.
156
+ - **Development Environment**: Visual Studio Code or PyCharm
157
+ - **Version Control**: Git
158
+
159
+ ## Usage Instructions
160
+ ### Installing the SDK
161
+ - **Installation Commands**:
162
+ ```bash
163
+ ## Local Installation
164
+ python setup.py sdist bdist_wheel
165
+ pip install dist/datamax-0.1.3-py3-none-any.whl
166
+
167
+ ## Pip Installation
168
+ pip install pydatamax
169
+ ```
170
+
171
+
172
+ - **Importing the Code**:
173
+ ```python
174
+ # File Parsing
175
+ from datamax import DataMax
176
+
177
+ ## Handling a Single File in Two Ways
178
+ # 1. Using a List of Length 1
179
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
180
+ data = data.get_data()
181
+
182
+ # 2. Using a String
183
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
184
+ data = data.get_data()
185
+
186
+ ## Handling Multiple Files
187
+ # 1. Using a List of Length n
188
+ data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
189
+ data = data.get_data()
190
+
191
+ # 2. Passing a Folder Path as a String
192
+ data = DataMax(file_path=r"docx_files_example/")
193
+ data = data.get_data()
194
+
195
+ # Data Cleaning
196
+ """
197
+ Cleaning rules can be found in datamax/utils/data_cleaner.py
198
+ abnormal: Abnormal cleaning
199
+ private: Privacy processing
200
+ filter: Text filtering
201
+ """
202
+ # Direct Use: Clean the text parameter directly and return a string
203
+ dm = DataMax()
204
+ data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
205
+
206
+ # Process Use: Use after get_data() to return the complete data structure
207
+ dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
208
+ data2 = dm.get_data()
209
+ cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
210
+
211
+ # Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
212
+ data = DataMax(file_path=r"path\to\xxx.docx")
213
+ parsed_data = data.get_data()
214
+ # If no custom messages are passed, the default messages in the SDK will be used
215
+ messages = [
216
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
217
+ {'role': 'user', 'content': 'Who are you?'}
218
+ ]
219
+ qa_datas = data.get_pre_label(
220
+ api_key="sk-xxx",
221
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
222
+ model_name="qwen-max",
223
+ chunk_size=500,
224
+ chunk_overlap=100,
225
+ question_number=5,
226
+ max_workers=5,
227
+ # message=[]
228
+ )
229
+ print(f'Annotated result:{qa_datas}')
230
+ ```
231
+
232
+
233
+ ## Examples
234
+ ```python
235
+ ## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
236
+ from datamax import DataMax
237
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
238
+ """
239
+ Parameters:
240
+ file_path: Relative file path / Absolute file path
241
+ to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
242
+ """
243
+
244
+ ## jpg | jpeg | png | ...(image types)
245
+ data = DataMax(file_path=r"image.jpg", use_mineru=True)
246
+ """
247
+ Parameters:
248
+ file_path: Relative file path / Absolute file path
249
+ use_mineru: Whether to use MinerU enhancement
250
+ """
251
+
252
+ ## pdf
253
+ from datamax import DataMax
254
+ data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
255
+ """
256
+ Parameters:
257
+ file_path: Relative file path / Absolute file path
258
+ use_mineru: Whether to use MinerU enhancement
259
+ """
260
+ ```
261
+
262
+ ## Contribution Guide
263
+ We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
264
+ ## License
265
+ This project is licensed under the MIT License. For more details, see the LICENSE file.
266
+
267
+ ## Contact Information
268
+ If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
269
+ - Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
270
+ - Project Homepage: GitHub Project Link
271
+
@@ -0,0 +1,39 @@
1
+ datamax/__init__.py,sha256=Kbs8ITE6suPy0VL8WzKH8A_iAGqukC0jIHcFGLgoBw8,28
2
+ datamax/loader/MinioHandler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
3
+ datamax/loader/OssHandler.py,sha256=YFbUfH2vkKRVg8OhkYpOrlYHrIX4zofbHFVe8nPpsQ4,7457
4
+ datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ datamax/loader/core.py,sha256=tSIkOw5D3EVFYme1b7joFt0e_LxJdf-mdUzxpyVt0VI,5098
6
+ datamax/parser/__init__.py,sha256=Jilq2PLBNonmoXKATzsIHWWvFuBdlcV2dbSP1cOZ6zg,111
7
+ datamax/parser/base.py,sha256=riGcMn4m295_qf9O0-NbHU2BcHGBXvoF4T3fWj9vgUQ,2514
8
+ datamax/parser/core.py,sha256=9rzIjsVTRacPTUTAVa5gm5fx0h95LxYnw0lEGqjIIB4,11437
9
+ datamax/parser/csv_parser.py,sha256=IcyVq8mGE6auIcUInXGYWDnh0H0XJ_3SyQrLVRrS7i0,190
10
+ datamax/parser/doc_parser.py,sha256=VwTOdq5pGPbOI-98SoTwwTcXIjD1BrZDfFGEhTi3T44,3348
11
+ datamax/parser/docx_parser.py,sha256=OhqcMeZ8JkwDJtvrMirM15j-EDnGNUj6U1-nX3gisKA,1727
12
+ datamax/parser/epub_parser.py,sha256=ljCGxLBPwE5gXVKARJec93VpP4dE9R2GspzuSZBkqPQ,1557
13
+ datamax/parser/html_parser.py,sha256=xQaaK8674QbQwE-Up9X0DJIH0Gg0mR2KoI7fJ6iw2m0,1393
14
+ datamax/parser/image_parser.py,sha256=qGCndc_21PwsfuxFG03wHSsV0uc-XMBaW3VDbsJQd90,1233
15
+ datamax/parser/json_parser.py,sha256=MFamKCkP5Ny1kJyJlPkd_vNqk31ngPRf8NoYw8SxMY4,190
16
+ datamax/parser/md_parser.py,sha256=lgRlcvtV_9gkB2BnygzcdqIfj94tWjEq6ziGeLq3p00,2156
17
+ datamax/parser/pdf_parser.py,sha256=EbhXjTU09hMTr850_o1K7m7zD4QU9_A54MsbOF7pLT0,3992
18
+ datamax/parser/ppt_parser.py,sha256=Niu3Ina6I6m6lAMS1Z-A7rUbR_iFGmNTaASBoNH_vZ0,3142
19
+ datamax/parser/pptx_parser.py,sha256=sFWyOa3QNIs4BgtpmSzFQgsgPmunfGqCqi6fulbLFW0,1811
20
+ datamax/parser/txt_parser.py,sha256=4DIP1LVOw21NDdtqG2RTD_hMcHufkvC8kr048AkuLFs,1682
21
+ datamax/parser/xls_parser.py,sha256=pRlqgg96f76H8UqXQfheQT9O0ThdP7958hKUCEyQfPM,954
22
+ datamax/parser/xlsx_parser.py,sha256=9ZqwCSF01thjEb_RleWGCiNOSuA8KZ3QFqzUKldb3wE,2183
23
+ datamax/utils/__init__.py,sha256=d69SJvqOXzItyg9rEcLc4z67Lw9vACispOe3x7NvZLA,1051
24
+ datamax/utils/constants.py,sha256=A0S56mkIfeT6oQmOd-VGTChzLOSBUqsG4skMmLt6uNk,4507
25
+ datamax/utils/data_cleaner.py,sha256=zlk2dXmhU-_9KVfqmqMGr967v-nc7Iv8ZKRdMkIJsGM,7784
26
+ datamax/utils/env_setup.py,sha256=KrRQIbCMgtTjD8lKwzc9jv7jFPMMNMzikEb0_TfIstU,3460
27
+ datamax/utils/gotocr_pdf.py,sha256=YCYio_5Yt77hky4nSyfREw5_Bh55XbGy7l2cypvGxNg,8479
28
+ datamax/utils/mineru_operator.py,sha256=Rss7YVSAUnoWmDnCGPJlgsMNmJWmb6blYuS4UB7PgQ8,2241
29
+ datamax/utils/paddleocr_pdf_operator.py,sha256=Tnb-5SzUd6OXM-XeaL8vdPnsOhgG_GKz-gfIdVtYoSs,3555
30
+ datamax/utils/ppt_extract.py,sha256=nd6KSqEzxANrPhNPUZY4ogAyxHzKCbdsI5ZfDQCz0Cw,6164
31
+ datamax/utils/qa_generator.py,sha256=d75an9JEyT6sxlSjdmWYveQshfyTb0v4aGSuTpTJa0A,12561
32
+ datamax/utils/tokenizer.py,sha256=Y8XB06XQVsNuG8IPl_4iBZj2yu1xzXldVbmZtXFMQM4,859
33
+ pydatamax-0.1.11.dist-info/licenses/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
34
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ tests/test_basic.py,sha256=4AByx25-MIt6_zmzxpFRoSCBqLtIjyfTwFLb1UCJz6k,303
36
+ pydatamax-0.1.11.dist-info/METADATA,sha256=n6BKLg2Dh015V7wRoFSOUPOeKo7hEUdetnK7FBCAK1c,9039
37
+ pydatamax-0.1.11.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
38
+ pydatamax-0.1.11.dist-info/top_level.txt,sha256=DvdVIUrxJDgRUYiGA5vznYZIP-K8ZnDkTZfrqYLNZMQ,14
39
+ pydatamax-0.1.11.dist-info/RECORD,,