pydatamax 0.1.5__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/OssHandler.py +85 -51
- datamax/parser/__init__.py +1 -1
- datamax/parser/base.py +2 -2
- datamax/parser/core.py +205 -31
- datamax/parser/doc_parser.py +2 -5
- datamax/parser/docx_parser.py +3 -6
- datamax/parser/epub_parser.py +2 -5
- datamax/parser/html_parser.py +2 -5
- datamax/parser/image_parser.py +18 -14
- datamax/parser/md_parser.py +67 -4
- datamax/parser/pdf_parser.py +59 -20
- datamax/parser/ppt_parser.py +3 -5
- datamax/parser/pptx_parser.py +10 -13
- datamax/parser/txt_parser.py +2 -5
- datamax/parser/xls_parser.py +26 -0
- datamax/parser/xlsx_parser.py +65 -4
- datamax/utils/__init__.py +1 -0
- datamax/utils/constants.py +58 -0
- datamax/utils/data_cleaner.py +45 -28
- datamax/utils/env_setup.py +80 -0
- datamax/utils/gotocr_pdf.py +265 -0
- datamax/utils/mineru_operator.py +62 -0
- datamax/utils/paddleocr_pdf_operator.py +2 -1
- datamax/utils/qa_generator.py +376 -0
- datamax/utils/tokenizer.py +1 -1
- pydatamax-0.1.11.dist-info/METADATA +271 -0
- pydatamax-0.1.11.dist-info/RECORD +39 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info}/WHEEL +1 -1
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info/licenses}/LICENSE +0 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info}/top_level.txt +1 -0
- tests/__init__.py +0 -0
- tests/test_basic.py +20 -0
- pydatamax-0.1.5.dist-info/METADATA +0 -282
- pydatamax-0.1.5.dist-info/RECORD +0 -31
@@ -0,0 +1,376 @@
|
|
1
|
+
import json
|
2
|
+
import os.path
|
3
|
+
import re
|
4
|
+
import threading
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
import requests
|
9
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10
|
+
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
11
|
+
from loguru import logger
|
12
|
+
from pyexpat.errors import messages
|
13
|
+
from tqdm import tqdm # For progress bar display
|
14
|
+
|
15
|
+
lock = threading.Lock()
|
16
|
+
|
17
|
+
|
18
|
+
# ------------prompt-----------------
|
19
|
+
def get_system_prompt_for_question(query_text, question_number):
|
20
|
+
"""Generate system prompt for question generation task"""
|
21
|
+
system_prompt = f"""
|
22
|
+
# 角色使命
|
23
|
+
你是一位专业的文本分析专家,擅长从复杂文本中提取关键信息并生成可用于模型微调的结构化数据(仅生成问题)。
|
24
|
+
|
25
|
+
## 核心任务
|
26
|
+
根据用户提供的文本,生成不少于 ${question_number} 个高质量问题。
|
27
|
+
|
28
|
+
## 约束条件(重要!)
|
29
|
+
- 必须基于文本内容直接生成
|
30
|
+
- 问题应具有明确答案指向性
|
31
|
+
- 需覆盖文本的不同方面
|
32
|
+
- 禁止生成假设性、重复或相似问题
|
33
|
+
- 确保生成得完整性
|
34
|
+
|
35
|
+
## 处理流程
|
36
|
+
1. 【文本解析】分段处理内容,识别关键实体和核心概念
|
37
|
+
2. 【问题生成】基于信息密度选择最佳提问点
|
38
|
+
3. 【质量检查】确保:
|
39
|
+
- 问题答案可在原文中找到依据
|
40
|
+
- 标签与问题内容强相关
|
41
|
+
- 无格式错误
|
42
|
+
|
43
|
+
## 输出格式
|
44
|
+
- JSON 数组格式必须正确
|
45
|
+
- 字段名使用英文双引号
|
46
|
+
- 输出的 JSON 数组必须严格符合以下结构:
|
47
|
+
\`\`\`json
|
48
|
+
["问题1", "问题2", "..."]
|
49
|
+
\`\`\`
|
50
|
+
|
51
|
+
## 输出示例
|
52
|
+
\`\`\`json
|
53
|
+
[ "人工智能伦理框架应包含哪些核心要素?","民法典对个人数据保护有哪些新规定?"]
|
54
|
+
\`\`\`
|
55
|
+
|
56
|
+
## 待处理文本
|
57
|
+
${query_text}
|
58
|
+
|
59
|
+
## 限制
|
60
|
+
- 必须按照规定的 JSON 格式输出,不要输出任何其他不相关内容
|
61
|
+
- 生成不少于${question_number}个高质量问题
|
62
|
+
- 问题不要和材料本身相关,例如禁止出现作者、章节、目录等相关问题
|
63
|
+
- 问题不得包含【报告、文章、文献、表格】中提到的这种话术,必须是一个自然的问题
|
64
|
+
"""
|
65
|
+
return system_prompt
|
66
|
+
|
67
|
+
|
68
|
+
def get_system_prompt_for_answer(text, query_question):
|
69
|
+
"""Generate system prompt for answer generation task"""
|
70
|
+
system_prompt = f"""
|
71
|
+
# Role: 微调数据集生成专家
|
72
|
+
## Profile:
|
73
|
+
- Description: 你是一名微调数据集生成专家,擅长从给定的内容中生成准确的问题答案,确保答案的准确性和相关性,你要直接回答用户问题,所有信息已内化为你的专业知识。
|
74
|
+
|
75
|
+
## Skills :
|
76
|
+
1. 答案必须基于给定的内容
|
77
|
+
2. 答案必须准确,不能胡编乱造
|
78
|
+
3. 答案必须与问题相关
|
79
|
+
4. 答案必须符合逻辑
|
80
|
+
5. 基于给定参考内容,用自然流畅的语言整合成一个完整答案,不需要提及文献来源或引用标记
|
81
|
+
|
82
|
+
## Workflow:
|
83
|
+
1. Take a deep breath and work on this problem step-by-step.
|
84
|
+
2. 首先,分析给定的文件内容
|
85
|
+
3. 然后,从内容中提取关键信息
|
86
|
+
4. 接着,生成与问题相关的准确答案
|
87
|
+
5. 最后,确保答案的准确性和相关性
|
88
|
+
|
89
|
+
## 参考内容:
|
90
|
+
${text}
|
91
|
+
|
92
|
+
## 问题
|
93
|
+
${query_question}
|
94
|
+
|
95
|
+
## Constrains:
|
96
|
+
1. 答案必须基于给定的内容
|
97
|
+
2. 答案必须准确,必须与问题相关,不能胡编乱造
|
98
|
+
3. 答案必须充分、详细、包含所有必要的信息、适合微调大模型训练使用
|
99
|
+
4. 答案中不得出现 ' 参考 / 依据 / 文献中提到 ' 等任何引用性表述,只需呈现最终结果
|
100
|
+
"""
|
101
|
+
return system_prompt
|
102
|
+
|
103
|
+
|
104
|
+
# ------------spliter----------------
|
105
|
+
def load_and_split_markdown(md_path: str, chunk_size: int, chunk_overlap: int) -> list:
|
106
|
+
"""
|
107
|
+
Parse Markdown using UnstructuredMarkdownLoader
|
108
|
+
Chunking strategy that preserves original paragraph structure
|
109
|
+
|
110
|
+
Args:
|
111
|
+
md_path: Path to the markdown file
|
112
|
+
chunk_size: Size of each chunk
|
113
|
+
chunk_overlap: Overlap between chunks
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
List of document chunks
|
117
|
+
"""
|
118
|
+
try:
|
119
|
+
# Use LangChain's MarkdownLoader to load Markdown file
|
120
|
+
loader = UnstructuredMarkdownLoader(md_path)
|
121
|
+
documents = loader.load()
|
122
|
+
# Further split documents if needed
|
123
|
+
splitter = RecursiveCharacterTextSplitter(
|
124
|
+
chunk_size=chunk_size,
|
125
|
+
chunk_overlap=chunk_overlap,
|
126
|
+
length_function=len,
|
127
|
+
is_separator_regex=False,
|
128
|
+
)
|
129
|
+
return splitter.split_documents(documents)
|
130
|
+
except Exception as e:
|
131
|
+
logger.error(f"加载 {Path(md_path).name} 失败: {str(e)}")
|
132
|
+
return []
|
133
|
+
|
134
|
+
|
135
|
+
# ------------llm generator-------------------
|
136
|
+
def extract_json_from_llm_output(output: str):
|
137
|
+
"""
|
138
|
+
Extract JSON content from LLM output, handling multiple possible formats
|
139
|
+
|
140
|
+
Args:
|
141
|
+
output: Raw output string from LLM
|
142
|
+
|
143
|
+
Returns:
|
144
|
+
Parsed JSON list if successful, None otherwise
|
145
|
+
"""
|
146
|
+
# Try to parse the entire output directly
|
147
|
+
try:
|
148
|
+
return json.loads(output)
|
149
|
+
except json.JSONDecodeError:
|
150
|
+
pass
|
151
|
+
|
152
|
+
# Try to extract content wrapped in ```json ```
|
153
|
+
json_match = re.search(r"```json\n([\s\S]*?)\n```", output)
|
154
|
+
if json_match:
|
155
|
+
try:
|
156
|
+
return json.loads(json_match.group(1))
|
157
|
+
except json.JSONDecodeError as e:
|
158
|
+
print(f"解析 JSON 时出错: {e}")
|
159
|
+
|
160
|
+
# Try to extract the most JSON-like part
|
161
|
+
json_start = output.find("[")
|
162
|
+
json_end = output.rfind("]") + 1
|
163
|
+
if json_start != -1 and json_end != 0:
|
164
|
+
try:
|
165
|
+
return json.loads(output[json_start:json_end])
|
166
|
+
except json.JSONDecodeError:
|
167
|
+
pass
|
168
|
+
|
169
|
+
print("模型未按标准格式输出:", output)
|
170
|
+
return None
|
171
|
+
|
172
|
+
|
173
|
+
def llm_generator(
|
174
|
+
api_key: str,
|
175
|
+
model: str,
|
176
|
+
base_url: str,
|
177
|
+
prompt: str,
|
178
|
+
type: str,
|
179
|
+
message: list = None,
|
180
|
+
temperature: float = 0.7,
|
181
|
+
top_p: float = 0.9,
|
182
|
+
max_token: int = 2048,
|
183
|
+
) -> list:
|
184
|
+
"""Generate content using LLM API"""
|
185
|
+
try:
|
186
|
+
if not message:
|
187
|
+
message = [
|
188
|
+
{"role": "system", "content": prompt},
|
189
|
+
{"role": "user", "content": "请严格按照要求生成内容"},
|
190
|
+
]
|
191
|
+
headers = {
|
192
|
+
"Authorization": f"Bearer {api_key}",
|
193
|
+
"Content-Type": "application/json",
|
194
|
+
}
|
195
|
+
data = {
|
196
|
+
"model": model,
|
197
|
+
"messages": message,
|
198
|
+
"max_tokens": max_token,
|
199
|
+
"temperature": temperature,
|
200
|
+
"top_p": top_p,
|
201
|
+
}
|
202
|
+
response = requests.post(base_url, headers=headers, json=data, timeout=30)
|
203
|
+
response.raise_for_status()
|
204
|
+
result = response.json()
|
205
|
+
|
206
|
+
# Parse LLM response
|
207
|
+
if "choices" in result and len(result["choices"]) > 0:
|
208
|
+
output = result["choices"][0]["message"]["content"]
|
209
|
+
if type == "question":
|
210
|
+
fmt_output = extract_json_from_llm_output(output)
|
211
|
+
else:
|
212
|
+
return output
|
213
|
+
return fmt_output
|
214
|
+
return []
|
215
|
+
|
216
|
+
except Exception as e:
|
217
|
+
print(f"LLM提取关键词失败: {e, e.__traceback__.tb_lineno}")
|
218
|
+
return []
|
219
|
+
|
220
|
+
|
221
|
+
# ------------thread_process-------------
|
222
|
+
|
223
|
+
|
224
|
+
def process_questions(
|
225
|
+
api_key: str,
|
226
|
+
model: str,
|
227
|
+
base_url: str,
|
228
|
+
page_content: list,
|
229
|
+
question_number: int,
|
230
|
+
message: list,
|
231
|
+
max_workers: int = 5,
|
232
|
+
) -> list:
|
233
|
+
"""Generate questions using multi-threading"""
|
234
|
+
total_questions = []
|
235
|
+
|
236
|
+
def _generate_questions(page):
|
237
|
+
"""Inner function for question generation"""
|
238
|
+
prompt = get_system_prompt_for_question(page, question_number)
|
239
|
+
questions = llm_generator(
|
240
|
+
api_key=api_key,
|
241
|
+
model=model,
|
242
|
+
base_url=base_url,
|
243
|
+
message=message,
|
244
|
+
prompt=prompt,
|
245
|
+
type="question",
|
246
|
+
)
|
247
|
+
return [{"question": q, "page": page} for q in questions] if questions else []
|
248
|
+
|
249
|
+
logger.info(f"开始生成问题 (线程数: {max_workers})...")
|
250
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
251
|
+
futures = [executor.submit(_generate_questions, page) for page in page_content]
|
252
|
+
|
253
|
+
with tqdm(as_completed(futures), total=len(futures), desc="生成问题") as pbar:
|
254
|
+
for future in pbar:
|
255
|
+
result = future.result()
|
256
|
+
if result:
|
257
|
+
with lock:
|
258
|
+
total_questions.extend(result)
|
259
|
+
pbar.set_postfix({"已生成问题": len(total_questions)})
|
260
|
+
|
261
|
+
return total_questions
|
262
|
+
|
263
|
+
|
264
|
+
def process_answers(
|
265
|
+
api_key: str,
|
266
|
+
model: str,
|
267
|
+
base_url: str,
|
268
|
+
question_items: list,
|
269
|
+
message: list = None,
|
270
|
+
max_workers=5,
|
271
|
+
) -> dict:
|
272
|
+
"""Generate answers using multi-threading"""
|
273
|
+
qa_pairs = {}
|
274
|
+
|
275
|
+
def _generate_answer(item):
|
276
|
+
"""Inner function for answer generation"""
|
277
|
+
prompt = get_system_prompt_for_answer(item["page"], item["question"])
|
278
|
+
answer = llm_generator(
|
279
|
+
api_key=api_key,
|
280
|
+
model=model,
|
281
|
+
base_url=base_url,
|
282
|
+
prompt=prompt,
|
283
|
+
message=message,
|
284
|
+
type="answer",
|
285
|
+
)
|
286
|
+
return item["question"], answer
|
287
|
+
|
288
|
+
logger.info(f"开始生成答案 (线程数: {max_workers})...")
|
289
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
290
|
+
futures = {
|
291
|
+
executor.submit(_generate_answer, item): item for item in question_items
|
292
|
+
}
|
293
|
+
|
294
|
+
with tqdm(as_completed(futures), total=len(futures), desc="生成答案") as pbar:
|
295
|
+
for future in pbar:
|
296
|
+
question, answer = future.result()
|
297
|
+
if answer:
|
298
|
+
with lock:
|
299
|
+
qa_pairs[question] = answer
|
300
|
+
pbar.set_postfix({"已生成答案": len(qa_pairs)})
|
301
|
+
return qa_pairs
|
302
|
+
|
303
|
+
|
304
|
+
def generatr_qa_pairs(
|
305
|
+
file_path: str,
|
306
|
+
api_key: str,
|
307
|
+
base_url: str,
|
308
|
+
model_name: str,
|
309
|
+
chunk_size=500,
|
310
|
+
chunk_overlap=100,
|
311
|
+
question_number=5,
|
312
|
+
message: list = None,
|
313
|
+
max_workers=5,
|
314
|
+
):
|
315
|
+
"""Main function to generate QA pairs from markdown file"""
|
316
|
+
# 1. Split markdown text into chunks
|
317
|
+
pages = load_and_split_markdown(
|
318
|
+
md_path=file_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
319
|
+
)
|
320
|
+
page_content = [i.page_content for i in pages]
|
321
|
+
logger.info(f"markdown被分解了{len(page_content)}个chunk")
|
322
|
+
|
323
|
+
# 2. Generate questions using multi-threading
|
324
|
+
questions = process_questions(
|
325
|
+
page_content=page_content,
|
326
|
+
message=message,
|
327
|
+
question_number=question_number,
|
328
|
+
max_workers=max_workers,
|
329
|
+
api_key=api_key,
|
330
|
+
base_url=base_url,
|
331
|
+
model=model_name,
|
332
|
+
)
|
333
|
+
if not questions:
|
334
|
+
logger.error("未能生成任何问题,请检查输入文档和API设置")
|
335
|
+
|
336
|
+
# 3. Generate answers using multi-threading
|
337
|
+
qa_pairs = process_answers(
|
338
|
+
question_items=questions,
|
339
|
+
message=message,
|
340
|
+
max_workers=max_workers,
|
341
|
+
api_key=api_key,
|
342
|
+
base_url=base_url,
|
343
|
+
model=model_name,
|
344
|
+
)
|
345
|
+
|
346
|
+
# 4. Save results
|
347
|
+
res_list = []
|
348
|
+
with open(
|
349
|
+
f"{os.path.basename(file_path).strip('.md')}.jsonl", "w", encoding="utf-8"
|
350
|
+
) as f:
|
351
|
+
for question, answer in qa_pairs.items():
|
352
|
+
# Build properly formatted JSON object
|
353
|
+
qa_entry = {"instruction": question, "input": "", "output": answer}
|
354
|
+
res_list.append(qa_entry)
|
355
|
+
# Write to JSONL file (one JSON object per line)
|
356
|
+
f.write(json.dumps(qa_entry, ensure_ascii=False) + "\n")
|
357
|
+
|
358
|
+
logger.success(
|
359
|
+
f"完成! 共生成 {len(qa_pairs)} 个问答对,已保存到 {os.path.basename(file_path).strip('.md')}.jsonl"
|
360
|
+
)
|
361
|
+
|
362
|
+
return res_list
|
363
|
+
|
364
|
+
|
365
|
+
if __name__ == "__main__":
|
366
|
+
generatr_qa_pairs(
|
367
|
+
file_path=r"C:\Users\cykro\Desktop\文档整理\知识图谱\知识图谱概要设计.md",
|
368
|
+
api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxx",
|
369
|
+
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
|
370
|
+
model_name="qwen-max",
|
371
|
+
chunk_size=500,
|
372
|
+
chunk_overlap=100,
|
373
|
+
question_number=5,
|
374
|
+
max_workers=5,
|
375
|
+
# message=[]
|
376
|
+
)
|
datamax/utils/tokenizer.py
CHANGED
@@ -17,6 +17,6 @@ class DashScopeClient:
|
|
17
17
|
p50k_base corresponds to models text-davinci-002 and text-davinci-003
|
18
18
|
r50k_base corresponds to model gpt2
|
19
19
|
'''
|
20
|
-
encoding = tiktoken.get_encoding(encoding_name="
|
20
|
+
encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
|
21
21
|
num_tokens = len(encoding.encode(content))
|
22
22
|
return num_tokens
|
@@ -0,0 +1,271 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: pydatamax
|
3
|
+
Version: 0.1.11
|
4
|
+
Summary: A library for parsing and converting various file formats.
|
5
|
+
Home-page: https://github.com/cosco/datamax
|
6
|
+
Author: hzb | ccy
|
7
|
+
Author-email: zhibaohe@hotmail.com | cy.kron@foxmail.com
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.10
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Requires-Dist: ebooklib
|
15
|
+
Requires-Dist: python-docx
|
16
|
+
Requires-Dist: beautifulsoup4
|
17
|
+
Requires-Dist: python-dotenv
|
18
|
+
Requires-Dist: minio
|
19
|
+
Requires-Dist: loguru
|
20
|
+
Requires-Dist: tqdm
|
21
|
+
Requires-Dist: oss2
|
22
|
+
Requires-Dist: python-docx
|
23
|
+
Requires-Dist: openai
|
24
|
+
Requires-Dist: jionlp
|
25
|
+
Requires-Dist: chardet
|
26
|
+
Requires-Dist: python-pptx
|
27
|
+
Requires-Dist: openpyxl
|
28
|
+
Requires-Dist: pymupdf
|
29
|
+
Requires-Dist: langchain_community==0.2.9
|
30
|
+
Requires-Dist: premailer
|
31
|
+
Requires-Dist: setuptools==75.3.0
|
32
|
+
Requires-Dist: docx2markdown
|
33
|
+
Requires-Dist: tiktoken
|
34
|
+
Requires-Dist: markitdown
|
35
|
+
Requires-Dist: pandas
|
36
|
+
Requires-Dist: xlrd
|
37
|
+
Requires-Dist: tabulate
|
38
|
+
Requires-Dist: unstructured[all]
|
39
|
+
Requires-Dist: markdown
|
40
|
+
Dynamic: author
|
41
|
+
Dynamic: author-email
|
42
|
+
Dynamic: classifier
|
43
|
+
Dynamic: description
|
44
|
+
Dynamic: description-content-type
|
45
|
+
Dynamic: home-page
|
46
|
+
Dynamic: license-file
|
47
|
+
Dynamic: requires-dist
|
48
|
+
Dynamic: requires-python
|
49
|
+
Dynamic: summary
|
50
|
+
|
51
|
+
# DataMax
|
52
|
+
|
53
|
+
## Overview
|
54
|
+
DataMax is designed as a comprehensive solution for processing diverse file formats, performing data cleaning, and facilitating data annotation.
|
55
|
+
|
56
|
+
## Key Features
|
57
|
+
|
58
|
+
### File Processing Capabilities
|
59
|
+
Currently supports reading, conversion, and extraction from:
|
60
|
+
- PDF, HTML
|
61
|
+
- DOCX/DOC, PPT/PPTX
|
62
|
+
- EPUB
|
63
|
+
- Images
|
64
|
+
- XLS/XLSX spreadsheets
|
65
|
+
- Plain text (TXT)
|
66
|
+
|
67
|
+
### Data Cleaning Pipeline
|
68
|
+
Three-tiered cleaning process:
|
69
|
+
1. Anomaly detection and handling
|
70
|
+
2. Privacy protection processing
|
71
|
+
3. Text filtering and normalization
|
72
|
+
|
73
|
+
### AI-Powered Data Annotation
|
74
|
+
Implements an LLM+Prompt to:
|
75
|
+
- Continuously generate pre-labeled datasets
|
76
|
+
- Provide optimized training data for model fine-tuning
|
77
|
+
|
78
|
+
|
79
|
+
## Installation Guide (Key Dependencies)
|
80
|
+
Dependencies include libreoffice, datamax, and MinerU.
|
81
|
+
|
82
|
+
### 1. Installing libreoffice Dependency
|
83
|
+
**Note:** Without datamax, .doc files will not be supported.
|
84
|
+
|
85
|
+
#### Linux (Debian/Ubuntu)
|
86
|
+
```bash
|
87
|
+
sudo apt-get update
|
88
|
+
sudo apt-get install libreoffice
|
89
|
+
```
|
90
|
+
### Windows
|
91
|
+
```text
|
92
|
+
Install LibreOffice from: [Download LibreOffice](https://www.libreoffice.org/download/download-libreoffice/?spm=5176.28103460.0.0.5b295d275bpHzh)
|
93
|
+
Add to environment variable: `$env:PATH += ";C:\Program Files\LibreOffice\program"`
|
94
|
+
```
|
95
|
+
### Checking LibreOffice Installation
|
96
|
+
```bash
|
97
|
+
soffice --version
|
98
|
+
```
|
99
|
+
|
100
|
+
## 2. Installing MinerU Dependency
|
101
|
+
Note: Without MinerU, advanced OCR parsing for PDFs will not be supported.
|
102
|
+
### Create a Virtual Environment and Install Basic Dependencies
|
103
|
+
```bash
|
104
|
+
conda create -n mineru python=3.10
|
105
|
+
conda activate mineru
|
106
|
+
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
|
107
|
+
```
|
108
|
+
### Installing Model Weight Files
|
109
|
+
https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md
|
110
|
+
```bash
|
111
|
+
pip install modelscope
|
112
|
+
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
|
113
|
+
python download_models.py
|
114
|
+
```
|
115
|
+
|
116
|
+
### Modify the Configuration File magic-pdf.json (Located in the User Directory, Template Preview Below)
|
117
|
+
```json
|
118
|
+
{
|
119
|
+
"models-dir": "path\\to\\folder\\PDF-Extract-Kit-1___0\\models",
|
120
|
+
"layoutreader-model-dir": "path\\to\\folder\\layoutreader",
|
121
|
+
"device-mode": "cpu",
|
122
|
+
...
|
123
|
+
}
|
124
|
+
```
|
125
|
+
|
126
|
+
## 3. Installing Basic Dependencies for datamax
|
127
|
+
1. Clone the repository to your local machine:
|
128
|
+
```bash
|
129
|
+
git clone <repository-url>
|
130
|
+
```
|
131
|
+
2. Install dependencies into conda:
|
132
|
+
```bash
|
133
|
+
cd datamax
|
134
|
+
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
135
|
+
```
|
136
|
+
|
137
|
+
|
138
|
+
## Features
|
139
|
+
- **Multi-format Support**: Capable of handling various text file types such as PDF, HTML, DOCX, and TXT.
|
140
|
+
- **Content Extraction**: Provides powerful content extraction capabilities to accurately retrieve information from complex document structures.
|
141
|
+
- **Data Conversion**: Supports converting processed data into markdown format for further analysis.
|
142
|
+
- **Batch Processing**: Can handle multiple files at once, improving work efficiency.
|
143
|
+
- **Customizable Configuration**: Users can adjust processing parameters according to their needs to meet different business requirements.
|
144
|
+
- **Cross-platform Compatibility**: This SDK can run on multiple operating systems, including Windows, MacOS, and Linux.
|
145
|
+
|
146
|
+
|
147
|
+
## Technology Stack
|
148
|
+
|
149
|
+
- **Programming Language**: Python >= 3.10
|
150
|
+
- **Dependency Libraries**:
|
151
|
+
- PyMuPDF: For PDF file parsing.
|
152
|
+
- BeautifulSoup: For HTML file parsing.
|
153
|
+
- python-docx: For DOCX file parsing.
|
154
|
+
- pandas: For data processing and conversion.
|
155
|
+
- paddleocr: For parsing scanned PDFs, tables, and images.
|
156
|
+
- **Development Environment**: Visual Studio Code or PyCharm
|
157
|
+
- **Version Control**: Git
|
158
|
+
|
159
|
+
## Usage Instructions
|
160
|
+
### Installing the SDK
|
161
|
+
- **Installation Commands**:
|
162
|
+
```bash
|
163
|
+
## Local Installation
|
164
|
+
python setup.py sdist bdist_wheel
|
165
|
+
pip install dist/datamax-0.1.3-py3-none-any.whl
|
166
|
+
|
167
|
+
## Pip Installation
|
168
|
+
pip install pydatamax
|
169
|
+
```
|
170
|
+
|
171
|
+
|
172
|
+
- **Importing the Code**:
|
173
|
+
```python
|
174
|
+
# File Parsing
|
175
|
+
from datamax import DataMax
|
176
|
+
|
177
|
+
## Handling a Single File in Two Ways
|
178
|
+
# 1. Using a List of Length 1
|
179
|
+
data = DataMax(file_path=[r"docx_files_example/船视宝概述.doc"])
|
180
|
+
data = data.get_data()
|
181
|
+
|
182
|
+
# 2. Using a String
|
183
|
+
data = DataMax(file_path=r"docx_files_example/船视宝概述.doc")
|
184
|
+
data = data.get_data()
|
185
|
+
|
186
|
+
## Handling Multiple Files
|
187
|
+
# 1. Using a List of Length n
|
188
|
+
data = DataMax(file_path=[r"docx_files_example/船视宝概述1.doc", r"docx_files_example/船视宝概述2.doc"])
|
189
|
+
data = data.get_data()
|
190
|
+
|
191
|
+
# 2. Passing a Folder Path as a String
|
192
|
+
data = DataMax(file_path=r"docx_files_example/")
|
193
|
+
data = data.get_data()
|
194
|
+
|
195
|
+
# Data Cleaning
|
196
|
+
"""
|
197
|
+
Cleaning rules can be found in datamax/utils/data_cleaner.py
|
198
|
+
abnormal: Abnormal cleaning
|
199
|
+
private: Privacy processing
|
200
|
+
filter: Text filtering
|
201
|
+
"""
|
202
|
+
# Direct Use: Clean the text parameter directly and return a string
|
203
|
+
dm = DataMax()
|
204
|
+
data = dm.clean_data(method_list=["abnormal", "private"], text="<div></div>你好 18717777777 \n\n\n\n")
|
205
|
+
|
206
|
+
# Process Use: Use after get_data() to return the complete data structure
|
207
|
+
dm = DataMax(file_path=r"C:\Users\cykro\Desktop\数据库开发手册.pdf", use_ocr=True)
|
208
|
+
data2 = dm.get_data()
|
209
|
+
cleaned_data = dm.clean_data(method_list=["abnormal", "filter", "private"])
|
210
|
+
|
211
|
+
# Large Model Pre-annotation Supporting any model that can be called via OpenAI SDK
|
212
|
+
data = DataMax(file_path=r"path\to\xxx.docx")
|
213
|
+
parsed_data = data.get_data()
|
214
|
+
# If no custom messages are passed, the default messages in the SDK will be used
|
215
|
+
messages = [
|
216
|
+
{'role': 'system', 'content': 'You are a helpful assistant.'},
|
217
|
+
{'role': 'user', 'content': 'Who are you?'}
|
218
|
+
]
|
219
|
+
qa_datas = data.get_pre_label(
|
220
|
+
api_key="sk-xxx",
|
221
|
+
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
|
222
|
+
model_name="qwen-max",
|
223
|
+
chunk_size=500,
|
224
|
+
chunk_overlap=100,
|
225
|
+
question_number=5,
|
226
|
+
max_workers=5,
|
227
|
+
# message=[]
|
228
|
+
)
|
229
|
+
print(f'Annotated result:{qa_datas}')
|
230
|
+
```
|
231
|
+
|
232
|
+
|
233
|
+
## Examples
|
234
|
+
```python
|
235
|
+
## docx | doc | epub | html | txt | ppt | pptx | xls | xlsx
|
236
|
+
from datamax import DataMax
|
237
|
+
data = DataMax(file_path=r"docx_files_example/船视宝概述.doc", to_markdown=True)
|
238
|
+
"""
|
239
|
+
Parameters:
|
240
|
+
file_path: Relative file path / Absolute file path
|
241
|
+
to_markdown: Whether to convert to markdown (default value False, directly returns text) This parameter only supports word files (doc | docx)
|
242
|
+
"""
|
243
|
+
|
244
|
+
## jpg | jpeg | png | ...(image types)
|
245
|
+
data = DataMax(file_path=r"image.jpg", use_mineru=True)
|
246
|
+
"""
|
247
|
+
Parameters:
|
248
|
+
file_path: Relative file path / Absolute file path
|
249
|
+
use_mineru: Whether to use MinerU enhancement
|
250
|
+
"""
|
251
|
+
|
252
|
+
## pdf
|
253
|
+
from datamax import DataMax
|
254
|
+
data = DataMax(file_path=r"docx_files_example/船视宝概述.pdf", use_mineru=True)
|
255
|
+
"""
|
256
|
+
Parameters:
|
257
|
+
file_path: Relative file path / Absolute file path
|
258
|
+
use_mineru: Whether to use MinerU enhancement
|
259
|
+
"""
|
260
|
+
```
|
261
|
+
|
262
|
+
## Contribution Guide
|
263
|
+
We welcome any form of contribution, whether it is reporting bugs, suggesting new features, or submitting code improvements. Please read our Contributor's Guide to learn how to get started.
|
264
|
+
## License
|
265
|
+
This project is licensed under the MIT License. For more details, see the LICENSE file.
|
266
|
+
|
267
|
+
## Contact Information
|
268
|
+
If you encounter any issues during use, or have any suggestions or feedback, please contact us through the following means:
|
269
|
+
- Email: cy.kron@foxmail.com | zhibaohe@hotmail.com
|
270
|
+
- Project Homepage: GitHub Project Link
|
271
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
datamax/__init__.py,sha256=Kbs8ITE6suPy0VL8WzKH8A_iAGqukC0jIHcFGLgoBw8,28
|
2
|
+
datamax/loader/MinioHandler.py,sha256=0XLvtKayhI2cjPU7S7m91tSzPcaOC924ByJNbabY6So,6386
|
3
|
+
datamax/loader/OssHandler.py,sha256=YFbUfH2vkKRVg8OhkYpOrlYHrIX4zofbHFVe8nPpsQ4,7457
|
4
|
+
datamax/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
datamax/loader/core.py,sha256=tSIkOw5D3EVFYme1b7joFt0e_LxJdf-mdUzxpyVt0VI,5098
|
6
|
+
datamax/parser/__init__.py,sha256=Jilq2PLBNonmoXKATzsIHWWvFuBdlcV2dbSP1cOZ6zg,111
|
7
|
+
datamax/parser/base.py,sha256=riGcMn4m295_qf9O0-NbHU2BcHGBXvoF4T3fWj9vgUQ,2514
|
8
|
+
datamax/parser/core.py,sha256=9rzIjsVTRacPTUTAVa5gm5fx0h95LxYnw0lEGqjIIB4,11437
|
9
|
+
datamax/parser/csv_parser.py,sha256=IcyVq8mGE6auIcUInXGYWDnh0H0XJ_3SyQrLVRrS7i0,190
|
10
|
+
datamax/parser/doc_parser.py,sha256=VwTOdq5pGPbOI-98SoTwwTcXIjD1BrZDfFGEhTi3T44,3348
|
11
|
+
datamax/parser/docx_parser.py,sha256=OhqcMeZ8JkwDJtvrMirM15j-EDnGNUj6U1-nX3gisKA,1727
|
12
|
+
datamax/parser/epub_parser.py,sha256=ljCGxLBPwE5gXVKARJec93VpP4dE9R2GspzuSZBkqPQ,1557
|
13
|
+
datamax/parser/html_parser.py,sha256=xQaaK8674QbQwE-Up9X0DJIH0Gg0mR2KoI7fJ6iw2m0,1393
|
14
|
+
datamax/parser/image_parser.py,sha256=qGCndc_21PwsfuxFG03wHSsV0uc-XMBaW3VDbsJQd90,1233
|
15
|
+
datamax/parser/json_parser.py,sha256=MFamKCkP5Ny1kJyJlPkd_vNqk31ngPRf8NoYw8SxMY4,190
|
16
|
+
datamax/parser/md_parser.py,sha256=lgRlcvtV_9gkB2BnygzcdqIfj94tWjEq6ziGeLq3p00,2156
|
17
|
+
datamax/parser/pdf_parser.py,sha256=EbhXjTU09hMTr850_o1K7m7zD4QU9_A54MsbOF7pLT0,3992
|
18
|
+
datamax/parser/ppt_parser.py,sha256=Niu3Ina6I6m6lAMS1Z-A7rUbR_iFGmNTaASBoNH_vZ0,3142
|
19
|
+
datamax/parser/pptx_parser.py,sha256=sFWyOa3QNIs4BgtpmSzFQgsgPmunfGqCqi6fulbLFW0,1811
|
20
|
+
datamax/parser/txt_parser.py,sha256=4DIP1LVOw21NDdtqG2RTD_hMcHufkvC8kr048AkuLFs,1682
|
21
|
+
datamax/parser/xls_parser.py,sha256=pRlqgg96f76H8UqXQfheQT9O0ThdP7958hKUCEyQfPM,954
|
22
|
+
datamax/parser/xlsx_parser.py,sha256=9ZqwCSF01thjEb_RleWGCiNOSuA8KZ3QFqzUKldb3wE,2183
|
23
|
+
datamax/utils/__init__.py,sha256=d69SJvqOXzItyg9rEcLc4z67Lw9vACispOe3x7NvZLA,1051
|
24
|
+
datamax/utils/constants.py,sha256=A0S56mkIfeT6oQmOd-VGTChzLOSBUqsG4skMmLt6uNk,4507
|
25
|
+
datamax/utils/data_cleaner.py,sha256=zlk2dXmhU-_9KVfqmqMGr967v-nc7Iv8ZKRdMkIJsGM,7784
|
26
|
+
datamax/utils/env_setup.py,sha256=KrRQIbCMgtTjD8lKwzc9jv7jFPMMNMzikEb0_TfIstU,3460
|
27
|
+
datamax/utils/gotocr_pdf.py,sha256=YCYio_5Yt77hky4nSyfREw5_Bh55XbGy7l2cypvGxNg,8479
|
28
|
+
datamax/utils/mineru_operator.py,sha256=Rss7YVSAUnoWmDnCGPJlgsMNmJWmb6blYuS4UB7PgQ8,2241
|
29
|
+
datamax/utils/paddleocr_pdf_operator.py,sha256=Tnb-5SzUd6OXM-XeaL8vdPnsOhgG_GKz-gfIdVtYoSs,3555
|
30
|
+
datamax/utils/ppt_extract.py,sha256=nd6KSqEzxANrPhNPUZY4ogAyxHzKCbdsI5ZfDQCz0Cw,6164
|
31
|
+
datamax/utils/qa_generator.py,sha256=d75an9JEyT6sxlSjdmWYveQshfyTb0v4aGSuTpTJa0A,12561
|
32
|
+
datamax/utils/tokenizer.py,sha256=Y8XB06XQVsNuG8IPl_4iBZj2yu1xzXldVbmZtXFMQM4,859
|
33
|
+
pydatamax-0.1.11.dist-info/licenses/LICENSE,sha256=LvCq2Pc7MejIvfNeRl_kAM5l_KXWlQFiC-Sjp7kqFf8,1067
|
34
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
|
+
tests/test_basic.py,sha256=4AByx25-MIt6_zmzxpFRoSCBqLtIjyfTwFLb1UCJz6k,303
|
36
|
+
pydatamax-0.1.11.dist-info/METADATA,sha256=n6BKLg2Dh015V7wRoFSOUPOeKo7hEUdetnK7FBCAK1c,9039
|
37
|
+
pydatamax-0.1.11.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
38
|
+
pydatamax-0.1.11.dist-info/top_level.txt,sha256=DvdVIUrxJDgRUYiGA5vznYZIP-K8ZnDkTZfrqYLNZMQ,14
|
39
|
+
pydatamax-0.1.11.dist-info/RECORD,,
|