pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +91 -68
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/utils/qa_generator.py
CHANGED
@@ -1,15 +1,44 @@
|
|
1
|
+
"""
|
2
|
+
QA Generator for DataMax
|
3
|
+
|
4
|
+
This module provides functionality to generate question-answer pairs from processed content.
|
5
|
+
|
6
|
+
Features:
|
7
|
+
- Chinese QA generation (default): Uses Chinese prompts for generating Chinese Q&A pairs
|
8
|
+
- English QA generation (enhanced): Uses reinforced English-only prompts with strict language constraints
|
9
|
+
to ensure pure English output without any Chinese characters
|
10
|
+
|
11
|
+
Language Selection:
|
12
|
+
- language="zh": Generates Chinese questions and answers (default)
|
13
|
+
- language="en": Generates English questions and answers with enhanced prompts that strictly
|
14
|
+
enforce English-only output to prevent any Chinese character leakage
|
15
|
+
|
16
|
+
Enhanced English Mode Features:
|
17
|
+
- Multiple language enforcement checks in prompts
|
18
|
+
- Explicit "ENGLISH ONLY" requirements in system prompts
|
19
|
+
- Enhanced user messages for English generation
|
20
|
+
- Stricter quality controls for English output
|
21
|
+
|
22
|
+
Usage:
|
23
|
+
# For Chinese QA pairs
|
24
|
+
qa_data = generate_qa_from_content(content=text, language="zh", ...)
|
25
|
+
|
26
|
+
# For English QA pairs (enhanced)
|
27
|
+
qa_data = generate_qa_from_content(content=text, language="en", ...)
|
28
|
+
"""
|
29
|
+
|
1
30
|
import json
|
2
31
|
import os.path
|
3
32
|
import re
|
4
33
|
import threading
|
5
34
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
35
|
from pathlib import Path
|
36
|
+
from pyexpat.errors import messages
|
7
37
|
|
8
38
|
import requests
|
39
|
+
from langchain.schema import Document
|
9
40
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10
|
-
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
11
41
|
from loguru import logger
|
12
|
-
from pyexpat.errors import messages
|
13
42
|
from tqdm import tqdm # For progress bar display
|
14
43
|
|
15
44
|
lock = threading.Lock()
|
@@ -17,13 +46,13 @@ lock = threading.Lock()
|
|
17
46
|
|
18
47
|
# ------------prompt-----------------
|
19
48
|
def get_system_prompt_for_question(query_text, question_number):
|
20
|
-
"""Generate system prompt for question generation task"""
|
49
|
+
"""Generate system prompt for question generation task (Chinese)"""
|
21
50
|
system_prompt = f"""
|
22
51
|
# 角色使命
|
23
52
|
你是一位专业的文本分析专家,擅长从复杂文本中提取关键信息并生成可用于模型微调的结构化数据(仅生成问题)。
|
24
53
|
|
25
54
|
## 核心任务
|
26
|
-
根据用户提供的文本,生成不少于
|
55
|
+
根据用户提供的文本,生成不少于 {question_number} 个高质量问题。
|
27
56
|
|
28
57
|
## 约束条件(重要!)
|
29
58
|
- 必须基于文本内容直接生成
|
@@ -54,25 +83,84 @@ def get_system_prompt_for_question(query_text, question_number):
|
|
54
83
|
\`\`\`
|
55
84
|
|
56
85
|
## 待处理文本
|
57
|
-
|
86
|
+
{query_text}
|
58
87
|
|
59
88
|
## 限制
|
60
89
|
- 必须按照规定的 JSON 格式输出,不要输出任何其他不相关内容
|
61
|
-
-
|
90
|
+
- 生成不少于{question_number}个高质量问题
|
62
91
|
- 问题不要和材料本身相关,例如禁止出现作者、章节、目录等相关问题
|
63
92
|
- 问题不得包含【报告、文章、文献、表格】中提到的这种话术,必须是一个自然的问题
|
64
93
|
"""
|
65
94
|
return system_prompt
|
66
95
|
|
67
96
|
|
97
|
+
def get_system_prompt_for_question_en(query_text, question_number):
|
98
|
+
"""Generate system prompt for question generation task (English)"""
|
99
|
+
system_prompt = f"""
|
100
|
+
# Role Mission
|
101
|
+
You are a professional text analysis expert, skilled in extracting key information from complex texts and generating structured data suitable for model fine-tuning (question generation only).
|
102
|
+
|
103
|
+
## CRITICAL REQUIREMENT: GENERATE ONLY ENGLISH QUESTIONS
|
104
|
+
- ALL questions MUST be in English language only
|
105
|
+
- NO Chinese characters or other languages allowed
|
106
|
+
- Output format must be English JSON array
|
107
|
+
|
108
|
+
## Core Task
|
109
|
+
Based on the text provided by the user, generate at least {question_number} high-quality questions IN ENGLISH ONLY.
|
110
|
+
|
111
|
+
## Constraints (Important!)
|
112
|
+
- Must be generated directly based on text content
|
113
|
+
- Questions should have clear answer directionality
|
114
|
+
- Need to cover different aspects of the text
|
115
|
+
- Prohibit generating hypothetical, repetitive or similar questions
|
116
|
+
- Ensure generation completeness
|
117
|
+
- **MANDATORY: All questions must be in English language**
|
118
|
+
|
119
|
+
## Processing Flow
|
120
|
+
1. [Text Analysis] Process content in segments, identify key entities and core concepts
|
121
|
+
2. [Question Generation] Select optimal questioning points based on information density
|
122
|
+
3. [Quality Check] Ensure:
|
123
|
+
- Question answers can be found in the original text
|
124
|
+
- Labels are strongly related to question content
|
125
|
+
- No format errors
|
126
|
+
- **All questions are in English**
|
127
|
+
|
128
|
+
## Output Format
|
129
|
+
- JSON array format must be correct
|
130
|
+
- Field names use English double quotes
|
131
|
+
- Output JSON array must strictly conform to the following structure:
|
132
|
+
- **ALL CONTENT MUST BE IN ENGLISH**
|
133
|
+
\`\`\`json
|
134
|
+
["English Question 1", "English Question 2", "..."]
|
135
|
+
\`\`\`
|
136
|
+
|
137
|
+
## Output Example
|
138
|
+
\`\`\`json
|
139
|
+
["What are the core elements that should be included in an AI ethics framework?", "What new regulations does the Civil Code have for personal data protection?", "How do machine learning algorithms impact data privacy?"]
|
140
|
+
\`\`\`
|
141
|
+
|
142
|
+
## Text to Process
|
143
|
+
{query_text}
|
144
|
+
|
145
|
+
## Restrictions
|
146
|
+
- Must output according to the specified JSON format, do not output any other unrelated content
|
147
|
+
- Generate at least {question_number} high-quality questions **IN ENGLISH ONLY**
|
148
|
+
- Questions should not be related to the material itself, for example, prohibit questions about authors, chapters, catalogs, etc.
|
149
|
+
- Questions must not contain phrases like "mentioned in [report, article, literature, table]", must be natural questions
|
150
|
+
- **CRITICAL: Absolutely no Chinese characters or non-English content allowed**
|
151
|
+
- All questions must be grammatically correct English
|
152
|
+
"""
|
153
|
+
return system_prompt
|
154
|
+
|
155
|
+
|
68
156
|
def get_system_prompt_for_answer(text, query_question):
|
69
|
-
"""Generate system prompt for answer generation task"""
|
157
|
+
"""Generate system prompt for answer generation task (Chinese)"""
|
70
158
|
system_prompt = f"""
|
71
159
|
# Role: 微调数据集生成专家
|
72
160
|
## Profile:
|
73
161
|
- Description: 你是一名微调数据集生成专家,擅长从给定的内容中生成准确的问题答案,确保答案的准确性和相关性,你要直接回答用户问题,所有信息已内化为你的专业知识。
|
74
162
|
|
75
|
-
## Skills
|
163
|
+
## Skills:
|
76
164
|
1. 答案必须基于给定的内容
|
77
165
|
2. 答案必须准确,不能胡编乱造
|
78
166
|
3. 答案必须与问题相关
|
@@ -87,12 +175,12 @@ def get_system_prompt_for_answer(text, query_question):
|
|
87
175
|
5. 最后,确保答案的准确性和相关性
|
88
176
|
|
89
177
|
## 参考内容:
|
90
|
-
|
178
|
+
{text}
|
91
179
|
|
92
180
|
## 问题
|
93
|
-
|
181
|
+
{query_question}
|
94
182
|
|
95
|
-
##
|
183
|
+
## Constraints:
|
96
184
|
1. 答案必须基于给定的内容
|
97
185
|
2. 答案必须准确,必须与问题相关,不能胡编乱造
|
98
186
|
3. 答案必须充分、详细、包含所有必要的信息、适合微调大模型训练使用
|
@@ -101,14 +189,62 @@ def get_system_prompt_for_answer(text, query_question):
|
|
101
189
|
return system_prompt
|
102
190
|
|
103
191
|
|
192
|
+
def get_system_prompt_for_answer_en(text, query_question):
|
193
|
+
"""Generate system prompt for answer generation task (English)"""
|
194
|
+
system_prompt = f"""
|
195
|
+
# Role: Fine-tuning Dataset Generation Expert
|
196
|
+
|
197
|
+
## CRITICAL REQUIREMENT: GENERATE ONLY ENGLISH ANSWERS
|
198
|
+
- ALL answers MUST be in English language only
|
199
|
+
- NO Chinese characters or other languages allowed
|
200
|
+
- Response must be in fluent, natural English
|
201
|
+
|
202
|
+
## Profile:
|
203
|
+
- Description: You are a fine-tuning dataset generation expert, skilled in generating accurate question answers from given content, ensuring answer accuracy and relevance. You should directly answer user questions in ENGLISH ONLY, with all information internalized as your professional knowledge.
|
204
|
+
|
205
|
+
## Skills:
|
206
|
+
1. Answers must be based on the given content
|
207
|
+
2. Answers must be accurate and not fabricated
|
208
|
+
3. Answers must be relevant to the questions
|
209
|
+
4. Answers must be logical
|
210
|
+
5. Based on the given reference content, integrate into a complete answer using natural and fluent **ENGLISH** language, without mentioning literature sources or citation marks
|
211
|
+
6. **MANDATORY: All responses must be in English language only**
|
212
|
+
|
213
|
+
## Workflow:
|
214
|
+
1. Take a deep breath and work on this problem step-by-step.
|
215
|
+
2. First, analyze the given file content
|
216
|
+
3. Then, extract key information from the content
|
217
|
+
4. Next, generate accurate answers related to the questions **IN ENGLISH ONLY**
|
218
|
+
5. Finally, ensure the accuracy and relevance of the answers in proper English
|
219
|
+
|
220
|
+
## Reference Content:
|
221
|
+
{text}
|
222
|
+
|
223
|
+
## Question
|
224
|
+
{query_question}
|
225
|
+
|
226
|
+
## Constraints:
|
227
|
+
1. Answers must be based on the given content
|
228
|
+
2. Answers must be accurate and relevant to the questions, not fabricated
|
229
|
+
3. Answers must be comprehensive, detailed, contain all necessary information, and be suitable for fine-tuning large model training
|
230
|
+
4. Answers must not contain any referential expressions like 'referenced / based on / mentioned in literature', only present the final results
|
231
|
+
5. **CRITICAL: ALL answers must be in English language only - no Chinese characters or other languages allowed**
|
232
|
+
6. Use proper English grammar, vocabulary, and sentence structure
|
233
|
+
7. Ensure the response flows naturally in English
|
234
|
+
|
235
|
+
## IMPORTANT REMINDER:
|
236
|
+
Your response must be entirely in English. Do not include any Chinese characters, phrases, or words in your answer.
|
237
|
+
"""
|
238
|
+
return system_prompt
|
239
|
+
|
240
|
+
|
104
241
|
# ------------spliter----------------
|
105
|
-
def
|
242
|
+
def split_content_to_chunks(content: str, chunk_size: int, chunk_overlap: int) -> list:
|
106
243
|
"""
|
107
|
-
|
108
|
-
Chunking strategy that preserves original paragraph structure
|
244
|
+
Split content into chunks, replacing the file reading approach
|
109
245
|
|
110
246
|
Args:
|
111
|
-
|
247
|
+
content: Processed text content
|
112
248
|
chunk_size: Size of each chunk
|
113
249
|
chunk_overlap: Overlap between chunks
|
114
250
|
|
@@ -116,19 +252,48 @@ def load_and_split_markdown(md_path: str, chunk_size: int, chunk_overlap: int) -
|
|
116
252
|
List of document chunks
|
117
253
|
"""
|
118
254
|
try:
|
119
|
-
#
|
120
|
-
|
121
|
-
|
122
|
-
#
|
255
|
+
# Handle potential escaped newlines
|
256
|
+
content = content.replace("\\n", "\n")
|
257
|
+
|
258
|
+
# Create document object for the splitter
|
259
|
+
document = Document(
|
260
|
+
page_content=content, metadata={"source": "processed_content"}
|
261
|
+
)
|
262
|
+
|
263
|
+
# Split the document
|
123
264
|
splitter = RecursiveCharacterTextSplitter(
|
124
265
|
chunk_size=chunk_size,
|
125
266
|
chunk_overlap=chunk_overlap,
|
126
267
|
length_function=len,
|
127
268
|
is_separator_regex=False,
|
128
269
|
)
|
129
|
-
return splitter.split_documents(
|
270
|
+
return splitter.split_documents([document])
|
130
271
|
except Exception as e:
|
131
|
-
logger.error(f"
|
272
|
+
logger.error(f"Failed to split content: {str(e)}")
|
273
|
+
return []
|
274
|
+
|
275
|
+
|
276
|
+
def load_and_split_markdown(md_path: str, chunk_size: int, chunk_overlap: int) -> list:
|
277
|
+
"""
|
278
|
+
Parse Markdown file and split into chunks
|
279
|
+
This function is kept for backward compatibility
|
280
|
+
|
281
|
+
Args:
|
282
|
+
md_path: Path to the markdown file
|
283
|
+
chunk_size: Size of each chunk
|
284
|
+
chunk_overlap: Overlap between chunks
|
285
|
+
|
286
|
+
Returns:
|
287
|
+
List of document chunks
|
288
|
+
"""
|
289
|
+
try:
|
290
|
+
# Read markdown file directly to avoid loader issues
|
291
|
+
with open(md_path, "r", encoding="utf-8") as file:
|
292
|
+
content = file.read()
|
293
|
+
|
294
|
+
return split_content_to_chunks(content, chunk_size, chunk_overlap)
|
295
|
+
except Exception as e:
|
296
|
+
logger.error(f"Failed to load {Path(md_path).name}: {str(e)}")
|
132
297
|
return []
|
133
298
|
|
134
299
|
|
@@ -155,7 +320,7 @@ def extract_json_from_llm_output(output: str):
|
|
155
320
|
try:
|
156
321
|
return json.loads(json_match.group(1))
|
157
322
|
except json.JSONDecodeError as e:
|
158
|
-
print(f"
|
323
|
+
print(f"Error parsing JSON: {e}")
|
159
324
|
|
160
325
|
# Try to extract the most JSON-like part
|
161
326
|
json_start = output.find("[")
|
@@ -166,7 +331,7 @@ def extract_json_from_llm_output(output: str):
|
|
166
331
|
except json.JSONDecodeError:
|
167
332
|
pass
|
168
333
|
|
169
|
-
print("
|
334
|
+
print("Model output not in standard format:", output)
|
170
335
|
return None
|
171
336
|
|
172
337
|
|
@@ -184,9 +349,19 @@ def llm_generator(
|
|
184
349
|
"""Generate content using LLM API"""
|
185
350
|
try:
|
186
351
|
if not message:
|
352
|
+
# Determine if this is English mode based on prompt content
|
353
|
+
is_english_mode = (
|
354
|
+
"GENERATE ONLY ENGLISH" in prompt or "ENGLISH ONLY" in prompt
|
355
|
+
)
|
356
|
+
|
357
|
+
if is_english_mode:
|
358
|
+
user_message = "Please generate content strictly according to requirements. IMPORTANT: Generate ONLY English content - no Chinese characters or other languages allowed."
|
359
|
+
else:
|
360
|
+
user_message = "请严格按照要求生成内容"
|
361
|
+
|
187
362
|
message = [
|
188
363
|
{"role": "system", "content": prompt},
|
189
|
-
{"role": "user", "content":
|
364
|
+
{"role": "user", "content": user_message},
|
190
365
|
]
|
191
366
|
headers = {
|
192
367
|
"Authorization": f"Bearer {api_key}",
|
@@ -214,7 +389,7 @@ def llm_generator(
|
|
214
389
|
return []
|
215
390
|
|
216
391
|
except Exception as e:
|
217
|
-
print(f"LLM
|
392
|
+
print(f"LLM keyword extraction failed: {e, e.__traceback__.tb_lineno}")
|
218
393
|
return []
|
219
394
|
|
220
395
|
|
@@ -227,7 +402,8 @@ def process_questions(
|
|
227
402
|
base_url: str,
|
228
403
|
page_content: list,
|
229
404
|
question_number: int,
|
230
|
-
|
405
|
+
language: str = "zh",
|
406
|
+
message: list = None,
|
231
407
|
max_workers: int = 5,
|
232
408
|
) -> list:
|
233
409
|
"""Generate questions using multi-threading"""
|
@@ -235,7 +411,11 @@ def process_questions(
|
|
235
411
|
|
236
412
|
def _generate_questions(page):
|
237
413
|
"""Inner function for question generation"""
|
238
|
-
|
414
|
+
if language.lower() == "en":
|
415
|
+
prompt = get_system_prompt_for_question_en(page, question_number)
|
416
|
+
else:
|
417
|
+
prompt = get_system_prompt_for_question(page, question_number)
|
418
|
+
|
239
419
|
questions = llm_generator(
|
240
420
|
api_key=api_key,
|
241
421
|
model=model,
|
@@ -246,17 +426,21 @@ def process_questions(
|
|
246
426
|
)
|
247
427
|
return [{"question": q, "page": page} for q in questions] if questions else []
|
248
428
|
|
249
|
-
logger.info(
|
429
|
+
logger.info(
|
430
|
+
f"Starting question generation (threads: {max_workers}, language: {language})..."
|
431
|
+
)
|
250
432
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
251
433
|
futures = [executor.submit(_generate_questions, page) for page in page_content]
|
252
434
|
|
253
|
-
with tqdm(
|
435
|
+
with tqdm(
|
436
|
+
as_completed(futures), total=len(futures), desc="Generating questions"
|
437
|
+
) as pbar:
|
254
438
|
for future in pbar:
|
255
439
|
result = future.result()
|
256
440
|
if result:
|
257
441
|
with lock:
|
258
442
|
total_questions.extend(result)
|
259
|
-
pbar.set_postfix({"
|
443
|
+
pbar.set_postfix({"Generated questions": len(total_questions)})
|
260
444
|
|
261
445
|
return total_questions
|
262
446
|
|
@@ -266,6 +450,7 @@ def process_answers(
|
|
266
450
|
model: str,
|
267
451
|
base_url: str,
|
268
452
|
question_items: list,
|
453
|
+
language: str = "zh",
|
269
454
|
message: list = None,
|
270
455
|
max_workers=5,
|
271
456
|
) -> dict:
|
@@ -274,7 +459,11 @@ def process_answers(
|
|
274
459
|
|
275
460
|
def _generate_answer(item):
|
276
461
|
"""Inner function for answer generation"""
|
277
|
-
|
462
|
+
if language.lower() == "en":
|
463
|
+
prompt = get_system_prompt_for_answer_en(item["page"], item["question"])
|
464
|
+
else:
|
465
|
+
prompt = get_system_prompt_for_answer(item["page"], item["question"])
|
466
|
+
|
278
467
|
answer = llm_generator(
|
279
468
|
api_key=api_key,
|
280
469
|
model=model,
|
@@ -285,22 +474,101 @@ def process_answers(
|
|
285
474
|
)
|
286
475
|
return item["question"], answer
|
287
476
|
|
288
|
-
logger.info(
|
477
|
+
logger.info(
|
478
|
+
f"Starting answer generation (threads: {max_workers}, language: {language})..."
|
479
|
+
)
|
289
480
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
290
481
|
futures = {
|
291
482
|
executor.submit(_generate_answer, item): item for item in question_items
|
292
483
|
}
|
293
484
|
|
294
|
-
with tqdm(
|
485
|
+
with tqdm(
|
486
|
+
as_completed(futures), total=len(futures), desc="Generating answers"
|
487
|
+
) as pbar:
|
295
488
|
for future in pbar:
|
296
489
|
question, answer = future.result()
|
297
490
|
if answer:
|
298
491
|
with lock:
|
299
492
|
qa_pairs[question] = answer
|
300
|
-
pbar.set_postfix({"
|
493
|
+
pbar.set_postfix({"Generated answers": len(qa_pairs)})
|
301
494
|
return qa_pairs
|
302
495
|
|
303
496
|
|
497
|
+
def generate_qa_from_content(
|
498
|
+
content: str,
|
499
|
+
api_key: str,
|
500
|
+
base_url: str,
|
501
|
+
model_name: str,
|
502
|
+
chunk_size=500,
|
503
|
+
chunk_overlap=100,
|
504
|
+
question_number=5,
|
505
|
+
language: str = "zh",
|
506
|
+
message: list = None,
|
507
|
+
max_workers=5,
|
508
|
+
):
|
509
|
+
"""
|
510
|
+
Generate QA pairs from processed content
|
511
|
+
|
512
|
+
Args:
|
513
|
+
content: Processed text content (from get_data() content field)
|
514
|
+
api_key: API key
|
515
|
+
base_url: API base URL
|
516
|
+
model_name: Model name
|
517
|
+
chunk_size: Chunk size
|
518
|
+
chunk_overlap: Overlap length
|
519
|
+
question_number: Number of questions generated per chunk
|
520
|
+
language: Language for QA generation ("zh" for Chinese, "en" for English)
|
521
|
+
message: Custom message
|
522
|
+
max_workers: Number of concurrent workers
|
523
|
+
|
524
|
+
Returns:
|
525
|
+
List of QA pairs
|
526
|
+
"""
|
527
|
+
# 1. Split content into chunks
|
528
|
+
pages = split_content_to_chunks(
|
529
|
+
content=content, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
530
|
+
)
|
531
|
+
page_content = [i.page_content for i in pages]
|
532
|
+
logger.info(f"Content split into {len(page_content)} chunks")
|
533
|
+
|
534
|
+
# 2. Generate questions using multi-threading
|
535
|
+
questions = process_questions(
|
536
|
+
page_content=page_content,
|
537
|
+
question_number=question_number,
|
538
|
+
language=language,
|
539
|
+
message=message,
|
540
|
+
max_workers=max_workers,
|
541
|
+
api_key=api_key,
|
542
|
+
base_url=base_url,
|
543
|
+
model=model_name,
|
544
|
+
)
|
545
|
+
if not questions:
|
546
|
+
logger.error(
|
547
|
+
"Failed to generate any questions, please check input content and API settings"
|
548
|
+
)
|
549
|
+
return []
|
550
|
+
|
551
|
+
# 3. Generate answers using multi-threading
|
552
|
+
qa_pairs = process_answers(
|
553
|
+
question_items=questions,
|
554
|
+
language=language,
|
555
|
+
message=message,
|
556
|
+
max_workers=max_workers,
|
557
|
+
api_key=api_key,
|
558
|
+
base_url=base_url,
|
559
|
+
model=model_name,
|
560
|
+
)
|
561
|
+
|
562
|
+
logger.success(f"Completed! Generated {len(qa_pairs)} QA pairs in {language}")
|
563
|
+
|
564
|
+
# Format results
|
565
|
+
res_list = []
|
566
|
+
for question, answer in qa_pairs.items():
|
567
|
+
qa_entry = {"instruction": question, "input": "", "output": answer}
|
568
|
+
res_list.append(qa_entry)
|
569
|
+
return res_list
|
570
|
+
|
571
|
+
|
304
572
|
def generatr_qa_pairs(
|
305
573
|
file_path: str,
|
306
574
|
api_key: str,
|
@@ -309,33 +577,38 @@ def generatr_qa_pairs(
|
|
309
577
|
chunk_size=500,
|
310
578
|
chunk_overlap=100,
|
311
579
|
question_number=5,
|
580
|
+
language: str = "zh",
|
312
581
|
message: list = None,
|
313
582
|
max_workers=5,
|
314
583
|
):
|
315
|
-
"""Main function to generate QA pairs from markdown file"""
|
316
|
-
# 1. Split markdown text into chunks
|
584
|
+
"""Main function to generate QA pairs from markdown file (kept for backward compatibility)"""
|
585
|
+
# 1. Split markdown text into chunks
|
317
586
|
pages = load_and_split_markdown(
|
318
587
|
md_path=file_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
319
588
|
)
|
320
589
|
page_content = [i.page_content for i in pages]
|
321
|
-
logger.info(f"
|
590
|
+
logger.info(f"Markdown split into {len(page_content)} chunks")
|
322
591
|
|
323
592
|
# 2. Generate questions using multi-threading
|
324
593
|
questions = process_questions(
|
325
594
|
page_content=page_content,
|
326
|
-
message=message,
|
327
595
|
question_number=question_number,
|
596
|
+
language=language,
|
597
|
+
message=message,
|
328
598
|
max_workers=max_workers,
|
329
599
|
api_key=api_key,
|
330
600
|
base_url=base_url,
|
331
601
|
model=model_name,
|
332
602
|
)
|
333
603
|
if not questions:
|
334
|
-
logger.error(
|
604
|
+
logger.error(
|
605
|
+
"Failed to generate any questions, please check input document and API settings"
|
606
|
+
)
|
335
607
|
|
336
608
|
# 3. Generate answers using multi-threading
|
337
609
|
qa_pairs = process_answers(
|
338
610
|
question_items=questions,
|
611
|
+
language=language,
|
339
612
|
message=message,
|
340
613
|
max_workers=max_workers,
|
341
614
|
api_key=api_key,
|
@@ -343,11 +616,9 @@ def generatr_qa_pairs(
|
|
343
616
|
model=model_name,
|
344
617
|
)
|
345
618
|
|
346
|
-
logger.success(
|
347
|
-
f"完成! 共生成 {len(qa_pairs)} 个问答对"
|
348
|
-
)
|
619
|
+
logger.success(f"Completed! Generated {len(qa_pairs)} QA pairs in {language}")
|
349
620
|
|
350
|
-
#
|
621
|
+
# Format results
|
351
622
|
res_list = []
|
352
623
|
for question, answer in qa_pairs.items():
|
353
624
|
qa_entry = {"instruction": question, "input": "", "output": answer}
|
@@ -356,14 +627,31 @@ def generatr_qa_pairs(
|
|
356
627
|
|
357
628
|
|
358
629
|
if __name__ == "__main__":
|
630
|
+
# Example 1: Generate Chinese QA pairs (default)
|
631
|
+
print("Generating Chinese QA pairs...")
|
359
632
|
generatr_qa_pairs(
|
360
|
-
file_path=r"C:\Users\
|
633
|
+
file_path=r"C:\Users\example\Desktop\document\knowledge_graph\knowledge_graph_design.md",
|
361
634
|
api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxx",
|
362
635
|
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
|
363
636
|
model_name="qwen-max",
|
364
637
|
chunk_size=500,
|
365
638
|
chunk_overlap=100,
|
366
639
|
question_number=5,
|
640
|
+
language="zh", # Chinese QA pairs
|
367
641
|
max_workers=5,
|
368
642
|
# message=[]
|
369
643
|
)
|
644
|
+
|
645
|
+
# Example 2: Generate English QA pairs (Enhanced with strict English-only prompts)
|
646
|
+
print("\nGenerating English QA pairs with enhanced prompts...")
|
647
|
+
# generatr_qa_pairs(
|
648
|
+
# file_path=r"C:\Users\example\Desktop\document\knowledge_graph\knowledge_graph_design.md",
|
649
|
+
# api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxx",
|
650
|
+
# base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
|
651
|
+
# model_name="qwen-max",
|
652
|
+
# chunk_size=500,
|
653
|
+
# chunk_overlap=100,
|
654
|
+
# question_number=5,
|
655
|
+
# language="en", # English QA pairs with enhanced prompts
|
656
|
+
# max_workers=5,
|
657
|
+
# )
|
datamax/utils/tokenizer.py
CHANGED
@@ -3,20 +3,21 @@ import tiktoken
|
|
3
3
|
|
4
4
|
class DashScopeClient:
|
5
5
|
_instance = None
|
6
|
+
|
6
7
|
def __new__(cls, *args, **kwargs):
|
7
8
|
if not cls._instance:
|
8
9
|
cls._instance = super(DashScopeClient, cls).__new__(cls)
|
9
10
|
return cls._instance
|
10
11
|
|
11
12
|
def get_tokenizer(self, content):
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
13
|
+
"""
|
14
|
+
Note: tiktoken only supports the following models with different token calculations
|
15
|
+
A BPE word divider developed by tiktoken openai
|
16
|
+
o200k_base corresponds to models: gpt-4o, GPT-4O-MINI
|
17
|
+
cl100k_base models: GPT-4-Turbo, gpt-4, gpt-3.5-turbo...
|
18
|
+
p50k_base corresponds to models text-davinci-002 and text-davinci-003
|
19
|
+
r50k_base corresponds to model gpt2
|
20
|
+
"""
|
20
21
|
encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
|
21
22
|
num_tokens = len(encoding.encode(content))
|
22
|
-
return num_tokens
|
23
|
+
return num_tokens
|