pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +91 -68
  31. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,44 @@
1
+ """
2
+ QA Generator for DataMax
3
+
4
+ This module provides functionality to generate question-answer pairs from processed content.
5
+
6
+ Features:
7
+ - Chinese QA generation (default): Uses Chinese prompts for generating Chinese Q&A pairs
8
+ - English QA generation (enhanced): Uses reinforced English-only prompts with strict language constraints
9
+ to ensure pure English output without any Chinese characters
10
+
11
+ Language Selection:
12
+ - language="zh": Generates Chinese questions and answers (default)
13
+ - language="en": Generates English questions and answers with enhanced prompts that strictly
14
+ enforce English-only output to prevent any Chinese character leakage
15
+
16
+ Enhanced English Mode Features:
17
+ - Multiple language enforcement checks in prompts
18
+ - Explicit "ENGLISH ONLY" requirements in system prompts
19
+ - Enhanced user messages for English generation
20
+ - Stricter quality controls for English output
21
+
22
+ Usage:
23
+ # For Chinese QA pairs
24
+ qa_data = generate_qa_from_content(content=text, language="zh", ...)
25
+
26
+ # For English QA pairs (enhanced)
27
+ qa_data = generate_qa_from_content(content=text, language="en", ...)
28
+ """
29
+
1
30
  import json
2
31
  import os.path
3
32
  import re
4
33
  import threading
5
34
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
35
  from pathlib import Path
36
+ from pyexpat.errors import messages
7
37
 
8
38
  import requests
39
+ from langchain.schema import Document
9
40
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
- from langchain_community.document_loaders import UnstructuredMarkdownLoader
11
41
  from loguru import logger
12
- from pyexpat.errors import messages
13
42
  from tqdm import tqdm # For progress bar display
14
43
 
15
44
  lock = threading.Lock()
@@ -17,13 +46,13 @@ lock = threading.Lock()
17
46
 
18
47
  # ------------prompt-----------------
19
48
  def get_system_prompt_for_question(query_text, question_number):
20
- """Generate system prompt for question generation task"""
49
+ """Generate system prompt for question generation task (Chinese)"""
21
50
  system_prompt = f"""
22
51
  # 角色使命
23
52
  你是一位专业的文本分析专家,擅长从复杂文本中提取关键信息并生成可用于模型微调的结构化数据(仅生成问题)。
24
53
 
25
54
  ## 核心任务
26
- 根据用户提供的文本,生成不少于 ${question_number} 个高质量问题。
55
+ 根据用户提供的文本,生成不少于 {question_number} 个高质量问题。
27
56
 
28
57
  ## 约束条件(重要!)
29
58
  - 必须基于文本内容直接生成
@@ -54,25 +83,84 @@ def get_system_prompt_for_question(query_text, question_number):
54
83
  \`\`\`
55
84
 
56
85
  ## 待处理文本
57
- ${query_text}
86
+ {query_text}
58
87
 
59
88
  ## 限制
60
89
  - 必须按照规定的 JSON 格式输出,不要输出任何其他不相关内容
61
- - 生成不少于${question_number}个高质量问题
90
+ - 生成不少于{question_number}个高质量问题
62
91
  - 问题不要和材料本身相关,例如禁止出现作者、章节、目录等相关问题
63
92
  - 问题不得包含【报告、文章、文献、表格】中提到的这种话术,必须是一个自然的问题
64
93
  """
65
94
  return system_prompt
66
95
 
67
96
 
97
+ def get_system_prompt_for_question_en(query_text, question_number):
98
+ """Generate system prompt for question generation task (English)"""
99
+ system_prompt = f"""
100
+ # Role Mission
101
+ You are a professional text analysis expert, skilled in extracting key information from complex texts and generating structured data suitable for model fine-tuning (question generation only).
102
+
103
+ ## CRITICAL REQUIREMENT: GENERATE ONLY ENGLISH QUESTIONS
104
+ - ALL questions MUST be in English language only
105
+ - NO Chinese characters or other languages allowed
106
+ - Output format must be English JSON array
107
+
108
+ ## Core Task
109
+ Based on the text provided by the user, generate at least {question_number} high-quality questions IN ENGLISH ONLY.
110
+
111
+ ## Constraints (Important!)
112
+ - Must be generated directly based on text content
113
+ - Questions should have clear answer directionality
114
+ - Need to cover different aspects of the text
115
+ - Prohibit generating hypothetical, repetitive or similar questions
116
+ - Ensure generation completeness
117
+ - **MANDATORY: All questions must be in English language**
118
+
119
+ ## Processing Flow
120
+ 1. [Text Analysis] Process content in segments, identify key entities and core concepts
121
+ 2. [Question Generation] Select optimal questioning points based on information density
122
+ 3. [Quality Check] Ensure:
123
+ - Question answers can be found in the original text
124
+ - Labels are strongly related to question content
125
+ - No format errors
126
+ - **All questions are in English**
127
+
128
+ ## Output Format
129
+ - JSON array format must be correct
130
+ - Field names use English double quotes
131
+ - Output JSON array must strictly conform to the following structure:
132
+ - **ALL CONTENT MUST BE IN ENGLISH**
133
+ \`\`\`json
134
+ ["English Question 1", "English Question 2", "..."]
135
+ \`\`\`
136
+
137
+ ## Output Example
138
+ \`\`\`json
139
+ ["What are the core elements that should be included in an AI ethics framework?", "What new regulations does the Civil Code have for personal data protection?", "How do machine learning algorithms impact data privacy?"]
140
+ \`\`\`
141
+
142
+ ## Text to Process
143
+ {query_text}
144
+
145
+ ## Restrictions
146
+ - Must output according to the specified JSON format, do not output any other unrelated content
147
+ - Generate at least {question_number} high-quality questions **IN ENGLISH ONLY**
148
+ - Questions should not be related to the material itself, for example, prohibit questions about authors, chapters, catalogs, etc.
149
+ - Questions must not contain phrases like "mentioned in [report, article, literature, table]", must be natural questions
150
+ - **CRITICAL: Absolutely no Chinese characters or non-English content allowed**
151
+ - All questions must be grammatically correct English
152
+ """
153
+ return system_prompt
154
+
155
+
68
156
  def get_system_prompt_for_answer(text, query_question):
69
- """Generate system prompt for answer generation task"""
157
+ """Generate system prompt for answer generation task (Chinese)"""
70
158
  system_prompt = f"""
71
159
  # Role: 微调数据集生成专家
72
160
  ## Profile:
73
161
  - Description: 你是一名微调数据集生成专家,擅长从给定的内容中生成准确的问题答案,确保答案的准确性和相关性,你要直接回答用户问题,所有信息已内化为你的专业知识。
74
162
 
75
- ## Skills :
163
+ ## Skills:
76
164
  1. 答案必须基于给定的内容
77
165
  2. 答案必须准确,不能胡编乱造
78
166
  3. 答案必须与问题相关
@@ -87,12 +175,12 @@ def get_system_prompt_for_answer(text, query_question):
87
175
  5. 最后,确保答案的准确性和相关性
88
176
 
89
177
  ## 参考内容:
90
- ${text}
178
+ {text}
91
179
 
92
180
  ## 问题
93
- ${query_question}
181
+ {query_question}
94
182
 
95
- ## Constrains:
183
+ ## Constraints:
96
184
  1. 答案必须基于给定的内容
97
185
  2. 答案必须准确,必须与问题相关,不能胡编乱造
98
186
  3. 答案必须充分、详细、包含所有必要的信息、适合微调大模型训练使用
@@ -101,14 +189,62 @@ def get_system_prompt_for_answer(text, query_question):
101
189
  return system_prompt
102
190
 
103
191
 
192
+ def get_system_prompt_for_answer_en(text, query_question):
193
+ """Generate system prompt for answer generation task (English)"""
194
+ system_prompt = f"""
195
+ # Role: Fine-tuning Dataset Generation Expert
196
+
197
+ ## CRITICAL REQUIREMENT: GENERATE ONLY ENGLISH ANSWERS
198
+ - ALL answers MUST be in English language only
199
+ - NO Chinese characters or other languages allowed
200
+ - Response must be in fluent, natural English
201
+
202
+ ## Profile:
203
+ - Description: You are a fine-tuning dataset generation expert, skilled in generating accurate question answers from given content, ensuring answer accuracy and relevance. You should directly answer user questions in ENGLISH ONLY, with all information internalized as your professional knowledge.
204
+
205
+ ## Skills:
206
+ 1. Answers must be based on the given content
207
+ 2. Answers must be accurate and not fabricated
208
+ 3. Answers must be relevant to the questions
209
+ 4. Answers must be logical
210
+ 5. Based on the given reference content, integrate into a complete answer using natural and fluent **ENGLISH** language, without mentioning literature sources or citation marks
211
+ 6. **MANDATORY: All responses must be in English language only**
212
+
213
+ ## Workflow:
214
+ 1. Take a deep breath and work on this problem step-by-step.
215
+ 2. First, analyze the given file content
216
+ 3. Then, extract key information from the content
217
+ 4. Next, generate accurate answers related to the questions **IN ENGLISH ONLY**
218
+ 5. Finally, ensure the accuracy and relevance of the answers in proper English
219
+
220
+ ## Reference Content:
221
+ {text}
222
+
223
+ ## Question
224
+ {query_question}
225
+
226
+ ## Constraints:
227
+ 1. Answers must be based on the given content
228
+ 2. Answers must be accurate and relevant to the questions, not fabricated
229
+ 3. Answers must be comprehensive, detailed, contain all necessary information, and be suitable for fine-tuning large model training
230
+ 4. Answers must not contain any referential expressions like 'referenced / based on / mentioned in literature', only present the final results
231
+ 5. **CRITICAL: ALL answers must be in English language only - no Chinese characters or other languages allowed**
232
+ 6. Use proper English grammar, vocabulary, and sentence structure
233
+ 7. Ensure the response flows naturally in English
234
+
235
+ ## IMPORTANT REMINDER:
236
+ Your response must be entirely in English. Do not include any Chinese characters, phrases, or words in your answer.
237
+ """
238
+ return system_prompt
239
+
240
+
104
241
  # ------------spliter----------------
105
- def load_and_split_markdown(md_path: str, chunk_size: int, chunk_overlap: int) -> list:
242
+ def split_content_to_chunks(content: str, chunk_size: int, chunk_overlap: int) -> list:
106
243
  """
107
- Parse Markdown using UnstructuredMarkdownLoader
108
- Chunking strategy that preserves original paragraph structure
244
+ Split content into chunks, replacing the file reading approach
109
245
 
110
246
  Args:
111
- md_path: Path to the markdown file
247
+ content: Processed text content
112
248
  chunk_size: Size of each chunk
113
249
  chunk_overlap: Overlap between chunks
114
250
 
@@ -116,19 +252,48 @@ def load_and_split_markdown(md_path: str, chunk_size: int, chunk_overlap: int) -
116
252
  List of document chunks
117
253
  """
118
254
  try:
119
- # Use LangChain's MarkdownLoader to load Markdown file
120
- loader = UnstructuredMarkdownLoader(md_path)
121
- documents = loader.load()
122
- # Further split documents if needed
255
+ # Handle potential escaped newlines
256
+ content = content.replace("\\n", "\n")
257
+
258
+ # Create document object for the splitter
259
+ document = Document(
260
+ page_content=content, metadata={"source": "processed_content"}
261
+ )
262
+
263
+ # Split the document
123
264
  splitter = RecursiveCharacterTextSplitter(
124
265
  chunk_size=chunk_size,
125
266
  chunk_overlap=chunk_overlap,
126
267
  length_function=len,
127
268
  is_separator_regex=False,
128
269
  )
129
- return splitter.split_documents(documents)
270
+ return splitter.split_documents([document])
130
271
  except Exception as e:
131
- logger.error(f"加载 {Path(md_path).name} 失败: {str(e)}")
272
+ logger.error(f"Failed to split content: {str(e)}")
273
+ return []
274
+
275
+
276
+ def load_and_split_markdown(md_path: str, chunk_size: int, chunk_overlap: int) -> list:
277
+ """
278
+ Parse Markdown file and split into chunks
279
+ This function is kept for backward compatibility
280
+
281
+ Args:
282
+ md_path: Path to the markdown file
283
+ chunk_size: Size of each chunk
284
+ chunk_overlap: Overlap between chunks
285
+
286
+ Returns:
287
+ List of document chunks
288
+ """
289
+ try:
290
+ # Read markdown file directly to avoid loader issues
291
+ with open(md_path, "r", encoding="utf-8") as file:
292
+ content = file.read()
293
+
294
+ return split_content_to_chunks(content, chunk_size, chunk_overlap)
295
+ except Exception as e:
296
+ logger.error(f"Failed to load {Path(md_path).name}: {str(e)}")
132
297
  return []
133
298
 
134
299
 
@@ -155,7 +320,7 @@ def extract_json_from_llm_output(output: str):
155
320
  try:
156
321
  return json.loads(json_match.group(1))
157
322
  except json.JSONDecodeError as e:
158
- print(f"解析 JSON 时出错: {e}")
323
+ print(f"Error parsing JSON: {e}")
159
324
 
160
325
  # Try to extract the most JSON-like part
161
326
  json_start = output.find("[")
@@ -166,7 +331,7 @@ def extract_json_from_llm_output(output: str):
166
331
  except json.JSONDecodeError:
167
332
  pass
168
333
 
169
- print("模型未按标准格式输出:", output)
334
+ print("Model output not in standard format:", output)
170
335
  return None
171
336
 
172
337
 
@@ -184,9 +349,19 @@ def llm_generator(
184
349
  """Generate content using LLM API"""
185
350
  try:
186
351
  if not message:
352
+ # Determine if this is English mode based on prompt content
353
+ is_english_mode = (
354
+ "GENERATE ONLY ENGLISH" in prompt or "ENGLISH ONLY" in prompt
355
+ )
356
+
357
+ if is_english_mode:
358
+ user_message = "Please generate content strictly according to requirements. IMPORTANT: Generate ONLY English content - no Chinese characters or other languages allowed."
359
+ else:
360
+ user_message = "请严格按照要求生成内容"
361
+
187
362
  message = [
188
363
  {"role": "system", "content": prompt},
189
- {"role": "user", "content": "请严格按照要求生成内容"},
364
+ {"role": "user", "content": user_message},
190
365
  ]
191
366
  headers = {
192
367
  "Authorization": f"Bearer {api_key}",
@@ -214,7 +389,7 @@ def llm_generator(
214
389
  return []
215
390
 
216
391
  except Exception as e:
217
- print(f"LLM提取关键词失败: {e, e.__traceback__.tb_lineno}")
392
+ print(f"LLM keyword extraction failed: {e, e.__traceback__.tb_lineno}")
218
393
  return []
219
394
 
220
395
 
@@ -227,7 +402,8 @@ def process_questions(
227
402
  base_url: str,
228
403
  page_content: list,
229
404
  question_number: int,
230
- message: list,
405
+ language: str = "zh",
406
+ message: list = None,
231
407
  max_workers: int = 5,
232
408
  ) -> list:
233
409
  """Generate questions using multi-threading"""
@@ -235,7 +411,11 @@ def process_questions(
235
411
 
236
412
  def _generate_questions(page):
237
413
  """Inner function for question generation"""
238
- prompt = get_system_prompt_for_question(page, question_number)
414
+ if language.lower() == "en":
415
+ prompt = get_system_prompt_for_question_en(page, question_number)
416
+ else:
417
+ prompt = get_system_prompt_for_question(page, question_number)
418
+
239
419
  questions = llm_generator(
240
420
  api_key=api_key,
241
421
  model=model,
@@ -246,17 +426,21 @@ def process_questions(
246
426
  )
247
427
  return [{"question": q, "page": page} for q in questions] if questions else []
248
428
 
249
- logger.info(f"开始生成问题 (线程数: {max_workers})...")
429
+ logger.info(
430
+ f"Starting question generation (threads: {max_workers}, language: {language})..."
431
+ )
250
432
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
251
433
  futures = [executor.submit(_generate_questions, page) for page in page_content]
252
434
 
253
- with tqdm(as_completed(futures), total=len(futures), desc="生成问题") as pbar:
435
+ with tqdm(
436
+ as_completed(futures), total=len(futures), desc="Generating questions"
437
+ ) as pbar:
254
438
  for future in pbar:
255
439
  result = future.result()
256
440
  if result:
257
441
  with lock:
258
442
  total_questions.extend(result)
259
- pbar.set_postfix({"已生成问题": len(total_questions)})
443
+ pbar.set_postfix({"Generated questions": len(total_questions)})
260
444
 
261
445
  return total_questions
262
446
 
@@ -266,6 +450,7 @@ def process_answers(
266
450
  model: str,
267
451
  base_url: str,
268
452
  question_items: list,
453
+ language: str = "zh",
269
454
  message: list = None,
270
455
  max_workers=5,
271
456
  ) -> dict:
@@ -274,7 +459,11 @@ def process_answers(
274
459
 
275
460
  def _generate_answer(item):
276
461
  """Inner function for answer generation"""
277
- prompt = get_system_prompt_for_answer(item["page"], item["question"])
462
+ if language.lower() == "en":
463
+ prompt = get_system_prompt_for_answer_en(item["page"], item["question"])
464
+ else:
465
+ prompt = get_system_prompt_for_answer(item["page"], item["question"])
466
+
278
467
  answer = llm_generator(
279
468
  api_key=api_key,
280
469
  model=model,
@@ -285,22 +474,101 @@ def process_answers(
285
474
  )
286
475
  return item["question"], answer
287
476
 
288
- logger.info(f"开始生成答案 (线程数: {max_workers})...")
477
+ logger.info(
478
+ f"Starting answer generation (threads: {max_workers}, language: {language})..."
479
+ )
289
480
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
290
481
  futures = {
291
482
  executor.submit(_generate_answer, item): item for item in question_items
292
483
  }
293
484
 
294
- with tqdm(as_completed(futures), total=len(futures), desc="生成答案") as pbar:
485
+ with tqdm(
486
+ as_completed(futures), total=len(futures), desc="Generating answers"
487
+ ) as pbar:
295
488
  for future in pbar:
296
489
  question, answer = future.result()
297
490
  if answer:
298
491
  with lock:
299
492
  qa_pairs[question] = answer
300
- pbar.set_postfix({"已生成答案": len(qa_pairs)})
493
+ pbar.set_postfix({"Generated answers": len(qa_pairs)})
301
494
  return qa_pairs
302
495
 
303
496
 
497
+ def generate_qa_from_content(
498
+ content: str,
499
+ api_key: str,
500
+ base_url: str,
501
+ model_name: str,
502
+ chunk_size=500,
503
+ chunk_overlap=100,
504
+ question_number=5,
505
+ language: str = "zh",
506
+ message: list = None,
507
+ max_workers=5,
508
+ ):
509
+ """
510
+ Generate QA pairs from processed content
511
+
512
+ Args:
513
+ content: Processed text content (from get_data() content field)
514
+ api_key: API key
515
+ base_url: API base URL
516
+ model_name: Model name
517
+ chunk_size: Chunk size
518
+ chunk_overlap: Overlap length
519
+ question_number: Number of questions generated per chunk
520
+ language: Language for QA generation ("zh" for Chinese, "en" for English)
521
+ message: Custom message
522
+ max_workers: Number of concurrent workers
523
+
524
+ Returns:
525
+ List of QA pairs
526
+ """
527
+ # 1. Split content into chunks
528
+ pages = split_content_to_chunks(
529
+ content=content, chunk_size=chunk_size, chunk_overlap=chunk_overlap
530
+ )
531
+ page_content = [i.page_content for i in pages]
532
+ logger.info(f"Content split into {len(page_content)} chunks")
533
+
534
+ # 2. Generate questions using multi-threading
535
+ questions = process_questions(
536
+ page_content=page_content,
537
+ question_number=question_number,
538
+ language=language,
539
+ message=message,
540
+ max_workers=max_workers,
541
+ api_key=api_key,
542
+ base_url=base_url,
543
+ model=model_name,
544
+ )
545
+ if not questions:
546
+ logger.error(
547
+ "Failed to generate any questions, please check input content and API settings"
548
+ )
549
+ return []
550
+
551
+ # 3. Generate answers using multi-threading
552
+ qa_pairs = process_answers(
553
+ question_items=questions,
554
+ language=language,
555
+ message=message,
556
+ max_workers=max_workers,
557
+ api_key=api_key,
558
+ base_url=base_url,
559
+ model=model_name,
560
+ )
561
+
562
+ logger.success(f"Completed! Generated {len(qa_pairs)} QA pairs in {language}")
563
+
564
+ # Format results
565
+ res_list = []
566
+ for question, answer in qa_pairs.items():
567
+ qa_entry = {"instruction": question, "input": "", "output": answer}
568
+ res_list.append(qa_entry)
569
+ return res_list
570
+
571
+
304
572
  def generatr_qa_pairs(
305
573
  file_path: str,
306
574
  api_key: str,
@@ -309,33 +577,38 @@ def generatr_qa_pairs(
309
577
  chunk_size=500,
310
578
  chunk_overlap=100,
311
579
  question_number=5,
580
+ language: str = "zh",
312
581
  message: list = None,
313
582
  max_workers=5,
314
583
  ):
315
- """Main function to generate QA pairs from markdown file"""
316
- # 1. Split markdown text into chunks`
584
+ """Main function to generate QA pairs from markdown file (kept for backward compatibility)"""
585
+ # 1. Split markdown text into chunks
317
586
  pages = load_and_split_markdown(
318
587
  md_path=file_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap
319
588
  )
320
589
  page_content = [i.page_content for i in pages]
321
- logger.info(f"markdown被分解了{len(page_content)}个chunk")
590
+ logger.info(f"Markdown split into {len(page_content)} chunks")
322
591
 
323
592
  # 2. Generate questions using multi-threading
324
593
  questions = process_questions(
325
594
  page_content=page_content,
326
- message=message,
327
595
  question_number=question_number,
596
+ language=language,
597
+ message=message,
328
598
  max_workers=max_workers,
329
599
  api_key=api_key,
330
600
  base_url=base_url,
331
601
  model=model_name,
332
602
  )
333
603
  if not questions:
334
- logger.error("未能生成任何问题,请检查输入文档和API设置")
604
+ logger.error(
605
+ "Failed to generate any questions, please check input document and API settings"
606
+ )
335
607
 
336
608
  # 3. Generate answers using multi-threading
337
609
  qa_pairs = process_answers(
338
610
  question_items=questions,
611
+ language=language,
339
612
  message=message,
340
613
  max_workers=max_workers,
341
614
  api_key=api_key,
@@ -343,11 +616,9 @@ def generatr_qa_pairs(
343
616
  model=model_name,
344
617
  )
345
618
 
346
- logger.success(
347
- f"完成! 共生成 {len(qa_pairs)} 个问答对"
348
- )
619
+ logger.success(f"Completed! Generated {len(qa_pairs)} QA pairs in {language}")
349
620
 
350
- #
621
+ # Format results
351
622
  res_list = []
352
623
  for question, answer in qa_pairs.items():
353
624
  qa_entry = {"instruction": question, "input": "", "output": answer}
@@ -356,14 +627,31 @@ def generatr_qa_pairs(
356
627
 
357
628
 
358
629
  if __name__ == "__main__":
630
+ # Example 1: Generate Chinese QA pairs (default)
631
+ print("Generating Chinese QA pairs...")
359
632
  generatr_qa_pairs(
360
- file_path=r"C:\Users\cykro\Desktop\文档整理\知识图谱\知识图谱概要设计.md",
633
+ file_path=r"C:\Users\example\Desktop\document\knowledge_graph\knowledge_graph_design.md",
361
634
  api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxx",
362
635
  base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
363
636
  model_name="qwen-max",
364
637
  chunk_size=500,
365
638
  chunk_overlap=100,
366
639
  question_number=5,
640
+ language="zh", # Chinese QA pairs
367
641
  max_workers=5,
368
642
  # message=[]
369
643
  )
644
+
645
+ # Example 2: Generate English QA pairs (Enhanced with strict English-only prompts)
646
+ print("\nGenerating English QA pairs with enhanced prompts...")
647
+ # generatr_qa_pairs(
648
+ # file_path=r"C:\Users\example\Desktop\document\knowledge_graph\knowledge_graph_design.md",
649
+ # api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxx",
650
+ # base_url="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
651
+ # model_name="qwen-max",
652
+ # chunk_size=500,
653
+ # chunk_overlap=100,
654
+ # question_number=5,
655
+ # language="en", # English QA pairs with enhanced prompts
656
+ # max_workers=5,
657
+ # )
@@ -3,20 +3,21 @@ import tiktoken
3
3
 
4
4
  class DashScopeClient:
5
5
  _instance = None
6
+
6
7
  def __new__(cls, *args, **kwargs):
7
8
  if not cls._instance:
8
9
  cls._instance = super(DashScopeClient, cls).__new__(cls)
9
10
  return cls._instance
10
11
 
11
12
  def get_tokenizer(self, content):
12
- '''
13
- Note: tiktoken only supports the following models with different token calculations
14
- A BPE word divider developed by tiktoken openai
15
- o200k_base corresponds to models: gpt-4o, GPT-4O-MINI
16
- cl100k_base models: GPT-4-Turbo, gpt-4, gpt-3.5-turbo...
17
- p50k_base corresponds to models text-davinci-002 and text-davinci-003
18
- r50k_base corresponds to model gpt2
19
- '''
13
+ """
14
+ Note: tiktoken only supports the following models with different token calculations
15
+ A BPE word divider developed by tiktoken openai
16
+ o200k_base corresponds to models: gpt-4o, GPT-4O-MINI
17
+ cl100k_base models: GPT-4-Turbo, gpt-4, gpt-3.5-turbo...
18
+ p50k_base corresponds to models text-davinci-002 and text-davinci-003
19
+ r50k_base corresponds to model gpt2
20
+ """
20
21
  encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
21
22
  num_tokens = len(encoding.encode(content))
22
- return num_tokens
23
+ return num_tokens