realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,616 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Reference-based exam-question generation system
|
|
5
|
+
|
|
6
|
+
Workflow:
|
|
7
|
+
1. Parse the PDF exam (MinerU)
|
|
8
|
+
2. Extract question information (LLM)
|
|
9
|
+
3. Generate new questions per reference question (Agent)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
import sys
|
|
20
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from src.agents.question import AgentCoordinator
|
|
24
|
+
|
|
25
|
+
# Project root is 3 levels up from src/tools/question/
|
|
26
|
+
project_root = Path(__file__).parent.parent.parent.parent
|
|
27
|
+
sys.path.insert(0, str(project_root))
|
|
28
|
+
|
|
29
|
+
# Note: AgentCoordinator is imported inside functions to avoid circular import
|
|
30
|
+
from src.services.llm.config import get_llm_config
|
|
31
|
+
from src.tools.question.pdf_parser import parse_pdf_with_mineru
|
|
32
|
+
from src.tools.question.question_extractor import extract_questions_from_paper
|
|
33
|
+
|
|
34
|
+
# Type alias for WebSocket callback
|
|
35
|
+
WsCallback = Callable[[str, dict[str, Any]], Any]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def generate_question_from_reference(
|
|
39
|
+
reference_question: dict[str, Any], coordinator: AgentCoordinator, kb_name: str
|
|
40
|
+
) -> dict[str, Any]:
|
|
41
|
+
"""
|
|
42
|
+
Generate a new question based on a reference entry.
|
|
43
|
+
"""
|
|
44
|
+
# Build generation requirement that encodes the reference
|
|
45
|
+
requirement = {
|
|
46
|
+
"reference_question": reference_question["question_text"],
|
|
47
|
+
"has_images": len(reference_question.get("images", [])) > 0,
|
|
48
|
+
"kb_name": kb_name,
|
|
49
|
+
"allow_reject": False,
|
|
50
|
+
"additional_requirements": (
|
|
51
|
+
f"Reference question:\n{reference_question['question_text']}\n\n"
|
|
52
|
+
"Requirements:\n"
|
|
53
|
+
"1. Keep a similar difficulty level.\n"
|
|
54
|
+
"2. **Identify the core knowledge concept(s) of the reference and keep them EXACTLY the same. Do not introduce new advanced topics beyond what the reference question requires.**\n"
|
|
55
|
+
"3. **Change the scenario/objects/geometry; do not simply replace numbers or symbols.**\n"
|
|
56
|
+
"4. **Alter at least one part of the reasoning process or add a new sub-question "
|
|
57
|
+
"(e.g., extra calculation, analysis, or proof).**\n"
|
|
58
|
+
"5. Keep the problem entirely within the same mathematical scope as the reference (e.g., if the reference is planar line parametrization, you must stay within planar line parametrization and cannot escalate to surfaces or directional derivatives).\n"
|
|
59
|
+
"6. Ensure the prompt is rigorous, precise, and self-contained.\n"
|
|
60
|
+
"7. If the original problem references images, describe them in text.\n"
|
|
61
|
+
"8. Rejection is forbiddenāyou must complete the generation task.\n\n"
|
|
62
|
+
"Chain-of-thought guidance:\n"
|
|
63
|
+
"- Think step-by-step to plan the new scenario and reasoning before producing the final JSON.\n"
|
|
64
|
+
"- Do not reveal your reasoning; output only the final JSON."
|
|
65
|
+
),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# Trigger generation through the coordinator
|
|
69
|
+
result = await coordinator.generate_question(requirement)
|
|
70
|
+
|
|
71
|
+
return result
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def mimic_exam_questions(
|
|
75
|
+
pdf_path: str | None = None,
|
|
76
|
+
paper_dir: str | None = None,
|
|
77
|
+
kb_name: str = None,
|
|
78
|
+
output_dir: str | None = None,
|
|
79
|
+
max_questions: int | None = None,
|
|
80
|
+
ws_callback: WsCallback | None = None,
|
|
81
|
+
) -> dict[str, Any]:
|
|
82
|
+
"""
|
|
83
|
+
End-to-end orchestration for reference-based question generation.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
pdf_path: Path to the PDF exam paper
|
|
87
|
+
paper_dir: Path to a pre-parsed exam directory
|
|
88
|
+
kb_name: Knowledge base name to use
|
|
89
|
+
output_dir: Output directory for generated questions
|
|
90
|
+
max_questions: Maximum number of questions to process
|
|
91
|
+
ws_callback: Optional async callback for WebSocket progress updates
|
|
92
|
+
Signature: async def callback(event_type: str, data: dict)
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
async def send_progress(event_type: str, data: dict[str, Any]):
|
|
96
|
+
"""Helper to send progress updates via WebSocket callback."""
|
|
97
|
+
if ws_callback:
|
|
98
|
+
try:
|
|
99
|
+
await ws_callback(event_type, data)
|
|
100
|
+
except Exception as e:
|
|
101
|
+
print(f"WebSocket callback error: {e}")
|
|
102
|
+
|
|
103
|
+
print("=" * 80)
|
|
104
|
+
print("š Reference-based question generation system")
|
|
105
|
+
print("=" * 80)
|
|
106
|
+
print()
|
|
107
|
+
|
|
108
|
+
# Validate arguments
|
|
109
|
+
if not pdf_path and not paper_dir:
|
|
110
|
+
await send_progress("error", {"content": "Either pdf_path or paper_dir must be provided."})
|
|
111
|
+
return {"success": False, "error": "Either pdf_path or paper_dir must be provided."}
|
|
112
|
+
|
|
113
|
+
if pdf_path and paper_dir:
|
|
114
|
+
await send_progress("error", {"content": "pdf_path and paper_dir cannot be used together."})
|
|
115
|
+
return {
|
|
116
|
+
"success": False,
|
|
117
|
+
"error": "pdf_path and paper_dir cannot be used together. Choose only one.",
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
latest_dir = None
|
|
121
|
+
|
|
122
|
+
# If an already parsed exam directory is provided
|
|
123
|
+
if paper_dir:
|
|
124
|
+
await send_progress(
|
|
125
|
+
"progress",
|
|
126
|
+
{
|
|
127
|
+
"stage": "parsing",
|
|
128
|
+
"status": "locating",
|
|
129
|
+
"message": "Locating parsed exam directory...",
|
|
130
|
+
},
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
print("š Using parsed exam directory")
|
|
134
|
+
print("-" * 80)
|
|
135
|
+
|
|
136
|
+
# Resolve relative names against reference_papers
|
|
137
|
+
# SECURITY FIX: Prevent Path Injection / Traversal
|
|
138
|
+
if os.path.isabs(paper_dir) or ".." in paper_dir:
|
|
139
|
+
error_msg = (
|
|
140
|
+
f"Invalid paper_dir: Absolute paths and traversal are not allowed. ({paper_dir})"
|
|
141
|
+
)
|
|
142
|
+
await send_progress("error", {"content": error_msg})
|
|
143
|
+
return {"success": False, "error": error_msg}
|
|
144
|
+
|
|
145
|
+
paper_path = Path(paper_dir)
|
|
146
|
+
|
|
147
|
+
# Candidate locations to search (including new location)
|
|
148
|
+
possible_paths = [
|
|
149
|
+
project_root
|
|
150
|
+
/ "data"
|
|
151
|
+
/ "user"
|
|
152
|
+
/ "question"
|
|
153
|
+
/ "mimic_papers"
|
|
154
|
+
/ paper_dir, # New primary location
|
|
155
|
+
Path("question_agents/reference_papers") / paper_dir, # Legacy location
|
|
156
|
+
Path("reference_papers") / paper_dir,
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
latest_dir = None
|
|
160
|
+
for p in possible_paths:
|
|
161
|
+
if p.exists():
|
|
162
|
+
# Double check to ensure we didn't escape via symlink or subtle tricks
|
|
163
|
+
try:
|
|
164
|
+
resolved_p = p.resolve()
|
|
165
|
+
# Safe check: Ensure the resolved path is strictly inside the intended parent
|
|
166
|
+
# This is a basic check; for robust security, whitelist allowed parents explicitly if needed.
|
|
167
|
+
latest_dir = resolved_p
|
|
168
|
+
break
|
|
169
|
+
except Exception:
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
if not latest_dir:
|
|
173
|
+
error_msg = f"Exam directory not found: {paper_dir}"
|
|
174
|
+
await send_progress("error", {"content": error_msg})
|
|
175
|
+
return {
|
|
176
|
+
"success": False,
|
|
177
|
+
"error": f"{error_msg}\nSearched paths: {[str(p) for p in possible_paths]}",
|
|
178
|
+
}
|
|
179
|
+
# Note: latest_dir was already resolved in the loop above, no need to override
|
|
180
|
+
|
|
181
|
+
# Ensure auto subdirectory exists
|
|
182
|
+
auto_dir = latest_dir / "auto"
|
|
183
|
+
if not auto_dir.exists():
|
|
184
|
+
error_msg = f"Invalid exam directory (missing auto folder): {latest_dir}"
|
|
185
|
+
await send_progress("error", {"content": error_msg})
|
|
186
|
+
return {
|
|
187
|
+
"success": False,
|
|
188
|
+
"error": error_msg,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
print(f"ā Exam directory detected: {latest_dir.name}")
|
|
192
|
+
print(f" Full path: {latest_dir}")
|
|
193
|
+
print()
|
|
194
|
+
|
|
195
|
+
await send_progress(
|
|
196
|
+
"progress",
|
|
197
|
+
{
|
|
198
|
+
"stage": "parsing",
|
|
199
|
+
"status": "complete",
|
|
200
|
+
"message": f"Using parsed exam: {latest_dir.name}",
|
|
201
|
+
},
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# If a PDF is provided, parse it first
|
|
205
|
+
elif pdf_path:
|
|
206
|
+
# Stage 1: Parsing PDF
|
|
207
|
+
await send_progress(
|
|
208
|
+
"progress",
|
|
209
|
+
{"stage": "parsing", "status": "running", "message": "Parsing PDF with MinerU..."},
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
print("š Step 1: parse the PDF exam")
|
|
213
|
+
print("-" * 80)
|
|
214
|
+
|
|
215
|
+
# Use provided output_dir or default to mimic_papers
|
|
216
|
+
if output_dir:
|
|
217
|
+
output_base = Path(output_dir)
|
|
218
|
+
else:
|
|
219
|
+
output_base = project_root / "data" / "user" / "question" / "mimic_papers"
|
|
220
|
+
output_base.mkdir(parents=True, exist_ok=True)
|
|
221
|
+
|
|
222
|
+
success = parse_pdf_with_mineru(pdf_path=pdf_path, output_base_dir=str(output_base))
|
|
223
|
+
|
|
224
|
+
if not success:
|
|
225
|
+
await send_progress("error", {"content": "Failed to parse PDF with MinerU"})
|
|
226
|
+
return {"success": False, "error": "Failed to parse PDF"}
|
|
227
|
+
|
|
228
|
+
print()
|
|
229
|
+
|
|
230
|
+
print("š Step 2: locating parsed results")
|
|
231
|
+
print("-" * 80)
|
|
232
|
+
|
|
233
|
+
# Look in the new output directory (user/question/mimic_papers)
|
|
234
|
+
reference_papers_dir = output_base
|
|
235
|
+
subdirs = sorted(
|
|
236
|
+
[d for d in reference_papers_dir.iterdir() if d.is_dir()],
|
|
237
|
+
key=lambda x: x.stat().st_mtime,
|
|
238
|
+
reverse=True,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if not subdirs:
|
|
242
|
+
await send_progress("error", {"content": "No parsed outputs were found"})
|
|
243
|
+
return {"success": False, "error": "No parsed outputs were found"}
|
|
244
|
+
|
|
245
|
+
latest_dir = subdirs[0]
|
|
246
|
+
print(f"ā Parsed folder: {latest_dir.name}")
|
|
247
|
+
print()
|
|
248
|
+
|
|
249
|
+
await send_progress(
|
|
250
|
+
"progress",
|
|
251
|
+
{
|
|
252
|
+
"stage": "parsing",
|
|
253
|
+
"status": "complete",
|
|
254
|
+
"message": f"PDF parsed successfully: {latest_dir.name}",
|
|
255
|
+
},
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Stage 2: Extract questions
|
|
259
|
+
await send_progress(
|
|
260
|
+
"progress",
|
|
261
|
+
{
|
|
262
|
+
"stage": "extracting",
|
|
263
|
+
"status": "running",
|
|
264
|
+
"message": "Extracting reference questions from exam...",
|
|
265
|
+
},
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
print("š Step 3: extract reference questions")
|
|
269
|
+
print("-" * 80)
|
|
270
|
+
|
|
271
|
+
json_files = list(latest_dir.glob("*_questions.json"))
|
|
272
|
+
|
|
273
|
+
if json_files:
|
|
274
|
+
print(f"ā Found existing question file: {json_files[0].name}")
|
|
275
|
+
with open(json_files[0], encoding="utf-8") as f:
|
|
276
|
+
questions_data = json.load(f)
|
|
277
|
+
else:
|
|
278
|
+
print("š No question file found, starting extraction...")
|
|
279
|
+
success = extract_questions_from_paper(paper_dir=str(latest_dir), output_dir=None)
|
|
280
|
+
|
|
281
|
+
if not success:
|
|
282
|
+
await send_progress("error", {"content": "Question extraction failed"})
|
|
283
|
+
return {"success": False, "error": "Question extraction failed"}
|
|
284
|
+
|
|
285
|
+
json_files = list(latest_dir.glob("*_questions.json"))
|
|
286
|
+
if not json_files:
|
|
287
|
+
await send_progress(
|
|
288
|
+
"error", {"content": "Question JSON file not found after extraction"}
|
|
289
|
+
)
|
|
290
|
+
return {"success": False, "error": "Question JSON file not found after extraction"}
|
|
291
|
+
|
|
292
|
+
with open(json_files[0], encoding="utf-8") as f:
|
|
293
|
+
questions_data = json.load(f)
|
|
294
|
+
|
|
295
|
+
reference_questions = questions_data.get("questions", [])
|
|
296
|
+
|
|
297
|
+
if max_questions:
|
|
298
|
+
reference_questions = reference_questions[:max_questions]
|
|
299
|
+
|
|
300
|
+
print(f"ā Loaded {len(reference_questions)} reference questions")
|
|
301
|
+
print()
|
|
302
|
+
|
|
303
|
+
# Send reference questions info
|
|
304
|
+
await send_progress(
|
|
305
|
+
"progress",
|
|
306
|
+
{
|
|
307
|
+
"stage": "extracting",
|
|
308
|
+
"status": "complete",
|
|
309
|
+
"message": f"Extracted {len(reference_questions)} reference questions",
|
|
310
|
+
"total_questions": len(reference_questions),
|
|
311
|
+
"reference_questions": [
|
|
312
|
+
{
|
|
313
|
+
"number": q.get("question_number", str(i + 1)),
|
|
314
|
+
"preview": (
|
|
315
|
+
q["question_text"][:100] + "..."
|
|
316
|
+
if len(q["question_text"]) > 100
|
|
317
|
+
else q["question_text"]
|
|
318
|
+
),
|
|
319
|
+
}
|
|
320
|
+
for i, q in enumerate(reference_questions)
|
|
321
|
+
],
|
|
322
|
+
},
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Stage 3: Generate mimic questions
|
|
326
|
+
await send_progress(
|
|
327
|
+
"progress",
|
|
328
|
+
{
|
|
329
|
+
"stage": "generating",
|
|
330
|
+
"status": "running",
|
|
331
|
+
"message": "Generating mimic questions...",
|
|
332
|
+
"current": 0,
|
|
333
|
+
"total": len(reference_questions),
|
|
334
|
+
},
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
print("š Step 4: generate new questions from references (parallel)")
|
|
338
|
+
print("-" * 80)
|
|
339
|
+
|
|
340
|
+
# Lazy import to avoid circular import
|
|
341
|
+
from src.agents.question import AgentCoordinator
|
|
342
|
+
from src.services.config import load_config_with_main
|
|
343
|
+
|
|
344
|
+
# Load config for parallel settings
|
|
345
|
+
config = load_config_with_main("question_config.yaml", project_root)
|
|
346
|
+
question_cfg = config.get("question", {})
|
|
347
|
+
max_parallel = question_cfg.get("max_parallel_questions", 3)
|
|
348
|
+
|
|
349
|
+
print(f"š Processing {len(reference_questions)} questions with max {max_parallel} parallel")
|
|
350
|
+
|
|
351
|
+
# Create semaphore for parallel control
|
|
352
|
+
semaphore = asyncio.Semaphore(max_parallel)
|
|
353
|
+
|
|
354
|
+
# Track completed count
|
|
355
|
+
completed_count = 0
|
|
356
|
+
completed_lock = asyncio.Lock()
|
|
357
|
+
|
|
358
|
+
async def generate_single_mimic(ref_question: dict, index: int) -> dict:
|
|
359
|
+
"""Generate a single mimic question with semaphore control."""
|
|
360
|
+
nonlocal completed_count
|
|
361
|
+
|
|
362
|
+
async with semaphore:
|
|
363
|
+
question_id = f"mimic_{index}"
|
|
364
|
+
ref_number = ref_question.get("question_number", str(index))
|
|
365
|
+
|
|
366
|
+
# Send question start update
|
|
367
|
+
await send_progress(
|
|
368
|
+
"question_update",
|
|
369
|
+
{
|
|
370
|
+
"question_id": question_id,
|
|
371
|
+
"index": index,
|
|
372
|
+
"status": "generating",
|
|
373
|
+
"reference_number": ref_number,
|
|
374
|
+
"reference_preview": ref_question["question_text"][:80] + "...",
|
|
375
|
+
},
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
print(f"\nš [{question_id}] Starting - Reference: {ref_number}")
|
|
379
|
+
print(f" Preview: {ref_question['question_text'][:80]}...")
|
|
380
|
+
|
|
381
|
+
# Create a fresh coordinator for each question
|
|
382
|
+
llm_config = get_llm_config()
|
|
383
|
+
coordinator = AgentCoordinator(
|
|
384
|
+
api_key=llm_config.api_key,
|
|
385
|
+
base_url=llm_config.base_url,
|
|
386
|
+
api_version=getattr(llm_config, "api_version", None),
|
|
387
|
+
max_rounds=10,
|
|
388
|
+
kb_name=kb_name,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
try:
|
|
392
|
+
result = await generate_question_from_reference(
|
|
393
|
+
reference_question=ref_question, coordinator=coordinator, kb_name=kb_name
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
async with completed_lock:
|
|
397
|
+
completed_count += 1
|
|
398
|
+
current_completed = completed_count
|
|
399
|
+
|
|
400
|
+
if result.get("success"):
|
|
401
|
+
print(f"ā [{question_id}] Generated in {result['rounds']} round(s)")
|
|
402
|
+
|
|
403
|
+
result_data = {
|
|
404
|
+
"success": True,
|
|
405
|
+
"reference_question_number": ref_number,
|
|
406
|
+
"reference_question_text": ref_question["question_text"],
|
|
407
|
+
"reference_images": ref_question.get("images", []),
|
|
408
|
+
"generated_question": result["question"],
|
|
409
|
+
"validation": result["validation"],
|
|
410
|
+
"rounds": result["rounds"],
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
# Send result update
|
|
414
|
+
await send_progress(
|
|
415
|
+
"result",
|
|
416
|
+
{
|
|
417
|
+
"question_id": question_id,
|
|
418
|
+
"index": index,
|
|
419
|
+
"success": True,
|
|
420
|
+
"question": result["question"],
|
|
421
|
+
"validation": result["validation"],
|
|
422
|
+
"rounds": result["rounds"],
|
|
423
|
+
"reference_question": ref_question["question_text"],
|
|
424
|
+
"current": current_completed,
|
|
425
|
+
"total": len(reference_questions),
|
|
426
|
+
},
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
return result_data
|
|
430
|
+
else:
|
|
431
|
+
print(f"ā [{question_id}] Failed: {result.get('error', 'Unknown error')}")
|
|
432
|
+
|
|
433
|
+
error_data = {
|
|
434
|
+
"success": False,
|
|
435
|
+
"reference_question_number": ref_number,
|
|
436
|
+
"reference_question_text": ref_question["question_text"],
|
|
437
|
+
"error": result.get("error", "Unknown error"),
|
|
438
|
+
"reason": result.get("reason", ""),
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
await send_progress(
|
|
442
|
+
"question_update",
|
|
443
|
+
{
|
|
444
|
+
"question_id": question_id,
|
|
445
|
+
"index": index,
|
|
446
|
+
"status": "failed",
|
|
447
|
+
"error": result.get("error", "Unknown error"),
|
|
448
|
+
"current": current_completed,
|
|
449
|
+
"total": len(reference_questions),
|
|
450
|
+
},
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
return error_data
|
|
454
|
+
|
|
455
|
+
except Exception as e:
|
|
456
|
+
print(f"ā [{question_id}] Exception: {e!s}")
|
|
457
|
+
|
|
458
|
+
async with completed_lock:
|
|
459
|
+
completed_count += 1
|
|
460
|
+
current_completed = completed_count
|
|
461
|
+
|
|
462
|
+
await send_progress(
|
|
463
|
+
"question_update",
|
|
464
|
+
{
|
|
465
|
+
"question_id": question_id,
|
|
466
|
+
"index": index,
|
|
467
|
+
"status": "failed",
|
|
468
|
+
"error": str(e),
|
|
469
|
+
"current": current_completed,
|
|
470
|
+
"total": len(reference_questions),
|
|
471
|
+
},
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
return {
|
|
475
|
+
"success": False,
|
|
476
|
+
"reference_question_number": ref_question.get("question_number", str(index)),
|
|
477
|
+
"reference_question_text": ref_question["question_text"],
|
|
478
|
+
"error": f"Exception: {e!s}",
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
# Run all mimic generations in parallel
|
|
482
|
+
tasks = [generate_single_mimic(ref_q, i) for i, ref_q in enumerate(reference_questions, 1)]
|
|
483
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
484
|
+
|
|
485
|
+
# Separate successes and failures
|
|
486
|
+
generated_questions = []
|
|
487
|
+
failed_questions = []
|
|
488
|
+
|
|
489
|
+
for result in results:
|
|
490
|
+
if isinstance(result, Exception):
|
|
491
|
+
failed_questions.append({"error": str(result)})
|
|
492
|
+
elif result.get("success"):
|
|
493
|
+
generated_questions.append(result)
|
|
494
|
+
else:
|
|
495
|
+
failed_questions.append(result)
|
|
496
|
+
|
|
497
|
+
print()
|
|
498
|
+
print("=" * 80)
|
|
499
|
+
print("š Generation summary")
|
|
500
|
+
print("=" * 80)
|
|
501
|
+
print(f"Reference questions: {len(reference_questions)}")
|
|
502
|
+
print(f"Successes: {len(generated_questions)}")
|
|
503
|
+
print(f"Failures: {len(failed_questions)}")
|
|
504
|
+
|
|
505
|
+
if output_dir is None:
|
|
506
|
+
output_dir = latest_dir
|
|
507
|
+
else:
|
|
508
|
+
output_dir = Path(output_dir)
|
|
509
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
510
|
+
|
|
511
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
512
|
+
output_file = output_dir / f"{latest_dir.name}_{timestamp}_generated_questions.json"
|
|
513
|
+
|
|
514
|
+
output_data = {
|
|
515
|
+
"reference_paper": latest_dir.name,
|
|
516
|
+
"kb_name": kb_name,
|
|
517
|
+
"total_reference_questions": len(reference_questions),
|
|
518
|
+
"successful_generations": len(generated_questions),
|
|
519
|
+
"failed_generations": len(failed_questions),
|
|
520
|
+
"generated_questions": generated_questions,
|
|
521
|
+
"failed_questions": failed_questions,
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
525
|
+
json.dump(output_data, f, ensure_ascii=False, indent=2)
|
|
526
|
+
|
|
527
|
+
print(f"\nš¾ Results saved to: {output_file}")
|
|
528
|
+
print()
|
|
529
|
+
|
|
530
|
+
# Send summary
|
|
531
|
+
await send_progress(
|
|
532
|
+
"summary",
|
|
533
|
+
{
|
|
534
|
+
"total_reference": len(reference_questions),
|
|
535
|
+
"successful": len(generated_questions),
|
|
536
|
+
"failed": len(failed_questions),
|
|
537
|
+
"output_file": str(output_file),
|
|
538
|
+
},
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
return {
|
|
542
|
+
"success": True,
|
|
543
|
+
"output_file": str(output_file),
|
|
544
|
+
"total_reference_questions": len(reference_questions),
|
|
545
|
+
"generated_questions": generated_questions,
|
|
546
|
+
"failed_questions": failed_questions,
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
async def main():
|
|
551
|
+
"""Command-line entry point."""
|
|
552
|
+
import argparse
|
|
553
|
+
|
|
554
|
+
parser = argparse.ArgumentParser(
|
|
555
|
+
description="Reference-based question generation CLI",
|
|
556
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
557
|
+
epilog="""
|
|
558
|
+
Examples:
|
|
559
|
+
python exam_mimic.py --pdf /path/to/exam.pdf --kb math2211
|
|
560
|
+
python exam_mimic.py --paper 2211asm1 --kb math2211
|
|
561
|
+
python exam_mimic.py --paper reference_papers/2211asm1 --kb math2211
|
|
562
|
+
python exam_mimic.py --paper 2211asm1 --kb math2211 --max-questions 3
|
|
563
|
+
python exam_mimic.py --paper 2211asm1 --kb math2211 -o ./output
|
|
564
|
+
""",
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
# Input mode (mutually exclusive)
|
|
568
|
+
input_group = parser.add_mutually_exclusive_group(required=True)
|
|
569
|
+
input_group.add_argument(
|
|
570
|
+
"--pdf", type=str, help="Absolute path to the PDF exam (will be parsed)"
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
input_group.add_argument(
|
|
574
|
+
"--paper",
|
|
575
|
+
type=str,
|
|
576
|
+
help="Name of a parsed exam directory (e.g., 2211asm1) or its absolute path",
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
parser.add_argument("--kb", type=str, required=True, help="Knowledge base name")
|
|
580
|
+
|
|
581
|
+
parser.add_argument(
|
|
582
|
+
"-o",
|
|
583
|
+
"--output",
|
|
584
|
+
type=str,
|
|
585
|
+
default=None,
|
|
586
|
+
help="Output directory (defaults to the exam folder)",
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
parser.add_argument(
|
|
590
|
+
"--max-questions",
|
|
591
|
+
type=int,
|
|
592
|
+
default=None,
|
|
593
|
+
help="Maximum number of reference questions to process (testing)",
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
args = parser.parse_args()
|
|
597
|
+
|
|
598
|
+
# Execute the workflow
|
|
599
|
+
result = await mimic_exam_questions(
|
|
600
|
+
pdf_path=args.pdf,
|
|
601
|
+
paper_dir=args.paper,
|
|
602
|
+
kb_name=args.kb,
|
|
603
|
+
output_dir=args.output,
|
|
604
|
+
max_questions=args.max_questions,
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
if result["success"]:
|
|
608
|
+
print("ā Completed!")
|
|
609
|
+
sys.exit(0)
|
|
610
|
+
else:
|
|
611
|
+
print(f"ā Failed: {result.get('error')}")
|
|
612
|
+
sys.exit(1)
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
if __name__ == "__main__":
|
|
616
|
+
asyncio.run(main())
|