realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,616 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Reference-based exam-question generation system
5
+
6
+ Workflow:
7
+ 1. Parse the PDF exam (MinerU)
8
+ 2. Extract question information (LLM)
9
+ 3. Generate new questions per reference question (Agent)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import asyncio
15
+ from datetime import datetime
16
+ import json
17
+ import os
18
+ from pathlib import Path
19
+ import sys
20
+ from typing import TYPE_CHECKING, Any, Callable
21
+
22
+ if TYPE_CHECKING:
23
+ from src.agents.question import AgentCoordinator
24
+
25
+ # Project root is 3 levels up from src/tools/question/
26
+ project_root = Path(__file__).parent.parent.parent.parent
27
+ sys.path.insert(0, str(project_root))
28
+
29
+ # Note: AgentCoordinator is imported inside functions to avoid circular import
30
+ from src.services.llm.config import get_llm_config
31
+ from src.tools.question.pdf_parser import parse_pdf_with_mineru
32
+ from src.tools.question.question_extractor import extract_questions_from_paper
33
+
34
+ # Type alias for WebSocket callback
35
+ WsCallback = Callable[[str, dict[str, Any]], Any]
36
+
37
+
38
+ async def generate_question_from_reference(
39
+ reference_question: dict[str, Any], coordinator: AgentCoordinator, kb_name: str
40
+ ) -> dict[str, Any]:
41
+ """
42
+ Generate a new question based on a reference entry.
43
+ """
44
+ # Build generation requirement that encodes the reference
45
+ requirement = {
46
+ "reference_question": reference_question["question_text"],
47
+ "has_images": len(reference_question.get("images", [])) > 0,
48
+ "kb_name": kb_name,
49
+ "allow_reject": False,
50
+ "additional_requirements": (
51
+ f"Reference question:\n{reference_question['question_text']}\n\n"
52
+ "Requirements:\n"
53
+ "1. Keep a similar difficulty level.\n"
54
+ "2. **Identify the core knowledge concept(s) of the reference and keep them EXACTLY the same. Do not introduce new advanced topics beyond what the reference question requires.**\n"
55
+ "3. **Change the scenario/objects/geometry; do not simply replace numbers or symbols.**\n"
56
+ "4. **Alter at least one part of the reasoning process or add a new sub-question "
57
+ "(e.g., extra calculation, analysis, or proof).**\n"
58
+ "5. Keep the problem entirely within the same mathematical scope as the reference (e.g., if the reference is planar line parametrization, you must stay within planar line parametrization and cannot escalate to surfaces or directional derivatives).\n"
59
+ "6. Ensure the prompt is rigorous, precise, and self-contained.\n"
60
+ "7. If the original problem references images, describe them in text.\n"
61
+ "8. Rejection is forbidden—you must complete the generation task.\n\n"
62
+ "Chain-of-thought guidance:\n"
63
+ "- Think step-by-step to plan the new scenario and reasoning before producing the final JSON.\n"
64
+ "- Do not reveal your reasoning; output only the final JSON."
65
+ ),
66
+ }
67
+
68
+ # Trigger generation through the coordinator
69
+ result = await coordinator.generate_question(requirement)
70
+
71
+ return result
72
+
73
+
74
+ async def mimic_exam_questions(
75
+ pdf_path: str | None = None,
76
+ paper_dir: str | None = None,
77
+ kb_name: str = None,
78
+ output_dir: str | None = None,
79
+ max_questions: int | None = None,
80
+ ws_callback: WsCallback | None = None,
81
+ ) -> dict[str, Any]:
82
+ """
83
+ End-to-end orchestration for reference-based question generation.
84
+
85
+ Args:
86
+ pdf_path: Path to the PDF exam paper
87
+ paper_dir: Path to a pre-parsed exam directory
88
+ kb_name: Knowledge base name to use
89
+ output_dir: Output directory for generated questions
90
+ max_questions: Maximum number of questions to process
91
+ ws_callback: Optional async callback for WebSocket progress updates
92
+ Signature: async def callback(event_type: str, data: dict)
93
+ """
94
+
95
+ async def send_progress(event_type: str, data: dict[str, Any]):
96
+ """Helper to send progress updates via WebSocket callback."""
97
+ if ws_callback:
98
+ try:
99
+ await ws_callback(event_type, data)
100
+ except Exception as e:
101
+ print(f"WebSocket callback error: {e}")
102
+
103
+ print("=" * 80)
104
+ print("šŸ“š Reference-based question generation system")
105
+ print("=" * 80)
106
+ print()
107
+
108
+ # Validate arguments
109
+ if not pdf_path and not paper_dir:
110
+ await send_progress("error", {"content": "Either pdf_path or paper_dir must be provided."})
111
+ return {"success": False, "error": "Either pdf_path or paper_dir must be provided."}
112
+
113
+ if pdf_path and paper_dir:
114
+ await send_progress("error", {"content": "pdf_path and paper_dir cannot be used together."})
115
+ return {
116
+ "success": False,
117
+ "error": "pdf_path and paper_dir cannot be used together. Choose only one.",
118
+ }
119
+
120
+ latest_dir = None
121
+
122
+ # If an already parsed exam directory is provided
123
+ if paper_dir:
124
+ await send_progress(
125
+ "progress",
126
+ {
127
+ "stage": "parsing",
128
+ "status": "locating",
129
+ "message": "Locating parsed exam directory...",
130
+ },
131
+ )
132
+
133
+ print("šŸ” Using parsed exam directory")
134
+ print("-" * 80)
135
+
136
+ # Resolve relative names against reference_papers
137
+ # SECURITY FIX: Prevent Path Injection / Traversal
138
+ if os.path.isabs(paper_dir) or ".." in paper_dir:
139
+ error_msg = (
140
+ f"Invalid paper_dir: Absolute paths and traversal are not allowed. ({paper_dir})"
141
+ )
142
+ await send_progress("error", {"content": error_msg})
143
+ return {"success": False, "error": error_msg}
144
+
145
+ paper_path = Path(paper_dir)
146
+
147
+ # Candidate locations to search (including new location)
148
+ possible_paths = [
149
+ project_root
150
+ / "data"
151
+ / "user"
152
+ / "question"
153
+ / "mimic_papers"
154
+ / paper_dir, # New primary location
155
+ Path("question_agents/reference_papers") / paper_dir, # Legacy location
156
+ Path("reference_papers") / paper_dir,
157
+ ]
158
+
159
+ latest_dir = None
160
+ for p in possible_paths:
161
+ if p.exists():
162
+ # Double check to ensure we didn't escape via symlink or subtle tricks
163
+ try:
164
+ resolved_p = p.resolve()
165
+ # Safe check: Ensure the resolved path is strictly inside the intended parent
166
+ # This is a basic check; for robust security, whitelist allowed parents explicitly if needed.
167
+ latest_dir = resolved_p
168
+ break
169
+ except Exception:
170
+ continue
171
+
172
+ if not latest_dir:
173
+ error_msg = f"Exam directory not found: {paper_dir}"
174
+ await send_progress("error", {"content": error_msg})
175
+ return {
176
+ "success": False,
177
+ "error": f"{error_msg}\nSearched paths: {[str(p) for p in possible_paths]}",
178
+ }
179
+ # Note: latest_dir was already resolved in the loop above, no need to override
180
+
181
+ # Ensure auto subdirectory exists
182
+ auto_dir = latest_dir / "auto"
183
+ if not auto_dir.exists():
184
+ error_msg = f"Invalid exam directory (missing auto folder): {latest_dir}"
185
+ await send_progress("error", {"content": error_msg})
186
+ return {
187
+ "success": False,
188
+ "error": error_msg,
189
+ }
190
+
191
+ print(f"āœ“ Exam directory detected: {latest_dir.name}")
192
+ print(f" Full path: {latest_dir}")
193
+ print()
194
+
195
+ await send_progress(
196
+ "progress",
197
+ {
198
+ "stage": "parsing",
199
+ "status": "complete",
200
+ "message": f"Using parsed exam: {latest_dir.name}",
201
+ },
202
+ )
203
+
204
+ # If a PDF is provided, parse it first
205
+ elif pdf_path:
206
+ # Stage 1: Parsing PDF
207
+ await send_progress(
208
+ "progress",
209
+ {"stage": "parsing", "status": "running", "message": "Parsing PDF with MinerU..."},
210
+ )
211
+
212
+ print("šŸ”„ Step 1: parse the PDF exam")
213
+ print("-" * 80)
214
+
215
+ # Use provided output_dir or default to mimic_papers
216
+ if output_dir:
217
+ output_base = Path(output_dir)
218
+ else:
219
+ output_base = project_root / "data" / "user" / "question" / "mimic_papers"
220
+ output_base.mkdir(parents=True, exist_ok=True)
221
+
222
+ success = parse_pdf_with_mineru(pdf_path=pdf_path, output_base_dir=str(output_base))
223
+
224
+ if not success:
225
+ await send_progress("error", {"content": "Failed to parse PDF with MinerU"})
226
+ return {"success": False, "error": "Failed to parse PDF"}
227
+
228
+ print()
229
+
230
+ print("šŸ” Step 2: locating parsed results")
231
+ print("-" * 80)
232
+
233
+ # Look in the new output directory (user/question/mimic_papers)
234
+ reference_papers_dir = output_base
235
+ subdirs = sorted(
236
+ [d for d in reference_papers_dir.iterdir() if d.is_dir()],
237
+ key=lambda x: x.stat().st_mtime,
238
+ reverse=True,
239
+ )
240
+
241
+ if not subdirs:
242
+ await send_progress("error", {"content": "No parsed outputs were found"})
243
+ return {"success": False, "error": "No parsed outputs were found"}
244
+
245
+ latest_dir = subdirs[0]
246
+ print(f"āœ“ Parsed folder: {latest_dir.name}")
247
+ print()
248
+
249
+ await send_progress(
250
+ "progress",
251
+ {
252
+ "stage": "parsing",
253
+ "status": "complete",
254
+ "message": f"PDF parsed successfully: {latest_dir.name}",
255
+ },
256
+ )
257
+
258
+ # Stage 2: Extract questions
259
+ await send_progress(
260
+ "progress",
261
+ {
262
+ "stage": "extracting",
263
+ "status": "running",
264
+ "message": "Extracting reference questions from exam...",
265
+ },
266
+ )
267
+
268
+ print("šŸ”„ Step 3: extract reference questions")
269
+ print("-" * 80)
270
+
271
+ json_files = list(latest_dir.glob("*_questions.json"))
272
+
273
+ if json_files:
274
+ print(f"āœ“ Found existing question file: {json_files[0].name}")
275
+ with open(json_files[0], encoding="utf-8") as f:
276
+ questions_data = json.load(f)
277
+ else:
278
+ print("šŸ“„ No question file found, starting extraction...")
279
+ success = extract_questions_from_paper(paper_dir=str(latest_dir), output_dir=None)
280
+
281
+ if not success:
282
+ await send_progress("error", {"content": "Question extraction failed"})
283
+ return {"success": False, "error": "Question extraction failed"}
284
+
285
+ json_files = list(latest_dir.glob("*_questions.json"))
286
+ if not json_files:
287
+ await send_progress(
288
+ "error", {"content": "Question JSON file not found after extraction"}
289
+ )
290
+ return {"success": False, "error": "Question JSON file not found after extraction"}
291
+
292
+ with open(json_files[0], encoding="utf-8") as f:
293
+ questions_data = json.load(f)
294
+
295
+ reference_questions = questions_data.get("questions", [])
296
+
297
+ if max_questions:
298
+ reference_questions = reference_questions[:max_questions]
299
+
300
+ print(f"āœ“ Loaded {len(reference_questions)} reference questions")
301
+ print()
302
+
303
+ # Send reference questions info
304
+ await send_progress(
305
+ "progress",
306
+ {
307
+ "stage": "extracting",
308
+ "status": "complete",
309
+ "message": f"Extracted {len(reference_questions)} reference questions",
310
+ "total_questions": len(reference_questions),
311
+ "reference_questions": [
312
+ {
313
+ "number": q.get("question_number", str(i + 1)),
314
+ "preview": (
315
+ q["question_text"][:100] + "..."
316
+ if len(q["question_text"]) > 100
317
+ else q["question_text"]
318
+ ),
319
+ }
320
+ for i, q in enumerate(reference_questions)
321
+ ],
322
+ },
323
+ )
324
+
325
+ # Stage 3: Generate mimic questions
326
+ await send_progress(
327
+ "progress",
328
+ {
329
+ "stage": "generating",
330
+ "status": "running",
331
+ "message": "Generating mimic questions...",
332
+ "current": 0,
333
+ "total": len(reference_questions),
334
+ },
335
+ )
336
+
337
+ print("šŸ”„ Step 4: generate new questions from references (parallel)")
338
+ print("-" * 80)
339
+
340
+ # Lazy import to avoid circular import
341
+ from src.agents.question import AgentCoordinator
342
+ from src.services.config import load_config_with_main
343
+
344
+ # Load config for parallel settings
345
+ config = load_config_with_main("question_config.yaml", project_root)
346
+ question_cfg = config.get("question", {})
347
+ max_parallel = question_cfg.get("max_parallel_questions", 3)
348
+
349
+ print(f"šŸ“Š Processing {len(reference_questions)} questions with max {max_parallel} parallel")
350
+
351
+ # Create semaphore for parallel control
352
+ semaphore = asyncio.Semaphore(max_parallel)
353
+
354
+ # Track completed count
355
+ completed_count = 0
356
+ completed_lock = asyncio.Lock()
357
+
358
+ async def generate_single_mimic(ref_question: dict, index: int) -> dict:
359
+ """Generate a single mimic question with semaphore control."""
360
+ nonlocal completed_count
361
+
362
+ async with semaphore:
363
+ question_id = f"mimic_{index}"
364
+ ref_number = ref_question.get("question_number", str(index))
365
+
366
+ # Send question start update
367
+ await send_progress(
368
+ "question_update",
369
+ {
370
+ "question_id": question_id,
371
+ "index": index,
372
+ "status": "generating",
373
+ "reference_number": ref_number,
374
+ "reference_preview": ref_question["question_text"][:80] + "...",
375
+ },
376
+ )
377
+
378
+ print(f"\nšŸ“ [{question_id}] Starting - Reference: {ref_number}")
379
+ print(f" Preview: {ref_question['question_text'][:80]}...")
380
+
381
+ # Create a fresh coordinator for each question
382
+ llm_config = get_llm_config()
383
+ coordinator = AgentCoordinator(
384
+ api_key=llm_config.api_key,
385
+ base_url=llm_config.base_url,
386
+ api_version=getattr(llm_config, "api_version", None),
387
+ max_rounds=10,
388
+ kb_name=kb_name,
389
+ )
390
+
391
+ try:
392
+ result = await generate_question_from_reference(
393
+ reference_question=ref_question, coordinator=coordinator, kb_name=kb_name
394
+ )
395
+
396
+ async with completed_lock:
397
+ completed_count += 1
398
+ current_completed = completed_count
399
+
400
+ if result.get("success"):
401
+ print(f"āœ“ [{question_id}] Generated in {result['rounds']} round(s)")
402
+
403
+ result_data = {
404
+ "success": True,
405
+ "reference_question_number": ref_number,
406
+ "reference_question_text": ref_question["question_text"],
407
+ "reference_images": ref_question.get("images", []),
408
+ "generated_question": result["question"],
409
+ "validation": result["validation"],
410
+ "rounds": result["rounds"],
411
+ }
412
+
413
+ # Send result update
414
+ await send_progress(
415
+ "result",
416
+ {
417
+ "question_id": question_id,
418
+ "index": index,
419
+ "success": True,
420
+ "question": result["question"],
421
+ "validation": result["validation"],
422
+ "rounds": result["rounds"],
423
+ "reference_question": ref_question["question_text"],
424
+ "current": current_completed,
425
+ "total": len(reference_questions),
426
+ },
427
+ )
428
+
429
+ return result_data
430
+ else:
431
+ print(f"āœ— [{question_id}] Failed: {result.get('error', 'Unknown error')}")
432
+
433
+ error_data = {
434
+ "success": False,
435
+ "reference_question_number": ref_number,
436
+ "reference_question_text": ref_question["question_text"],
437
+ "error": result.get("error", "Unknown error"),
438
+ "reason": result.get("reason", ""),
439
+ }
440
+
441
+ await send_progress(
442
+ "question_update",
443
+ {
444
+ "question_id": question_id,
445
+ "index": index,
446
+ "status": "failed",
447
+ "error": result.get("error", "Unknown error"),
448
+ "current": current_completed,
449
+ "total": len(reference_questions),
450
+ },
451
+ )
452
+
453
+ return error_data
454
+
455
+ except Exception as e:
456
+ print(f"āœ— [{question_id}] Exception: {e!s}")
457
+
458
+ async with completed_lock:
459
+ completed_count += 1
460
+ current_completed = completed_count
461
+
462
+ await send_progress(
463
+ "question_update",
464
+ {
465
+ "question_id": question_id,
466
+ "index": index,
467
+ "status": "failed",
468
+ "error": str(e),
469
+ "current": current_completed,
470
+ "total": len(reference_questions),
471
+ },
472
+ )
473
+
474
+ return {
475
+ "success": False,
476
+ "reference_question_number": ref_question.get("question_number", str(index)),
477
+ "reference_question_text": ref_question["question_text"],
478
+ "error": f"Exception: {e!s}",
479
+ }
480
+
481
+ # Run all mimic generations in parallel
482
+ tasks = [generate_single_mimic(ref_q, i) for i, ref_q in enumerate(reference_questions, 1)]
483
+ results = await asyncio.gather(*tasks, return_exceptions=True)
484
+
485
+ # Separate successes and failures
486
+ generated_questions = []
487
+ failed_questions = []
488
+
489
+ for result in results:
490
+ if isinstance(result, Exception):
491
+ failed_questions.append({"error": str(result)})
492
+ elif result.get("success"):
493
+ generated_questions.append(result)
494
+ else:
495
+ failed_questions.append(result)
496
+
497
+ print()
498
+ print("=" * 80)
499
+ print("šŸ“Š Generation summary")
500
+ print("=" * 80)
501
+ print(f"Reference questions: {len(reference_questions)}")
502
+ print(f"Successes: {len(generated_questions)}")
503
+ print(f"Failures: {len(failed_questions)}")
504
+
505
+ if output_dir is None:
506
+ output_dir = latest_dir
507
+ else:
508
+ output_dir = Path(output_dir)
509
+ output_dir.mkdir(parents=True, exist_ok=True)
510
+
511
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
512
+ output_file = output_dir / f"{latest_dir.name}_{timestamp}_generated_questions.json"
513
+
514
+ output_data = {
515
+ "reference_paper": latest_dir.name,
516
+ "kb_name": kb_name,
517
+ "total_reference_questions": len(reference_questions),
518
+ "successful_generations": len(generated_questions),
519
+ "failed_generations": len(failed_questions),
520
+ "generated_questions": generated_questions,
521
+ "failed_questions": failed_questions,
522
+ }
523
+
524
+ with open(output_file, "w", encoding="utf-8") as f:
525
+ json.dump(output_data, f, ensure_ascii=False, indent=2)
526
+
527
+ print(f"\nšŸ’¾ Results saved to: {output_file}")
528
+ print()
529
+
530
+ # Send summary
531
+ await send_progress(
532
+ "summary",
533
+ {
534
+ "total_reference": len(reference_questions),
535
+ "successful": len(generated_questions),
536
+ "failed": len(failed_questions),
537
+ "output_file": str(output_file),
538
+ },
539
+ )
540
+
541
+ return {
542
+ "success": True,
543
+ "output_file": str(output_file),
544
+ "total_reference_questions": len(reference_questions),
545
+ "generated_questions": generated_questions,
546
+ "failed_questions": failed_questions,
547
+ }
548
+
549
+
550
+ async def main():
551
+ """Command-line entry point."""
552
+ import argparse
553
+
554
+ parser = argparse.ArgumentParser(
555
+ description="Reference-based question generation CLI",
556
+ formatter_class=argparse.RawDescriptionHelpFormatter,
557
+ epilog="""
558
+ Examples:
559
+ python exam_mimic.py --pdf /path/to/exam.pdf --kb math2211
560
+ python exam_mimic.py --paper 2211asm1 --kb math2211
561
+ python exam_mimic.py --paper reference_papers/2211asm1 --kb math2211
562
+ python exam_mimic.py --paper 2211asm1 --kb math2211 --max-questions 3
563
+ python exam_mimic.py --paper 2211asm1 --kb math2211 -o ./output
564
+ """,
565
+ )
566
+
567
+ # Input mode (mutually exclusive)
568
+ input_group = parser.add_mutually_exclusive_group(required=True)
569
+ input_group.add_argument(
570
+ "--pdf", type=str, help="Absolute path to the PDF exam (will be parsed)"
571
+ )
572
+
573
+ input_group.add_argument(
574
+ "--paper",
575
+ type=str,
576
+ help="Name of a parsed exam directory (e.g., 2211asm1) or its absolute path",
577
+ )
578
+
579
+ parser.add_argument("--kb", type=str, required=True, help="Knowledge base name")
580
+
581
+ parser.add_argument(
582
+ "-o",
583
+ "--output",
584
+ type=str,
585
+ default=None,
586
+ help="Output directory (defaults to the exam folder)",
587
+ )
588
+
589
+ parser.add_argument(
590
+ "--max-questions",
591
+ type=int,
592
+ default=None,
593
+ help="Maximum number of reference questions to process (testing)",
594
+ )
595
+
596
+ args = parser.parse_args()
597
+
598
+ # Execute the workflow
599
+ result = await mimic_exam_questions(
600
+ pdf_path=args.pdf,
601
+ paper_dir=args.paper,
602
+ kb_name=args.kb,
603
+ output_dir=args.output,
604
+ max_questions=args.max_questions,
605
+ )
606
+
607
+ if result["success"]:
608
+ print("āœ“ Completed!")
609
+ sys.exit(0)
610
+ else:
611
+ print(f"āœ— Failed: {result.get('error')}")
612
+ sys.exit(1)
613
+
614
+
615
+ if __name__ == "__main__":
616
+ asyncio.run(main())