realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,211 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Parse PDF files using MinerU and save results to reference_papers directory
5
+ """
6
+
7
+ import argparse
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ import shutil
11
+ import subprocess
12
+ import sys
13
+
14
+
15
+ def check_mineru_installed():
16
+ """Check if MinerU is installed"""
17
+ try:
18
+ # Security: Using partial path is intentional here - we need to find
19
+ # the command in user's PATH. These are trusted CLI tools, not user input.
20
+ result = subprocess.run(
21
+ ["magic-pdf", "--version"], # nosec B607
22
+ check=False,
23
+ capture_output=True,
24
+ text=True,
25
+ shell=False,
26
+ )
27
+ if result.returncode == 0:
28
+ return "magic-pdf"
29
+ except FileNotFoundError:
30
+ pass
31
+
32
+ try:
33
+ # Security: Same as above - intentionally using PATH lookup for CLI tool.
34
+ result = subprocess.run(
35
+ ["mineru", "--version"], # nosec B607
36
+ check=False,
37
+ capture_output=True,
38
+ text=True,
39
+ shell=False,
40
+ )
41
+ if result.returncode == 0:
42
+ return "mineru"
43
+ except FileNotFoundError:
44
+ pass
45
+
46
+ return None
47
+
48
+
49
+ def parse_pdf_with_mineru(pdf_path: str, output_base_dir: str = None):
50
+ """
51
+ Parse PDF file using MinerU
52
+
53
+ Args:
54
+ pdf_path: Path to PDF file
55
+ output_base_dir: Base path for output directory, defaults to reference_papers
56
+
57
+ Returns:
58
+ bool: Whether parsing was successful
59
+ """
60
+ mineru_cmd = check_mineru_installed()
61
+ if not mineru_cmd:
62
+ print("āœ— Error: MinerU installation not detected")
63
+ print("Please install MinerU first:")
64
+ print(" pip install magic-pdf[full]")
65
+ print("or")
66
+ print(" pip install mineru")
67
+ print("or visit: https://github.com/opendatalab/MinerU")
68
+ return False
69
+
70
+ print(f"āœ“ Detected MinerU command: {mineru_cmd}")
71
+
72
+ pdf_path = Path(pdf_path).resolve()
73
+ if not pdf_path.exists():
74
+ print(f"āœ— Error: PDF file does not exist: {pdf_path}")
75
+ return False
76
+
77
+ if not pdf_path.suffix.lower() == ".pdf":
78
+ print(f"āœ— Error: File is not PDF format: {pdf_path}")
79
+ return False
80
+
81
+ # Project root is 3 levels up from src/tools/question/
82
+ project_root = Path(__file__).parent.parent.parent.parent
83
+ if output_base_dir is None:
84
+ output_base_dir = project_root / "reference_papers"
85
+ else:
86
+ output_base_dir = Path(output_base_dir)
87
+
88
+ output_base_dir.mkdir(parents=True, exist_ok=True)
89
+
90
+ pdf_name = pdf_path.stem
91
+ output_dir = output_base_dir / pdf_name
92
+
93
+ if output_dir.exists():
94
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
95
+ backup_dir = output_base_dir / f"{pdf_name}_backup_{timestamp}"
96
+ print(f"āš ļø Directory already exists, backing up to: {backup_dir.name}")
97
+ shutil.move(str(output_dir), str(backup_dir))
98
+
99
+ print(f"šŸ“„ PDF file: {pdf_path}")
100
+ print(f"šŸ“ Output directory: {output_dir}")
101
+ print("→ Starting parsing...")
102
+
103
+ try:
104
+ temp_output = output_base_dir / "temp_mineru_output"
105
+ temp_output.mkdir(parents=True, exist_ok=True)
106
+
107
+ cmd = [mineru_cmd, "-p", str(pdf_path), "-o", str(temp_output)]
108
+
109
+ print(f"šŸ”§ Executing command: {' '.join(cmd)}")
110
+
111
+ result = subprocess.run(cmd, capture_output=True, text=True, check=False, shell=False)
112
+
113
+ if result.returncode != 0:
114
+ print("āœ— MinerU parsing failed:")
115
+ print(f"Stdout: {result.stdout}")
116
+ print(f"Stderr: {result.stderr}")
117
+ if temp_output.exists():
118
+ shutil.rmtree(temp_output)
119
+ return False
120
+
121
+ print("āœ“ MinerU parsing completed!")
122
+
123
+ generated_folders = list(temp_output.iterdir())
124
+
125
+ if not generated_folders:
126
+ print("āš ļø Warning: No generated files found in temp directory")
127
+ if temp_output.exists():
128
+ shutil.rmtree(temp_output)
129
+ return False
130
+
131
+ source_folder = generated_folders[0] if generated_folders[0].is_dir() else temp_output
132
+
133
+ # Create target directory and move content
134
+ output_dir.mkdir(parents=True, exist_ok=True)
135
+
136
+ # Move MinerU-generated content to target directory
137
+ if source_folder.exists() and source_folder.is_dir():
138
+ # If source_folder is the PDF-named directory, move its contents
139
+ for item in source_folder.iterdir():
140
+ dest_item = output_dir / item.name
141
+ if dest_item.exists():
142
+ if dest_item.is_dir():
143
+ shutil.rmtree(dest_item)
144
+ else:
145
+ dest_item.unlink()
146
+ shutil.move(str(item), str(dest_item))
147
+ print(f"šŸ“¦ Files saved to: {output_dir}")
148
+ else:
149
+ if output_dir.exists():
150
+ shutil.rmtree(output_dir)
151
+ shutil.move(str(source_folder), str(output_dir))
152
+ print(f"šŸ“¦ Files saved to: {output_dir}")
153
+
154
+ if temp_output.exists():
155
+ shutil.rmtree(temp_output)
156
+
157
+ print("\nšŸ“‹ Generated files:")
158
+ for item in output_dir.rglob("*"):
159
+ if item.is_file():
160
+ rel_path = item.relative_to(output_dir)
161
+ print(f" - {rel_path}")
162
+
163
+ return True
164
+
165
+ except Exception as e:
166
+ print(f"āœ— Error occurred during parsing: {e!s}")
167
+ import traceback
168
+
169
+ traceback.print_exc()
170
+ return False
171
+
172
+
173
+ def main():
174
+ """Main function"""
175
+ parser = argparse.ArgumentParser(
176
+ description="Parse PDF files using MinerU and save results to reference_papers directory",
177
+ formatter_class=argparse.RawDescriptionHelpFormatter,
178
+ epilog="""
179
+ Examples:
180
+ # Parse a single PDF file
181
+ python pdf_parser.py /path/to/paper.pdf
182
+
183
+ # Parse PDF and specify output directory
184
+ python pdf_parser.py /path/to/paper.pdf -o /custom/output/dir
185
+ """,
186
+ )
187
+
188
+ parser.add_argument("pdf_path", type=str, help="Path to PDF file")
189
+
190
+ parser.add_argument(
191
+ "-o",
192
+ "--output",
193
+ type=str,
194
+ default=None,
195
+ help="Base path for output directory (default: reference_papers)",
196
+ )
197
+
198
+ args = parser.parse_args()
199
+
200
+ success = parse_pdf_with_mineru(args.pdf_path, args.output)
201
+
202
+ if success:
203
+ print("\nāœ“ Parsing completed!")
204
+ sys.exit(0)
205
+ else:
206
+ print("\nāœ— Parsing failed!")
207
+ sys.exit(1)
208
+
209
+
210
+ if __name__ == "__main__":
211
+ main()
@@ -0,0 +1,397 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Extract question information from MinerU-parsed exam papers
5
+
6
+ This script reads MinerU-parsed markdown files and content_list.json,
7
+ uses LLM to analyze and extract all questions, including question content and related images.
8
+
9
+ Uses the unified LLM Factory for all LLM calls, supporting:
10
+ - Cloud providers (OpenAI, Anthropic, DeepSeek, etc.)
11
+ - Local providers (Ollama, LM Studio, vLLM, etc.)
12
+ - Automatic retry with exponential backoff
13
+ """
14
+
15
+ import argparse
16
+ import asyncio
17
+ from datetime import datetime
18
+ import json
19
+ from pathlib import Path
20
+ import sys
21
+ from typing import Any
22
+
23
+ # Project root is 3 levels up from src/tools/question/
24
+ project_root = Path(__file__).parent.parent.parent.parent
25
+ sys.path.insert(0, str(project_root))
26
+
27
+ from src.services.config import get_agent_params
28
+ from src.services.llm import complete as llm_complete
29
+ from src.services.llm.capabilities import supports_response_format
30
+ from src.services.llm.config import get_llm_config
31
+ from src.utils.json_parser import parse_json_response
32
+
33
+
34
+ def load_parsed_paper(paper_dir: Path) -> tuple[str | None, list[dict] | None, Path]:
35
+ """
36
+ Load MinerU-parsed exam paper files
37
+
38
+ Args:
39
+ paper_dir: MinerU output directory (e.g., reference_papers/paper_name_20241129/)
40
+
41
+ Returns:
42
+ (markdown_content, content_list, images_dir)
43
+ """
44
+ auto_dir = paper_dir / "auto"
45
+ if not auto_dir.exists():
46
+ auto_dir = paper_dir
47
+
48
+ md_files = list(auto_dir.glob("*.md"))
49
+ if not md_files:
50
+ print(f"āœ— Error: No markdown file found in {auto_dir}")
51
+ return None, None, auto_dir / "images"
52
+
53
+ md_file = md_files[0]
54
+ print(f"šŸ“„ Found markdown file: {md_file.name}")
55
+
56
+ with open(md_file, encoding="utf-8") as f:
57
+ markdown_content = f.read()
58
+
59
+ json_files = list(auto_dir.glob("*_content_list.json"))
60
+ content_list = None
61
+ if json_files:
62
+ json_file = json_files[0]
63
+ print(f"šŸ“‹ Found content_list file: {json_file.name}")
64
+ with open(json_file, encoding="utf-8") as f:
65
+ content_list = json.load(f)
66
+ else:
67
+ print("āš ļø Warning: content_list.json file not found, will use markdown content only")
68
+
69
+ images_dir = auto_dir / "images"
70
+ if images_dir.exists():
71
+ image_count = len(list(images_dir.glob("*")))
72
+ print(f"šŸ–¼ļø Found image directory: {image_count} images")
73
+ else:
74
+ print("āš ļø Warning: images directory not found")
75
+
76
+ return markdown_content, content_list, images_dir
77
+
78
+
79
+ def extract_questions_with_llm(
80
+ markdown_content: str,
81
+ content_list: list[dict] | None,
82
+ images_dir: Path,
83
+ api_key: str,
84
+ base_url: str,
85
+ model: str,
86
+ api_version: str | None = None,
87
+ binding: str | None = None,
88
+ ) -> list[dict[str, Any]]:
89
+ """
90
+ Use LLM to analyze markdown content and extract questions
91
+
92
+ Args:
93
+ markdown_content: Document content in Markdown format
94
+ content_list: MinerU-generated content_list (optional)
95
+ images_dir: Image directory path
96
+ api_key: OpenAI API key
97
+ base_url: API endpoint URL
98
+ model: Model name
99
+ api_version: API version for Azure OpenAI (optional)
100
+ binding: Provider binding type (optional)
101
+
102
+ Returns:
103
+ Question list, each question contains:
104
+ {
105
+ "question_number": Question number,
106
+ "question_text": Question text content (multiple choice includes options),
107
+ "images": [List of relative paths to related images]
108
+ }
109
+ """
110
+ import os
111
+
112
+ binding = binding or os.getenv("LLM_BINDING", "openai")
113
+
114
+ image_list = []
115
+ if images_dir.exists():
116
+ for img_file in sorted(images_dir.glob("*")):
117
+ if img_file.suffix.lower() in [".jpg", ".jpeg", ".png", ".gif", ".webp"]:
118
+ image_list.append(img_file.name)
119
+
120
+ system_prompt = """You are a professional exam paper analysis assistant. Your task is to extract all question information from the provided exam paper content.
121
+
122
+ Please carefully analyze the exam paper content and extract the following information for each question:
123
+ 1. Question number (e.g., "1.", "Question 1", etc.)
124
+ 2. Complete question text content (if multiple choice, include all options)
125
+ 3. Related image file names (if the question references images)
126
+
127
+ For multiple choice questions, please merge the stem and all options into one complete question text, for example:
128
+ "1. Which of the following descriptions about neural networks is correct? ()\nA. Option A content\nB. Option B content\nC. Option C content\nD. Option D content"
129
+
130
+ Please return results in JSON format as follows:
131
+ ```json
132
+ {
133
+ "questions": [
134
+ {
135
+ "question_number": "1",
136
+ "question_text": "Complete question content (including options)...",
137
+ "images": ["image_001.jpg", "image_002.jpg"]
138
+ },
139
+ {
140
+ "question_number": "2",
141
+ "question_text": "Complete content of another question...",
142
+ "images": []
143
+ }
144
+ ]
145
+ }
146
+ ```
147
+
148
+ Important Notes:
149
+ 1. Ensure all questions are extracted, do not miss any
150
+ 2. Keep the original question text, do not modify or summarize
151
+ 3. For multiple choice questions, must merge stem and options in question_text
152
+ 4. If a question has no associated images, set images field to empty array []
153
+ 5. Image file names should be actual existing file names
154
+ 6. Ensure the returned format is valid JSON
155
+ """
156
+
157
+ user_prompt = f"""Exam paper content (Markdown format):
158
+
159
+ {markdown_content[:15000]}
160
+
161
+ Available image files:
162
+ {json.dumps(image_list, ensure_ascii=False, indent=2)}
163
+
164
+ Please analyze the above exam paper content, extract all question information, and return in JSON format.
165
+ """
166
+
167
+ print("\nšŸ¤– Using LLM to analyze questions...")
168
+ print(f"šŸ“Š Model: {model}")
169
+ print(f"šŸ“ Document length: {len(markdown_content)} characters")
170
+ print(f"šŸ–¼ļø Available images: {len(image_list)}")
171
+
172
+ # Get agent parameters from unified config
173
+ agent_params = get_agent_params("question")
174
+
175
+ # Build kwargs for LLM Factory
176
+ llm_kwargs = {
177
+ "temperature": agent_params["temperature"],
178
+ "max_tokens": agent_params["max_tokens"],
179
+ }
180
+
181
+ # Only add response_format if the provider supports it
182
+ if supports_response_format(binding, model):
183
+ llm_kwargs["response_format"] = {"type": "json_object"}
184
+
185
+ try:
186
+ # Call LLM via unified Factory (async, so we need to run in event loop)
187
+ loop = asyncio.get_event_loop()
188
+ if loop.is_running():
189
+ # We're in an existing event loop, run in a thread
190
+ import concurrent.futures
191
+
192
+ with concurrent.futures.ThreadPoolExecutor() as executor:
193
+ future = executor.submit(
194
+ asyncio.run,
195
+ llm_complete(
196
+ prompt=user_prompt,
197
+ system_prompt=system_prompt,
198
+ model=model,
199
+ api_key=api_key,
200
+ base_url=base_url,
201
+ api_version=api_version,
202
+ binding=binding,
203
+ **llm_kwargs,
204
+ ),
205
+ )
206
+ result_text = future.result()
207
+ else:
208
+ # No running loop, use run_until_complete
209
+ result_text = loop.run_until_complete(
210
+ llm_complete(
211
+ prompt=user_prompt,
212
+ system_prompt=system_prompt,
213
+ model=model,
214
+ api_key=api_key,
215
+ base_url=base_url,
216
+ api_version=api_version,
217
+ binding=binding,
218
+ **llm_kwargs,
219
+ )
220
+ )
221
+ except RuntimeError as e:
222
+ if "already running" in str(e):
223
+ # Fallback: use asyncio.run in a thread
224
+ import concurrent.futures
225
+
226
+ with concurrent.futures.ThreadPoolExecutor() as executor:
227
+ future = executor.submit(
228
+ asyncio.run,
229
+ llm_complete(
230
+ prompt=user_prompt,
231
+ system_prompt=system_prompt,
232
+ model=model,
233
+ api_key=api_key,
234
+ base_url=base_url,
235
+ api_version=api_version,
236
+ binding=binding,
237
+ **llm_kwargs,
238
+ ),
239
+ )
240
+ result_text = future.result()
241
+ else:
242
+ raise
243
+
244
+ # Parse JSON response
245
+ try:
246
+ if not result_text:
247
+ raise ValueError("LLM returned empty or None response")
248
+ result = parse_json_response(result_text, logger_instance=None, fallback={})
249
+ if result is None:
250
+ raise ValueError("JSON parsing returned None")
251
+ except Exception as e:
252
+ print(f"āœ— JSON parsing error: {e!s}")
253
+ print(f"LLM response content: {result_text[:500]}...")
254
+ raise ValueError(
255
+ f"Failed to parse LLM JSON response: {e}. "
256
+ f"Raw response (first 500 chars): {result_text[:500]!r}"
257
+ ) from e
258
+
259
+ questions = result.get("questions", [])
260
+ print(f"āœ“ Successfully extracted {len(questions)} questions")
261
+
262
+ return questions
263
+
264
+
265
+ def save_questions_json(questions: list[dict[str, Any]], output_dir: Path, paper_name: str) -> Path:
266
+ """
267
+ Save question information as JSON file
268
+
269
+ Args:
270
+ questions: Question list
271
+ output_dir: Output directory
272
+ paper_name: Paper name
273
+
274
+ Returns:
275
+ Saved file path
276
+ """
277
+ output_dir.mkdir(parents=True, exist_ok=True)
278
+
279
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
280
+
281
+ output_data = {
282
+ "paper_name": paper_name,
283
+ "extraction_time": datetime.now().isoformat(),
284
+ "total_questions": len(questions),
285
+ "questions": questions,
286
+ }
287
+
288
+ output_file = output_dir / f"{paper_name}_{timestamp}_questions.json"
289
+ with open(output_file, "w", encoding="utf-8") as f:
290
+ json.dump(output_data, f, ensure_ascii=False, indent=2)
291
+
292
+ print(f"šŸ’¾ Question information saved to: {output_file.name}")
293
+
294
+ print("\nšŸ“‹ Question statistics:")
295
+ print(f" Total questions: {len(questions)}")
296
+
297
+ questions_with_images = sum(1 for q in questions if q.get("images"))
298
+ print(f" Questions with images: {questions_with_images}")
299
+
300
+ return output_file
301
+
302
+
303
+ def extract_questions_from_paper(paper_dir: str, output_dir: str | None = None) -> bool:
304
+ """
305
+ Extract questions from parsed exam paper
306
+
307
+ Args:
308
+ paper_dir: MinerU-parsed directory path
309
+ output_dir: Output directory (default: paper_dir)
310
+
311
+ Returns:
312
+ Whether extraction was successful
313
+ """
314
+ paper_dir = Path(paper_dir).resolve()
315
+ if not paper_dir.exists():
316
+ print(f"āœ— Error: Directory does not exist: {paper_dir}")
317
+ return False
318
+
319
+ print(f"šŸ“ Paper directory: {paper_dir}")
320
+
321
+ markdown_content, content_list, images_dir = load_parsed_paper(paper_dir)
322
+
323
+ if not markdown_content:
324
+ print("āœ— Error: Unable to load paper content")
325
+ return False
326
+
327
+ try:
328
+ llm_config = get_llm_config()
329
+ except ValueError as e:
330
+ print(f"āœ— {e!s}")
331
+ print(
332
+ "Tip: Please create .env file in project root and configure LLM-related environment variables"
333
+ )
334
+ return False
335
+
336
+ questions = extract_questions_with_llm(
337
+ markdown_content=markdown_content,
338
+ content_list=content_list,
339
+ images_dir=images_dir,
340
+ api_key=llm_config.api_key,
341
+ base_url=llm_config.base_url,
342
+ model=llm_config.model,
343
+ api_version=getattr(llm_config, "api_version", None),
344
+ binding=getattr(llm_config, "binding", None),
345
+ )
346
+
347
+ if not questions:
348
+ print("āš ļø Warning: No questions extracted")
349
+ return False
350
+
351
+ if output_dir is None:
352
+ output_dir = paper_dir
353
+ else:
354
+ output_dir = Path(output_dir)
355
+
356
+ paper_name = paper_dir.name
357
+ output_file = save_questions_json(questions, output_dir, paper_name)
358
+
359
+ print("\nāœ“ Question extraction completed!")
360
+ print(f"šŸ“„ View results: {output_file}")
361
+
362
+ return True
363
+
364
+
365
+ def main():
366
+ """Main function"""
367
+ parser = argparse.ArgumentParser(
368
+ description="Extract question information from MinerU-parsed exam papers",
369
+ formatter_class=argparse.RawDescriptionHelpFormatter,
370
+ epilog="""
371
+ Examples:
372
+ # Extract questions from parsed exam paper directory
373
+ python question_extractor.py reference_papers/exam_20241129_143052
374
+
375
+ # Specify output directory
376
+ python question_extractor.py reference_papers/exam_20241129_143052 -o ./output
377
+ """,
378
+ )
379
+
380
+ parser.add_argument("paper_dir", type=str, help="MinerU-parsed exam paper directory path")
381
+
382
+ parser.add_argument(
383
+ "-o", "--output", type=str, default=None, help="Output directory (default: paper directory)"
384
+ )
385
+
386
+ args = parser.parse_args()
387
+
388
+ success = extract_questions_from_paper(args.paper_dir, args.output)
389
+
390
+ if success:
391
+ sys.exit(0)
392
+ else:
393
+ sys.exit(1)
394
+
395
+
396
+ if __name__ == "__main__":
397
+ main()