realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,799 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ CitationManager - Citation management system
5
+ Responsible for extracting citation information from tool calls and managing citation JSON files
6
+ """
7
+
8
+ import asyncio
9
+ from datetime import datetime
10
+ import json
11
+ from pathlib import Path
12
+ import sys
13
+ from typing import Any
14
+
15
+ project_root = Path(__file__).parent.parent.parent.parent
16
+ sys.path.insert(0, str(project_root))
17
+
18
+
19
+ class CitationManager:
20
+ """Citation manager with global ID management"""
21
+
22
+ def __init__(self, research_id: str, cache_dir: Path | None = None):
23
+ """
24
+ Initialize citation manager
25
+
26
+ Args:
27
+ research_id: Research task ID
28
+ cache_dir: Cache directory path, if None uses default path
29
+ """
30
+ self.research_id = research_id
31
+ if cache_dir is None:
32
+ cache_dir = Path("./cache") / research_id
33
+ self.cache_dir = Path(cache_dir)
34
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
35
+
36
+ self.citations_file = self.cache_dir / "citations.json"
37
+ self._citations: dict[str, dict[str, Any]] = {}
38
+
39
+ # Global citation ID counters
40
+ self._plan_counter = 0 # For PLAN-XX format (planning stage)
41
+ self._block_counters: dict[str, int] = {} # For CIT-X-XX format (research stage)
42
+
43
+ # Reference number mapping (citation_id -> ref_number for in-text citations)
44
+ self._ref_number_map: dict[str, int] = {}
45
+
46
+ # Lock for thread-safe operations in parallel mode
47
+ self._lock = asyncio.Lock()
48
+
49
+ self._load_citations()
50
+
51
+ def generate_plan_citation_id(self) -> str:
52
+ """
53
+ Generate a new citation ID for planning stage (PLAN-XX format)
54
+
55
+ Returns:
56
+ Citation ID in PLAN-XX format
57
+ """
58
+ self._plan_counter += 1
59
+ return f"PLAN-{self._plan_counter:02d}"
60
+
61
+ def generate_research_citation_id(self, block_id: str) -> str:
62
+ """
63
+ Generate a new citation ID for research stage (CIT-X-XX format)
64
+
65
+ Args:
66
+ block_id: Block ID (e.g., "block_3")
67
+
68
+ Returns:
69
+ Citation ID in CIT-X-XX format
70
+ """
71
+ # Extract block number from block_id
72
+ block_num = 0
73
+ try:
74
+ if block_id and "_" in block_id:
75
+ block_num = int(block_id.split("_")[1])
76
+ except (ValueError, IndexError):
77
+ block_num = 0
78
+
79
+ # Increment counter for this block
80
+ block_key = str(block_num)
81
+ if block_key not in self._block_counters:
82
+ self._block_counters[block_key] = 0
83
+ self._block_counters[block_key] += 1
84
+
85
+ return f"CIT-{block_num}-{self._block_counters[block_key]:02d}"
86
+
87
+ def get_next_citation_id(self, stage: str = "research", block_id: str = "") -> str:
88
+ """
89
+ Get the next available citation ID
90
+
91
+ Args:
92
+ stage: "planning" or "research"
93
+ block_id: Block ID (required for research stage)
94
+
95
+ Returns:
96
+ Next available citation ID
97
+ """
98
+ if stage == "planning":
99
+ return self.generate_plan_citation_id()
100
+ return self.generate_research_citation_id(block_id)
101
+
102
+ def citation_exists(self, citation_id: str) -> bool:
103
+ """
104
+ Check if a citation ID already exists
105
+
106
+ Args:
107
+ citation_id: Citation ID to check
108
+
109
+ Returns:
110
+ True if citation exists, False otherwise
111
+ """
112
+ return citation_id in self._citations
113
+
114
+ def _load_citations(self):
115
+ """Load citation information from JSON file and restore counters"""
116
+ if self.citations_file.exists():
117
+ try:
118
+ with open(self.citations_file, encoding="utf-8") as f:
119
+ data = json.load(f)
120
+ self._citations = data.get("citations", {})
121
+
122
+ # Try to restore counters from saved state first
123
+ counters = data.get("counters", {})
124
+ if counters:
125
+ self._plan_counter = counters.get("plan_counter", 0)
126
+ self._block_counters = counters.get("block_counters", {})
127
+ else:
128
+ # Fallback: restore counters from existing citations
129
+ self._restore_counters_from_citations()
130
+ except Exception as e:
131
+ print(f"⚠️ Failed to load citation file: {e}")
132
+ self._citations = {}
133
+ else:
134
+ self._citations = {}
135
+
136
+ def _restore_counters_from_citations(self):
137
+ """Restore citation counters from existing citations to avoid ID conflicts"""
138
+ for citation_id in self._citations.keys():
139
+ if citation_id.startswith("PLAN-"):
140
+ try:
141
+ num = int(citation_id.replace("PLAN-", ""))
142
+ self._plan_counter = max(self._plan_counter, num)
143
+ except ValueError:
144
+ pass
145
+ elif citation_id.startswith("CIT-"):
146
+ try:
147
+ parts = citation_id.replace("CIT-", "").split("-")
148
+ if len(parts) == 2:
149
+ block_num = parts[0]
150
+ seq_num = int(parts[1])
151
+ if block_num not in self._block_counters:
152
+ self._block_counters[block_num] = 0
153
+ self._block_counters[block_num] = max(
154
+ self._block_counters[block_num], seq_num
155
+ )
156
+ except (ValueError, IndexError):
157
+ pass
158
+
159
+ def _save_citations(self):
160
+ """Save citation information to JSON file"""
161
+ try:
162
+ data = {
163
+ "research_id": self.research_id,
164
+ "updated_at": datetime.now().isoformat(),
165
+ "citations": self._citations,
166
+ "counters": {
167
+ "plan_counter": self._plan_counter,
168
+ "block_counters": self._block_counters,
169
+ },
170
+ }
171
+ with open(self.citations_file, "w", encoding="utf-8") as f:
172
+ json.dump(data, f, ensure_ascii=False, indent=2)
173
+ except Exception as e:
174
+ print(f"⚠️ Failed to save citation file: {e}")
175
+
176
+ def validate_citation_references(self, text: str) -> dict[str, Any]:
177
+ """
178
+ Validate citation references in text and identify invalid ones
179
+
180
+ Args:
181
+ text: Text containing citation references like [[CIT-X-XX]]
182
+
183
+ Returns:
184
+ Dictionary with validation results:
185
+ {
186
+ "valid_citations": [...],
187
+ "invalid_citations": [...],
188
+ "is_valid": bool
189
+ }
190
+ """
191
+ import re
192
+
193
+ # Find all citation references in the text
194
+ pattern = r"\[\[([A-Z]+-\d+-?\d*)\]\]"
195
+ found_refs = re.findall(pattern, text)
196
+
197
+ valid = []
198
+ invalid = []
199
+
200
+ for ref in found_refs:
201
+ if self.citation_exists(ref):
202
+ valid.append(ref)
203
+ else:
204
+ invalid.append(ref)
205
+
206
+ return {
207
+ "valid_citations": valid,
208
+ "invalid_citations": invalid,
209
+ "is_valid": len(invalid) == 0,
210
+ "total_found": len(found_refs),
211
+ }
212
+
213
+ def fix_invalid_citations(self, text: str) -> str:
214
+ """
215
+ Remove or mark invalid citation references in text
216
+
217
+ Args:
218
+ text: Text containing citation references
219
+
220
+ Returns:
221
+ Text with invalid citations removed or marked
222
+ """
223
+ import re
224
+
225
+ pattern = r"\[\[([A-Z]+-\d+-?\d*)\]\]\(#ref-[a-z]+-\d+-?\d*\)"
226
+
227
+ def replace_invalid(match):
228
+ citation_id = match.group(1)
229
+ if self.citation_exists(citation_id):
230
+ return match.group(0) # Keep valid citations
231
+ return "" # Remove invalid citations
232
+
233
+ return re.sub(pattern, replace_invalid, text)
234
+
235
+ def add_citation(
236
+ self,
237
+ citation_id: str,
238
+ tool_type: str,
239
+ tool_trace: Any,
240
+ raw_answer: str, # Raw answer JSON string
241
+ ) -> bool:
242
+ """
243
+ Add citation information
244
+
245
+ Args:
246
+ citation_id: Citation ID
247
+ tool_type: Tool type
248
+ tool_trace: ToolTrace object
249
+ raw_answer: Raw answer (JSON string)
250
+
251
+ Returns:
252
+ Whether addition was successful
253
+ """
254
+ try:
255
+ tool_type_lower = tool_type.lower()
256
+
257
+ if tool_type_lower in ("rag_naive", "rag_hybrid", "query_item"):
258
+ citation_info = self._extract_rag_citation(
259
+ citation_id, tool_type, raw_answer, tool_trace
260
+ )
261
+ elif tool_type_lower == "web_search":
262
+ citation_info = self._extract_web_citation(
263
+ citation_id, tool_type, raw_answer, tool_trace
264
+ )
265
+ elif tool_type_lower == "paper_search":
266
+ citation_info = self._extract_paper_citation(
267
+ citation_id, tool_type, raw_answer, tool_trace
268
+ )
269
+ elif tool_type_lower == "run_code":
270
+ citation_info = self._extract_code_citation(citation_id, tool_type, tool_trace)
271
+ else:
272
+ # Unknown tool type, use generic format
273
+ citation_info = self._extract_generic_citation(citation_id, tool_type, tool_trace)
274
+
275
+ if citation_info:
276
+ self._citations[citation_id] = citation_info
277
+ self._save_citations()
278
+ return True
279
+ return False
280
+ except Exception as e:
281
+ print(f"⚠️ Failed to add citation (citation_id={citation_id}): {e}")
282
+ return False
283
+
284
+ def _extract_rag_citation(
285
+ self, citation_id: str, tool_type: str, raw_answer: str, tool_trace: Any
286
+ ) -> dict[str, Any]:
287
+ """Extract citation information for RAG retrieval with source documents"""
288
+ citation_info = {
289
+ "citation_id": citation_id,
290
+ "tool_type": tool_type,
291
+ "query": tool_trace.query,
292
+ "summary": tool_trace.summary,
293
+ "timestamp": tool_trace.timestamp,
294
+ "sources": [], # List of source documents
295
+ }
296
+
297
+ try:
298
+ # Parse raw_answer to extract source information
299
+ answer_data = json.loads(raw_answer)
300
+
301
+ # Extract source documents if available
302
+ # Common fields in RAG responses: chunks, documents, sources, context
303
+ sources = []
304
+
305
+ # Try different field names for source documents
306
+ for field_name in ["chunks", "documents", "sources", "context", "retrieved_docs"]:
307
+ if field_name in answer_data:
308
+ source_list = answer_data[field_name]
309
+ if isinstance(source_list, list):
310
+ for i, doc in enumerate(source_list[:5]): # Limit to 5 sources
311
+ source_info = {}
312
+ if isinstance(doc, dict):
313
+ source_info["title"] = doc.get("title", doc.get("doc_title", ""))
314
+ source_info["content_preview"] = doc.get(
315
+ "content", doc.get("text", "")
316
+ )[:200]
317
+ source_info["source_file"] = doc.get(
318
+ "source", doc.get("file_path", doc.get("filename", ""))
319
+ )
320
+ source_info["page"] = doc.get("page", doc.get("page_number", ""))
321
+ source_info["chunk_id"] = doc.get("chunk_id", doc.get("id", i))
322
+ source_info["score"] = doc.get("score", doc.get("similarity", ""))
323
+ elif isinstance(doc, str):
324
+ source_info["content_preview"] = doc[:200]
325
+ if source_info:
326
+ sources.append(source_info)
327
+ break
328
+
329
+ # Also extract kb_name if available
330
+ citation_info["kb_name"] = answer_data.get("kb_name", "")
331
+ citation_info["sources"] = sources
332
+ citation_info["total_sources"] = len(sources)
333
+
334
+ except (json.JSONDecodeError, Exception) as e:
335
+ # If parsing fails, still return basic citation info
336
+ print(f"⚠️ Failed to parse RAG source info: {e}")
337
+
338
+ return citation_info
339
+
340
+ def _extract_web_citation(
341
+ self, citation_id: str, tool_type: str, raw_answer: str, tool_trace: Any
342
+ ) -> dict[str, Any]:
343
+ """Extract citation information for web search with URLs"""
344
+ citation_info = {
345
+ "citation_id": citation_id,
346
+ "tool_type": tool_type,
347
+ "query": tool_trace.query,
348
+ "summary": tool_trace.summary,
349
+ "timestamp": tool_trace.timestamp,
350
+ "web_sources": [], # List of web sources with URLs
351
+ }
352
+
353
+ try:
354
+ # Parse raw_answer to extract web source information
355
+ answer_data = json.loads(raw_answer)
356
+
357
+ web_sources = []
358
+
359
+ # Try different field names for web results
360
+ for field_name in ["results", "web_results", "search_results", "urls"]:
361
+ if field_name in answer_data:
362
+ result_list = answer_data[field_name]
363
+ if isinstance(result_list, list):
364
+ for result in result_list[:5]: # Limit to 5 sources
365
+ if isinstance(result, dict):
366
+ web_source = {
367
+ "title": result.get("title", ""),
368
+ "url": result.get("url", result.get("link", "")),
369
+ "snippet": result.get("snippet", result.get("description", ""))[
370
+ :200
371
+ ],
372
+ "domain": result.get("domain", ""),
373
+ }
374
+ if web_source["url"]: # Only add if URL exists
375
+ web_sources.append(web_source)
376
+ break
377
+
378
+ citation_info["web_sources"] = web_sources
379
+ citation_info["total_sources"] = len(web_sources)
380
+
381
+ except (json.JSONDecodeError, Exception) as e:
382
+ # If parsing fails, still return basic citation info
383
+ print(f"⚠️ Failed to parse web source info: {e}")
384
+
385
+ return citation_info
386
+
387
+ def _extract_paper_citation(
388
+ self, citation_id: str, tool_type: str, raw_answer: str, tool_trace: Any
389
+ ) -> dict[str, Any]:
390
+ """Extract citation information for paper search - supports multiple papers"""
391
+ citation_info = {
392
+ "citation_id": citation_id,
393
+ "tool_type": tool_type,
394
+ "query": tool_trace.query,
395
+ "summary": tool_trace.summary,
396
+ "timestamp": tool_trace.timestamp,
397
+ "papers": [], # Store all papers, not just the first one
398
+ }
399
+
400
+ try:
401
+ # Parse raw_answer JSON
402
+ answer_data = json.loads(raw_answer)
403
+ papers = answer_data.get("papers", [])
404
+
405
+ if not papers:
406
+ # If no papers, return basic info
407
+ return citation_info
408
+
409
+ # Process ALL papers (up to 5 for practicality)
410
+ processed_papers = []
411
+ for paper in papers[:5]:
412
+ # Format authors
413
+ authors = paper.get("authors", [])
414
+ author_str = ", ".join(authors[:3]) # Display at most 3 authors
415
+ if len(authors) > 3:
416
+ author_str += " et al."
417
+
418
+ paper_info = {
419
+ "title": paper.get("title", ""),
420
+ "authors": author_str,
421
+ "authors_list": authors,
422
+ "year": paper.get("year", ""),
423
+ "url": paper.get("url", ""),
424
+ "arxiv_id": paper.get("arxiv_id", ""),
425
+ "abstract": paper.get("abstract", "")[:300], # Truncate abstract
426
+ "doi": paper.get("doi", ""),
427
+ "venue": paper.get("venue", paper.get("journal", "")),
428
+ }
429
+ processed_papers.append(paper_info)
430
+
431
+ citation_info["papers"] = processed_papers
432
+ citation_info["total_papers"] = len(processed_papers)
433
+
434
+ # Keep primary paper info at top level for backward compatibility
435
+ if processed_papers:
436
+ primary = processed_papers[0]
437
+ citation_info["title"] = primary["title"]
438
+ citation_info["authors"] = primary["authors"]
439
+ citation_info["authors_list"] = primary["authors_list"]
440
+ citation_info["year"] = primary["year"]
441
+ citation_info["url"] = primary["url"]
442
+ citation_info["arxiv_id"] = primary["arxiv_id"]
443
+
444
+ return citation_info
445
+ except Exception as e:
446
+ print(f"⚠️ Failed to parse paper citation: {e}")
447
+ # Still return the basic citation info
448
+ return citation_info
449
+
450
+ def _extract_code_citation(
451
+ self, citation_id: str, tool_type: str, tool_trace: Any
452
+ ) -> dict[str, Any]:
453
+ """Extract citation information for code execution"""
454
+ return {
455
+ "citation_id": citation_id,
456
+ "tool_type": tool_type,
457
+ "query": tool_trace.query, # Code content
458
+ "summary": tool_trace.summary,
459
+ "timestamp": tool_trace.timestamp,
460
+ }
461
+
462
+ def _extract_generic_citation(
463
+ self, citation_id: str, tool_type: str, tool_trace: Any
464
+ ) -> dict[str, Any]:
465
+ """Extract generic citation information (unknown tool type)"""
466
+ return {
467
+ "citation_id": citation_id,
468
+ "tool_type": tool_type,
469
+ "query": tool_trace.query,
470
+ "summary": tool_trace.summary,
471
+ "timestamp": tool_trace.timestamp,
472
+ }
473
+
474
+ def get_citation(self, citation_id: str) -> dict[str, Any] | None:
475
+ """Get citation information for specified citation ID"""
476
+ return self._citations.get(citation_id)
477
+
478
+ def get_all_citations(self) -> dict[str, dict[str, Any]]:
479
+ """Get all citation information"""
480
+ return self._citations.copy()
481
+
482
+ def get_citations_file_path(self) -> Path:
483
+ """Get citation JSON file path"""
484
+ return self.citations_file
485
+
486
+ def format_citation_for_report(self, citation_id: str) -> str | None:
487
+ """
488
+ Format citation information for report display
489
+
490
+ Args:
491
+ citation_id: Citation ID
492
+
493
+ Returns:
494
+ Formatted citation string, or None if not found
495
+ """
496
+ citation = self.get_citation(citation_id)
497
+ if not citation:
498
+ return None
499
+
500
+ tool_type = citation.get("tool_type", "").lower()
501
+
502
+ if tool_type == "paper_search":
503
+ # Standard academic citation format
504
+ title = citation.get("title", "")
505
+ authors = citation.get("authors", "")
506
+ year = citation.get("year", "")
507
+ url = citation.get("url", "")
508
+ arxiv_id = citation.get("arxiv_id", "")
509
+
510
+ # Build citation string
511
+ parts = []
512
+ if authors:
513
+ parts.append(authors)
514
+ if year:
515
+ parts.append(f"({year})")
516
+ if title:
517
+ parts.append(f'"{title}"')
518
+ if arxiv_id:
519
+ parts.append(f"arXiv:{arxiv_id}")
520
+ if url:
521
+ parts.append(f"<{url}>")
522
+
523
+ # Add note about additional papers if available
524
+ total_papers = citation.get("total_papers", 1)
525
+ if total_papers > 1:
526
+ parts.append(f"[+{total_papers - 1} more papers]")
527
+
528
+ return " ".join(parts) if parts else None
529
+
530
+ if tool_type in ("rag_naive", "rag_hybrid", "query_item"):
531
+ # RAG citation with source info
532
+ query = citation.get("query", "")
533
+ kb_name = citation.get("kb_name", "")
534
+ sources = citation.get("sources", [])
535
+
536
+ tool_type_display = {
537
+ "rag_naive": "RAG Retrieval",
538
+ "rag_hybrid": "Hybrid RAG Retrieval",
539
+ "query_item": "Knowledge Base Query",
540
+ }.get(tool_type, tool_type)
541
+
542
+ parts = [f"{tool_type_display}: {query}"]
543
+ if kb_name:
544
+ parts.append(f"[KB: {kb_name}]")
545
+ if sources:
546
+ source_titles = [s.get("title", s.get("source_file", "")) for s in sources[:3] if s]
547
+ source_titles = [t for t in source_titles if t]
548
+ if source_titles:
549
+ parts.append(f"[Sources: {', '.join(source_titles)}]")
550
+
551
+ return " ".join(parts)
552
+
553
+ if tool_type == "web_search":
554
+ # Web search with URLs
555
+ query = citation.get("query", "")
556
+ web_sources = citation.get("web_sources", [])
557
+
558
+ parts = [f"Web Search: {query}"]
559
+ if web_sources:
560
+ urls = [s.get("url", "") for s in web_sources[:3] if s.get("url")]
561
+ if urls:
562
+ parts.append(f"[URLs: {', '.join(urls)}]")
563
+
564
+ return " ".join(parts)
565
+
566
+ # Other types of citation formats
567
+ tool_type_display = {
568
+ "run_code": "Code Execution",
569
+ }.get(tool_type, tool_type)
570
+
571
+ query = citation.get("query", "")
572
+ return f"{tool_type_display}: {query}"
573
+
574
+ # ========== Reference Number Mapping Methods ==========
575
+
576
+ def _get_citation_dedup_key(self, citation: dict, paper: dict = None) -> str:
577
+ """
578
+ Generate unique key for citation deduplication
579
+
580
+ Deduplication is ONLY applied to paper_search citations where the same paper
581
+ (title + first author) is cited multiple times. All other citation types
582
+ get unique ref_numbers based on their citation_id.
583
+
584
+ Args:
585
+ citation: The citation dict
586
+ paper: Optional paper dict for paper_search citations
587
+
588
+ Returns:
589
+ Unique string key for deduplication
590
+ """
591
+ tool_type = citation.get("tool_type", "").lower()
592
+ citation_id = citation.get("citation_id", "")
593
+
594
+ if tool_type == "paper_search" and paper:
595
+ # For papers: use title + first author (normalized) - allow dedup for same paper
596
+ title = paper.get("title", "").lower().strip()
597
+ authors = paper.get("authors", "").lower().strip()
598
+ # Extract first author if multiple
599
+ first_author = authors.split(",")[0].strip() if authors else ""
600
+ if title: # Only dedup if we have a title
601
+ return f"paper:{title}|{first_author}"
602
+ # No title? Use citation_id to ensure unique
603
+ return f"unique:{citation_id}"
604
+ elif tool_type == "paper_search":
605
+ # Fallback for paper_search without paper dict
606
+ title = citation.get("title", "").lower().strip()
607
+ authors = citation.get("authors", "").lower().strip()
608
+ first_author = authors.split(",")[0].strip() if authors else ""
609
+ if title: # Only dedup if we have a title
610
+ return f"paper:{title}|{first_author}"
611
+ return f"unique:{citation_id}"
612
+ else:
613
+ # For RAG/web_search/etc: each citation gets unique ref_number
614
+ # Use citation_id to ensure each citation is unique
615
+ return f"unique:{citation_id}"
616
+
617
+ def _extract_citation_sort_key(self, citation_id: str) -> tuple:
618
+ """
619
+ Extract numeric sort key from citation ID for ordering
620
+
621
+ Args:
622
+ citation_id: Citation ID (e.g., "PLAN-01", "CIT-1-02")
623
+
624
+ Returns:
625
+ Tuple for sorting (stage, block_num, seq_num)
626
+ """
627
+ try:
628
+ if citation_id.startswith("PLAN-"):
629
+ # PLAN-XX format: put at the beginning
630
+ num = int(citation_id.replace("PLAN-", ""))
631
+ return (0, 0, num)
632
+ # CIT-X-XX format
633
+ parts = citation_id.replace("CIT-", "").split("-")
634
+ if len(parts) == 2:
635
+ return (1, int(parts[0]), int(parts[1]))
636
+ except (ValueError, IndexError):
637
+ pass
638
+ return (999, 999, 999)
639
+
640
+ def build_ref_number_map(self) -> dict[str, int]:
641
+ """
642
+ Build citation_id to reference number mapping with deduplication.
643
+ This is the single source of truth for ref_number assignment.
644
+
645
+ Returns:
646
+ Dictionary mapping citation_id to reference number (1-based)
647
+ """
648
+ if not self._citations:
649
+ self._ref_number_map = {}
650
+ return self._ref_number_map
651
+
652
+ # Sort all citation IDs by their numeric parts
653
+ sorted_citation_ids = sorted(self._citations.keys(), key=self._extract_citation_sort_key)
654
+
655
+ # Track seen dedup keys and their assigned ref_numbers
656
+ seen_keys: dict[str, int] = {}
657
+ ref_idx = 0
658
+ ref_map: dict[str, int] = {}
659
+
660
+ for citation_id in sorted_citation_ids:
661
+ citation = self._citations.get(citation_id)
662
+ if not citation:
663
+ continue
664
+
665
+ tool_type = citation.get("tool_type", "").lower()
666
+
667
+ if tool_type == "paper_search":
668
+ # paper_search may have multiple papers - each paper gets a separate ref_number
669
+ papers = citation.get("papers", [])
670
+ if papers:
671
+ for paper_idx, paper in enumerate(papers):
672
+ # Check for duplicate using dedup key
673
+ dedup_key = self._get_citation_dedup_key(citation, paper)
674
+
675
+ if dedup_key in seen_keys:
676
+ # Map to existing ref_number
677
+ existing_ref = seen_keys[dedup_key]
678
+ if paper_idx == 0:
679
+ ref_map[citation_id] = existing_ref
680
+ ref_map[f"{citation_id}-{paper_idx + 1}"] = existing_ref
681
+ else:
682
+ # New unique citation
683
+ ref_idx += 1
684
+ seen_keys[dedup_key] = ref_idx
685
+ if paper_idx == 0:
686
+ ref_map[citation_id] = ref_idx
687
+ ref_map[f"{citation_id}-{paper_idx + 1}"] = ref_idx
688
+ else:
689
+ # Paper search without papers array
690
+ dedup_key = self._get_citation_dedup_key(citation)
691
+ if dedup_key in seen_keys:
692
+ ref_map[citation_id] = seen_keys[dedup_key]
693
+ else:
694
+ ref_idx += 1
695
+ seen_keys[dedup_key] = ref_idx
696
+ ref_map[citation_id] = ref_idx
697
+ else:
698
+ # Non-paper citations
699
+ dedup_key = self._get_citation_dedup_key(citation)
700
+ if dedup_key in seen_keys:
701
+ ref_map[citation_id] = seen_keys[dedup_key]
702
+ else:
703
+ ref_idx += 1
704
+ seen_keys[dedup_key] = ref_idx
705
+ ref_map[citation_id] = ref_idx
706
+
707
+ self._ref_number_map = ref_map
708
+ return ref_map
709
+
710
+ def get_ref_number(self, citation_id: str) -> int:
711
+ """
712
+ Get the reference number for a citation ID.
713
+ If the map hasn't been built yet, build it first.
714
+
715
+ Args:
716
+ citation_id: Citation ID
717
+
718
+ Returns:
719
+ Reference number (1-based), or 0 if not found
720
+ """
721
+ if not self._ref_number_map:
722
+ self.build_ref_number_map()
723
+ return self._ref_number_map.get(citation_id, 0)
724
+
725
+ def get_ref_number_map(self) -> dict[str, int]:
726
+ """
727
+ Get the full reference number map.
728
+ If the map hasn't been built yet, build it first.
729
+
730
+ Returns:
731
+ Dictionary mapping citation_id to reference number
732
+ """
733
+ if not self._ref_number_map:
734
+ self.build_ref_number_map()
735
+ return self._ref_number_map.copy()
736
+
737
+ # ========== Async thread-safe methods for parallel mode ==========
738
+
739
+ async def generate_plan_citation_id_async(self) -> str:
740
+ """
741
+ Thread-safe async version of generate_plan_citation_id for parallel mode
742
+
743
+ Returns:
744
+ Citation ID in PLAN-XX format
745
+ """
746
+ async with self._lock:
747
+ return self.generate_plan_citation_id()
748
+
749
+ async def generate_research_citation_id_async(self, block_id: str) -> str:
750
+ """
751
+ Thread-safe async version of generate_research_citation_id for parallel mode
752
+
753
+ Args:
754
+ block_id: Block ID (e.g., "block_3")
755
+
756
+ Returns:
757
+ Citation ID in CIT-X-XX format
758
+ """
759
+ async with self._lock:
760
+ return self.generate_research_citation_id(block_id)
761
+
762
+ async def get_next_citation_id_async(self, stage: str = "research", block_id: str = "") -> str:
763
+ """
764
+ Thread-safe async version of get_next_citation_id for parallel mode
765
+
766
+ Args:
767
+ stage: "planning" or "research"
768
+ block_id: Block ID (required for research stage)
769
+
770
+ Returns:
771
+ Next available citation ID
772
+ """
773
+ async with self._lock:
774
+ return self.get_next_citation_id(stage, block_id)
775
+
776
+ async def add_citation_async(
777
+ self,
778
+ citation_id: str,
779
+ tool_type: str,
780
+ tool_trace: Any,
781
+ raw_answer: str,
782
+ ) -> bool:
783
+ """
784
+ Thread-safe async version of add_citation for parallel mode
785
+
786
+ Args:
787
+ citation_id: Citation ID
788
+ tool_type: Tool type
789
+ tool_trace: ToolTrace object
790
+ raw_answer: Raw answer (JSON string)
791
+
792
+ Returns:
793
+ Whether addition was successful
794
+ """
795
+ async with self._lock:
796
+ return self.add_citation(citation_id, tool_type, tool_trace, raw_answer)
797
+
798
+
799
+ __all__ = ["CitationManager"]