realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,1333 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ ReportingAgent - Report generation Agent (DR-in-KG 2.0)
5
+ - Deduplication and cleaning
6
+ - Generate linear outline (introduction → sections → conclusion)
7
+ - Write final report (prefer LLM JSON return markdown, fallback to local assembly on failure)
8
+ - Inline citations and References anchors (based on citation_id)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from collections.abc import Callable
14
+ from pathlib import Path
15
+ import re
16
+ from string import Template
17
+ import sys
18
+ from typing import Any
19
+
20
+ project_root = Path(__file__).parent.parent.parent.parent
21
+ sys.path.insert(0, str(project_root))
22
+
23
+ from src.agents.base_agent import BaseAgent
24
+ from src.agents.research.data_structures import DynamicTopicQueue, TopicBlock
25
+
26
+ from ..utils.json_utils import ensure_json_dict, ensure_keys, extract_json_from_text
27
+
28
+
29
+ class ReportingAgent(BaseAgent):
30
+ """Report generation Agent"""
31
+
32
+ @staticmethod
33
+ def _escape_braces(text: str) -> str:
34
+ """
35
+ Escape curly braces in text to prevent str.format() from interpreting them.
36
+ This is needed because JSON data may contain LaTeX formulas with braces like {L}, {x}.
37
+
38
+ Args:
39
+ text: Input text that may contain curly braces
40
+
41
+ Returns:
42
+ Text with braces escaped ({{ and }})
43
+ """
44
+ return text.replace("{", "{{").replace("}", "}}")
45
+
46
+ @staticmethod
47
+ def _convert_to_template_format(template_str: str) -> str:
48
+ """
49
+ Convert {var} style placeholders to $var style for string.Template.
50
+ This avoids conflicts with LaTeX braces like {\rho}, {L}.
51
+ """
52
+ return re.sub(r"\{(\w+)\}", r"$\1", template_str)
53
+
54
+ def _safe_format(self, template_str: str, **kwargs) -> str:
55
+ """
56
+ Safe string formatting using string.Template to avoid LaTeX brace conflicts.
57
+ Converts {var} to $var format, then uses safe_substitute.
58
+ """
59
+ converted = self._convert_to_template_format(template_str)
60
+ return Template(converted).safe_substitute(**kwargs)
61
+
62
+ def __init__(
63
+ self,
64
+ config: dict[str, Any],
65
+ api_key: str | None = None,
66
+ base_url: str | None = None,
67
+ api_version: str | None = None,
68
+ ):
69
+ language = config.get("system", {}).get("language", "zh")
70
+ super().__init__(
71
+ module_name="research",
72
+ agent_name="reporting_agent",
73
+ api_key=api_key,
74
+ base_url=base_url,
75
+ api_version=api_version,
76
+ language=language,
77
+ config=config,
78
+ )
79
+ self.reporting_config = config.get("reporting", {})
80
+ self.citation_manager = None # Will be set during process
81
+
82
+ # Citation configuration: read from config, default off
83
+ self.enable_citation_list = self.reporting_config.get("enable_citation_list", False)
84
+ self.enable_inline_citations = self.reporting_config.get("enable_inline_citations", False)
85
+
86
+ def set_citation_manager(self, citation_manager):
87
+ """Set citation manager"""
88
+ self.citation_manager = citation_manager
89
+
90
+ async def process(
91
+ self,
92
+ queue: DynamicTopicQueue,
93
+ topic: str,
94
+ progress_callback: Callable[[dict[str, Any]], None] | None = None,
95
+ ) -> dict[str, Any]:
96
+ """
97
+ Generate final report
98
+ Returns:
99
+ {
100
+ "report": str,
101
+ "word_count": int,
102
+ "sections": int,
103
+ "citations": int
104
+ }
105
+ """
106
+ print(f"\n{'=' * 70}")
107
+ print("📄 ReportingAgent - Report Generation")
108
+ print(f"{'=' * 70}")
109
+ print(f"Topic: {topic}")
110
+ print(f"Topic Blocks: {len(queue.blocks)}\n")
111
+
112
+ # Store progress_callback for use in _write_report
113
+ self._progress_callback = progress_callback
114
+
115
+ self._notify_progress(
116
+ progress_callback, "reporting_started", topic=topic, total_blocks=len(queue.blocks)
117
+ )
118
+
119
+ # 1) Deduplication
120
+ print("🔄 Step 1: Deduplication and cleaning...")
121
+ cleaned_blocks = await self._deduplicate_blocks(queue.blocks)
122
+ print(f"✓ Cleaning completed: {len(cleaned_blocks)} topic blocks")
123
+ self._notify_progress(
124
+ progress_callback, "deduplicate_completed", kept_blocks=len(cleaned_blocks)
125
+ )
126
+
127
+ # 2) Outline
128
+ print("\n📋 Step 2: Generating outline...")
129
+ outline = await self._generate_outline(topic, cleaned_blocks)
130
+ print("✓ Outline generation completed")
131
+ self._notify_progress(
132
+ progress_callback, "outline_completed", sections=len(outline.get("sections", []))
133
+ )
134
+
135
+ # Save outline for later use
136
+ self._current_outline = outline
137
+
138
+ # 3) Writing
139
+ print("\nâœī¸ Step 3: Writing report...")
140
+ report_markdown = await self._write_report(topic, cleaned_blocks, outline)
141
+ print("✓ Report writing completed")
142
+ self._notify_progress(progress_callback, "writing_completed")
143
+
144
+ word_count = len(report_markdown)
145
+ sections = len(cleaned_blocks)
146
+ citations = sum(len(b.tool_traces) for b in cleaned_blocks)
147
+
148
+ print("\n📊 Report Statistics:")
149
+ print(f" Word Count: {word_count}")
150
+ print(f" Sections: {sections}")
151
+ print(f" Citations: {citations}")
152
+ self._notify_progress(
153
+ progress_callback,
154
+ "reporting_completed",
155
+ word_count=word_count,
156
+ sections=sections,
157
+ citations=citations,
158
+ )
159
+
160
+ result = {
161
+ "report": report_markdown,
162
+ "word_count": word_count,
163
+ "sections": sections,
164
+ "citations": citations,
165
+ }
166
+
167
+ # If outline has been generated, add it to result
168
+ if hasattr(self, "_current_outline"):
169
+ result["outline"] = self._current_outline
170
+ delattr(self, "_current_outline")
171
+
172
+ return result
173
+
174
+ async def _deduplicate_blocks(self, blocks: list[TopicBlock]) -> list[TopicBlock]:
175
+ if len(blocks) <= 1:
176
+ return blocks
177
+ system_prompt = self.get_prompt("system", "role")
178
+ if not system_prompt:
179
+ raise ValueError(
180
+ "ReportingAgent missing system prompt, please configure system.role in prompts/{lang}/reporting_agent.yaml"
181
+ )
182
+ user_prompt = self.get_prompt("process", "deduplicate")
183
+ if not user_prompt:
184
+ raise ValueError(
185
+ "ReportingAgent missing deduplicate prompt, please configure process.deduplicate in prompts/{lang}/reporting_agent.yaml"
186
+ )
187
+ topics_text = "\n".join(
188
+ [f"{i + 1}. {b.sub_topic}: {b.overview[:200]}" for i, b in enumerate(blocks)]
189
+ )
190
+ filled = self._safe_format(user_prompt, topics=topics_text, total_topics=len(blocks))
191
+ resp = await self.call_llm(filled, system_prompt, stage="deduplicate", verbose=False)
192
+ data = extract_json_from_text(resp)
193
+ try:
194
+ obj = ensure_json_dict(data)
195
+ ensure_keys(obj, ["keep_indices"])
196
+ keep_indices = obj.get("keep_indices", [])
197
+ return [blocks[i] for i in keep_indices if isinstance(i, int) and i < len(blocks)]
198
+ except Exception:
199
+ return blocks
200
+
201
+ async def _generate_outline(self, topic: str, blocks: list[TopicBlock]) -> dict[str, Any]:
202
+ """Generate report outline based on complete subtopic, overview and all tool_trace summaries
203
+
204
+ Supports three-level heading system:
205
+ - Level 1 (#): Report main title
206
+ - Level 2 (##): Main sections (Introduction, Core Sections, Conclusion)
207
+ - Level 3 (###): Subsections within each section
208
+ """
209
+ system_prompt = self.get_prompt("system", "role")
210
+ if not system_prompt:
211
+ raise ValueError(
212
+ "ReportingAgent missing system prompt, please configure system.role in prompts/{lang}/reporting_agent.yaml"
213
+ )
214
+ user_prompt = self.get_prompt("process", "generate_outline")
215
+ if not user_prompt:
216
+ raise ValueError(
217
+ "ReportingAgent missing generate_outline prompt, please configure process.generate_outline in prompts/{lang}/reporting_agent.yaml"
218
+ )
219
+
220
+ # Build complete topic information, including subtopic, overview and all tool_trace summaries
221
+ topics_data = []
222
+ for i, block in enumerate(blocks, 1):
223
+ topic_info = {
224
+ "index": i,
225
+ "block_id": block.block_id,
226
+ "sub_topic": block.sub_topic,
227
+ "overview": block.overview,
228
+ "tool_summaries": (
229
+ [trace.summary for trace in block.tool_traces] if block.tool_traces else []
230
+ ),
231
+ }
232
+ topics_data.append(topic_info)
233
+
234
+ import json as _json
235
+
236
+ topics_json = _json.dumps(topics_data, ensure_ascii=False, indent=2)
237
+ # Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
238
+ filled = self._safe_format(
239
+ user_prompt, topic=topic, topics_json=topics_json, total_topics=len(blocks)
240
+ )
241
+
242
+ resp = await self.call_llm(filled, system_prompt, stage="generate_outline", verbose=False)
243
+ data = extract_json_from_text(resp)
244
+ try:
245
+ obj = ensure_json_dict(data)
246
+ ensure_keys(obj, ["title", "introduction", "sections", "conclusion"])
247
+ # Ensure title uses markdown format (# prefix)
248
+ if not obj.get("title", "").startswith("#"):
249
+ obj["title"] = f"# {obj.get('title', topic)}"
250
+ # Ensure introduction and conclusion use markdown format (## prefix)
251
+ if obj.get("introduction") and not obj["introduction"].startswith("##"):
252
+ obj["introduction"] = f"## {obj['introduction']}"
253
+ if obj.get("conclusion") and not obj["conclusion"].startswith("##"):
254
+ obj["conclusion"] = f"## {obj['conclusion']}"
255
+
256
+ # Process sections to ensure proper formatting
257
+ for section in obj.get("sections", []):
258
+ # Ensure section title has ## prefix
259
+ if section.get("title") and not section["title"].startswith("##"):
260
+ section["title"] = f"## {section['title']}"
261
+ # Process subsections if present
262
+ for subsection in section.get("subsections", []):
263
+ if subsection.get("title") and not subsection["title"].startswith("###"):
264
+ subsection["title"] = f"### {subsection['title']}"
265
+
266
+ return obj
267
+ except Exception:
268
+ # Fallback to default outline with subsections
269
+ return self._create_default_outline(topic, blocks)
270
+
271
+ def _create_default_outline(self, topic: str, blocks: list[TopicBlock]) -> dict[str, Any]:
272
+ """Create a default outline with three-level heading structure"""
273
+ sections = []
274
+ for i, b in enumerate(blocks, 1):
275
+ section = {
276
+ "title": f"## {i}. {b.sub_topic}",
277
+ "instruction": f"Provide detailed introduction to {b.sub_topic}, including core concepts, key mechanisms, and practical applications",
278
+ "block_id": b.block_id,
279
+ "subsections": [
280
+ {
281
+ "title": f"### {i}.1 Core Concepts and Definitions",
282
+ "instruction": f"Explain the fundamental concepts and definitions related to {b.sub_topic}",
283
+ },
284
+ {
285
+ "title": f"### {i}.2 Key Mechanisms and Principles",
286
+ "instruction": f"Analyze the underlying mechanisms and theoretical principles of {b.sub_topic}",
287
+ },
288
+ ],
289
+ }
290
+ sections.append(section)
291
+
292
+ return {
293
+ "title": f"# {topic}",
294
+ "introduction": "## Introduction",
295
+ "introduction_instruction": "Present the research background, motivation, objectives, and report structure",
296
+ "sections": sections,
297
+ "conclusion": "## Conclusion and Future Directions",
298
+ "conclusion_instruction": "Summarize core findings, research contributions, limitations, and future directions",
299
+ }
300
+
301
+ def _ser_block(self, b: TopicBlock) -> dict[str, Any]:
302
+ """Serialize TopicBlock to dictionary, including complete tool traces
303
+
304
+ If self._citation_map is available (built by _build_citation_number_map),
305
+ each trace will include a ref_number field for inline citation use.
306
+ """
307
+ traces = []
308
+ for t in b.tool_traces:
309
+ cid = getattr(t, "citation_id", None) or f"CIT-{b.block_id.split('_')[-1]}-01"
310
+ trace_data = {
311
+ "citation_id": cid,
312
+ "tool_type": t.tool_type,
313
+ "query": t.query,
314
+ "raw_answer": t.raw_answer, # Include complete original response
315
+ "summary": t.summary,
316
+ }
317
+ # Add ref_number if citation map is available
318
+ if hasattr(self, "_citation_map") and self._citation_map:
319
+ ref_num = self._citation_map.get(cid, 0)
320
+ if ref_num > 0:
321
+ trace_data["ref_number"] = ref_num
322
+ traces.append(trace_data)
323
+ return {
324
+ "block_id": b.block_id,
325
+ "sub_topic": b.sub_topic,
326
+ "overview": b.overview,
327
+ "traces": traces,
328
+ }
329
+
330
+ def _build_citation_table(self, block: TopicBlock) -> str:
331
+ """Build a clear citation reference table for LLM to understand the mapping
332
+
333
+ This creates an easy-to-read table showing:
334
+ - Reference number to use in text (use [N] format)
335
+ - Tool type
336
+ - Query summary (truncated)
337
+
338
+ Args:
339
+ block: TopicBlock containing tool traces
340
+
341
+ Returns:
342
+ Formatted citation table string
343
+ """
344
+ if not block.tool_traces:
345
+ return " (No citations available for this section)"
346
+
347
+ lines = []
348
+ for trace in block.tool_traces:
349
+ cid = getattr(trace, "citation_id", None)
350
+ if not cid:
351
+ continue
352
+
353
+ ref_num = self._citation_map.get(cid, 0) if hasattr(self, "_citation_map") else 0
354
+ if ref_num <= 0:
355
+ continue
356
+
357
+ # Truncate query for readability
358
+ query_preview = trace.query[:60] + "..." if len(trace.query) > 60 else trace.query
359
+ tool_display = {
360
+ "rag_naive": "RAG",
361
+ "rag_hybrid": "Hybrid RAG",
362
+ "query_item": "KB Query",
363
+ "paper_search": "Paper",
364
+ "web_search": "Web",
365
+ "run_code": "Code",
366
+ }.get(trace.tool_type.lower(), trace.tool_type)
367
+
368
+ # Use clear format: cite as [N] -> source description
369
+ lines.append(f" - Cite as [{ref_num}] → ({tool_display}) {query_preview}")
370
+
371
+ if not lines:
372
+ return " (No citations available for this section)"
373
+
374
+ return "\n".join(lines)
375
+
376
+ async def _write_introduction(
377
+ self, topic: str, blocks: list[TopicBlock], outline: dict[str, Any]
378
+ ) -> str:
379
+ """Write report introduction section"""
380
+ system_prompt = self.get_prompt(
381
+ "system",
382
+ "role",
383
+ "You are an academic writing expert specializing in writing the introduction section of research reports.",
384
+ )
385
+ tmpl = self.get_prompt("process", "write_introduction", "")
386
+ if not tmpl:
387
+ raise ValueError(
388
+ "Cannot get introduction writing prompt template, report generation failed"
389
+ )
390
+
391
+ import json as _json
392
+
393
+ # Prepare context for introduction: overview information of all topics
394
+ topics_summary = []
395
+ for b in blocks:
396
+ topics_summary.append(
397
+ {"sub_topic": b.sub_topic, "overview": b.overview, "tool_count": len(b.tool_traces)}
398
+ )
399
+
400
+ # Use introduction_instruction if available, otherwise fall back to introduction title
401
+ intro_instruction = outline.get("introduction_instruction", "") or outline.get(
402
+ "introduction", ""
403
+ )
404
+
405
+ # Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
406
+ topics_summary_json = _json.dumps(topics_summary, ensure_ascii=False, indent=2)
407
+ filled = self._safe_format(
408
+ tmpl,
409
+ topic=topic,
410
+ introduction_instruction=intro_instruction,
411
+ topics_summary=topics_summary_json,
412
+ total_topics=len(blocks),
413
+ )
414
+
415
+ resp = await self.call_llm(filled, system_prompt, stage="write_introduction", verbose=False)
416
+ data = extract_json_from_text(resp)
417
+
418
+ try:
419
+ obj = ensure_json_dict(data)
420
+ ensure_keys(obj, ["introduction"])
421
+ intro = obj.get("introduction", "")
422
+ if isinstance(intro, str) and intro.strip():
423
+ return intro
424
+ raise ValueError("LLM returned empty or invalid introduction field")
425
+ except Exception as e:
426
+ raise ValueError(
427
+ f"Unable to parse LLM returned introduction content: {e!s}. Report generation failed."
428
+ )
429
+
430
+ async def _write_section_body(
431
+ self, topic: str, block: TopicBlock, section_outline: dict[str, Any]
432
+ ) -> str:
433
+ """Write main content of a single section"""
434
+ system_prompt = self.get_prompt(
435
+ "system",
436
+ "role",
437
+ "You are an academic writing expert specializing in writing chapter content for research reports.",
438
+ )
439
+ tmpl = self.get_prompt("process", "write_section_body", "")
440
+ if not tmpl:
441
+ raise ValueError("Cannot get section writing prompt template, report generation failed")
442
+
443
+ import json as _json
444
+
445
+ block_data = self._ser_block(block)
446
+
447
+ # Dynamically build citation instructions based on configuration
448
+ if self.enable_inline_citations:
449
+ # Build clear citation reference table for this block
450
+ citation_table = self._build_citation_table(block)
451
+
452
+ citation_instruction_template = self.get_prompt("citation", "enabled_instruction")
453
+ if citation_instruction_template:
454
+ citation_instruction = citation_instruction_template.format(
455
+ citation_table=citation_table
456
+ )
457
+ else:
458
+ # Fallback if YAML not configured
459
+ citation_instruction = f"**Citation Reference Table**:\n{citation_table}"
460
+ citation_output_hint = ", citations"
461
+ else:
462
+ citation_instruction = self.get_prompt("citation", "disabled_instruction") or ""
463
+ citation_output_hint = ""
464
+
465
+ # Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
466
+ block_data_json = _json.dumps(block_data, ensure_ascii=False, indent=2)
467
+ filled = self._safe_format(
468
+ tmpl,
469
+ topic=topic,
470
+ section_title=section_outline.get("title", block.sub_topic),
471
+ section_instruction=section_outline.get("instruction", ""),
472
+ block_data=block_data_json,
473
+ min_section_length=self.reporting_config.get("min_section_length", 500),
474
+ citation_instruction=citation_instruction,
475
+ citation_output_hint=citation_output_hint,
476
+ )
477
+
478
+ resp = await self.call_llm(filled, system_prompt, stage="write_section_body", verbose=False)
479
+ data = extract_json_from_text(resp)
480
+
481
+ try:
482
+ obj = ensure_json_dict(data)
483
+ ensure_keys(obj, ["section_content"])
484
+ content = obj.get("section_content", "")
485
+ if isinstance(content, str) and content.strip():
486
+ return content
487
+ raise ValueError("LLM returned empty or invalid section_content field")
488
+ except Exception as e:
489
+ raise ValueError(
490
+ f"Unable to parse LLM returned section content: {e!s}. Report generation failed."
491
+ )
492
+
493
+ async def _write_conclusion(
494
+ self, topic: str, blocks: list[TopicBlock], outline: dict[str, Any]
495
+ ) -> str:
496
+ """Write report conclusion section"""
497
+ system_prompt = self.get_prompt(
498
+ "system",
499
+ "role",
500
+ "You are an academic writing expert specializing in writing the conclusion section of research reports.",
501
+ )
502
+ tmpl = self.get_prompt("process", "write_conclusion", "")
503
+ if not tmpl:
504
+ raise ValueError(
505
+ "Cannot get conclusion writing prompt template, report generation failed"
506
+ )
507
+
508
+ import json as _json
509
+
510
+ # Prepare context for conclusion: key findings of all topics
511
+ topics_findings = []
512
+ for b in blocks:
513
+ findings = {
514
+ "sub_topic": b.sub_topic,
515
+ "overview": b.overview,
516
+ "key_findings": [
517
+ t.summary for t in b.tool_traces[:3]
518
+ ], # Top 3 key findings for each topic
519
+ }
520
+ topics_findings.append(findings)
521
+
522
+ # Use conclusion_instruction if available, otherwise fall back to conclusion title
523
+ conclusion_instruction = outline.get("conclusion_instruction", "") or outline.get(
524
+ "conclusion", ""
525
+ )
526
+
527
+ # Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
528
+ topics_findings_json = _json.dumps(topics_findings, ensure_ascii=False, indent=2)
529
+ filled = self._safe_format(
530
+ tmpl,
531
+ topic=topic,
532
+ conclusion_instruction=conclusion_instruction,
533
+ topics_findings=topics_findings_json,
534
+ total_topics=len(blocks),
535
+ )
536
+
537
+ resp = await self.call_llm(filled, system_prompt, stage="write_conclusion", verbose=False)
538
+ data = extract_json_from_text(resp)
539
+
540
+ try:
541
+ obj = ensure_json_dict(data)
542
+ ensure_keys(obj, ["conclusion"])
543
+ conclusion = obj.get("conclusion", "")
544
+ if isinstance(conclusion, str) and conclusion.strip():
545
+ return conclusion
546
+ raise ValueError("LLM returned empty or invalid conclusion field")
547
+ except Exception as e:
548
+ raise ValueError(
549
+ f"Unable to parse LLM returned conclusion content: {e!s}. Report generation failed."
550
+ )
551
+
552
+ def _build_citation_number_map(self, blocks: list[TopicBlock]) -> dict[str, int]:
553
+ """Build citation_id to reference number mapping with deduplication
554
+
555
+ This method delegates to CitationManager for unified mapping logic.
556
+ The mapping is built once and cached in CitationManager.
557
+
558
+ Returns:
559
+ Dictionary mapping citation_id (e.g., "CIT-1-01") to reference number (e.g., 1)
560
+ """
561
+ if self.citation_manager:
562
+ # Use CitationManager's unified mapping (single source of truth)
563
+ return self.citation_manager.build_ref_number_map()
564
+
565
+ # Fallback: build from blocks when no CitationManager available
566
+ citation_map = {}
567
+
568
+ def extract_citation_number(cit_id):
569
+ try:
570
+ if cit_id.startswith("PLAN-"):
571
+ num = int(cit_id.replace("PLAN-", ""))
572
+ return (0, 0, num)
573
+ parts_list = cit_id.replace("CIT-", "").split("-")
574
+ if len(parts_list) == 2:
575
+ return (1, int(parts_list[0]), int(parts_list[1]))
576
+ except:
577
+ pass
578
+ return (999, 999, 999)
579
+
580
+ all_citations = []
581
+ for block in blocks:
582
+ if block.tool_traces:
583
+ for trace in block.tool_traces:
584
+ citation_id = getattr(trace, "citation_id", None)
585
+ if citation_id and citation_id not in [c["citation_id"] for c in all_citations]:
586
+ all_citations.append({"citation_id": citation_id})
587
+
588
+ all_citations.sort(key=lambda x: extract_citation_number(x["citation_id"]))
589
+
590
+ for idx, cit in enumerate(all_citations, 1):
591
+ citation_map[cit["citation_id"]] = idx
592
+
593
+ return citation_map
594
+
595
+ def _generate_references(self, blocks: list[TopicBlock]) -> str:
596
+ """Generate References section"""
597
+ parts = ["## References\n"]
598
+
599
+ # If using CitationManager, generate from JSON file
600
+ if self.citation_manager:
601
+ return self._generate_references_from_manager(blocks)
602
+
603
+ # Otherwise use original method of extracting from blocks (backward compatible)
604
+ return self._generate_references_from_blocks(blocks)
605
+
606
+ def _get_citation_dedup_key(self, citation: dict, paper: dict = None) -> str:
607
+ """Generate unique key for citation deduplication
608
+
609
+ Args:
610
+ citation: The citation dict
611
+ paper: Optional paper dict for paper_search citations
612
+
613
+ Returns:
614
+ Unique string key for deduplication
615
+ """
616
+ tool_type = citation.get("tool_type", "").lower()
617
+
618
+ if tool_type == "paper_search" and paper:
619
+ # For papers: use title + first author (normalized)
620
+ title = paper.get("title", "").lower().strip()
621
+ authors = paper.get("authors", "").lower().strip()
622
+ # Extract first author if multiple
623
+ first_author = authors.split(",")[0].strip() if authors else ""
624
+ return f"paper:{title}|{first_author}"
625
+ elif tool_type == "paper_search":
626
+ # Fallback for paper_search without paper dict
627
+ title = citation.get("title", "").lower().strip()
628
+ authors = citation.get("authors", "").lower().strip()
629
+ first_author = authors.split(",")[0].strip() if authors else ""
630
+ return f"paper:{title}|{first_author}"
631
+ else:
632
+ # For RAG/web_search/etc: use tool_type + query (normalized)
633
+ query = citation.get("query", "").lower().strip()
634
+ # Use first 100 chars of query for dedup
635
+ return f"{tool_type}:{query[:100]}"
636
+
637
+ def _generate_references_from_manager(self, blocks: list[TopicBlock]) -> str:
638
+ """Generate References section from CitationManager in academic paper style
639
+
640
+ Uses CitationManager's ref_number_map to ensure consistency between
641
+ in-text citations and the References section.
642
+
643
+ Format:
644
+ - Ordered by reference number (consistent with in-text citations)
645
+ - Paper citations: APA format
646
+ - RAG/Query citations: Tool name, query, summary
647
+ - Web search: Tool name, query, summary + collapsible links
648
+ """
649
+ parts = ["## References\n\n"]
650
+
651
+ # Get all citations and the ref_number_map
652
+ all_citations = self.citation_manager.get_all_citations()
653
+
654
+ if not all_citations:
655
+ return "## References\n\n*No citations available.*\n"
656
+
657
+ # Get the ref_number_map from CitationManager (single source of truth)
658
+ ref_map = self.citation_manager.get_ref_number_map()
659
+
660
+ # Build reverse map: ref_number -> (citation_id, paper_idx or None)
661
+ # This groups citations by their ref_number for consistent output
662
+ ref_to_citations: dict[int, list[tuple[str, dict, dict | None]]] = {}
663
+
664
+ for citation_id, citation in all_citations.items():
665
+ tool_type = citation.get("tool_type", "").lower()
666
+
667
+ if tool_type == "paper_search":
668
+ papers = citation.get("papers", [])
669
+ if papers:
670
+ for paper_idx, paper in enumerate(papers):
671
+ # Check if this paper has a ref_number
672
+ paper_ref_key = f"{citation_id}-{paper_idx + 1}"
673
+ ref_num = ref_map.get(paper_ref_key) or ref_map.get(citation_id, 0)
674
+ if ref_num > 0:
675
+ if ref_num not in ref_to_citations:
676
+ ref_to_citations[ref_num] = []
677
+ ref_to_citations[ref_num].append((citation_id, citation, paper))
678
+ else:
679
+ ref_num = ref_map.get(citation_id, 0)
680
+ if ref_num > 0:
681
+ if ref_num not in ref_to_citations:
682
+ ref_to_citations[ref_num] = []
683
+ ref_to_citations[ref_num].append((citation_id, citation, None))
684
+ else:
685
+ ref_num = ref_map.get(citation_id, 0)
686
+ if ref_num > 0:
687
+ if ref_num not in ref_to_citations:
688
+ ref_to_citations[ref_num] = []
689
+ ref_to_citations[ref_num].append((citation_id, citation, None))
690
+
691
+ # Generate references in order of ref_number
692
+ for ref_num in sorted(ref_to_citations.keys()):
693
+ entries = ref_to_citations[ref_num]
694
+ if not entries:
695
+ continue
696
+
697
+ # Use the first entry for this ref_number (others are duplicates)
698
+ citation_id, citation, paper = entries[0]
699
+ tool_type = citation.get("tool_type", "").lower()
700
+
701
+ anchor = f"ref-{ref_num}"
702
+ parts.append(f'<a id="{anchor}"></a>**[{ref_num}]** ')
703
+
704
+ if tool_type == "paper_search":
705
+ if paper:
706
+ formatted = self._format_single_paper_apa(paper)
707
+ else:
708
+ formatted = self._format_paper_citation_apa(citation)
709
+ parts.append(formatted)
710
+ elif tool_type == "web_search":
711
+ formatted = self._format_web_search_citation(citation)
712
+ parts.append(formatted)
713
+ elif tool_type in ("rag_naive", "rag_hybrid", "query_item"):
714
+ formatted = self._format_rag_citation(citation)
715
+ parts.append(formatted)
716
+ elif tool_type == "run_code":
717
+ formatted = self._format_code_citation(citation)
718
+ parts.append(formatted)
719
+ else:
720
+ # Generic format
721
+ query = citation.get("query", "")
722
+ summary = citation.get("summary", "")
723
+ parts.append(f"**{tool_type}**\n\n")
724
+ parts.append(f"- **Query**: {query}\n")
725
+ if summary:
726
+ clean_summary = self._strip_markdown(summary)
727
+ parts.append(
728
+ f"- **Summary**: {clean_summary[:300]}{'...' if len(clean_summary) > 300 else ''}\n"
729
+ )
730
+
731
+ parts.append("\n\n")
732
+
733
+ return "".join(parts)
734
+
735
+ def _format_single_paper_apa(self, paper: dict) -> str:
736
+ """Format a single paper in APA style
737
+
738
+ Format: Authors (Year). *Title*. Venue. arXiv:ID. URL
739
+ """
740
+ authors = paper.get("authors", "Unknown Author")
741
+ year = paper.get("year", "n.d.")
742
+ title = paper.get("title", "Untitled")
743
+ url = paper.get("url", "")
744
+ arxiv_id = paper.get("arxiv_id", "")
745
+ venue = paper.get("venue", "")
746
+ doi = paper.get("doi", "")
747
+
748
+ # APA format
749
+ result = f"{authors} ({year}). *{title}*."
750
+ if venue:
751
+ result += f" {venue}."
752
+ if arxiv_id:
753
+ result += f" arXiv:{arxiv_id}."
754
+ if doi:
755
+ result += f" https://doi.org/{doi}"
756
+ elif url:
757
+ result += f" {url}"
758
+
759
+ return result
760
+
761
+ def _format_paper_citation_apa(self, citation: dict) -> str:
762
+ """Format paper citation in APA style (fallback for citations without papers array)
763
+
764
+ Format: Authors (Year). *Title*. Venue. arXiv:ID. URL
765
+ """
766
+ # Use top-level fields (backward compatibility)
767
+ authors = citation.get("authors", "Unknown Author")
768
+ year = citation.get("year", "n.d.")
769
+ title = citation.get("title", "Untitled")
770
+ url = citation.get("url", "")
771
+ arxiv_id = citation.get("arxiv_id", "")
772
+ venue = citation.get("venue", "")
773
+ doi = citation.get("doi", "")
774
+
775
+ result = f"{authors} ({year}). *{title}*."
776
+ if venue:
777
+ result += f" {venue}."
778
+ if arxiv_id:
779
+ result += f" arXiv:{arxiv_id}."
780
+ if doi:
781
+ result += f" https://doi.org/{doi}"
782
+ elif url:
783
+ result += f" {url}"
784
+ return result
785
+
786
+ def _format_web_search_citation(self, citation: dict) -> str:
787
+ """Format web search citation with collapsible links"""
788
+ query = citation.get("query", "")
789
+ summary = citation.get("summary", "")
790
+ web_sources = citation.get("web_sources", [])
791
+
792
+ result = "**Web Search**\n\n"
793
+ result += f"- **Query**: {query}\n"
794
+ if summary:
795
+ # Clean summary to avoid markdown rendering issues
796
+ clean_summary = self._strip_markdown(summary)
797
+ summary_text = clean_summary[:300] + ("..." if len(clean_summary) > 300 else "")
798
+ result += f"- **Summary**: {summary_text}\n"
799
+
800
+ # Add collapsible links section
801
+ if web_sources:
802
+ result += "\n<details>\n<summary>📎 Retrieved Sources ({} links)</summary>\n\n".format(
803
+ len(web_sources)
804
+ )
805
+ for i, source in enumerate(web_sources, 1):
806
+ title = source.get("title", "Untitled")
807
+ url = source.get("url", "")
808
+ snippet = source.get("snippet", "")
809
+ if url:
810
+ result += f"{i}. [{title}]({url})"
811
+ if snippet:
812
+ clean_snippet = self._strip_markdown(snippet)
813
+ result += f"\n > {clean_snippet[:150]}{'...' if len(clean_snippet) > 150 else ''}"
814
+ result += "\n\n"
815
+ result += "</details>"
816
+
817
+ return result
818
+
819
+ def _format_rag_citation(self, citation: dict) -> str:
820
+ """Format RAG/Query citation"""
821
+ tool_type = citation.get("tool_type", "")
822
+ query = citation.get("query", "")
823
+ summary = citation.get("summary", "")
824
+ kb_name = citation.get("kb_name", "")
825
+ sources = citation.get("sources", [])
826
+
827
+ # Tool name display
828
+ tool_display = {
829
+ "rag_naive": "RAG Retrieval",
830
+ "rag_hybrid": "Hybrid RAG Retrieval",
831
+ "query_item": "Knowledge Base Query",
832
+ }.get(tool_type, tool_type)
833
+
834
+ result = f"**{tool_display}**"
835
+ if kb_name:
836
+ result += f" (KB: {kb_name})"
837
+ result += "\n\n"
838
+ result += f"- **Query**: {query}\n"
839
+ if summary:
840
+ # Clean summary: remove markdown formatting to avoid rendering issues
841
+ clean_summary = self._strip_markdown(summary)
842
+ summary_text = clean_summary[:300] + ("..." if len(clean_summary) > 300 else "")
843
+ result += f"- **Summary**: {summary_text}\n"
844
+
845
+ # Add source documents if available
846
+ if sources:
847
+ result += "\n<details>\n<summary>📄 Source Documents ({} docs)</summary>\n\n".format(
848
+ len(sources)
849
+ )
850
+ for i, source in enumerate(sources, 1):
851
+ title = source.get("title", "") or source.get("source_file", f"Document {i}")
852
+ content = source.get("content_preview", "")
853
+ page = source.get("page", "")
854
+ result += f"{i}. **{title}**"
855
+ if page:
856
+ result += f" (Page {page})"
857
+ if content:
858
+ clean_content = self._strip_markdown(content)
859
+ result += (
860
+ f"\n > {clean_content[:150]}{'...' if len(clean_content) > 150 else ''}"
861
+ )
862
+ result += "\n\n"
863
+ result += "</details>"
864
+
865
+ return result
866
+
867
+ def _strip_markdown(self, text: str) -> str:
868
+ """Strip markdown formatting from text to get plain text"""
869
+ import re
870
+
871
+ if not text:
872
+ return ""
873
+
874
+ # Remove bold/italic markers
875
+ text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # **bold**
876
+ text = re.sub(r"\*([^*]+)\*", r"\1", text) # *italic*
877
+ text = re.sub(r"__([^_]+)__", r"\1", text) # __bold__
878
+ text = re.sub(r"_([^_]+)_", r"\1", text) # _italic_
879
+
880
+ # Remove headers
881
+ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
882
+
883
+ # Remove links but keep text
884
+ text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
885
+
886
+ # Remove inline code
887
+ text = re.sub(r"`([^`]+)`", r"\1", text)
888
+
889
+ # Remove bullet points
890
+ text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
891
+
892
+ # Remove numbered lists
893
+ text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
894
+
895
+ # Remove blockquotes
896
+ text = re.sub(r"^>\s*", "", text, flags=re.MULTILINE)
897
+
898
+ # Normalize whitespace
899
+ text = re.sub(r"\n{3,}", "\n\n", text)
900
+ text = re.sub(r" +", " ", text)
901
+
902
+ return text.strip()
903
+
904
+ def _format_code_citation(self, citation: dict) -> str:
905
+ """Format code execution citation"""
906
+ query = citation.get("query", "") # This is usually the code
907
+ summary = citation.get("summary", "")
908
+
909
+ result = "**Code Execution**\n\n"
910
+ if query:
911
+ # Truncate long code
912
+ code_preview = query[:300] + ("..." if len(query) > 300 else "")
913
+ result += f"- **Code**: `{code_preview}`\n"
914
+ if summary:
915
+ summary_text = summary[:300] + ("..." if len(summary) > 300 else "")
916
+ result += f"- **Result**: {summary_text}\n"
917
+
918
+ return result
919
+
920
+ def _generate_references_from_blocks(self, blocks: list[TopicBlock]) -> str:
921
+ """Generate References section from blocks (backward compatible, academic paper style)"""
922
+ parts = ["## References\n\n"]
923
+
924
+ # Collect all citations
925
+ all_citations = []
926
+ for block in blocks:
927
+ if block.tool_traces:
928
+ for trace in block.tool_traces:
929
+ citation_id = (
930
+ getattr(trace, "citation_id", None)
931
+ or f"CIT-{block.block_id.split('_')[-1]}-01"
932
+ )
933
+ all_citations.append(
934
+ {"citation_id": citation_id, "block": block, "trace": trace}
935
+ )
936
+
937
+ if not all_citations:
938
+ return "## References\n\n*No citations available.*\n"
939
+
940
+ # Sort by citation_id (extract numeric parts for sorting)
941
+ def extract_citation_number(cit_id):
942
+ try:
943
+ if cit_id.startswith("PLAN-"):
944
+ num = int(cit_id.replace("PLAN-", ""))
945
+ return (0, 0, num)
946
+ # CIT-X-XX format
947
+ parts_list = cit_id.replace("CIT-", "").split("-")
948
+ if len(parts_list) == 2:
949
+ return (1, int(parts_list[0]), int(parts_list[1]))
950
+ except:
951
+ pass
952
+ return (999, 999, 999)
953
+
954
+ all_citations.sort(key=lambda x: extract_citation_number(x["citation_id"]))
955
+
956
+ # Generate numbered references in academic paper style
957
+ # Using simple ref-N anchor format for clickable inline citations
958
+ for idx, cit in enumerate(all_citations, 1):
959
+ trace = cit["trace"]
960
+ citation_id = cit["citation_id"]
961
+
962
+ # Use simple ref-N anchor format (consistent with _generate_references_from_manager)
963
+ anchor = f"ref-{idx}"
964
+ tool_type = trace.tool_type.lower() if trace.tool_type else ""
965
+
966
+ # Tool name display
967
+ tool_display = {
968
+ "rag_naive": "RAG Retrieval",
969
+ "rag_hybrid": "Hybrid RAG Retrieval",
970
+ "query_item": "Knowledge Base Query",
971
+ "paper_search": "Paper Search",
972
+ "web_search": "Web Search",
973
+ "run_code": "Code Execution",
974
+ }.get(tool_type, tool_type)
975
+
976
+ parts.append(f'<a id="{anchor}"></a>**[{idx}]** **{tool_display}**\n\n')
977
+ parts.append(f"- **Query**: {trace.query}\n")
978
+ if trace.summary:
979
+ summary_text = trace.summary[:500] + ("..." if len(trace.summary) > 500 else "")
980
+ parts.append(f"- **Summary**: {summary_text}\n")
981
+ parts.append("\n")
982
+
983
+ return "".join(parts)
984
+
985
+ def _convert_citation_format(self, text: str) -> str:
986
+ """
987
+ Convert various citation formats to clickable [[N]](#ref-N) format.
988
+
989
+ Handles:
990
+ - [N] format (simple number in brackets)
991
+ - [ref=N] format (from citation table)
992
+
993
+ Args:
994
+ text: Text with citations in various formats
995
+
996
+ Returns:
997
+ Text with [[N]](#ref-N) clickable citations
998
+ """
999
+ import re
1000
+
1001
+ # Get valid ref_numbers from the citation map
1002
+ valid_refs = set()
1003
+ if hasattr(self, "_citation_map") and self._citation_map:
1004
+ valid_refs = set(self._citation_map.values())
1005
+
1006
+ def replace_citation(match):
1007
+ # Get the number from the match
1008
+ ref_num = match.group(1)
1009
+
1010
+ # Only convert if it's a valid ref_number
1011
+ try:
1012
+ num = int(ref_num)
1013
+ if num in valid_refs:
1014
+ return f"[[{ref_num}]](#ref-{ref_num})"
1015
+ except ValueError:
1016
+ pass
1017
+
1018
+ # Return unchanged if not a valid reference
1019
+ return match.group(0)
1020
+
1021
+ # First, convert [ref=N] format to clickable format
1022
+ # Pattern: [ref=N] where N is a number
1023
+ ref_pattern = r"\[ref=(\d+)\]"
1024
+ text = re.sub(ref_pattern, replace_citation, text)
1025
+
1026
+ # Then, convert simple [N] format (but NOT already converted [[N]])
1027
+ # Pattern to match [N] where N is a number, but NOT already in [[N]] format
1028
+ # Use negative lookbehind and lookahead to avoid matching [[N]] or [N](#ref-N)
1029
+ simple_pattern = r"(?<!\[)\[(\d+)\](?!\(#ref-)"
1030
+ text = re.sub(simple_pattern, replace_citation, text)
1031
+
1032
+ return text
1033
+
1034
+ def _validate_and_fix_citations(self, text: str) -> tuple[str, dict]:
1035
+ """
1036
+ Validate citations in text and fix invalid ones.
1037
+
1038
+ Args:
1039
+ text: Text with citations
1040
+
1041
+ Returns:
1042
+ Tuple of (fixed_text, validation_result)
1043
+ """
1044
+ import re
1045
+
1046
+ # Get valid ref_numbers
1047
+ valid_refs = set()
1048
+ if hasattr(self, "_citation_map") and self._citation_map:
1049
+ valid_refs = set(self._citation_map.values())
1050
+
1051
+ # Find all citations in [[N]](#ref-N) format
1052
+ pattern = r"\[\[(\d+)\]\]\(#ref-\d+\)"
1053
+ found_citations = re.findall(pattern, text)
1054
+
1055
+ valid = []
1056
+ invalid = []
1057
+
1058
+ for ref in found_citations:
1059
+ try:
1060
+ num = int(ref)
1061
+ if num in valid_refs:
1062
+ valid.append(num)
1063
+ else:
1064
+ invalid.append(num)
1065
+ except ValueError:
1066
+ invalid.append(ref)
1067
+
1068
+ # Remove invalid citations
1069
+ if invalid:
1070
+
1071
+ def remove_invalid(match):
1072
+ ref_num = match.group(1)
1073
+ try:
1074
+ num = int(ref_num)
1075
+ if num not in valid_refs:
1076
+ return "" # Remove invalid citation
1077
+ except ValueError:
1078
+ return ""
1079
+ return match.group(0)
1080
+
1081
+ text = re.sub(pattern, remove_invalid, text)
1082
+
1083
+ validation_result = {
1084
+ "valid_citations": valid,
1085
+ "invalid_citations": invalid,
1086
+ "is_valid": len(invalid) == 0,
1087
+ "total_found": len(found_citations),
1088
+ }
1089
+
1090
+ return text, validation_result
1091
+
1092
+ async def _write_report(
1093
+ self, topic: str, blocks: list[TopicBlock], outline: dict[str, Any]
1094
+ ) -> str:
1095
+ """Write complete report using step-by-step method with three-level heading support"""
1096
+ parts = []
1097
+
1098
+ # Build citation number map before writing (for consistent ref_number in traces)
1099
+ if self.enable_inline_citations:
1100
+ self._citation_map = self._build_citation_number_map(blocks)
1101
+ print(f" 📋 Built citation map with {len(self._citation_map)} entries")
1102
+ else:
1103
+ self._citation_map = {}
1104
+
1105
+ # 1. Add main title (from outline, or use topic if not available)
1106
+ title = outline.get("title", f"# {topic}")
1107
+ if not title.startswith("#"):
1108
+ title = f"# {title}"
1109
+ parts.append(f"{title}\n\n")
1110
+
1111
+ # 2. Write introduction
1112
+ print(" 📝 Writing introduction...")
1113
+ self._notify_progress(
1114
+ getattr(self, "_progress_callback", None),
1115
+ "writing_section",
1116
+ current_section="Introduction",
1117
+ section_index=0,
1118
+ total_sections=len(outline.get("sections", [])) + 2, # +2 for intro and conclusion
1119
+ )
1120
+ introduction = await self._write_introduction(topic, blocks, outline)
1121
+ # Get introduction title from outline, or use default if not available
1122
+ intro_title = outline.get("introduction", "## Introduction")
1123
+ if not intro_title.startswith("##"):
1124
+ intro_title = f"## {intro_title}"
1125
+ parts.append(f"{intro_title}\n\n")
1126
+ parts.append(introduction)
1127
+ parts.append("\n\n")
1128
+
1129
+ # 3. Write each section with subsection support
1130
+ sections = outline.get("sections", [])
1131
+ for i, section in enumerate(sections, 1):
1132
+ block_id = section.get("block_id")
1133
+ block = next((b for b in blocks if b.block_id == block_id), None)
1134
+ if not block:
1135
+ print(
1136
+ f" âš ī¸ Warning: Cannot find topic block with block_id={block_id}, skipping this section"
1137
+ )
1138
+ continue
1139
+
1140
+ section_title = section.get("title", block.sub_topic)
1141
+ # Clean section title for display (remove markdown markers)
1142
+ display_title = section_title.replace("##", "").strip()
1143
+ print(f" 📝 Writing section {i}/{len(sections)}: {section_title}...")
1144
+ self._notify_progress(
1145
+ getattr(self, "_progress_callback", None),
1146
+ "writing_section",
1147
+ current_section=display_title,
1148
+ section_index=i, # 1-based, after introduction
1149
+ total_sections=len(sections) + 2, # +2 for intro and conclusion
1150
+ )
1151
+
1152
+ # Check if section has subsections defined in outline
1153
+ subsections = section.get("subsections", [])
1154
+
1155
+ if subsections:
1156
+ # Write section with explicit subsection structure
1157
+ section_content = await self._write_section_with_subsections(
1158
+ topic, block, section, subsections
1159
+ )
1160
+ else:
1161
+ # Write section normally (LLM will generate its own subsection structure)
1162
+ section_content = await self._write_section_body(topic, block, section)
1163
+
1164
+ # Section content already includes ## level title, append directly
1165
+ parts.append(section_content)
1166
+ parts.append("\n\n")
1167
+
1168
+ # 4. Write conclusion
1169
+ print(" 📝 Writing conclusion...")
1170
+ total_sections = len(sections) + 2
1171
+ self._notify_progress(
1172
+ getattr(self, "_progress_callback", None),
1173
+ "writing_section",
1174
+ current_section="Conclusion",
1175
+ section_index=total_sections - 1, # Last section
1176
+ total_sections=total_sections,
1177
+ )
1178
+ conclusion = await self._write_conclusion(topic, blocks, outline)
1179
+ # Get conclusion title from outline, or use default if not available
1180
+ conclusion_title = outline.get("conclusion", "## Conclusion")
1181
+ if not conclusion_title.startswith("##"):
1182
+ conclusion_title = f"## {conclusion_title}"
1183
+ parts.append(f"{conclusion_title}\n\n")
1184
+ parts.append(conclusion)
1185
+ parts.append("\n\n")
1186
+
1187
+ # 5. Generate References based on configuration
1188
+ if self.enable_citation_list:
1189
+ print(" 📝 Generating citation list...")
1190
+ references = self._generate_references(blocks)
1191
+ parts.append(references)
1192
+ else:
1193
+ print(" â„šī¸ Citation list disabled, skipping generation")
1194
+
1195
+ # Combine all parts
1196
+ report = "".join(parts)
1197
+
1198
+ # 6. Post-process citations (convert [N] to [[N]](#ref-N) format)
1199
+ if self.enable_inline_citations:
1200
+ print(" 🔗 Converting citation format...")
1201
+ report = self._convert_citation_format(report)
1202
+
1203
+ # Validate and fix invalid citations
1204
+ print(" ✓ Validating citations...")
1205
+ report, validation = self._validate_and_fix_citations(report)
1206
+
1207
+ if not validation["is_valid"]:
1208
+ print(
1209
+ f" âš ī¸ Removed {len(validation['invalid_citations'])} invalid citations: {validation['invalid_citations']}"
1210
+ )
1211
+ else:
1212
+ print(f" ✓ All {validation['total_found']} citations are valid")
1213
+
1214
+ return report
1215
+
1216
+ async def _write_section_with_subsections(
1217
+ self,
1218
+ topic: str,
1219
+ block: TopicBlock,
1220
+ section: dict[str, Any],
1221
+ subsections: list[dict[str, Any]],
1222
+ ) -> str:
1223
+ """Write a section that has explicitly defined subsections in the outline
1224
+
1225
+ This method writes the section as a whole, passing subsection structure to the LLM
1226
+ to guide the content organization while maintaining coherence.
1227
+ """
1228
+ import json as _json
1229
+
1230
+ # Enhance section instruction with subsection information
1231
+ subsection_info = []
1232
+ for j, sub in enumerate(subsections, 1):
1233
+ subsection_info.append(
1234
+ {
1235
+ "title": sub.get("title", f"### Subsection {j}"),
1236
+ "instruction": sub.get("instruction", ""),
1237
+ }
1238
+ )
1239
+
1240
+ # Create enhanced section data with subsection guidance
1241
+ enhanced_section = {
1242
+ "title": section.get("title", block.sub_topic),
1243
+ "instruction": section.get("instruction", ""),
1244
+ "subsection_structure": subsection_info,
1245
+ }
1246
+
1247
+ # Prepare block data with subsection hints
1248
+ block_data = self._ser_block(block)
1249
+ block_data["expected_subsections"] = subsection_info
1250
+
1251
+ system_prompt = self.get_prompt(
1252
+ "system",
1253
+ "role",
1254
+ "You are an academic writing expert specializing in writing comprehensive research report sections with structured subsections.",
1255
+ )
1256
+ tmpl = self.get_prompt("process", "write_section_body", "")
1257
+ if not tmpl:
1258
+ raise ValueError("Cannot get section writing prompt template, report generation failed")
1259
+
1260
+ # Build enhanced instruction including subsection structure
1261
+ section_instruction = section.get("instruction", "")
1262
+ if subsection_info:
1263
+ subsection_guide = "\n\n**Expected subsection structure:**\n"
1264
+ for sub in subsection_info:
1265
+ subsection_guide += f"- {sub['title']}: {sub['instruction']}\n"
1266
+ section_instruction += subsection_guide
1267
+
1268
+ # Dynamically build citation instructions based on configuration
1269
+ if self.enable_inline_citations:
1270
+ # Build clear citation reference table for this block
1271
+ citation_table = self._build_citation_table(block)
1272
+
1273
+ citation_instruction_template = self.get_prompt("citation", "enabled_instruction")
1274
+ if citation_instruction_template:
1275
+ citation_instruction = citation_instruction_template.format(
1276
+ citation_table=citation_table
1277
+ )
1278
+ else:
1279
+ # Fallback if YAML not configured
1280
+ citation_instruction = f"**Citation Reference Table**:\n{citation_table}"
1281
+ citation_output_hint = ", citations"
1282
+ else:
1283
+ citation_instruction = self.get_prompt("citation", "disabled_instruction") or ""
1284
+ citation_output_hint = ""
1285
+
1286
+ # Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
1287
+ block_data_json = _json.dumps(block_data, ensure_ascii=False, indent=2)
1288
+ filled = self._safe_format(
1289
+ tmpl,
1290
+ topic=topic,
1291
+ section_title=section.get("title", block.sub_topic),
1292
+ section_instruction=section_instruction,
1293
+ block_data=block_data_json,
1294
+ min_section_length=self.reporting_config.get("min_section_length", 800),
1295
+ citation_instruction=citation_instruction,
1296
+ citation_output_hint=citation_output_hint,
1297
+ )
1298
+
1299
+ # TODO Implement retry logic for LLM calls when JSON parsing or post-processing fails (e.g., malformed output, schema violations).
1300
+ resp = await self.call_llm(
1301
+ filled,
1302
+ system_prompt,
1303
+ stage="write_section_with_subsections",
1304
+ verbose=False,
1305
+ )
1306
+ data = extract_json_from_text(resp)
1307
+
1308
+ try:
1309
+ obj = ensure_json_dict(data)
1310
+ ensure_keys(obj, ["section_content"])
1311
+ content = obj.get("section_content", "")
1312
+ if isinstance(content, str) and content.strip():
1313
+ return content
1314
+ raise ValueError("LLM returned empty or invalid section_content field")
1315
+ except Exception as e:
1316
+ raise ValueError(
1317
+ f"Unable to parse LLM returned section content: {e!s}. Report generation failed."
1318
+ )
1319
+
1320
+ def _notify_progress(
1321
+ self, callback: Callable[[dict[str, Any]], None] | None, status: str, **payload: Any
1322
+ ) -> None:
1323
+ if not callback:
1324
+ return
1325
+ event = {"status": status}
1326
+ event.update({k: v for k, v in payload.items() if v is not None})
1327
+ try:
1328
+ callback(event)
1329
+ except Exception:
1330
+ pass
1331
+
1332
+
1333
+ __all__ = ["ReportingAgent"]