realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,1333 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
ReportingAgent - Report generation Agent (DR-in-KG 2.0)
|
|
5
|
+
- Deduplication and cleaning
|
|
6
|
+
- Generate linear outline (introduction â sections â conclusion)
|
|
7
|
+
- Write final report (prefer LLM JSON return markdown, fallback to local assembly on failure)
|
|
8
|
+
- Inline citations and References anchors (based on citation_id)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from collections.abc import Callable
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
import re
|
|
16
|
+
from string import Template
|
|
17
|
+
import sys
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
project_root = Path(__file__).parent.parent.parent.parent
|
|
21
|
+
sys.path.insert(0, str(project_root))
|
|
22
|
+
|
|
23
|
+
from src.agents.base_agent import BaseAgent
|
|
24
|
+
from src.agents.research.data_structures import DynamicTopicQueue, TopicBlock
|
|
25
|
+
|
|
26
|
+
from ..utils.json_utils import ensure_json_dict, ensure_keys, extract_json_from_text
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ReportingAgent(BaseAgent):
|
|
30
|
+
"""Report generation Agent"""
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def _escape_braces(text: str) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Escape curly braces in text to prevent str.format() from interpreting them.
|
|
36
|
+
This is needed because JSON data may contain LaTeX formulas with braces like {L}, {x}.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
text: Input text that may contain curly braces
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Text with braces escaped ({{ and }})
|
|
43
|
+
"""
|
|
44
|
+
return text.replace("{", "{{").replace("}", "}}")
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _convert_to_template_format(template_str: str) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Convert {var} style placeholders to $var style for string.Template.
|
|
50
|
+
This avoids conflicts with LaTeX braces like {\rho}, {L}.
|
|
51
|
+
"""
|
|
52
|
+
return re.sub(r"\{(\w+)\}", r"$\1", template_str)
|
|
53
|
+
|
|
54
|
+
def _safe_format(self, template_str: str, **kwargs) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Safe string formatting using string.Template to avoid LaTeX brace conflicts.
|
|
57
|
+
Converts {var} to $var format, then uses safe_substitute.
|
|
58
|
+
"""
|
|
59
|
+
converted = self._convert_to_template_format(template_str)
|
|
60
|
+
return Template(converted).safe_substitute(**kwargs)
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
config: dict[str, Any],
|
|
65
|
+
api_key: str | None = None,
|
|
66
|
+
base_url: str | None = None,
|
|
67
|
+
api_version: str | None = None,
|
|
68
|
+
):
|
|
69
|
+
language = config.get("system", {}).get("language", "zh")
|
|
70
|
+
super().__init__(
|
|
71
|
+
module_name="research",
|
|
72
|
+
agent_name="reporting_agent",
|
|
73
|
+
api_key=api_key,
|
|
74
|
+
base_url=base_url,
|
|
75
|
+
api_version=api_version,
|
|
76
|
+
language=language,
|
|
77
|
+
config=config,
|
|
78
|
+
)
|
|
79
|
+
self.reporting_config = config.get("reporting", {})
|
|
80
|
+
self.citation_manager = None # Will be set during process
|
|
81
|
+
|
|
82
|
+
# Citation configuration: read from config, default off
|
|
83
|
+
self.enable_citation_list = self.reporting_config.get("enable_citation_list", False)
|
|
84
|
+
self.enable_inline_citations = self.reporting_config.get("enable_inline_citations", False)
|
|
85
|
+
|
|
86
|
+
def set_citation_manager(self, citation_manager):
|
|
87
|
+
"""Set citation manager"""
|
|
88
|
+
self.citation_manager = citation_manager
|
|
89
|
+
|
|
90
|
+
async def process(
|
|
91
|
+
self,
|
|
92
|
+
queue: DynamicTopicQueue,
|
|
93
|
+
topic: str,
|
|
94
|
+
progress_callback: Callable[[dict[str, Any]], None] | None = None,
|
|
95
|
+
) -> dict[str, Any]:
|
|
96
|
+
"""
|
|
97
|
+
Generate final report
|
|
98
|
+
Returns:
|
|
99
|
+
{
|
|
100
|
+
"report": str,
|
|
101
|
+
"word_count": int,
|
|
102
|
+
"sections": int,
|
|
103
|
+
"citations": int
|
|
104
|
+
}
|
|
105
|
+
"""
|
|
106
|
+
print(f"\n{'=' * 70}")
|
|
107
|
+
print("đ ReportingAgent - Report Generation")
|
|
108
|
+
print(f"{'=' * 70}")
|
|
109
|
+
print(f"Topic: {topic}")
|
|
110
|
+
print(f"Topic Blocks: {len(queue.blocks)}\n")
|
|
111
|
+
|
|
112
|
+
# Store progress_callback for use in _write_report
|
|
113
|
+
self._progress_callback = progress_callback
|
|
114
|
+
|
|
115
|
+
self._notify_progress(
|
|
116
|
+
progress_callback, "reporting_started", topic=topic, total_blocks=len(queue.blocks)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# 1) Deduplication
|
|
120
|
+
print("đ Step 1: Deduplication and cleaning...")
|
|
121
|
+
cleaned_blocks = await self._deduplicate_blocks(queue.blocks)
|
|
122
|
+
print(f"â Cleaning completed: {len(cleaned_blocks)} topic blocks")
|
|
123
|
+
self._notify_progress(
|
|
124
|
+
progress_callback, "deduplicate_completed", kept_blocks=len(cleaned_blocks)
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# 2) Outline
|
|
128
|
+
print("\nđ Step 2: Generating outline...")
|
|
129
|
+
outline = await self._generate_outline(topic, cleaned_blocks)
|
|
130
|
+
print("â Outline generation completed")
|
|
131
|
+
self._notify_progress(
|
|
132
|
+
progress_callback, "outline_completed", sections=len(outline.get("sections", []))
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Save outline for later use
|
|
136
|
+
self._current_outline = outline
|
|
137
|
+
|
|
138
|
+
# 3) Writing
|
|
139
|
+
print("\nâī¸ Step 3: Writing report...")
|
|
140
|
+
report_markdown = await self._write_report(topic, cleaned_blocks, outline)
|
|
141
|
+
print("â Report writing completed")
|
|
142
|
+
self._notify_progress(progress_callback, "writing_completed")
|
|
143
|
+
|
|
144
|
+
word_count = len(report_markdown)
|
|
145
|
+
sections = len(cleaned_blocks)
|
|
146
|
+
citations = sum(len(b.tool_traces) for b in cleaned_blocks)
|
|
147
|
+
|
|
148
|
+
print("\nđ Report Statistics:")
|
|
149
|
+
print(f" Word Count: {word_count}")
|
|
150
|
+
print(f" Sections: {sections}")
|
|
151
|
+
print(f" Citations: {citations}")
|
|
152
|
+
self._notify_progress(
|
|
153
|
+
progress_callback,
|
|
154
|
+
"reporting_completed",
|
|
155
|
+
word_count=word_count,
|
|
156
|
+
sections=sections,
|
|
157
|
+
citations=citations,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
result = {
|
|
161
|
+
"report": report_markdown,
|
|
162
|
+
"word_count": word_count,
|
|
163
|
+
"sections": sections,
|
|
164
|
+
"citations": citations,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
# If outline has been generated, add it to result
|
|
168
|
+
if hasattr(self, "_current_outline"):
|
|
169
|
+
result["outline"] = self._current_outline
|
|
170
|
+
delattr(self, "_current_outline")
|
|
171
|
+
|
|
172
|
+
return result
|
|
173
|
+
|
|
174
|
+
async def _deduplicate_blocks(self, blocks: list[TopicBlock]) -> list[TopicBlock]:
|
|
175
|
+
if len(blocks) <= 1:
|
|
176
|
+
return blocks
|
|
177
|
+
system_prompt = self.get_prompt("system", "role")
|
|
178
|
+
if not system_prompt:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
"ReportingAgent missing system prompt, please configure system.role in prompts/{lang}/reporting_agent.yaml"
|
|
181
|
+
)
|
|
182
|
+
user_prompt = self.get_prompt("process", "deduplicate")
|
|
183
|
+
if not user_prompt:
|
|
184
|
+
raise ValueError(
|
|
185
|
+
"ReportingAgent missing deduplicate prompt, please configure process.deduplicate in prompts/{lang}/reporting_agent.yaml"
|
|
186
|
+
)
|
|
187
|
+
topics_text = "\n".join(
|
|
188
|
+
[f"{i + 1}. {b.sub_topic}: {b.overview[:200]}" for i, b in enumerate(blocks)]
|
|
189
|
+
)
|
|
190
|
+
filled = self._safe_format(user_prompt, topics=topics_text, total_topics=len(blocks))
|
|
191
|
+
resp = await self.call_llm(filled, system_prompt, stage="deduplicate", verbose=False)
|
|
192
|
+
data = extract_json_from_text(resp)
|
|
193
|
+
try:
|
|
194
|
+
obj = ensure_json_dict(data)
|
|
195
|
+
ensure_keys(obj, ["keep_indices"])
|
|
196
|
+
keep_indices = obj.get("keep_indices", [])
|
|
197
|
+
return [blocks[i] for i in keep_indices if isinstance(i, int) and i < len(blocks)]
|
|
198
|
+
except Exception:
|
|
199
|
+
return blocks
|
|
200
|
+
|
|
201
|
+
async def _generate_outline(self, topic: str, blocks: list[TopicBlock]) -> dict[str, Any]:
|
|
202
|
+
"""Generate report outline based on complete subtopic, overview and all tool_trace summaries
|
|
203
|
+
|
|
204
|
+
Supports three-level heading system:
|
|
205
|
+
- Level 1 (#): Report main title
|
|
206
|
+
- Level 2 (##): Main sections (Introduction, Core Sections, Conclusion)
|
|
207
|
+
- Level 3 (###): Subsections within each section
|
|
208
|
+
"""
|
|
209
|
+
system_prompt = self.get_prompt("system", "role")
|
|
210
|
+
if not system_prompt:
|
|
211
|
+
raise ValueError(
|
|
212
|
+
"ReportingAgent missing system prompt, please configure system.role in prompts/{lang}/reporting_agent.yaml"
|
|
213
|
+
)
|
|
214
|
+
user_prompt = self.get_prompt("process", "generate_outline")
|
|
215
|
+
if not user_prompt:
|
|
216
|
+
raise ValueError(
|
|
217
|
+
"ReportingAgent missing generate_outline prompt, please configure process.generate_outline in prompts/{lang}/reporting_agent.yaml"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Build complete topic information, including subtopic, overview and all tool_trace summaries
|
|
221
|
+
topics_data = []
|
|
222
|
+
for i, block in enumerate(blocks, 1):
|
|
223
|
+
topic_info = {
|
|
224
|
+
"index": i,
|
|
225
|
+
"block_id": block.block_id,
|
|
226
|
+
"sub_topic": block.sub_topic,
|
|
227
|
+
"overview": block.overview,
|
|
228
|
+
"tool_summaries": (
|
|
229
|
+
[trace.summary for trace in block.tool_traces] if block.tool_traces else []
|
|
230
|
+
),
|
|
231
|
+
}
|
|
232
|
+
topics_data.append(topic_info)
|
|
233
|
+
|
|
234
|
+
import json as _json
|
|
235
|
+
|
|
236
|
+
topics_json = _json.dumps(topics_data, ensure_ascii=False, indent=2)
|
|
237
|
+
# Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
|
|
238
|
+
filled = self._safe_format(
|
|
239
|
+
user_prompt, topic=topic, topics_json=topics_json, total_topics=len(blocks)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
resp = await self.call_llm(filled, system_prompt, stage="generate_outline", verbose=False)
|
|
243
|
+
data = extract_json_from_text(resp)
|
|
244
|
+
try:
|
|
245
|
+
obj = ensure_json_dict(data)
|
|
246
|
+
ensure_keys(obj, ["title", "introduction", "sections", "conclusion"])
|
|
247
|
+
# Ensure title uses markdown format (# prefix)
|
|
248
|
+
if not obj.get("title", "").startswith("#"):
|
|
249
|
+
obj["title"] = f"# {obj.get('title', topic)}"
|
|
250
|
+
# Ensure introduction and conclusion use markdown format (## prefix)
|
|
251
|
+
if obj.get("introduction") and not obj["introduction"].startswith("##"):
|
|
252
|
+
obj["introduction"] = f"## {obj['introduction']}"
|
|
253
|
+
if obj.get("conclusion") and not obj["conclusion"].startswith("##"):
|
|
254
|
+
obj["conclusion"] = f"## {obj['conclusion']}"
|
|
255
|
+
|
|
256
|
+
# Process sections to ensure proper formatting
|
|
257
|
+
for section in obj.get("sections", []):
|
|
258
|
+
# Ensure section title has ## prefix
|
|
259
|
+
if section.get("title") and not section["title"].startswith("##"):
|
|
260
|
+
section["title"] = f"## {section['title']}"
|
|
261
|
+
# Process subsections if present
|
|
262
|
+
for subsection in section.get("subsections", []):
|
|
263
|
+
if subsection.get("title") and not subsection["title"].startswith("###"):
|
|
264
|
+
subsection["title"] = f"### {subsection['title']}"
|
|
265
|
+
|
|
266
|
+
return obj
|
|
267
|
+
except Exception:
|
|
268
|
+
# Fallback to default outline with subsections
|
|
269
|
+
return self._create_default_outline(topic, blocks)
|
|
270
|
+
|
|
271
|
+
def _create_default_outline(self, topic: str, blocks: list[TopicBlock]) -> dict[str, Any]:
|
|
272
|
+
"""Create a default outline with three-level heading structure"""
|
|
273
|
+
sections = []
|
|
274
|
+
for i, b in enumerate(blocks, 1):
|
|
275
|
+
section = {
|
|
276
|
+
"title": f"## {i}. {b.sub_topic}",
|
|
277
|
+
"instruction": f"Provide detailed introduction to {b.sub_topic}, including core concepts, key mechanisms, and practical applications",
|
|
278
|
+
"block_id": b.block_id,
|
|
279
|
+
"subsections": [
|
|
280
|
+
{
|
|
281
|
+
"title": f"### {i}.1 Core Concepts and Definitions",
|
|
282
|
+
"instruction": f"Explain the fundamental concepts and definitions related to {b.sub_topic}",
|
|
283
|
+
},
|
|
284
|
+
{
|
|
285
|
+
"title": f"### {i}.2 Key Mechanisms and Principles",
|
|
286
|
+
"instruction": f"Analyze the underlying mechanisms and theoretical principles of {b.sub_topic}",
|
|
287
|
+
},
|
|
288
|
+
],
|
|
289
|
+
}
|
|
290
|
+
sections.append(section)
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"title": f"# {topic}",
|
|
294
|
+
"introduction": "## Introduction",
|
|
295
|
+
"introduction_instruction": "Present the research background, motivation, objectives, and report structure",
|
|
296
|
+
"sections": sections,
|
|
297
|
+
"conclusion": "## Conclusion and Future Directions",
|
|
298
|
+
"conclusion_instruction": "Summarize core findings, research contributions, limitations, and future directions",
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
def _ser_block(self, b: TopicBlock) -> dict[str, Any]:
|
|
302
|
+
"""Serialize TopicBlock to dictionary, including complete tool traces
|
|
303
|
+
|
|
304
|
+
If self._citation_map is available (built by _build_citation_number_map),
|
|
305
|
+
each trace will include a ref_number field for inline citation use.
|
|
306
|
+
"""
|
|
307
|
+
traces = []
|
|
308
|
+
for t in b.tool_traces:
|
|
309
|
+
cid = getattr(t, "citation_id", None) or f"CIT-{b.block_id.split('_')[-1]}-01"
|
|
310
|
+
trace_data = {
|
|
311
|
+
"citation_id": cid,
|
|
312
|
+
"tool_type": t.tool_type,
|
|
313
|
+
"query": t.query,
|
|
314
|
+
"raw_answer": t.raw_answer, # Include complete original response
|
|
315
|
+
"summary": t.summary,
|
|
316
|
+
}
|
|
317
|
+
# Add ref_number if citation map is available
|
|
318
|
+
if hasattr(self, "_citation_map") and self._citation_map:
|
|
319
|
+
ref_num = self._citation_map.get(cid, 0)
|
|
320
|
+
if ref_num > 0:
|
|
321
|
+
trace_data["ref_number"] = ref_num
|
|
322
|
+
traces.append(trace_data)
|
|
323
|
+
return {
|
|
324
|
+
"block_id": b.block_id,
|
|
325
|
+
"sub_topic": b.sub_topic,
|
|
326
|
+
"overview": b.overview,
|
|
327
|
+
"traces": traces,
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
def _build_citation_table(self, block: TopicBlock) -> str:
|
|
331
|
+
"""Build a clear citation reference table for LLM to understand the mapping
|
|
332
|
+
|
|
333
|
+
This creates an easy-to-read table showing:
|
|
334
|
+
- Reference number to use in text (use [N] format)
|
|
335
|
+
- Tool type
|
|
336
|
+
- Query summary (truncated)
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
block: TopicBlock containing tool traces
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
Formatted citation table string
|
|
343
|
+
"""
|
|
344
|
+
if not block.tool_traces:
|
|
345
|
+
return " (No citations available for this section)"
|
|
346
|
+
|
|
347
|
+
lines = []
|
|
348
|
+
for trace in block.tool_traces:
|
|
349
|
+
cid = getattr(trace, "citation_id", None)
|
|
350
|
+
if not cid:
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
ref_num = self._citation_map.get(cid, 0) if hasattr(self, "_citation_map") else 0
|
|
354
|
+
if ref_num <= 0:
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
# Truncate query for readability
|
|
358
|
+
query_preview = trace.query[:60] + "..." if len(trace.query) > 60 else trace.query
|
|
359
|
+
tool_display = {
|
|
360
|
+
"rag_naive": "RAG",
|
|
361
|
+
"rag_hybrid": "Hybrid RAG",
|
|
362
|
+
"query_item": "KB Query",
|
|
363
|
+
"paper_search": "Paper",
|
|
364
|
+
"web_search": "Web",
|
|
365
|
+
"run_code": "Code",
|
|
366
|
+
}.get(trace.tool_type.lower(), trace.tool_type)
|
|
367
|
+
|
|
368
|
+
# Use clear format: cite as [N] -> source description
|
|
369
|
+
lines.append(f" - Cite as [{ref_num}] â ({tool_display}) {query_preview}")
|
|
370
|
+
|
|
371
|
+
if not lines:
|
|
372
|
+
return " (No citations available for this section)"
|
|
373
|
+
|
|
374
|
+
return "\n".join(lines)
|
|
375
|
+
|
|
376
|
+
async def _write_introduction(
|
|
377
|
+
self, topic: str, blocks: list[TopicBlock], outline: dict[str, Any]
|
|
378
|
+
) -> str:
|
|
379
|
+
"""Write report introduction section"""
|
|
380
|
+
system_prompt = self.get_prompt(
|
|
381
|
+
"system",
|
|
382
|
+
"role",
|
|
383
|
+
"You are an academic writing expert specializing in writing the introduction section of research reports.",
|
|
384
|
+
)
|
|
385
|
+
tmpl = self.get_prompt("process", "write_introduction", "")
|
|
386
|
+
if not tmpl:
|
|
387
|
+
raise ValueError(
|
|
388
|
+
"Cannot get introduction writing prompt template, report generation failed"
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
import json as _json
|
|
392
|
+
|
|
393
|
+
# Prepare context for introduction: overview information of all topics
|
|
394
|
+
topics_summary = []
|
|
395
|
+
for b in blocks:
|
|
396
|
+
topics_summary.append(
|
|
397
|
+
{"sub_topic": b.sub_topic, "overview": b.overview, "tool_count": len(b.tool_traces)}
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Use introduction_instruction if available, otherwise fall back to introduction title
|
|
401
|
+
intro_instruction = outline.get("introduction_instruction", "") or outline.get(
|
|
402
|
+
"introduction", ""
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
|
|
406
|
+
topics_summary_json = _json.dumps(topics_summary, ensure_ascii=False, indent=2)
|
|
407
|
+
filled = self._safe_format(
|
|
408
|
+
tmpl,
|
|
409
|
+
topic=topic,
|
|
410
|
+
introduction_instruction=intro_instruction,
|
|
411
|
+
topics_summary=topics_summary_json,
|
|
412
|
+
total_topics=len(blocks),
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
resp = await self.call_llm(filled, system_prompt, stage="write_introduction", verbose=False)
|
|
416
|
+
data = extract_json_from_text(resp)
|
|
417
|
+
|
|
418
|
+
try:
|
|
419
|
+
obj = ensure_json_dict(data)
|
|
420
|
+
ensure_keys(obj, ["introduction"])
|
|
421
|
+
intro = obj.get("introduction", "")
|
|
422
|
+
if isinstance(intro, str) and intro.strip():
|
|
423
|
+
return intro
|
|
424
|
+
raise ValueError("LLM returned empty or invalid introduction field")
|
|
425
|
+
except Exception as e:
|
|
426
|
+
raise ValueError(
|
|
427
|
+
f"Unable to parse LLM returned introduction content: {e!s}. Report generation failed."
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
async def _write_section_body(
|
|
431
|
+
self, topic: str, block: TopicBlock, section_outline: dict[str, Any]
|
|
432
|
+
) -> str:
|
|
433
|
+
"""Write main content of a single section"""
|
|
434
|
+
system_prompt = self.get_prompt(
|
|
435
|
+
"system",
|
|
436
|
+
"role",
|
|
437
|
+
"You are an academic writing expert specializing in writing chapter content for research reports.",
|
|
438
|
+
)
|
|
439
|
+
tmpl = self.get_prompt("process", "write_section_body", "")
|
|
440
|
+
if not tmpl:
|
|
441
|
+
raise ValueError("Cannot get section writing prompt template, report generation failed")
|
|
442
|
+
|
|
443
|
+
import json as _json
|
|
444
|
+
|
|
445
|
+
block_data = self._ser_block(block)
|
|
446
|
+
|
|
447
|
+
# Dynamically build citation instructions based on configuration
|
|
448
|
+
if self.enable_inline_citations:
|
|
449
|
+
# Build clear citation reference table for this block
|
|
450
|
+
citation_table = self._build_citation_table(block)
|
|
451
|
+
|
|
452
|
+
citation_instruction_template = self.get_prompt("citation", "enabled_instruction")
|
|
453
|
+
if citation_instruction_template:
|
|
454
|
+
citation_instruction = citation_instruction_template.format(
|
|
455
|
+
citation_table=citation_table
|
|
456
|
+
)
|
|
457
|
+
else:
|
|
458
|
+
# Fallback if YAML not configured
|
|
459
|
+
citation_instruction = f"**Citation Reference Table**:\n{citation_table}"
|
|
460
|
+
citation_output_hint = ", citations"
|
|
461
|
+
else:
|
|
462
|
+
citation_instruction = self.get_prompt("citation", "disabled_instruction") or ""
|
|
463
|
+
citation_output_hint = ""
|
|
464
|
+
|
|
465
|
+
# Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
|
|
466
|
+
block_data_json = _json.dumps(block_data, ensure_ascii=False, indent=2)
|
|
467
|
+
filled = self._safe_format(
|
|
468
|
+
tmpl,
|
|
469
|
+
topic=topic,
|
|
470
|
+
section_title=section_outline.get("title", block.sub_topic),
|
|
471
|
+
section_instruction=section_outline.get("instruction", ""),
|
|
472
|
+
block_data=block_data_json,
|
|
473
|
+
min_section_length=self.reporting_config.get("min_section_length", 500),
|
|
474
|
+
citation_instruction=citation_instruction,
|
|
475
|
+
citation_output_hint=citation_output_hint,
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
resp = await self.call_llm(filled, system_prompt, stage="write_section_body", verbose=False)
|
|
479
|
+
data = extract_json_from_text(resp)
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
obj = ensure_json_dict(data)
|
|
483
|
+
ensure_keys(obj, ["section_content"])
|
|
484
|
+
content = obj.get("section_content", "")
|
|
485
|
+
if isinstance(content, str) and content.strip():
|
|
486
|
+
return content
|
|
487
|
+
raise ValueError("LLM returned empty or invalid section_content field")
|
|
488
|
+
except Exception as e:
|
|
489
|
+
raise ValueError(
|
|
490
|
+
f"Unable to parse LLM returned section content: {e!s}. Report generation failed."
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
async def _write_conclusion(
|
|
494
|
+
self, topic: str, blocks: list[TopicBlock], outline: dict[str, Any]
|
|
495
|
+
) -> str:
|
|
496
|
+
"""Write report conclusion section"""
|
|
497
|
+
system_prompt = self.get_prompt(
|
|
498
|
+
"system",
|
|
499
|
+
"role",
|
|
500
|
+
"You are an academic writing expert specializing in writing the conclusion section of research reports.",
|
|
501
|
+
)
|
|
502
|
+
tmpl = self.get_prompt("process", "write_conclusion", "")
|
|
503
|
+
if not tmpl:
|
|
504
|
+
raise ValueError(
|
|
505
|
+
"Cannot get conclusion writing prompt template, report generation failed"
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
import json as _json
|
|
509
|
+
|
|
510
|
+
# Prepare context for conclusion: key findings of all topics
|
|
511
|
+
topics_findings = []
|
|
512
|
+
for b in blocks:
|
|
513
|
+
findings = {
|
|
514
|
+
"sub_topic": b.sub_topic,
|
|
515
|
+
"overview": b.overview,
|
|
516
|
+
"key_findings": [
|
|
517
|
+
t.summary for t in b.tool_traces[:3]
|
|
518
|
+
], # Top 3 key findings for each topic
|
|
519
|
+
}
|
|
520
|
+
topics_findings.append(findings)
|
|
521
|
+
|
|
522
|
+
# Use conclusion_instruction if available, otherwise fall back to conclusion title
|
|
523
|
+
conclusion_instruction = outline.get("conclusion_instruction", "") or outline.get(
|
|
524
|
+
"conclusion", ""
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
# Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
|
|
528
|
+
topics_findings_json = _json.dumps(topics_findings, ensure_ascii=False, indent=2)
|
|
529
|
+
filled = self._safe_format(
|
|
530
|
+
tmpl,
|
|
531
|
+
topic=topic,
|
|
532
|
+
conclusion_instruction=conclusion_instruction,
|
|
533
|
+
topics_findings=topics_findings_json,
|
|
534
|
+
total_topics=len(blocks),
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
resp = await self.call_llm(filled, system_prompt, stage="write_conclusion", verbose=False)
|
|
538
|
+
data = extract_json_from_text(resp)
|
|
539
|
+
|
|
540
|
+
try:
|
|
541
|
+
obj = ensure_json_dict(data)
|
|
542
|
+
ensure_keys(obj, ["conclusion"])
|
|
543
|
+
conclusion = obj.get("conclusion", "")
|
|
544
|
+
if isinstance(conclusion, str) and conclusion.strip():
|
|
545
|
+
return conclusion
|
|
546
|
+
raise ValueError("LLM returned empty or invalid conclusion field")
|
|
547
|
+
except Exception as e:
|
|
548
|
+
raise ValueError(
|
|
549
|
+
f"Unable to parse LLM returned conclusion content: {e!s}. Report generation failed."
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
def _build_citation_number_map(self, blocks: list[TopicBlock]) -> dict[str, int]:
|
|
553
|
+
"""Build citation_id to reference number mapping with deduplication
|
|
554
|
+
|
|
555
|
+
This method delegates to CitationManager for unified mapping logic.
|
|
556
|
+
The mapping is built once and cached in CitationManager.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
Dictionary mapping citation_id (e.g., "CIT-1-01") to reference number (e.g., 1)
|
|
560
|
+
"""
|
|
561
|
+
if self.citation_manager:
|
|
562
|
+
# Use CitationManager's unified mapping (single source of truth)
|
|
563
|
+
return self.citation_manager.build_ref_number_map()
|
|
564
|
+
|
|
565
|
+
# Fallback: build from blocks when no CitationManager available
|
|
566
|
+
citation_map = {}
|
|
567
|
+
|
|
568
|
+
def extract_citation_number(cit_id):
|
|
569
|
+
try:
|
|
570
|
+
if cit_id.startswith("PLAN-"):
|
|
571
|
+
num = int(cit_id.replace("PLAN-", ""))
|
|
572
|
+
return (0, 0, num)
|
|
573
|
+
parts_list = cit_id.replace("CIT-", "").split("-")
|
|
574
|
+
if len(parts_list) == 2:
|
|
575
|
+
return (1, int(parts_list[0]), int(parts_list[1]))
|
|
576
|
+
except:
|
|
577
|
+
pass
|
|
578
|
+
return (999, 999, 999)
|
|
579
|
+
|
|
580
|
+
all_citations = []
|
|
581
|
+
for block in blocks:
|
|
582
|
+
if block.tool_traces:
|
|
583
|
+
for trace in block.tool_traces:
|
|
584
|
+
citation_id = getattr(trace, "citation_id", None)
|
|
585
|
+
if citation_id and citation_id not in [c["citation_id"] for c in all_citations]:
|
|
586
|
+
all_citations.append({"citation_id": citation_id})
|
|
587
|
+
|
|
588
|
+
all_citations.sort(key=lambda x: extract_citation_number(x["citation_id"]))
|
|
589
|
+
|
|
590
|
+
for idx, cit in enumerate(all_citations, 1):
|
|
591
|
+
citation_map[cit["citation_id"]] = idx
|
|
592
|
+
|
|
593
|
+
return citation_map
|
|
594
|
+
|
|
595
|
+
def _generate_references(self, blocks: list[TopicBlock]) -> str:
|
|
596
|
+
"""Generate References section"""
|
|
597
|
+
parts = ["## References\n"]
|
|
598
|
+
|
|
599
|
+
# If using CitationManager, generate from JSON file
|
|
600
|
+
if self.citation_manager:
|
|
601
|
+
return self._generate_references_from_manager(blocks)
|
|
602
|
+
|
|
603
|
+
# Otherwise use original method of extracting from blocks (backward compatible)
|
|
604
|
+
return self._generate_references_from_blocks(blocks)
|
|
605
|
+
|
|
606
|
+
def _get_citation_dedup_key(self, citation: dict, paper: dict = None) -> str:
|
|
607
|
+
"""Generate unique key for citation deduplication
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
citation: The citation dict
|
|
611
|
+
paper: Optional paper dict for paper_search citations
|
|
612
|
+
|
|
613
|
+
Returns:
|
|
614
|
+
Unique string key for deduplication
|
|
615
|
+
"""
|
|
616
|
+
tool_type = citation.get("tool_type", "").lower()
|
|
617
|
+
|
|
618
|
+
if tool_type == "paper_search" and paper:
|
|
619
|
+
# For papers: use title + first author (normalized)
|
|
620
|
+
title = paper.get("title", "").lower().strip()
|
|
621
|
+
authors = paper.get("authors", "").lower().strip()
|
|
622
|
+
# Extract first author if multiple
|
|
623
|
+
first_author = authors.split(",")[0].strip() if authors else ""
|
|
624
|
+
return f"paper:{title}|{first_author}"
|
|
625
|
+
elif tool_type == "paper_search":
|
|
626
|
+
# Fallback for paper_search without paper dict
|
|
627
|
+
title = citation.get("title", "").lower().strip()
|
|
628
|
+
authors = citation.get("authors", "").lower().strip()
|
|
629
|
+
first_author = authors.split(",")[0].strip() if authors else ""
|
|
630
|
+
return f"paper:{title}|{first_author}"
|
|
631
|
+
else:
|
|
632
|
+
# For RAG/web_search/etc: use tool_type + query (normalized)
|
|
633
|
+
query = citation.get("query", "").lower().strip()
|
|
634
|
+
# Use first 100 chars of query for dedup
|
|
635
|
+
return f"{tool_type}:{query[:100]}"
|
|
636
|
+
|
|
637
|
+
def _generate_references_from_manager(self, blocks: list[TopicBlock]) -> str:
|
|
638
|
+
"""Generate References section from CitationManager in academic paper style
|
|
639
|
+
|
|
640
|
+
Uses CitationManager's ref_number_map to ensure consistency between
|
|
641
|
+
in-text citations and the References section.
|
|
642
|
+
|
|
643
|
+
Format:
|
|
644
|
+
- Ordered by reference number (consistent with in-text citations)
|
|
645
|
+
- Paper citations: APA format
|
|
646
|
+
- RAG/Query citations: Tool name, query, summary
|
|
647
|
+
- Web search: Tool name, query, summary + collapsible links
|
|
648
|
+
"""
|
|
649
|
+
parts = ["## References\n\n"]
|
|
650
|
+
|
|
651
|
+
# Get all citations and the ref_number_map
|
|
652
|
+
all_citations = self.citation_manager.get_all_citations()
|
|
653
|
+
|
|
654
|
+
if not all_citations:
|
|
655
|
+
return "## References\n\n*No citations available.*\n"
|
|
656
|
+
|
|
657
|
+
# Get the ref_number_map from CitationManager (single source of truth)
|
|
658
|
+
ref_map = self.citation_manager.get_ref_number_map()
|
|
659
|
+
|
|
660
|
+
# Build reverse map: ref_number -> (citation_id, paper_idx or None)
|
|
661
|
+
# This groups citations by their ref_number for consistent output
|
|
662
|
+
ref_to_citations: dict[int, list[tuple[str, dict, dict | None]]] = {}
|
|
663
|
+
|
|
664
|
+
for citation_id, citation in all_citations.items():
|
|
665
|
+
tool_type = citation.get("tool_type", "").lower()
|
|
666
|
+
|
|
667
|
+
if tool_type == "paper_search":
|
|
668
|
+
papers = citation.get("papers", [])
|
|
669
|
+
if papers:
|
|
670
|
+
for paper_idx, paper in enumerate(papers):
|
|
671
|
+
# Check if this paper has a ref_number
|
|
672
|
+
paper_ref_key = f"{citation_id}-{paper_idx + 1}"
|
|
673
|
+
ref_num = ref_map.get(paper_ref_key) or ref_map.get(citation_id, 0)
|
|
674
|
+
if ref_num > 0:
|
|
675
|
+
if ref_num not in ref_to_citations:
|
|
676
|
+
ref_to_citations[ref_num] = []
|
|
677
|
+
ref_to_citations[ref_num].append((citation_id, citation, paper))
|
|
678
|
+
else:
|
|
679
|
+
ref_num = ref_map.get(citation_id, 0)
|
|
680
|
+
if ref_num > 0:
|
|
681
|
+
if ref_num not in ref_to_citations:
|
|
682
|
+
ref_to_citations[ref_num] = []
|
|
683
|
+
ref_to_citations[ref_num].append((citation_id, citation, None))
|
|
684
|
+
else:
|
|
685
|
+
ref_num = ref_map.get(citation_id, 0)
|
|
686
|
+
if ref_num > 0:
|
|
687
|
+
if ref_num not in ref_to_citations:
|
|
688
|
+
ref_to_citations[ref_num] = []
|
|
689
|
+
ref_to_citations[ref_num].append((citation_id, citation, None))
|
|
690
|
+
|
|
691
|
+
# Generate references in order of ref_number
|
|
692
|
+
for ref_num in sorted(ref_to_citations.keys()):
|
|
693
|
+
entries = ref_to_citations[ref_num]
|
|
694
|
+
if not entries:
|
|
695
|
+
continue
|
|
696
|
+
|
|
697
|
+
# Use the first entry for this ref_number (others are duplicates)
|
|
698
|
+
citation_id, citation, paper = entries[0]
|
|
699
|
+
tool_type = citation.get("tool_type", "").lower()
|
|
700
|
+
|
|
701
|
+
anchor = f"ref-{ref_num}"
|
|
702
|
+
parts.append(f'<a id="{anchor}"></a>**[{ref_num}]** ')
|
|
703
|
+
|
|
704
|
+
if tool_type == "paper_search":
|
|
705
|
+
if paper:
|
|
706
|
+
formatted = self._format_single_paper_apa(paper)
|
|
707
|
+
else:
|
|
708
|
+
formatted = self._format_paper_citation_apa(citation)
|
|
709
|
+
parts.append(formatted)
|
|
710
|
+
elif tool_type == "web_search":
|
|
711
|
+
formatted = self._format_web_search_citation(citation)
|
|
712
|
+
parts.append(formatted)
|
|
713
|
+
elif tool_type in ("rag_naive", "rag_hybrid", "query_item"):
|
|
714
|
+
formatted = self._format_rag_citation(citation)
|
|
715
|
+
parts.append(formatted)
|
|
716
|
+
elif tool_type == "run_code":
|
|
717
|
+
formatted = self._format_code_citation(citation)
|
|
718
|
+
parts.append(formatted)
|
|
719
|
+
else:
|
|
720
|
+
# Generic format
|
|
721
|
+
query = citation.get("query", "")
|
|
722
|
+
summary = citation.get("summary", "")
|
|
723
|
+
parts.append(f"**{tool_type}**\n\n")
|
|
724
|
+
parts.append(f"- **Query**: {query}\n")
|
|
725
|
+
if summary:
|
|
726
|
+
clean_summary = self._strip_markdown(summary)
|
|
727
|
+
parts.append(
|
|
728
|
+
f"- **Summary**: {clean_summary[:300]}{'...' if len(clean_summary) > 300 else ''}\n"
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
parts.append("\n\n")
|
|
732
|
+
|
|
733
|
+
return "".join(parts)
|
|
734
|
+
|
|
735
|
+
def _format_single_paper_apa(self, paper: dict) -> str:
|
|
736
|
+
"""Format a single paper in APA style
|
|
737
|
+
|
|
738
|
+
Format: Authors (Year). *Title*. Venue. arXiv:ID. URL
|
|
739
|
+
"""
|
|
740
|
+
authors = paper.get("authors", "Unknown Author")
|
|
741
|
+
year = paper.get("year", "n.d.")
|
|
742
|
+
title = paper.get("title", "Untitled")
|
|
743
|
+
url = paper.get("url", "")
|
|
744
|
+
arxiv_id = paper.get("arxiv_id", "")
|
|
745
|
+
venue = paper.get("venue", "")
|
|
746
|
+
doi = paper.get("doi", "")
|
|
747
|
+
|
|
748
|
+
# APA format
|
|
749
|
+
result = f"{authors} ({year}). *{title}*."
|
|
750
|
+
if venue:
|
|
751
|
+
result += f" {venue}."
|
|
752
|
+
if arxiv_id:
|
|
753
|
+
result += f" arXiv:{arxiv_id}."
|
|
754
|
+
if doi:
|
|
755
|
+
result += f" https://doi.org/{doi}"
|
|
756
|
+
elif url:
|
|
757
|
+
result += f" {url}"
|
|
758
|
+
|
|
759
|
+
return result
|
|
760
|
+
|
|
761
|
+
def _format_paper_citation_apa(self, citation: dict) -> str:
|
|
762
|
+
"""Format paper citation in APA style (fallback for citations without papers array)
|
|
763
|
+
|
|
764
|
+
Format: Authors (Year). *Title*. Venue. arXiv:ID. URL
|
|
765
|
+
"""
|
|
766
|
+
# Use top-level fields (backward compatibility)
|
|
767
|
+
authors = citation.get("authors", "Unknown Author")
|
|
768
|
+
year = citation.get("year", "n.d.")
|
|
769
|
+
title = citation.get("title", "Untitled")
|
|
770
|
+
url = citation.get("url", "")
|
|
771
|
+
arxiv_id = citation.get("arxiv_id", "")
|
|
772
|
+
venue = citation.get("venue", "")
|
|
773
|
+
doi = citation.get("doi", "")
|
|
774
|
+
|
|
775
|
+
result = f"{authors} ({year}). *{title}*."
|
|
776
|
+
if venue:
|
|
777
|
+
result += f" {venue}."
|
|
778
|
+
if arxiv_id:
|
|
779
|
+
result += f" arXiv:{arxiv_id}."
|
|
780
|
+
if doi:
|
|
781
|
+
result += f" https://doi.org/{doi}"
|
|
782
|
+
elif url:
|
|
783
|
+
result += f" {url}"
|
|
784
|
+
return result
|
|
785
|
+
|
|
786
|
+
def _format_web_search_citation(self, citation: dict) -> str:
|
|
787
|
+
"""Format web search citation with collapsible links"""
|
|
788
|
+
query = citation.get("query", "")
|
|
789
|
+
summary = citation.get("summary", "")
|
|
790
|
+
web_sources = citation.get("web_sources", [])
|
|
791
|
+
|
|
792
|
+
result = "**Web Search**\n\n"
|
|
793
|
+
result += f"- **Query**: {query}\n"
|
|
794
|
+
if summary:
|
|
795
|
+
# Clean summary to avoid markdown rendering issues
|
|
796
|
+
clean_summary = self._strip_markdown(summary)
|
|
797
|
+
summary_text = clean_summary[:300] + ("..." if len(clean_summary) > 300 else "")
|
|
798
|
+
result += f"- **Summary**: {summary_text}\n"
|
|
799
|
+
|
|
800
|
+
# Add collapsible links section
|
|
801
|
+
if web_sources:
|
|
802
|
+
result += "\n<details>\n<summary>đ Retrieved Sources ({} links)</summary>\n\n".format(
|
|
803
|
+
len(web_sources)
|
|
804
|
+
)
|
|
805
|
+
for i, source in enumerate(web_sources, 1):
|
|
806
|
+
title = source.get("title", "Untitled")
|
|
807
|
+
url = source.get("url", "")
|
|
808
|
+
snippet = source.get("snippet", "")
|
|
809
|
+
if url:
|
|
810
|
+
result += f"{i}. [{title}]({url})"
|
|
811
|
+
if snippet:
|
|
812
|
+
clean_snippet = self._strip_markdown(snippet)
|
|
813
|
+
result += f"\n > {clean_snippet[:150]}{'...' if len(clean_snippet) > 150 else ''}"
|
|
814
|
+
result += "\n\n"
|
|
815
|
+
result += "</details>"
|
|
816
|
+
|
|
817
|
+
return result
|
|
818
|
+
|
|
819
|
+
def _format_rag_citation(self, citation: dict) -> str:
|
|
820
|
+
"""Format RAG/Query citation"""
|
|
821
|
+
tool_type = citation.get("tool_type", "")
|
|
822
|
+
query = citation.get("query", "")
|
|
823
|
+
summary = citation.get("summary", "")
|
|
824
|
+
kb_name = citation.get("kb_name", "")
|
|
825
|
+
sources = citation.get("sources", [])
|
|
826
|
+
|
|
827
|
+
# Tool name display
|
|
828
|
+
tool_display = {
|
|
829
|
+
"rag_naive": "RAG Retrieval",
|
|
830
|
+
"rag_hybrid": "Hybrid RAG Retrieval",
|
|
831
|
+
"query_item": "Knowledge Base Query",
|
|
832
|
+
}.get(tool_type, tool_type)
|
|
833
|
+
|
|
834
|
+
result = f"**{tool_display}**"
|
|
835
|
+
if kb_name:
|
|
836
|
+
result += f" (KB: {kb_name})"
|
|
837
|
+
result += "\n\n"
|
|
838
|
+
result += f"- **Query**: {query}\n"
|
|
839
|
+
if summary:
|
|
840
|
+
# Clean summary: remove markdown formatting to avoid rendering issues
|
|
841
|
+
clean_summary = self._strip_markdown(summary)
|
|
842
|
+
summary_text = clean_summary[:300] + ("..." if len(clean_summary) > 300 else "")
|
|
843
|
+
result += f"- **Summary**: {summary_text}\n"
|
|
844
|
+
|
|
845
|
+
# Add source documents if available
|
|
846
|
+
if sources:
|
|
847
|
+
result += "\n<details>\n<summary>đ Source Documents ({} docs)</summary>\n\n".format(
|
|
848
|
+
len(sources)
|
|
849
|
+
)
|
|
850
|
+
for i, source in enumerate(sources, 1):
|
|
851
|
+
title = source.get("title", "") or source.get("source_file", f"Document {i}")
|
|
852
|
+
content = source.get("content_preview", "")
|
|
853
|
+
page = source.get("page", "")
|
|
854
|
+
result += f"{i}. **{title}**"
|
|
855
|
+
if page:
|
|
856
|
+
result += f" (Page {page})"
|
|
857
|
+
if content:
|
|
858
|
+
clean_content = self._strip_markdown(content)
|
|
859
|
+
result += (
|
|
860
|
+
f"\n > {clean_content[:150]}{'...' if len(clean_content) > 150 else ''}"
|
|
861
|
+
)
|
|
862
|
+
result += "\n\n"
|
|
863
|
+
result += "</details>"
|
|
864
|
+
|
|
865
|
+
return result
|
|
866
|
+
|
|
867
|
+
def _strip_markdown(self, text: str) -> str:
|
|
868
|
+
"""Strip markdown formatting from text to get plain text"""
|
|
869
|
+
import re
|
|
870
|
+
|
|
871
|
+
if not text:
|
|
872
|
+
return ""
|
|
873
|
+
|
|
874
|
+
# Remove bold/italic markers
|
|
875
|
+
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # **bold**
|
|
876
|
+
text = re.sub(r"\*([^*]+)\*", r"\1", text) # *italic*
|
|
877
|
+
text = re.sub(r"__([^_]+)__", r"\1", text) # __bold__
|
|
878
|
+
text = re.sub(r"_([^_]+)_", r"\1", text) # _italic_
|
|
879
|
+
|
|
880
|
+
# Remove headers
|
|
881
|
+
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
|
|
882
|
+
|
|
883
|
+
# Remove links but keep text
|
|
884
|
+
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
|
|
885
|
+
|
|
886
|
+
# Remove inline code
|
|
887
|
+
text = re.sub(r"`([^`]+)`", r"\1", text)
|
|
888
|
+
|
|
889
|
+
# Remove bullet points
|
|
890
|
+
text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
|
|
891
|
+
|
|
892
|
+
# Remove numbered lists
|
|
893
|
+
text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
|
|
894
|
+
|
|
895
|
+
# Remove blockquotes
|
|
896
|
+
text = re.sub(r"^>\s*", "", text, flags=re.MULTILINE)
|
|
897
|
+
|
|
898
|
+
# Normalize whitespace
|
|
899
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
900
|
+
text = re.sub(r" +", " ", text)
|
|
901
|
+
|
|
902
|
+
return text.strip()
|
|
903
|
+
|
|
904
|
+
def _format_code_citation(self, citation: dict) -> str:
|
|
905
|
+
"""Format code execution citation"""
|
|
906
|
+
query = citation.get("query", "") # This is usually the code
|
|
907
|
+
summary = citation.get("summary", "")
|
|
908
|
+
|
|
909
|
+
result = "**Code Execution**\n\n"
|
|
910
|
+
if query:
|
|
911
|
+
# Truncate long code
|
|
912
|
+
code_preview = query[:300] + ("..." if len(query) > 300 else "")
|
|
913
|
+
result += f"- **Code**: `{code_preview}`\n"
|
|
914
|
+
if summary:
|
|
915
|
+
summary_text = summary[:300] + ("..." if len(summary) > 300 else "")
|
|
916
|
+
result += f"- **Result**: {summary_text}\n"
|
|
917
|
+
|
|
918
|
+
return result
|
|
919
|
+
|
|
920
|
+
def _generate_references_from_blocks(self, blocks: list[TopicBlock]) -> str:
|
|
921
|
+
"""Generate References section from blocks (backward compatible, academic paper style)"""
|
|
922
|
+
parts = ["## References\n\n"]
|
|
923
|
+
|
|
924
|
+
# Collect all citations
|
|
925
|
+
all_citations = []
|
|
926
|
+
for block in blocks:
|
|
927
|
+
if block.tool_traces:
|
|
928
|
+
for trace in block.tool_traces:
|
|
929
|
+
citation_id = (
|
|
930
|
+
getattr(trace, "citation_id", None)
|
|
931
|
+
or f"CIT-{block.block_id.split('_')[-1]}-01"
|
|
932
|
+
)
|
|
933
|
+
all_citations.append(
|
|
934
|
+
{"citation_id": citation_id, "block": block, "trace": trace}
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
if not all_citations:
|
|
938
|
+
return "## References\n\n*No citations available.*\n"
|
|
939
|
+
|
|
940
|
+
# Sort by citation_id (extract numeric parts for sorting)
|
|
941
|
+
def extract_citation_number(cit_id):
|
|
942
|
+
try:
|
|
943
|
+
if cit_id.startswith("PLAN-"):
|
|
944
|
+
num = int(cit_id.replace("PLAN-", ""))
|
|
945
|
+
return (0, 0, num)
|
|
946
|
+
# CIT-X-XX format
|
|
947
|
+
parts_list = cit_id.replace("CIT-", "").split("-")
|
|
948
|
+
if len(parts_list) == 2:
|
|
949
|
+
return (1, int(parts_list[0]), int(parts_list[1]))
|
|
950
|
+
except:
|
|
951
|
+
pass
|
|
952
|
+
return (999, 999, 999)
|
|
953
|
+
|
|
954
|
+
all_citations.sort(key=lambda x: extract_citation_number(x["citation_id"]))
|
|
955
|
+
|
|
956
|
+
# Generate numbered references in academic paper style
|
|
957
|
+
# Using simple ref-N anchor format for clickable inline citations
|
|
958
|
+
for idx, cit in enumerate(all_citations, 1):
|
|
959
|
+
trace = cit["trace"]
|
|
960
|
+
citation_id = cit["citation_id"]
|
|
961
|
+
|
|
962
|
+
# Use simple ref-N anchor format (consistent with _generate_references_from_manager)
|
|
963
|
+
anchor = f"ref-{idx}"
|
|
964
|
+
tool_type = trace.tool_type.lower() if trace.tool_type else ""
|
|
965
|
+
|
|
966
|
+
# Tool name display
|
|
967
|
+
tool_display = {
|
|
968
|
+
"rag_naive": "RAG Retrieval",
|
|
969
|
+
"rag_hybrid": "Hybrid RAG Retrieval",
|
|
970
|
+
"query_item": "Knowledge Base Query",
|
|
971
|
+
"paper_search": "Paper Search",
|
|
972
|
+
"web_search": "Web Search",
|
|
973
|
+
"run_code": "Code Execution",
|
|
974
|
+
}.get(tool_type, tool_type)
|
|
975
|
+
|
|
976
|
+
parts.append(f'<a id="{anchor}"></a>**[{idx}]** **{tool_display}**\n\n')
|
|
977
|
+
parts.append(f"- **Query**: {trace.query}\n")
|
|
978
|
+
if trace.summary:
|
|
979
|
+
summary_text = trace.summary[:500] + ("..." if len(trace.summary) > 500 else "")
|
|
980
|
+
parts.append(f"- **Summary**: {summary_text}\n")
|
|
981
|
+
parts.append("\n")
|
|
982
|
+
|
|
983
|
+
return "".join(parts)
|
|
984
|
+
|
|
985
|
+
def _convert_citation_format(self, text: str) -> str:
|
|
986
|
+
"""
|
|
987
|
+
Convert various citation formats to clickable [[N]](#ref-N) format.
|
|
988
|
+
|
|
989
|
+
Handles:
|
|
990
|
+
- [N] format (simple number in brackets)
|
|
991
|
+
- [ref=N] format (from citation table)
|
|
992
|
+
|
|
993
|
+
Args:
|
|
994
|
+
text: Text with citations in various formats
|
|
995
|
+
|
|
996
|
+
Returns:
|
|
997
|
+
Text with [[N]](#ref-N) clickable citations
|
|
998
|
+
"""
|
|
999
|
+
import re
|
|
1000
|
+
|
|
1001
|
+
# Get valid ref_numbers from the citation map
|
|
1002
|
+
valid_refs = set()
|
|
1003
|
+
if hasattr(self, "_citation_map") and self._citation_map:
|
|
1004
|
+
valid_refs = set(self._citation_map.values())
|
|
1005
|
+
|
|
1006
|
+
def replace_citation(match):
|
|
1007
|
+
# Get the number from the match
|
|
1008
|
+
ref_num = match.group(1)
|
|
1009
|
+
|
|
1010
|
+
# Only convert if it's a valid ref_number
|
|
1011
|
+
try:
|
|
1012
|
+
num = int(ref_num)
|
|
1013
|
+
if num in valid_refs:
|
|
1014
|
+
return f"[[{ref_num}]](#ref-{ref_num})"
|
|
1015
|
+
except ValueError:
|
|
1016
|
+
pass
|
|
1017
|
+
|
|
1018
|
+
# Return unchanged if not a valid reference
|
|
1019
|
+
return match.group(0)
|
|
1020
|
+
|
|
1021
|
+
# First, convert [ref=N] format to clickable format
|
|
1022
|
+
# Pattern: [ref=N] where N is a number
|
|
1023
|
+
ref_pattern = r"\[ref=(\d+)\]"
|
|
1024
|
+
text = re.sub(ref_pattern, replace_citation, text)
|
|
1025
|
+
|
|
1026
|
+
# Then, convert simple [N] format (but NOT already converted [[N]])
|
|
1027
|
+
# Pattern to match [N] where N is a number, but NOT already in [[N]] format
|
|
1028
|
+
# Use negative lookbehind and lookahead to avoid matching [[N]] or [N](#ref-N)
|
|
1029
|
+
simple_pattern = r"(?<!\[)\[(\d+)\](?!\(#ref-)"
|
|
1030
|
+
text = re.sub(simple_pattern, replace_citation, text)
|
|
1031
|
+
|
|
1032
|
+
return text
|
|
1033
|
+
|
|
1034
|
+
def _validate_and_fix_citations(self, text: str) -> tuple[str, dict]:
|
|
1035
|
+
"""
|
|
1036
|
+
Validate citations in text and fix invalid ones.
|
|
1037
|
+
|
|
1038
|
+
Args:
|
|
1039
|
+
text: Text with citations
|
|
1040
|
+
|
|
1041
|
+
Returns:
|
|
1042
|
+
Tuple of (fixed_text, validation_result)
|
|
1043
|
+
"""
|
|
1044
|
+
import re
|
|
1045
|
+
|
|
1046
|
+
# Get valid ref_numbers
|
|
1047
|
+
valid_refs = set()
|
|
1048
|
+
if hasattr(self, "_citation_map") and self._citation_map:
|
|
1049
|
+
valid_refs = set(self._citation_map.values())
|
|
1050
|
+
|
|
1051
|
+
# Find all citations in [[N]](#ref-N) format
|
|
1052
|
+
pattern = r"\[\[(\d+)\]\]\(#ref-\d+\)"
|
|
1053
|
+
found_citations = re.findall(pattern, text)
|
|
1054
|
+
|
|
1055
|
+
valid = []
|
|
1056
|
+
invalid = []
|
|
1057
|
+
|
|
1058
|
+
for ref in found_citations:
|
|
1059
|
+
try:
|
|
1060
|
+
num = int(ref)
|
|
1061
|
+
if num in valid_refs:
|
|
1062
|
+
valid.append(num)
|
|
1063
|
+
else:
|
|
1064
|
+
invalid.append(num)
|
|
1065
|
+
except ValueError:
|
|
1066
|
+
invalid.append(ref)
|
|
1067
|
+
|
|
1068
|
+
# Remove invalid citations
|
|
1069
|
+
if invalid:
|
|
1070
|
+
|
|
1071
|
+
def remove_invalid(match):
|
|
1072
|
+
ref_num = match.group(1)
|
|
1073
|
+
try:
|
|
1074
|
+
num = int(ref_num)
|
|
1075
|
+
if num not in valid_refs:
|
|
1076
|
+
return "" # Remove invalid citation
|
|
1077
|
+
except ValueError:
|
|
1078
|
+
return ""
|
|
1079
|
+
return match.group(0)
|
|
1080
|
+
|
|
1081
|
+
text = re.sub(pattern, remove_invalid, text)
|
|
1082
|
+
|
|
1083
|
+
validation_result = {
|
|
1084
|
+
"valid_citations": valid,
|
|
1085
|
+
"invalid_citations": invalid,
|
|
1086
|
+
"is_valid": len(invalid) == 0,
|
|
1087
|
+
"total_found": len(found_citations),
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
return text, validation_result
|
|
1091
|
+
|
|
1092
|
+
async def _write_report(
|
|
1093
|
+
self, topic: str, blocks: list[TopicBlock], outline: dict[str, Any]
|
|
1094
|
+
) -> str:
|
|
1095
|
+
"""Write complete report using step-by-step method with three-level heading support"""
|
|
1096
|
+
parts = []
|
|
1097
|
+
|
|
1098
|
+
# Build citation number map before writing (for consistent ref_number in traces)
|
|
1099
|
+
if self.enable_inline_citations:
|
|
1100
|
+
self._citation_map = self._build_citation_number_map(blocks)
|
|
1101
|
+
print(f" đ Built citation map with {len(self._citation_map)} entries")
|
|
1102
|
+
else:
|
|
1103
|
+
self._citation_map = {}
|
|
1104
|
+
|
|
1105
|
+
# 1. Add main title (from outline, or use topic if not available)
|
|
1106
|
+
title = outline.get("title", f"# {topic}")
|
|
1107
|
+
if not title.startswith("#"):
|
|
1108
|
+
title = f"# {title}"
|
|
1109
|
+
parts.append(f"{title}\n\n")
|
|
1110
|
+
|
|
1111
|
+
# 2. Write introduction
|
|
1112
|
+
print(" đ Writing introduction...")
|
|
1113
|
+
self._notify_progress(
|
|
1114
|
+
getattr(self, "_progress_callback", None),
|
|
1115
|
+
"writing_section",
|
|
1116
|
+
current_section="Introduction",
|
|
1117
|
+
section_index=0,
|
|
1118
|
+
total_sections=len(outline.get("sections", [])) + 2, # +2 for intro and conclusion
|
|
1119
|
+
)
|
|
1120
|
+
introduction = await self._write_introduction(topic, blocks, outline)
|
|
1121
|
+
# Get introduction title from outline, or use default if not available
|
|
1122
|
+
intro_title = outline.get("introduction", "## Introduction")
|
|
1123
|
+
if not intro_title.startswith("##"):
|
|
1124
|
+
intro_title = f"## {intro_title}"
|
|
1125
|
+
parts.append(f"{intro_title}\n\n")
|
|
1126
|
+
parts.append(introduction)
|
|
1127
|
+
parts.append("\n\n")
|
|
1128
|
+
|
|
1129
|
+
# 3. Write each section with subsection support
|
|
1130
|
+
sections = outline.get("sections", [])
|
|
1131
|
+
for i, section in enumerate(sections, 1):
|
|
1132
|
+
block_id = section.get("block_id")
|
|
1133
|
+
block = next((b for b in blocks if b.block_id == block_id), None)
|
|
1134
|
+
if not block:
|
|
1135
|
+
print(
|
|
1136
|
+
f" â ī¸ Warning: Cannot find topic block with block_id={block_id}, skipping this section"
|
|
1137
|
+
)
|
|
1138
|
+
continue
|
|
1139
|
+
|
|
1140
|
+
section_title = section.get("title", block.sub_topic)
|
|
1141
|
+
# Clean section title for display (remove markdown markers)
|
|
1142
|
+
display_title = section_title.replace("##", "").strip()
|
|
1143
|
+
print(f" đ Writing section {i}/{len(sections)}: {section_title}...")
|
|
1144
|
+
self._notify_progress(
|
|
1145
|
+
getattr(self, "_progress_callback", None),
|
|
1146
|
+
"writing_section",
|
|
1147
|
+
current_section=display_title,
|
|
1148
|
+
section_index=i, # 1-based, after introduction
|
|
1149
|
+
total_sections=len(sections) + 2, # +2 for intro and conclusion
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
# Check if section has subsections defined in outline
|
|
1153
|
+
subsections = section.get("subsections", [])
|
|
1154
|
+
|
|
1155
|
+
if subsections:
|
|
1156
|
+
# Write section with explicit subsection structure
|
|
1157
|
+
section_content = await self._write_section_with_subsections(
|
|
1158
|
+
topic, block, section, subsections
|
|
1159
|
+
)
|
|
1160
|
+
else:
|
|
1161
|
+
# Write section normally (LLM will generate its own subsection structure)
|
|
1162
|
+
section_content = await self._write_section_body(topic, block, section)
|
|
1163
|
+
|
|
1164
|
+
# Section content already includes ## level title, append directly
|
|
1165
|
+
parts.append(section_content)
|
|
1166
|
+
parts.append("\n\n")
|
|
1167
|
+
|
|
1168
|
+
# 4. Write conclusion
|
|
1169
|
+
print(" đ Writing conclusion...")
|
|
1170
|
+
total_sections = len(sections) + 2
|
|
1171
|
+
self._notify_progress(
|
|
1172
|
+
getattr(self, "_progress_callback", None),
|
|
1173
|
+
"writing_section",
|
|
1174
|
+
current_section="Conclusion",
|
|
1175
|
+
section_index=total_sections - 1, # Last section
|
|
1176
|
+
total_sections=total_sections,
|
|
1177
|
+
)
|
|
1178
|
+
conclusion = await self._write_conclusion(topic, blocks, outline)
|
|
1179
|
+
# Get conclusion title from outline, or use default if not available
|
|
1180
|
+
conclusion_title = outline.get("conclusion", "## Conclusion")
|
|
1181
|
+
if not conclusion_title.startswith("##"):
|
|
1182
|
+
conclusion_title = f"## {conclusion_title}"
|
|
1183
|
+
parts.append(f"{conclusion_title}\n\n")
|
|
1184
|
+
parts.append(conclusion)
|
|
1185
|
+
parts.append("\n\n")
|
|
1186
|
+
|
|
1187
|
+
# 5. Generate References based on configuration
|
|
1188
|
+
if self.enable_citation_list:
|
|
1189
|
+
print(" đ Generating citation list...")
|
|
1190
|
+
references = self._generate_references(blocks)
|
|
1191
|
+
parts.append(references)
|
|
1192
|
+
else:
|
|
1193
|
+
print(" âšī¸ Citation list disabled, skipping generation")
|
|
1194
|
+
|
|
1195
|
+
# Combine all parts
|
|
1196
|
+
report = "".join(parts)
|
|
1197
|
+
|
|
1198
|
+
# 6. Post-process citations (convert [N] to [[N]](#ref-N) format)
|
|
1199
|
+
if self.enable_inline_citations:
|
|
1200
|
+
print(" đ Converting citation format...")
|
|
1201
|
+
report = self._convert_citation_format(report)
|
|
1202
|
+
|
|
1203
|
+
# Validate and fix invalid citations
|
|
1204
|
+
print(" â Validating citations...")
|
|
1205
|
+
report, validation = self._validate_and_fix_citations(report)
|
|
1206
|
+
|
|
1207
|
+
if not validation["is_valid"]:
|
|
1208
|
+
print(
|
|
1209
|
+
f" â ī¸ Removed {len(validation['invalid_citations'])} invalid citations: {validation['invalid_citations']}"
|
|
1210
|
+
)
|
|
1211
|
+
else:
|
|
1212
|
+
print(f" â All {validation['total_found']} citations are valid")
|
|
1213
|
+
|
|
1214
|
+
return report
|
|
1215
|
+
|
|
1216
|
+
async def _write_section_with_subsections(
|
|
1217
|
+
self,
|
|
1218
|
+
topic: str,
|
|
1219
|
+
block: TopicBlock,
|
|
1220
|
+
section: dict[str, Any],
|
|
1221
|
+
subsections: list[dict[str, Any]],
|
|
1222
|
+
) -> str:
|
|
1223
|
+
"""Write a section that has explicitly defined subsections in the outline
|
|
1224
|
+
|
|
1225
|
+
This method writes the section as a whole, passing subsection structure to the LLM
|
|
1226
|
+
to guide the content organization while maintaining coherence.
|
|
1227
|
+
"""
|
|
1228
|
+
import json as _json
|
|
1229
|
+
|
|
1230
|
+
# Enhance section instruction with subsection information
|
|
1231
|
+
subsection_info = []
|
|
1232
|
+
for j, sub in enumerate(subsections, 1):
|
|
1233
|
+
subsection_info.append(
|
|
1234
|
+
{
|
|
1235
|
+
"title": sub.get("title", f"### Subsection {j}"),
|
|
1236
|
+
"instruction": sub.get("instruction", ""),
|
|
1237
|
+
}
|
|
1238
|
+
)
|
|
1239
|
+
|
|
1240
|
+
# Create enhanced section data with subsection guidance
|
|
1241
|
+
enhanced_section = {
|
|
1242
|
+
"title": section.get("title", block.sub_topic),
|
|
1243
|
+
"instruction": section.get("instruction", ""),
|
|
1244
|
+
"subsection_structure": subsection_info,
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
# Prepare block data with subsection hints
|
|
1248
|
+
block_data = self._ser_block(block)
|
|
1249
|
+
block_data["expected_subsections"] = subsection_info
|
|
1250
|
+
|
|
1251
|
+
system_prompt = self.get_prompt(
|
|
1252
|
+
"system",
|
|
1253
|
+
"role",
|
|
1254
|
+
"You are an academic writing expert specializing in writing comprehensive research report sections with structured subsections.",
|
|
1255
|
+
)
|
|
1256
|
+
tmpl = self.get_prompt("process", "write_section_body", "")
|
|
1257
|
+
if not tmpl:
|
|
1258
|
+
raise ValueError("Cannot get section writing prompt template, report generation failed")
|
|
1259
|
+
|
|
1260
|
+
# Build enhanced instruction including subsection structure
|
|
1261
|
+
section_instruction = section.get("instruction", "")
|
|
1262
|
+
if subsection_info:
|
|
1263
|
+
subsection_guide = "\n\n**Expected subsection structure:**\n"
|
|
1264
|
+
for sub in subsection_info:
|
|
1265
|
+
subsection_guide += f"- {sub['title']}: {sub['instruction']}\n"
|
|
1266
|
+
section_instruction += subsection_guide
|
|
1267
|
+
|
|
1268
|
+
# Dynamically build citation instructions based on configuration
|
|
1269
|
+
if self.enable_inline_citations:
|
|
1270
|
+
# Build clear citation reference table for this block
|
|
1271
|
+
citation_table = self._build_citation_table(block)
|
|
1272
|
+
|
|
1273
|
+
citation_instruction_template = self.get_prompt("citation", "enabled_instruction")
|
|
1274
|
+
if citation_instruction_template:
|
|
1275
|
+
citation_instruction = citation_instruction_template.format(
|
|
1276
|
+
citation_table=citation_table
|
|
1277
|
+
)
|
|
1278
|
+
else:
|
|
1279
|
+
# Fallback if YAML not configured
|
|
1280
|
+
citation_instruction = f"**Citation Reference Table**:\n{citation_table}"
|
|
1281
|
+
citation_output_hint = ", citations"
|
|
1282
|
+
else:
|
|
1283
|
+
citation_instruction = self.get_prompt("citation", "disabled_instruction") or ""
|
|
1284
|
+
citation_output_hint = ""
|
|
1285
|
+
|
|
1286
|
+
# Use safe_format to avoid conflicts with LaTeX braces like {\rho}, {L}
|
|
1287
|
+
block_data_json = _json.dumps(block_data, ensure_ascii=False, indent=2)
|
|
1288
|
+
filled = self._safe_format(
|
|
1289
|
+
tmpl,
|
|
1290
|
+
topic=topic,
|
|
1291
|
+
section_title=section.get("title", block.sub_topic),
|
|
1292
|
+
section_instruction=section_instruction,
|
|
1293
|
+
block_data=block_data_json,
|
|
1294
|
+
min_section_length=self.reporting_config.get("min_section_length", 800),
|
|
1295
|
+
citation_instruction=citation_instruction,
|
|
1296
|
+
citation_output_hint=citation_output_hint,
|
|
1297
|
+
)
|
|
1298
|
+
|
|
1299
|
+
# TODO Implement retry logic for LLM calls when JSON parsing or post-processing fails (e.g., malformed output, schema violations).
|
|
1300
|
+
resp = await self.call_llm(
|
|
1301
|
+
filled,
|
|
1302
|
+
system_prompt,
|
|
1303
|
+
stage="write_section_with_subsections",
|
|
1304
|
+
verbose=False,
|
|
1305
|
+
)
|
|
1306
|
+
data = extract_json_from_text(resp)
|
|
1307
|
+
|
|
1308
|
+
try:
|
|
1309
|
+
obj = ensure_json_dict(data)
|
|
1310
|
+
ensure_keys(obj, ["section_content"])
|
|
1311
|
+
content = obj.get("section_content", "")
|
|
1312
|
+
if isinstance(content, str) and content.strip():
|
|
1313
|
+
return content
|
|
1314
|
+
raise ValueError("LLM returned empty or invalid section_content field")
|
|
1315
|
+
except Exception as e:
|
|
1316
|
+
raise ValueError(
|
|
1317
|
+
f"Unable to parse LLM returned section content: {e!s}. Report generation failed."
|
|
1318
|
+
)
|
|
1319
|
+
|
|
1320
|
+
def _notify_progress(
|
|
1321
|
+
self, callback: Callable[[dict[str, Any]], None] | None, status: str, **payload: Any
|
|
1322
|
+
) -> None:
|
|
1323
|
+
if not callback:
|
|
1324
|
+
return
|
|
1325
|
+
event = {"status": status}
|
|
1326
|
+
event.update({k: v for k, v in payload.items() if v is not None})
|
|
1327
|
+
try:
|
|
1328
|
+
callback(event)
|
|
1329
|
+
except Exception:
|
|
1330
|
+
pass
|
|
1331
|
+
|
|
1332
|
+
|
|
1333
|
+
__all__ = ["ReportingAgent"]
|