realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,799 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
CitationManager - Citation management system
|
|
5
|
+
Responsible for extracting citation information from tool calls and managing citation JSON files
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import sys
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
project_root = Path(__file__).parent.parent.parent.parent
|
|
16
|
+
sys.path.insert(0, str(project_root))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CitationManager:
|
|
20
|
+
"""Citation manager with global ID management"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, research_id: str, cache_dir: Path | None = None):
|
|
23
|
+
"""
|
|
24
|
+
Initialize citation manager
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
research_id: Research task ID
|
|
28
|
+
cache_dir: Cache directory path, if None uses default path
|
|
29
|
+
"""
|
|
30
|
+
self.research_id = research_id
|
|
31
|
+
if cache_dir is None:
|
|
32
|
+
cache_dir = Path("./cache") / research_id
|
|
33
|
+
self.cache_dir = Path(cache_dir)
|
|
34
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
self.citations_file = self.cache_dir / "citations.json"
|
|
37
|
+
self._citations: dict[str, dict[str, Any]] = {}
|
|
38
|
+
|
|
39
|
+
# Global citation ID counters
|
|
40
|
+
self._plan_counter = 0 # For PLAN-XX format (planning stage)
|
|
41
|
+
self._block_counters: dict[str, int] = {} # For CIT-X-XX format (research stage)
|
|
42
|
+
|
|
43
|
+
# Reference number mapping (citation_id -> ref_number for in-text citations)
|
|
44
|
+
self._ref_number_map: dict[str, int] = {}
|
|
45
|
+
|
|
46
|
+
# Lock for thread-safe operations in parallel mode
|
|
47
|
+
self._lock = asyncio.Lock()
|
|
48
|
+
|
|
49
|
+
self._load_citations()
|
|
50
|
+
|
|
51
|
+
def generate_plan_citation_id(self) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Generate a new citation ID for planning stage (PLAN-XX format)
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Citation ID in PLAN-XX format
|
|
57
|
+
"""
|
|
58
|
+
self._plan_counter += 1
|
|
59
|
+
return f"PLAN-{self._plan_counter:02d}"
|
|
60
|
+
|
|
61
|
+
def generate_research_citation_id(self, block_id: str) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Generate a new citation ID for research stage (CIT-X-XX format)
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
block_id: Block ID (e.g., "block_3")
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Citation ID in CIT-X-XX format
|
|
70
|
+
"""
|
|
71
|
+
# Extract block number from block_id
|
|
72
|
+
block_num = 0
|
|
73
|
+
try:
|
|
74
|
+
if block_id and "_" in block_id:
|
|
75
|
+
block_num = int(block_id.split("_")[1])
|
|
76
|
+
except (ValueError, IndexError):
|
|
77
|
+
block_num = 0
|
|
78
|
+
|
|
79
|
+
# Increment counter for this block
|
|
80
|
+
block_key = str(block_num)
|
|
81
|
+
if block_key not in self._block_counters:
|
|
82
|
+
self._block_counters[block_key] = 0
|
|
83
|
+
self._block_counters[block_key] += 1
|
|
84
|
+
|
|
85
|
+
return f"CIT-{block_num}-{self._block_counters[block_key]:02d}"
|
|
86
|
+
|
|
87
|
+
def get_next_citation_id(self, stage: str = "research", block_id: str = "") -> str:
|
|
88
|
+
"""
|
|
89
|
+
Get the next available citation ID
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
stage: "planning" or "research"
|
|
93
|
+
block_id: Block ID (required for research stage)
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Next available citation ID
|
|
97
|
+
"""
|
|
98
|
+
if stage == "planning":
|
|
99
|
+
return self.generate_plan_citation_id()
|
|
100
|
+
return self.generate_research_citation_id(block_id)
|
|
101
|
+
|
|
102
|
+
def citation_exists(self, citation_id: str) -> bool:
|
|
103
|
+
"""
|
|
104
|
+
Check if a citation ID already exists
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
citation_id: Citation ID to check
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
True if citation exists, False otherwise
|
|
111
|
+
"""
|
|
112
|
+
return citation_id in self._citations
|
|
113
|
+
|
|
114
|
+
def _load_citations(self):
|
|
115
|
+
"""Load citation information from JSON file and restore counters"""
|
|
116
|
+
if self.citations_file.exists():
|
|
117
|
+
try:
|
|
118
|
+
with open(self.citations_file, encoding="utf-8") as f:
|
|
119
|
+
data = json.load(f)
|
|
120
|
+
self._citations = data.get("citations", {})
|
|
121
|
+
|
|
122
|
+
# Try to restore counters from saved state first
|
|
123
|
+
counters = data.get("counters", {})
|
|
124
|
+
if counters:
|
|
125
|
+
self._plan_counter = counters.get("plan_counter", 0)
|
|
126
|
+
self._block_counters = counters.get("block_counters", {})
|
|
127
|
+
else:
|
|
128
|
+
# Fallback: restore counters from existing citations
|
|
129
|
+
self._restore_counters_from_citations()
|
|
130
|
+
except Exception as e:
|
|
131
|
+
print(f"⚠️ Failed to load citation file: {e}")
|
|
132
|
+
self._citations = {}
|
|
133
|
+
else:
|
|
134
|
+
self._citations = {}
|
|
135
|
+
|
|
136
|
+
def _restore_counters_from_citations(self):
|
|
137
|
+
"""Restore citation counters from existing citations to avoid ID conflicts"""
|
|
138
|
+
for citation_id in self._citations.keys():
|
|
139
|
+
if citation_id.startswith("PLAN-"):
|
|
140
|
+
try:
|
|
141
|
+
num = int(citation_id.replace("PLAN-", ""))
|
|
142
|
+
self._plan_counter = max(self._plan_counter, num)
|
|
143
|
+
except ValueError:
|
|
144
|
+
pass
|
|
145
|
+
elif citation_id.startswith("CIT-"):
|
|
146
|
+
try:
|
|
147
|
+
parts = citation_id.replace("CIT-", "").split("-")
|
|
148
|
+
if len(parts) == 2:
|
|
149
|
+
block_num = parts[0]
|
|
150
|
+
seq_num = int(parts[1])
|
|
151
|
+
if block_num not in self._block_counters:
|
|
152
|
+
self._block_counters[block_num] = 0
|
|
153
|
+
self._block_counters[block_num] = max(
|
|
154
|
+
self._block_counters[block_num], seq_num
|
|
155
|
+
)
|
|
156
|
+
except (ValueError, IndexError):
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
def _save_citations(self):
|
|
160
|
+
"""Save citation information to JSON file"""
|
|
161
|
+
try:
|
|
162
|
+
data = {
|
|
163
|
+
"research_id": self.research_id,
|
|
164
|
+
"updated_at": datetime.now().isoformat(),
|
|
165
|
+
"citations": self._citations,
|
|
166
|
+
"counters": {
|
|
167
|
+
"plan_counter": self._plan_counter,
|
|
168
|
+
"block_counters": self._block_counters,
|
|
169
|
+
},
|
|
170
|
+
}
|
|
171
|
+
with open(self.citations_file, "w", encoding="utf-8") as f:
|
|
172
|
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
173
|
+
except Exception as e:
|
|
174
|
+
print(f"⚠️ Failed to save citation file: {e}")
|
|
175
|
+
|
|
176
|
+
def validate_citation_references(self, text: str) -> dict[str, Any]:
|
|
177
|
+
"""
|
|
178
|
+
Validate citation references in text and identify invalid ones
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
text: Text containing citation references like [[CIT-X-XX]]
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Dictionary with validation results:
|
|
185
|
+
{
|
|
186
|
+
"valid_citations": [...],
|
|
187
|
+
"invalid_citations": [...],
|
|
188
|
+
"is_valid": bool
|
|
189
|
+
}
|
|
190
|
+
"""
|
|
191
|
+
import re
|
|
192
|
+
|
|
193
|
+
# Find all citation references in the text
|
|
194
|
+
pattern = r"\[\[([A-Z]+-\d+-?\d*)\]\]"
|
|
195
|
+
found_refs = re.findall(pattern, text)
|
|
196
|
+
|
|
197
|
+
valid = []
|
|
198
|
+
invalid = []
|
|
199
|
+
|
|
200
|
+
for ref in found_refs:
|
|
201
|
+
if self.citation_exists(ref):
|
|
202
|
+
valid.append(ref)
|
|
203
|
+
else:
|
|
204
|
+
invalid.append(ref)
|
|
205
|
+
|
|
206
|
+
return {
|
|
207
|
+
"valid_citations": valid,
|
|
208
|
+
"invalid_citations": invalid,
|
|
209
|
+
"is_valid": len(invalid) == 0,
|
|
210
|
+
"total_found": len(found_refs),
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
def fix_invalid_citations(self, text: str) -> str:
|
|
214
|
+
"""
|
|
215
|
+
Remove or mark invalid citation references in text
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
text: Text containing citation references
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Text with invalid citations removed or marked
|
|
222
|
+
"""
|
|
223
|
+
import re
|
|
224
|
+
|
|
225
|
+
pattern = r"\[\[([A-Z]+-\d+-?\d*)\]\]\(#ref-[a-z]+-\d+-?\d*\)"
|
|
226
|
+
|
|
227
|
+
def replace_invalid(match):
|
|
228
|
+
citation_id = match.group(1)
|
|
229
|
+
if self.citation_exists(citation_id):
|
|
230
|
+
return match.group(0) # Keep valid citations
|
|
231
|
+
return "" # Remove invalid citations
|
|
232
|
+
|
|
233
|
+
return re.sub(pattern, replace_invalid, text)
|
|
234
|
+
|
|
235
|
+
def add_citation(
|
|
236
|
+
self,
|
|
237
|
+
citation_id: str,
|
|
238
|
+
tool_type: str,
|
|
239
|
+
tool_trace: Any,
|
|
240
|
+
raw_answer: str, # Raw answer JSON string
|
|
241
|
+
) -> bool:
|
|
242
|
+
"""
|
|
243
|
+
Add citation information
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
citation_id: Citation ID
|
|
247
|
+
tool_type: Tool type
|
|
248
|
+
tool_trace: ToolTrace object
|
|
249
|
+
raw_answer: Raw answer (JSON string)
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Whether addition was successful
|
|
253
|
+
"""
|
|
254
|
+
try:
|
|
255
|
+
tool_type_lower = tool_type.lower()
|
|
256
|
+
|
|
257
|
+
if tool_type_lower in ("rag_naive", "rag_hybrid", "query_item"):
|
|
258
|
+
citation_info = self._extract_rag_citation(
|
|
259
|
+
citation_id, tool_type, raw_answer, tool_trace
|
|
260
|
+
)
|
|
261
|
+
elif tool_type_lower == "web_search":
|
|
262
|
+
citation_info = self._extract_web_citation(
|
|
263
|
+
citation_id, tool_type, raw_answer, tool_trace
|
|
264
|
+
)
|
|
265
|
+
elif tool_type_lower == "paper_search":
|
|
266
|
+
citation_info = self._extract_paper_citation(
|
|
267
|
+
citation_id, tool_type, raw_answer, tool_trace
|
|
268
|
+
)
|
|
269
|
+
elif tool_type_lower == "run_code":
|
|
270
|
+
citation_info = self._extract_code_citation(citation_id, tool_type, tool_trace)
|
|
271
|
+
else:
|
|
272
|
+
# Unknown tool type, use generic format
|
|
273
|
+
citation_info = self._extract_generic_citation(citation_id, tool_type, tool_trace)
|
|
274
|
+
|
|
275
|
+
if citation_info:
|
|
276
|
+
self._citations[citation_id] = citation_info
|
|
277
|
+
self._save_citations()
|
|
278
|
+
return True
|
|
279
|
+
return False
|
|
280
|
+
except Exception as e:
|
|
281
|
+
print(f"⚠️ Failed to add citation (citation_id={citation_id}): {e}")
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
def _extract_rag_citation(
|
|
285
|
+
self, citation_id: str, tool_type: str, raw_answer: str, tool_trace: Any
|
|
286
|
+
) -> dict[str, Any]:
|
|
287
|
+
"""Extract citation information for RAG retrieval with source documents"""
|
|
288
|
+
citation_info = {
|
|
289
|
+
"citation_id": citation_id,
|
|
290
|
+
"tool_type": tool_type,
|
|
291
|
+
"query": tool_trace.query,
|
|
292
|
+
"summary": tool_trace.summary,
|
|
293
|
+
"timestamp": tool_trace.timestamp,
|
|
294
|
+
"sources": [], # List of source documents
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
# Parse raw_answer to extract source information
|
|
299
|
+
answer_data = json.loads(raw_answer)
|
|
300
|
+
|
|
301
|
+
# Extract source documents if available
|
|
302
|
+
# Common fields in RAG responses: chunks, documents, sources, context
|
|
303
|
+
sources = []
|
|
304
|
+
|
|
305
|
+
# Try different field names for source documents
|
|
306
|
+
for field_name in ["chunks", "documents", "sources", "context", "retrieved_docs"]:
|
|
307
|
+
if field_name in answer_data:
|
|
308
|
+
source_list = answer_data[field_name]
|
|
309
|
+
if isinstance(source_list, list):
|
|
310
|
+
for i, doc in enumerate(source_list[:5]): # Limit to 5 sources
|
|
311
|
+
source_info = {}
|
|
312
|
+
if isinstance(doc, dict):
|
|
313
|
+
source_info["title"] = doc.get("title", doc.get("doc_title", ""))
|
|
314
|
+
source_info["content_preview"] = doc.get(
|
|
315
|
+
"content", doc.get("text", "")
|
|
316
|
+
)[:200]
|
|
317
|
+
source_info["source_file"] = doc.get(
|
|
318
|
+
"source", doc.get("file_path", doc.get("filename", ""))
|
|
319
|
+
)
|
|
320
|
+
source_info["page"] = doc.get("page", doc.get("page_number", ""))
|
|
321
|
+
source_info["chunk_id"] = doc.get("chunk_id", doc.get("id", i))
|
|
322
|
+
source_info["score"] = doc.get("score", doc.get("similarity", ""))
|
|
323
|
+
elif isinstance(doc, str):
|
|
324
|
+
source_info["content_preview"] = doc[:200]
|
|
325
|
+
if source_info:
|
|
326
|
+
sources.append(source_info)
|
|
327
|
+
break
|
|
328
|
+
|
|
329
|
+
# Also extract kb_name if available
|
|
330
|
+
citation_info["kb_name"] = answer_data.get("kb_name", "")
|
|
331
|
+
citation_info["sources"] = sources
|
|
332
|
+
citation_info["total_sources"] = len(sources)
|
|
333
|
+
|
|
334
|
+
except (json.JSONDecodeError, Exception) as e:
|
|
335
|
+
# If parsing fails, still return basic citation info
|
|
336
|
+
print(f"⚠️ Failed to parse RAG source info: {e}")
|
|
337
|
+
|
|
338
|
+
return citation_info
|
|
339
|
+
|
|
340
|
+
def _extract_web_citation(
|
|
341
|
+
self, citation_id: str, tool_type: str, raw_answer: str, tool_trace: Any
|
|
342
|
+
) -> dict[str, Any]:
|
|
343
|
+
"""Extract citation information for web search with URLs"""
|
|
344
|
+
citation_info = {
|
|
345
|
+
"citation_id": citation_id,
|
|
346
|
+
"tool_type": tool_type,
|
|
347
|
+
"query": tool_trace.query,
|
|
348
|
+
"summary": tool_trace.summary,
|
|
349
|
+
"timestamp": tool_trace.timestamp,
|
|
350
|
+
"web_sources": [], # List of web sources with URLs
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
try:
|
|
354
|
+
# Parse raw_answer to extract web source information
|
|
355
|
+
answer_data = json.loads(raw_answer)
|
|
356
|
+
|
|
357
|
+
web_sources = []
|
|
358
|
+
|
|
359
|
+
# Try different field names for web results
|
|
360
|
+
for field_name in ["results", "web_results", "search_results", "urls"]:
|
|
361
|
+
if field_name in answer_data:
|
|
362
|
+
result_list = answer_data[field_name]
|
|
363
|
+
if isinstance(result_list, list):
|
|
364
|
+
for result in result_list[:5]: # Limit to 5 sources
|
|
365
|
+
if isinstance(result, dict):
|
|
366
|
+
web_source = {
|
|
367
|
+
"title": result.get("title", ""),
|
|
368
|
+
"url": result.get("url", result.get("link", "")),
|
|
369
|
+
"snippet": result.get("snippet", result.get("description", ""))[
|
|
370
|
+
:200
|
|
371
|
+
],
|
|
372
|
+
"domain": result.get("domain", ""),
|
|
373
|
+
}
|
|
374
|
+
if web_source["url"]: # Only add if URL exists
|
|
375
|
+
web_sources.append(web_source)
|
|
376
|
+
break
|
|
377
|
+
|
|
378
|
+
citation_info["web_sources"] = web_sources
|
|
379
|
+
citation_info["total_sources"] = len(web_sources)
|
|
380
|
+
|
|
381
|
+
except (json.JSONDecodeError, Exception) as e:
|
|
382
|
+
# If parsing fails, still return basic citation info
|
|
383
|
+
print(f"⚠️ Failed to parse web source info: {e}")
|
|
384
|
+
|
|
385
|
+
return citation_info
|
|
386
|
+
|
|
387
|
+
def _extract_paper_citation(
|
|
388
|
+
self, citation_id: str, tool_type: str, raw_answer: str, tool_trace: Any
|
|
389
|
+
) -> dict[str, Any]:
|
|
390
|
+
"""Extract citation information for paper search - supports multiple papers"""
|
|
391
|
+
citation_info = {
|
|
392
|
+
"citation_id": citation_id,
|
|
393
|
+
"tool_type": tool_type,
|
|
394
|
+
"query": tool_trace.query,
|
|
395
|
+
"summary": tool_trace.summary,
|
|
396
|
+
"timestamp": tool_trace.timestamp,
|
|
397
|
+
"papers": [], # Store all papers, not just the first one
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
try:
|
|
401
|
+
# Parse raw_answer JSON
|
|
402
|
+
answer_data = json.loads(raw_answer)
|
|
403
|
+
papers = answer_data.get("papers", [])
|
|
404
|
+
|
|
405
|
+
if not papers:
|
|
406
|
+
# If no papers, return basic info
|
|
407
|
+
return citation_info
|
|
408
|
+
|
|
409
|
+
# Process ALL papers (up to 5 for practicality)
|
|
410
|
+
processed_papers = []
|
|
411
|
+
for paper in papers[:5]:
|
|
412
|
+
# Format authors
|
|
413
|
+
authors = paper.get("authors", [])
|
|
414
|
+
author_str = ", ".join(authors[:3]) # Display at most 3 authors
|
|
415
|
+
if len(authors) > 3:
|
|
416
|
+
author_str += " et al."
|
|
417
|
+
|
|
418
|
+
paper_info = {
|
|
419
|
+
"title": paper.get("title", ""),
|
|
420
|
+
"authors": author_str,
|
|
421
|
+
"authors_list": authors,
|
|
422
|
+
"year": paper.get("year", ""),
|
|
423
|
+
"url": paper.get("url", ""),
|
|
424
|
+
"arxiv_id": paper.get("arxiv_id", ""),
|
|
425
|
+
"abstract": paper.get("abstract", "")[:300], # Truncate abstract
|
|
426
|
+
"doi": paper.get("doi", ""),
|
|
427
|
+
"venue": paper.get("venue", paper.get("journal", "")),
|
|
428
|
+
}
|
|
429
|
+
processed_papers.append(paper_info)
|
|
430
|
+
|
|
431
|
+
citation_info["papers"] = processed_papers
|
|
432
|
+
citation_info["total_papers"] = len(processed_papers)
|
|
433
|
+
|
|
434
|
+
# Keep primary paper info at top level for backward compatibility
|
|
435
|
+
if processed_papers:
|
|
436
|
+
primary = processed_papers[0]
|
|
437
|
+
citation_info["title"] = primary["title"]
|
|
438
|
+
citation_info["authors"] = primary["authors"]
|
|
439
|
+
citation_info["authors_list"] = primary["authors_list"]
|
|
440
|
+
citation_info["year"] = primary["year"]
|
|
441
|
+
citation_info["url"] = primary["url"]
|
|
442
|
+
citation_info["arxiv_id"] = primary["arxiv_id"]
|
|
443
|
+
|
|
444
|
+
return citation_info
|
|
445
|
+
except Exception as e:
|
|
446
|
+
print(f"⚠️ Failed to parse paper citation: {e}")
|
|
447
|
+
# Still return the basic citation info
|
|
448
|
+
return citation_info
|
|
449
|
+
|
|
450
|
+
def _extract_code_citation(
|
|
451
|
+
self, citation_id: str, tool_type: str, tool_trace: Any
|
|
452
|
+
) -> dict[str, Any]:
|
|
453
|
+
"""Extract citation information for code execution"""
|
|
454
|
+
return {
|
|
455
|
+
"citation_id": citation_id,
|
|
456
|
+
"tool_type": tool_type,
|
|
457
|
+
"query": tool_trace.query, # Code content
|
|
458
|
+
"summary": tool_trace.summary,
|
|
459
|
+
"timestamp": tool_trace.timestamp,
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
def _extract_generic_citation(
|
|
463
|
+
self, citation_id: str, tool_type: str, tool_trace: Any
|
|
464
|
+
) -> dict[str, Any]:
|
|
465
|
+
"""Extract generic citation information (unknown tool type)"""
|
|
466
|
+
return {
|
|
467
|
+
"citation_id": citation_id,
|
|
468
|
+
"tool_type": tool_type,
|
|
469
|
+
"query": tool_trace.query,
|
|
470
|
+
"summary": tool_trace.summary,
|
|
471
|
+
"timestamp": tool_trace.timestamp,
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
def get_citation(self, citation_id: str) -> dict[str, Any] | None:
|
|
475
|
+
"""Get citation information for specified citation ID"""
|
|
476
|
+
return self._citations.get(citation_id)
|
|
477
|
+
|
|
478
|
+
def get_all_citations(self) -> dict[str, dict[str, Any]]:
|
|
479
|
+
"""Get all citation information"""
|
|
480
|
+
return self._citations.copy()
|
|
481
|
+
|
|
482
|
+
def get_citations_file_path(self) -> Path:
|
|
483
|
+
"""Get citation JSON file path"""
|
|
484
|
+
return self.citations_file
|
|
485
|
+
|
|
486
|
+
def format_citation_for_report(self, citation_id: str) -> str | None:
|
|
487
|
+
"""
|
|
488
|
+
Format citation information for report display
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
citation_id: Citation ID
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
Formatted citation string, or None if not found
|
|
495
|
+
"""
|
|
496
|
+
citation = self.get_citation(citation_id)
|
|
497
|
+
if not citation:
|
|
498
|
+
return None
|
|
499
|
+
|
|
500
|
+
tool_type = citation.get("tool_type", "").lower()
|
|
501
|
+
|
|
502
|
+
if tool_type == "paper_search":
|
|
503
|
+
# Standard academic citation format
|
|
504
|
+
title = citation.get("title", "")
|
|
505
|
+
authors = citation.get("authors", "")
|
|
506
|
+
year = citation.get("year", "")
|
|
507
|
+
url = citation.get("url", "")
|
|
508
|
+
arxiv_id = citation.get("arxiv_id", "")
|
|
509
|
+
|
|
510
|
+
# Build citation string
|
|
511
|
+
parts = []
|
|
512
|
+
if authors:
|
|
513
|
+
parts.append(authors)
|
|
514
|
+
if year:
|
|
515
|
+
parts.append(f"({year})")
|
|
516
|
+
if title:
|
|
517
|
+
parts.append(f'"{title}"')
|
|
518
|
+
if arxiv_id:
|
|
519
|
+
parts.append(f"arXiv:{arxiv_id}")
|
|
520
|
+
if url:
|
|
521
|
+
parts.append(f"<{url}>")
|
|
522
|
+
|
|
523
|
+
# Add note about additional papers if available
|
|
524
|
+
total_papers = citation.get("total_papers", 1)
|
|
525
|
+
if total_papers > 1:
|
|
526
|
+
parts.append(f"[+{total_papers - 1} more papers]")
|
|
527
|
+
|
|
528
|
+
return " ".join(parts) if parts else None
|
|
529
|
+
|
|
530
|
+
if tool_type in ("rag_naive", "rag_hybrid", "query_item"):
|
|
531
|
+
# RAG citation with source info
|
|
532
|
+
query = citation.get("query", "")
|
|
533
|
+
kb_name = citation.get("kb_name", "")
|
|
534
|
+
sources = citation.get("sources", [])
|
|
535
|
+
|
|
536
|
+
tool_type_display = {
|
|
537
|
+
"rag_naive": "RAG Retrieval",
|
|
538
|
+
"rag_hybrid": "Hybrid RAG Retrieval",
|
|
539
|
+
"query_item": "Knowledge Base Query",
|
|
540
|
+
}.get(tool_type, tool_type)
|
|
541
|
+
|
|
542
|
+
parts = [f"{tool_type_display}: {query}"]
|
|
543
|
+
if kb_name:
|
|
544
|
+
parts.append(f"[KB: {kb_name}]")
|
|
545
|
+
if sources:
|
|
546
|
+
source_titles = [s.get("title", s.get("source_file", "")) for s in sources[:3] if s]
|
|
547
|
+
source_titles = [t for t in source_titles if t]
|
|
548
|
+
if source_titles:
|
|
549
|
+
parts.append(f"[Sources: {', '.join(source_titles)}]")
|
|
550
|
+
|
|
551
|
+
return " ".join(parts)
|
|
552
|
+
|
|
553
|
+
if tool_type == "web_search":
|
|
554
|
+
# Web search with URLs
|
|
555
|
+
query = citation.get("query", "")
|
|
556
|
+
web_sources = citation.get("web_sources", [])
|
|
557
|
+
|
|
558
|
+
parts = [f"Web Search: {query}"]
|
|
559
|
+
if web_sources:
|
|
560
|
+
urls = [s.get("url", "") for s in web_sources[:3] if s.get("url")]
|
|
561
|
+
if urls:
|
|
562
|
+
parts.append(f"[URLs: {', '.join(urls)}]")
|
|
563
|
+
|
|
564
|
+
return " ".join(parts)
|
|
565
|
+
|
|
566
|
+
# Other types of citation formats
|
|
567
|
+
tool_type_display = {
|
|
568
|
+
"run_code": "Code Execution",
|
|
569
|
+
}.get(tool_type, tool_type)
|
|
570
|
+
|
|
571
|
+
query = citation.get("query", "")
|
|
572
|
+
return f"{tool_type_display}: {query}"
|
|
573
|
+
|
|
574
|
+
# ========== Reference Number Mapping Methods ==========
|
|
575
|
+
|
|
576
|
+
def _get_citation_dedup_key(self, citation: dict, paper: dict = None) -> str:
|
|
577
|
+
"""
|
|
578
|
+
Generate unique key for citation deduplication
|
|
579
|
+
|
|
580
|
+
Deduplication is ONLY applied to paper_search citations where the same paper
|
|
581
|
+
(title + first author) is cited multiple times. All other citation types
|
|
582
|
+
get unique ref_numbers based on their citation_id.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
citation: The citation dict
|
|
586
|
+
paper: Optional paper dict for paper_search citations
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
Unique string key for deduplication
|
|
590
|
+
"""
|
|
591
|
+
tool_type = citation.get("tool_type", "").lower()
|
|
592
|
+
citation_id = citation.get("citation_id", "")
|
|
593
|
+
|
|
594
|
+
if tool_type == "paper_search" and paper:
|
|
595
|
+
# For papers: use title + first author (normalized) - allow dedup for same paper
|
|
596
|
+
title = paper.get("title", "").lower().strip()
|
|
597
|
+
authors = paper.get("authors", "").lower().strip()
|
|
598
|
+
# Extract first author if multiple
|
|
599
|
+
first_author = authors.split(",")[0].strip() if authors else ""
|
|
600
|
+
if title: # Only dedup if we have a title
|
|
601
|
+
return f"paper:{title}|{first_author}"
|
|
602
|
+
# No title? Use citation_id to ensure unique
|
|
603
|
+
return f"unique:{citation_id}"
|
|
604
|
+
elif tool_type == "paper_search":
|
|
605
|
+
# Fallback for paper_search without paper dict
|
|
606
|
+
title = citation.get("title", "").lower().strip()
|
|
607
|
+
authors = citation.get("authors", "").lower().strip()
|
|
608
|
+
first_author = authors.split(",")[0].strip() if authors else ""
|
|
609
|
+
if title: # Only dedup if we have a title
|
|
610
|
+
return f"paper:{title}|{first_author}"
|
|
611
|
+
return f"unique:{citation_id}"
|
|
612
|
+
else:
|
|
613
|
+
# For RAG/web_search/etc: each citation gets unique ref_number
|
|
614
|
+
# Use citation_id to ensure each citation is unique
|
|
615
|
+
return f"unique:{citation_id}"
|
|
616
|
+
|
|
617
|
+
def _extract_citation_sort_key(self, citation_id: str) -> tuple:
|
|
618
|
+
"""
|
|
619
|
+
Extract numeric sort key from citation ID for ordering
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
citation_id: Citation ID (e.g., "PLAN-01", "CIT-1-02")
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
Tuple for sorting (stage, block_num, seq_num)
|
|
626
|
+
"""
|
|
627
|
+
try:
|
|
628
|
+
if citation_id.startswith("PLAN-"):
|
|
629
|
+
# PLAN-XX format: put at the beginning
|
|
630
|
+
num = int(citation_id.replace("PLAN-", ""))
|
|
631
|
+
return (0, 0, num)
|
|
632
|
+
# CIT-X-XX format
|
|
633
|
+
parts = citation_id.replace("CIT-", "").split("-")
|
|
634
|
+
if len(parts) == 2:
|
|
635
|
+
return (1, int(parts[0]), int(parts[1]))
|
|
636
|
+
except (ValueError, IndexError):
|
|
637
|
+
pass
|
|
638
|
+
return (999, 999, 999)
|
|
639
|
+
|
|
640
|
+
def build_ref_number_map(self) -> dict[str, int]:
|
|
641
|
+
"""
|
|
642
|
+
Build citation_id to reference number mapping with deduplication.
|
|
643
|
+
This is the single source of truth for ref_number assignment.
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
Dictionary mapping citation_id to reference number (1-based)
|
|
647
|
+
"""
|
|
648
|
+
if not self._citations:
|
|
649
|
+
self._ref_number_map = {}
|
|
650
|
+
return self._ref_number_map
|
|
651
|
+
|
|
652
|
+
# Sort all citation IDs by their numeric parts
|
|
653
|
+
sorted_citation_ids = sorted(self._citations.keys(), key=self._extract_citation_sort_key)
|
|
654
|
+
|
|
655
|
+
# Track seen dedup keys and their assigned ref_numbers
|
|
656
|
+
seen_keys: dict[str, int] = {}
|
|
657
|
+
ref_idx = 0
|
|
658
|
+
ref_map: dict[str, int] = {}
|
|
659
|
+
|
|
660
|
+
for citation_id in sorted_citation_ids:
|
|
661
|
+
citation = self._citations.get(citation_id)
|
|
662
|
+
if not citation:
|
|
663
|
+
continue
|
|
664
|
+
|
|
665
|
+
tool_type = citation.get("tool_type", "").lower()
|
|
666
|
+
|
|
667
|
+
if tool_type == "paper_search":
|
|
668
|
+
# paper_search may have multiple papers - each paper gets a separate ref_number
|
|
669
|
+
papers = citation.get("papers", [])
|
|
670
|
+
if papers:
|
|
671
|
+
for paper_idx, paper in enumerate(papers):
|
|
672
|
+
# Check for duplicate using dedup key
|
|
673
|
+
dedup_key = self._get_citation_dedup_key(citation, paper)
|
|
674
|
+
|
|
675
|
+
if dedup_key in seen_keys:
|
|
676
|
+
# Map to existing ref_number
|
|
677
|
+
existing_ref = seen_keys[dedup_key]
|
|
678
|
+
if paper_idx == 0:
|
|
679
|
+
ref_map[citation_id] = existing_ref
|
|
680
|
+
ref_map[f"{citation_id}-{paper_idx + 1}"] = existing_ref
|
|
681
|
+
else:
|
|
682
|
+
# New unique citation
|
|
683
|
+
ref_idx += 1
|
|
684
|
+
seen_keys[dedup_key] = ref_idx
|
|
685
|
+
if paper_idx == 0:
|
|
686
|
+
ref_map[citation_id] = ref_idx
|
|
687
|
+
ref_map[f"{citation_id}-{paper_idx + 1}"] = ref_idx
|
|
688
|
+
else:
|
|
689
|
+
# Paper search without papers array
|
|
690
|
+
dedup_key = self._get_citation_dedup_key(citation)
|
|
691
|
+
if dedup_key in seen_keys:
|
|
692
|
+
ref_map[citation_id] = seen_keys[dedup_key]
|
|
693
|
+
else:
|
|
694
|
+
ref_idx += 1
|
|
695
|
+
seen_keys[dedup_key] = ref_idx
|
|
696
|
+
ref_map[citation_id] = ref_idx
|
|
697
|
+
else:
|
|
698
|
+
# Non-paper citations
|
|
699
|
+
dedup_key = self._get_citation_dedup_key(citation)
|
|
700
|
+
if dedup_key in seen_keys:
|
|
701
|
+
ref_map[citation_id] = seen_keys[dedup_key]
|
|
702
|
+
else:
|
|
703
|
+
ref_idx += 1
|
|
704
|
+
seen_keys[dedup_key] = ref_idx
|
|
705
|
+
ref_map[citation_id] = ref_idx
|
|
706
|
+
|
|
707
|
+
self._ref_number_map = ref_map
|
|
708
|
+
return ref_map
|
|
709
|
+
|
|
710
|
+
def get_ref_number(self, citation_id: str) -> int:
|
|
711
|
+
"""
|
|
712
|
+
Get the reference number for a citation ID.
|
|
713
|
+
If the map hasn't been built yet, build it first.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
citation_id: Citation ID
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
Reference number (1-based), or 0 if not found
|
|
720
|
+
"""
|
|
721
|
+
if not self._ref_number_map:
|
|
722
|
+
self.build_ref_number_map()
|
|
723
|
+
return self._ref_number_map.get(citation_id, 0)
|
|
724
|
+
|
|
725
|
+
def get_ref_number_map(self) -> dict[str, int]:
|
|
726
|
+
"""
|
|
727
|
+
Get the full reference number map.
|
|
728
|
+
If the map hasn't been built yet, build it first.
|
|
729
|
+
|
|
730
|
+
Returns:
|
|
731
|
+
Dictionary mapping citation_id to reference number
|
|
732
|
+
"""
|
|
733
|
+
if not self._ref_number_map:
|
|
734
|
+
self.build_ref_number_map()
|
|
735
|
+
return self._ref_number_map.copy()
|
|
736
|
+
|
|
737
|
+
# ========== Async thread-safe methods for parallel mode ==========
|
|
738
|
+
|
|
739
|
+
async def generate_plan_citation_id_async(self) -> str:
|
|
740
|
+
"""
|
|
741
|
+
Thread-safe async version of generate_plan_citation_id for parallel mode
|
|
742
|
+
|
|
743
|
+
Returns:
|
|
744
|
+
Citation ID in PLAN-XX format
|
|
745
|
+
"""
|
|
746
|
+
async with self._lock:
|
|
747
|
+
return self.generate_plan_citation_id()
|
|
748
|
+
|
|
749
|
+
async def generate_research_citation_id_async(self, block_id: str) -> str:
|
|
750
|
+
"""
|
|
751
|
+
Thread-safe async version of generate_research_citation_id for parallel mode
|
|
752
|
+
|
|
753
|
+
Args:
|
|
754
|
+
block_id: Block ID (e.g., "block_3")
|
|
755
|
+
|
|
756
|
+
Returns:
|
|
757
|
+
Citation ID in CIT-X-XX format
|
|
758
|
+
"""
|
|
759
|
+
async with self._lock:
|
|
760
|
+
return self.generate_research_citation_id(block_id)
|
|
761
|
+
|
|
762
|
+
async def get_next_citation_id_async(self, stage: str = "research", block_id: str = "") -> str:
|
|
763
|
+
"""
|
|
764
|
+
Thread-safe async version of get_next_citation_id for parallel mode
|
|
765
|
+
|
|
766
|
+
Args:
|
|
767
|
+
stage: "planning" or "research"
|
|
768
|
+
block_id: Block ID (required for research stage)
|
|
769
|
+
|
|
770
|
+
Returns:
|
|
771
|
+
Next available citation ID
|
|
772
|
+
"""
|
|
773
|
+
async with self._lock:
|
|
774
|
+
return self.get_next_citation_id(stage, block_id)
|
|
775
|
+
|
|
776
|
+
async def add_citation_async(
|
|
777
|
+
self,
|
|
778
|
+
citation_id: str,
|
|
779
|
+
tool_type: str,
|
|
780
|
+
tool_trace: Any,
|
|
781
|
+
raw_answer: str,
|
|
782
|
+
) -> bool:
|
|
783
|
+
"""
|
|
784
|
+
Thread-safe async version of add_citation for parallel mode
|
|
785
|
+
|
|
786
|
+
Args:
|
|
787
|
+
citation_id: Citation ID
|
|
788
|
+
tool_type: Tool type
|
|
789
|
+
tool_trace: ToolTrace object
|
|
790
|
+
raw_answer: Raw answer (JSON string)
|
|
791
|
+
|
|
792
|
+
Returns:
|
|
793
|
+
Whether addition was successful
|
|
794
|
+
"""
|
|
795
|
+
async with self._lock:
|
|
796
|
+
return self.add_citation(citation_id, tool_type, tool_trace, raw_answer)
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
__all__ = ["CitationManager"]
|