realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text Parser
|
|
3
|
+
===========
|
|
4
|
+
|
|
5
|
+
Parser for plain text documents (.txt files).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Union
|
|
10
|
+
|
|
11
|
+
from ...types import Document
|
|
12
|
+
from ..base import BaseComponent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TextParser(BaseComponent):
|
|
16
|
+
"""
|
|
17
|
+
Plain text parser.
|
|
18
|
+
|
|
19
|
+
Parses text files (.txt) into Document objects.
|
|
20
|
+
Also handles common text-based formats.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name = "text_parser"
|
|
24
|
+
|
|
25
|
+
# Supported extensions
|
|
26
|
+
SUPPORTED_EXTENSIONS = {".txt", ".text", ".log", ".csv", ".tsv"}
|
|
27
|
+
|
|
28
|
+
async def process(self, file_path: Union[str, Path], **kwargs) -> Document:
|
|
29
|
+
"""
|
|
30
|
+
Parse a text file into a Document.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
file_path: Path to the text file
|
|
34
|
+
**kwargs: Additional arguments
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Parsed Document
|
|
38
|
+
"""
|
|
39
|
+
file_path = Path(file_path)
|
|
40
|
+
|
|
41
|
+
if not file_path.exists():
|
|
42
|
+
raise FileNotFoundError(f"Text file not found: {file_path}")
|
|
43
|
+
|
|
44
|
+
self.logger.info(f"Parsing text file: {file_path.name}")
|
|
45
|
+
|
|
46
|
+
# Try different encodings
|
|
47
|
+
content = None
|
|
48
|
+
encodings = ["utf-8", "utf-8-sig", "gbk", "gb2312", "latin-1"]
|
|
49
|
+
|
|
50
|
+
for encoding in encodings:
|
|
51
|
+
try:
|
|
52
|
+
with open(file_path, "r", encoding=encoding) as f:
|
|
53
|
+
content = f.read()
|
|
54
|
+
break
|
|
55
|
+
except UnicodeDecodeError:
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
if content is None:
|
|
59
|
+
# Last resort: read as binary and decode with error handling
|
|
60
|
+
with open(file_path, "rb") as f:
|
|
61
|
+
content = f.read().decode("utf-8", errors="replace")
|
|
62
|
+
|
|
63
|
+
return Document(
|
|
64
|
+
content=content,
|
|
65
|
+
file_path=str(file_path),
|
|
66
|
+
metadata={
|
|
67
|
+
"filename": file_path.name,
|
|
68
|
+
"parser": self.name,
|
|
69
|
+
"extension": file_path.suffix.lower(),
|
|
70
|
+
"size_bytes": file_path.stat().st_size,
|
|
71
|
+
},
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def can_parse(cls, file_path: Union[str, Path]) -> bool:
|
|
76
|
+
"""
|
|
77
|
+
Check if this parser can handle the given file.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
file_path: Path to check
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
True if file can be parsed
|
|
84
|
+
"""
|
|
85
|
+
suffix = Path(file_path).suffix.lower()
|
|
86
|
+
return suffix in cls.SUPPORTED_EXTENSIONS
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Retrievers
|
|
3
|
+
===================
|
|
4
|
+
|
|
5
|
+
Retrievers for searching indexed documents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .base import BaseRetriever
|
|
9
|
+
from .dense import DenseRetriever
|
|
10
|
+
from .hybrid import HybridRetriever
|
|
11
|
+
from .lightrag import LightRAGRetriever
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BaseRetriever",
|
|
15
|
+
"DenseRetriever",
|
|
16
|
+
"HybridRetriever",
|
|
17
|
+
"LightRAGRetriever",
|
|
18
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Retriever
|
|
3
|
+
==============
|
|
4
|
+
|
|
5
|
+
Base class for document retrievers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict
|
|
9
|
+
|
|
10
|
+
from ..base import BaseComponent
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseRetriever(BaseComponent):
|
|
14
|
+
"""
|
|
15
|
+
Base class for document retrievers.
|
|
16
|
+
|
|
17
|
+
Retrievers search indexed documents and return relevant results.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
name = "base_retriever"
|
|
21
|
+
|
|
22
|
+
async def process(self, query: str, kb_name: str, **kwargs) -> Dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
Search for documents matching a query.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
query: Search query
|
|
28
|
+
kb_name: Knowledge base name
|
|
29
|
+
**kwargs: Additional arguments
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Search results dictionary
|
|
33
|
+
"""
|
|
34
|
+
raise NotImplementedError("Subclasses must implement process()")
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dense Retriever
|
|
3
|
+
===============
|
|
4
|
+
|
|
5
|
+
Dense vector-based retriever using FAISS or cosine similarity.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import pickle
|
|
11
|
+
from typing import Any, Dict, Optional
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
from ..base import BaseComponent
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DenseRetriever(BaseComponent):
|
|
19
|
+
"""
|
|
20
|
+
Dense vector retriever.
|
|
21
|
+
|
|
22
|
+
Uses FAISS for fast similarity search or falls back to
|
|
23
|
+
cosine similarity if FAISS is unavailable.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "dense_retriever"
|
|
27
|
+
|
|
28
|
+
def __init__(self, kb_base_dir: Optional[str] = None, top_k: int = 5):
|
|
29
|
+
"""
|
|
30
|
+
Initialize dense retriever.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
kb_base_dir: Base directory for knowledge bases
|
|
34
|
+
top_k: Number of results to return
|
|
35
|
+
"""
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.kb_base_dir = kb_base_dir or str(
|
|
38
|
+
Path(__file__).resolve().parent.parent.parent.parent.parent.parent
|
|
39
|
+
/ "data"
|
|
40
|
+
/ "knowledge_bases"
|
|
41
|
+
)
|
|
42
|
+
self.top_k = top_k
|
|
43
|
+
|
|
44
|
+
# Try to import FAISS
|
|
45
|
+
self.use_faiss = False
|
|
46
|
+
try:
|
|
47
|
+
import faiss
|
|
48
|
+
|
|
49
|
+
self.faiss = faiss
|
|
50
|
+
self.use_faiss = True
|
|
51
|
+
except ImportError:
|
|
52
|
+
self.logger.warning("FAISS not available, using simple cosine similarity")
|
|
53
|
+
|
|
54
|
+
async def process(self, query: str, kb_name: str, **kwargs) -> Dict[str, Any]:
|
|
55
|
+
"""
|
|
56
|
+
Search using dense embeddings with FAISS or cosine similarity.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
query: Search query
|
|
60
|
+
kb_name: Knowledge base name
|
|
61
|
+
**kwargs: Additional arguments (mode, top_k, etc.)
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Search results dictionary with answer and sources
|
|
65
|
+
"""
|
|
66
|
+
top_k = kwargs.get("top_k", self.top_k)
|
|
67
|
+
self.logger.info(f"Dense search in {kb_name}: {query[:50]}... (top_k={top_k})")
|
|
68
|
+
|
|
69
|
+
from src.services.embedding import get_embedding_client
|
|
70
|
+
|
|
71
|
+
# Get query embedding
|
|
72
|
+
client = get_embedding_client()
|
|
73
|
+
query_embedding = np.array((await client.embed([query]))[0], dtype=np.float32)
|
|
74
|
+
|
|
75
|
+
# Load index
|
|
76
|
+
kb_dir = Path(self.kb_base_dir) / kb_name / "vector_store"
|
|
77
|
+
metadata_file = kb_dir / "metadata.json"
|
|
78
|
+
info_file = kb_dir / "info.json"
|
|
79
|
+
|
|
80
|
+
if not metadata_file.exists():
|
|
81
|
+
self.logger.warning(f"No vector index found at {kb_dir}")
|
|
82
|
+
return {
|
|
83
|
+
"query": query,
|
|
84
|
+
"answer": "No documents indexed. Please upload documents first.",
|
|
85
|
+
"content": "",
|
|
86
|
+
"mode": "dense",
|
|
87
|
+
"provider": "llamaindex",
|
|
88
|
+
"results": [],
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# Load metadata and info (info.json is optional)
|
|
92
|
+
with open(metadata_file, "r", encoding="utf-8") as f:
|
|
93
|
+
metadata = json.load(f)
|
|
94
|
+
|
|
95
|
+
if info_file.exists():
|
|
96
|
+
with open(info_file, "r", encoding="utf-8") as f:
|
|
97
|
+
info = json.load(f)
|
|
98
|
+
else:
|
|
99
|
+
info = {"use_faiss": False}
|
|
100
|
+
|
|
101
|
+
use_faiss = info.get("use_faiss", False)
|
|
102
|
+
|
|
103
|
+
if use_faiss and self.use_faiss:
|
|
104
|
+
# Use FAISS for fast search
|
|
105
|
+
index_file = kb_dir / "index.faiss"
|
|
106
|
+
if not index_file.exists():
|
|
107
|
+
self.logger.error(f"FAISS index file not found: {index_file}")
|
|
108
|
+
return self._empty_response(query)
|
|
109
|
+
|
|
110
|
+
# Load FAISS index
|
|
111
|
+
index = self.faiss.read_index(str(index_file))
|
|
112
|
+
|
|
113
|
+
# Normalize query vector for cosine similarity without modifying original
|
|
114
|
+
norm = np.linalg.norm(query_embedding)
|
|
115
|
+
if norm > 0:
|
|
116
|
+
query_vec = (query_embedding / norm).reshape(1, -1)
|
|
117
|
+
else:
|
|
118
|
+
query_vec = query_embedding.reshape(1, -1)
|
|
119
|
+
|
|
120
|
+
# Search
|
|
121
|
+
distances, indices = index.search(query_vec, min(top_k, len(metadata)))
|
|
122
|
+
|
|
123
|
+
# Build results
|
|
124
|
+
results = []
|
|
125
|
+
for dist, idx in zip(distances[0], indices[0]):
|
|
126
|
+
if idx < len(metadata): # Valid index
|
|
127
|
+
score = 1.0 / (1.0 + dist) # Convert distance to similarity score
|
|
128
|
+
results.append((score, metadata[idx]))
|
|
129
|
+
else:
|
|
130
|
+
# Fallback: Load embeddings and use cosine similarity
|
|
131
|
+
embeddings_file = kb_dir / "embeddings.pkl"
|
|
132
|
+
if not embeddings_file.exists():
|
|
133
|
+
self.logger.error(f"Embeddings file not found: {embeddings_file}")
|
|
134
|
+
return self._empty_response(query)
|
|
135
|
+
|
|
136
|
+
with open(embeddings_file, "rb") as f:
|
|
137
|
+
embeddings = pickle.load(f)
|
|
138
|
+
|
|
139
|
+
# Normalize for cosine similarity (avoid division by zero)
|
|
140
|
+
query_norm = np.linalg.norm(query_embedding)
|
|
141
|
+
if query_norm > 0:
|
|
142
|
+
query_vec = query_embedding / query_norm
|
|
143
|
+
else:
|
|
144
|
+
query_vec = query_embedding # Keep as is if zero norm
|
|
145
|
+
|
|
146
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
147
|
+
# Replace zero norms with 1 to avoid division by zero
|
|
148
|
+
norms = np.where(norms == 0, 1, norms)
|
|
149
|
+
doc_vecs = embeddings / norms
|
|
150
|
+
|
|
151
|
+
# Compute similarities
|
|
152
|
+
similarities = np.dot(doc_vecs, query_vec)
|
|
153
|
+
|
|
154
|
+
# Get top-k results
|
|
155
|
+
top_indices = np.argsort(similarities)[::-1][:top_k]
|
|
156
|
+
|
|
157
|
+
results = []
|
|
158
|
+
for idx in top_indices:
|
|
159
|
+
score = float(similarities[idx])
|
|
160
|
+
results.append((score, metadata[idx]))
|
|
161
|
+
|
|
162
|
+
# Build response content
|
|
163
|
+
# Format chunks cleanly for LLM context (without score annotations)
|
|
164
|
+
content_parts = []
|
|
165
|
+
sources = []
|
|
166
|
+
for score, item in results:
|
|
167
|
+
content = item.get("content", "").strip()
|
|
168
|
+
if content: # Only include non-empty chunks
|
|
169
|
+
# Add chunk without score prefix for clean LLM input
|
|
170
|
+
content_parts.append(content)
|
|
171
|
+
sources.append(
|
|
172
|
+
{
|
|
173
|
+
"content": content,
|
|
174
|
+
"score": score,
|
|
175
|
+
"metadata": item.get("metadata", {}),
|
|
176
|
+
}
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Join chunks with clear separation
|
|
180
|
+
content = "\n\n".join(content_parts)
|
|
181
|
+
|
|
182
|
+
return {
|
|
183
|
+
"query": query,
|
|
184
|
+
"answer": content, # Return clean context for LLM to use
|
|
185
|
+
"content": content,
|
|
186
|
+
"mode": "dense",
|
|
187
|
+
"provider": "llamaindex",
|
|
188
|
+
"results": sources,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
def _empty_response(self, query: str) -> Dict[str, Any]:
|
|
192
|
+
"""Return empty response when no results found."""
|
|
193
|
+
return {
|
|
194
|
+
"query": query,
|
|
195
|
+
"answer": "No relevant documents found.",
|
|
196
|
+
"content": "",
|
|
197
|
+
"mode": "dense",
|
|
198
|
+
"provider": "llamaindex",
|
|
199
|
+
"results": [],
|
|
200
|
+
}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hybrid Retriever
|
|
3
|
+
================
|
|
4
|
+
|
|
5
|
+
Hybrid retriever combining multiple retrieval strategies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import sys
|
|
10
|
+
from typing import Any, Dict, Optional
|
|
11
|
+
|
|
12
|
+
from ..base import BaseComponent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class HybridRetriever(BaseComponent):
|
|
16
|
+
"""
|
|
17
|
+
Hybrid retriever combining graph and vector retrieval.
|
|
18
|
+
|
|
19
|
+
Uses LightRAG's hybrid mode for retrieval.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name = "hybrid_retriever"
|
|
23
|
+
_instances: Dict[str, Any] = {}
|
|
24
|
+
|
|
25
|
+
def __init__(self, kb_base_dir: Optional[str] = None):
|
|
26
|
+
"""
|
|
27
|
+
Initialize hybrid retriever.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
kb_base_dir: Base directory for knowledge bases
|
|
31
|
+
"""
|
|
32
|
+
super().__init__()
|
|
33
|
+
self.kb_base_dir = kb_base_dir or str(
|
|
34
|
+
Path(__file__).resolve().parent.parent.parent.parent.parent.parent
|
|
35
|
+
/ "data"
|
|
36
|
+
/ "knowledge_bases"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def _get_rag_instance(self, kb_name: str):
|
|
40
|
+
"""Get or create a RAGAnything instance."""
|
|
41
|
+
working_dir = str(Path(self.kb_base_dir) / kb_name / "rag_storage")
|
|
42
|
+
|
|
43
|
+
if working_dir in self._instances:
|
|
44
|
+
return self._instances[working_dir]
|
|
45
|
+
|
|
46
|
+
# Add RAG-Anything path
|
|
47
|
+
project_root = Path(__file__).resolve().parent.parent.parent.parent.parent.parent
|
|
48
|
+
raganything_path = project_root.parent / "raganything" / "RAG-Anything"
|
|
49
|
+
if raganything_path.exists() and str(raganything_path) not in sys.path:
|
|
50
|
+
sys.path.insert(0, str(raganything_path))
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
from openai import AsyncOpenAI
|
|
54
|
+
from raganything import RAGAnything, RAGAnythingConfig
|
|
55
|
+
|
|
56
|
+
from src.services.embedding import get_embedding_client
|
|
57
|
+
from src.services.llm import get_llm_client
|
|
58
|
+
|
|
59
|
+
llm_client = get_llm_client()
|
|
60
|
+
embed_client = get_embedding_client()
|
|
61
|
+
|
|
62
|
+
# Create AsyncOpenAI client directly
|
|
63
|
+
openai_client = AsyncOpenAI(
|
|
64
|
+
api_key=llm_client.config.api_key,
|
|
65
|
+
base_url=llm_client.config.base_url,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# LLM function using services (ASYNC - LightRAG expects async functions)
|
|
69
|
+
async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
|
|
70
|
+
"""Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
|
|
71
|
+
if history_messages is None:
|
|
72
|
+
history_messages = []
|
|
73
|
+
|
|
74
|
+
# Build messages
|
|
75
|
+
messages = []
|
|
76
|
+
if system_prompt:
|
|
77
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
78
|
+
messages.extend(history_messages)
|
|
79
|
+
messages.append({"role": "user", "content": prompt})
|
|
80
|
+
|
|
81
|
+
# Whitelist only valid OpenAI parameters
|
|
82
|
+
valid_params = {
|
|
83
|
+
"temperature",
|
|
84
|
+
"top_p",
|
|
85
|
+
"n",
|
|
86
|
+
"stream",
|
|
87
|
+
"stop",
|
|
88
|
+
"max_tokens",
|
|
89
|
+
"presence_penalty",
|
|
90
|
+
"frequency_penalty",
|
|
91
|
+
"logit_bias",
|
|
92
|
+
"user",
|
|
93
|
+
"seed",
|
|
94
|
+
}
|
|
95
|
+
clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
|
|
96
|
+
|
|
97
|
+
# Call OpenAI API directly (async)
|
|
98
|
+
response = await openai_client.chat.completions.create(
|
|
99
|
+
model=llm_client.config.model,
|
|
100
|
+
messages=messages,
|
|
101
|
+
**clean_kwargs,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return response.choices[0].message.content
|
|
105
|
+
|
|
106
|
+
config = RAGAnythingConfig(
|
|
107
|
+
working_dir=working_dir,
|
|
108
|
+
enable_image_processing=True,
|
|
109
|
+
enable_table_processing=True,
|
|
110
|
+
enable_equation_processing=True,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
rag = RAGAnything(
|
|
114
|
+
config=config,
|
|
115
|
+
llm_model_func=llm_model_func,
|
|
116
|
+
embedding_func=embed_client.get_embedding_func(),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
self._instances[working_dir] = rag
|
|
120
|
+
return rag
|
|
121
|
+
|
|
122
|
+
except ImportError as e:
|
|
123
|
+
self.logger.error(f"Failed to import RAG-Anything: {e}")
|
|
124
|
+
raise
|
|
125
|
+
|
|
126
|
+
async def process(
|
|
127
|
+
self,
|
|
128
|
+
query: str,
|
|
129
|
+
kb_name: str,
|
|
130
|
+
mode: str = "hybrid",
|
|
131
|
+
only_need_context: bool = False,
|
|
132
|
+
**kwargs,
|
|
133
|
+
) -> Dict[str, Any]:
|
|
134
|
+
"""
|
|
135
|
+
Search using hybrid retrieval.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
query: Search query
|
|
139
|
+
kb_name: Knowledge base name
|
|
140
|
+
mode: Search mode (hybrid, local, global, naive)
|
|
141
|
+
only_need_context: Whether to only return context without answer
|
|
142
|
+
**kwargs: Additional arguments
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Search results dictionary
|
|
146
|
+
"""
|
|
147
|
+
self.logger.info(f"Hybrid search ({mode}) in {kb_name}: {query[:50]}...")
|
|
148
|
+
|
|
149
|
+
from src.logging.adapters import LightRAGLogContext
|
|
150
|
+
|
|
151
|
+
with LightRAGLogContext(scene="rag_search"):
|
|
152
|
+
rag = self._get_rag_instance(kb_name)
|
|
153
|
+
await rag._ensure_lightrag_initialized()
|
|
154
|
+
|
|
155
|
+
answer = await rag.aquery(query, mode=mode, only_need_context=only_need_context)
|
|
156
|
+
answer_str = answer if isinstance(answer, str) else str(answer)
|
|
157
|
+
|
|
158
|
+
return {
|
|
159
|
+
"query": query,
|
|
160
|
+
"answer": answer_str,
|
|
161
|
+
"content": answer_str,
|
|
162
|
+
"mode": mode,
|
|
163
|
+
"provider": "hybrid",
|
|
164
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LightRAG Retriever
|
|
3
|
+
==================
|
|
4
|
+
|
|
5
|
+
Pure LightRAG retriever (text-only, no multimodal).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import sys
|
|
10
|
+
from typing import Any, ClassVar, Dict, Optional
|
|
11
|
+
|
|
12
|
+
from ..base import BaseComponent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LightRAGRetriever(BaseComponent):
|
|
16
|
+
"""
|
|
17
|
+
Pure LightRAG retriever using LightRAG.query() directly.
|
|
18
|
+
|
|
19
|
+
Uses LightRAG's native retrieval modes (naive, local, global, hybrid).
|
|
20
|
+
Text-only, no multimodal processing.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name = "lightrag_retriever"
|
|
24
|
+
_instances: ClassVar[Dict[str, Any]] = {}
|
|
25
|
+
|
|
26
|
+
def __init__(self, kb_base_dir: Optional[str] = None):
|
|
27
|
+
"""
|
|
28
|
+
Initialize LightRAG retriever.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
kb_base_dir: Base directory for knowledge bases
|
|
32
|
+
"""
|
|
33
|
+
super().__init__()
|
|
34
|
+
self.kb_base_dir = kb_base_dir or str(
|
|
35
|
+
Path(__file__).resolve().parent.parent.parent.parent.parent.parent
|
|
36
|
+
/ "data"
|
|
37
|
+
/ "knowledge_bases"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def _get_lightrag_instance(self, kb_name: str):
|
|
41
|
+
"""Get or create a pure LightRAG instance (text-only)."""
|
|
42
|
+
working_dir = str(Path(self.kb_base_dir) / kb_name / "rag_storage")
|
|
43
|
+
|
|
44
|
+
if working_dir in self._instances:
|
|
45
|
+
return self._instances[working_dir]
|
|
46
|
+
|
|
47
|
+
# Add LightRAG path
|
|
48
|
+
project_root = Path(__file__).resolve().parent.parent.parent.parent.parent.parent
|
|
49
|
+
raganything_path = project_root.parent / "raganything" / "RAG-Anything"
|
|
50
|
+
if raganything_path.exists() and str(raganything_path) not in sys.path:
|
|
51
|
+
sys.path.insert(0, str(raganything_path))
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
from lightrag import LightRAG
|
|
55
|
+
from openai import AsyncOpenAI
|
|
56
|
+
|
|
57
|
+
from src.services.embedding import get_embedding_client
|
|
58
|
+
from src.services.llm import get_llm_client
|
|
59
|
+
|
|
60
|
+
llm_client = get_llm_client()
|
|
61
|
+
embed_client = get_embedding_client()
|
|
62
|
+
|
|
63
|
+
# Create AsyncOpenAI client directly
|
|
64
|
+
openai_client = AsyncOpenAI(
|
|
65
|
+
api_key=llm_client.config.api_key,
|
|
66
|
+
base_url=llm_client.config.base_url,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# LLM function using services (ASYNC - LightRAG expects async functions)
|
|
70
|
+
async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
|
|
71
|
+
"""Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
|
|
72
|
+
if history_messages is None:
|
|
73
|
+
history_messages = []
|
|
74
|
+
|
|
75
|
+
# Build messages
|
|
76
|
+
messages = []
|
|
77
|
+
if system_prompt:
|
|
78
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
79
|
+
messages.extend(history_messages)
|
|
80
|
+
messages.append({"role": "user", "content": prompt})
|
|
81
|
+
|
|
82
|
+
# Whitelist only valid OpenAI parameters
|
|
83
|
+
valid_params = {
|
|
84
|
+
"temperature",
|
|
85
|
+
"top_p",
|
|
86
|
+
"n",
|
|
87
|
+
"stream",
|
|
88
|
+
"stop",
|
|
89
|
+
"max_tokens",
|
|
90
|
+
"presence_penalty",
|
|
91
|
+
"frequency_penalty",
|
|
92
|
+
"logit_bias",
|
|
93
|
+
"user",
|
|
94
|
+
"seed",
|
|
95
|
+
}
|
|
96
|
+
clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
|
|
97
|
+
|
|
98
|
+
# Call OpenAI API directly (async)
|
|
99
|
+
response = await openai_client.chat.completions.create(
|
|
100
|
+
model=llm_client.config.model,
|
|
101
|
+
messages=messages,
|
|
102
|
+
**clean_kwargs,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return response.choices[0].message.content
|
|
106
|
+
|
|
107
|
+
# Create pure LightRAG instance (no multimodal)
|
|
108
|
+
rag = LightRAG(
|
|
109
|
+
working_dir=working_dir,
|
|
110
|
+
llm_model_func=llm_model_func,
|
|
111
|
+
embedding_func=embed_client.get_embedding_func(), # Use proper EmbeddingFunc object
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
self._instances[working_dir] = rag
|
|
115
|
+
return rag
|
|
116
|
+
|
|
117
|
+
except ImportError as e:
|
|
118
|
+
self.logger.error(f"Failed to import LightRAG: {e}")
|
|
119
|
+
raise
|
|
120
|
+
|
|
121
|
+
async def process(
|
|
122
|
+
self,
|
|
123
|
+
query: str,
|
|
124
|
+
kb_name: str,
|
|
125
|
+
mode: str = "hybrid",
|
|
126
|
+
only_need_context: bool = False,
|
|
127
|
+
**kwargs,
|
|
128
|
+
) -> Dict[str, Any]:
|
|
129
|
+
"""
|
|
130
|
+
Search using pure LightRAG retrieval (text-only).
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
query: Search query
|
|
134
|
+
kb_name: Knowledge base name
|
|
135
|
+
mode: Search mode (hybrid, local, global, naive)
|
|
136
|
+
only_need_context: Whether to only return context without answer
|
|
137
|
+
**kwargs: Additional arguments
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Search results dictionary
|
|
141
|
+
"""
|
|
142
|
+
self.logger.info(f"LightRAG search ({mode}) in {kb_name}: {query[:50]}...")
|
|
143
|
+
|
|
144
|
+
from src.logging.adapters import LightRAGLogContext
|
|
145
|
+
|
|
146
|
+
with LightRAGLogContext(scene="LightRAG-Search"):
|
|
147
|
+
rag = self._get_lightrag_instance(kb_name)
|
|
148
|
+
|
|
149
|
+
# Initialize storages if not already initialized
|
|
150
|
+
await rag.initialize_storages()
|
|
151
|
+
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
152
|
+
|
|
153
|
+
await initialize_pipeline_status()
|
|
154
|
+
|
|
155
|
+
# Import QueryParam for proper query parameter passing
|
|
156
|
+
from lightrag import QueryParam
|
|
157
|
+
|
|
158
|
+
# Use LightRAG's native query method with QueryParam object
|
|
159
|
+
query_param = QueryParam(mode=mode, only_need_context=only_need_context)
|
|
160
|
+
answer = await rag.aquery(query, param=query_param)
|
|
161
|
+
answer_str = answer if isinstance(answer, str) else str(answer)
|
|
162
|
+
|
|
163
|
+
return {
|
|
164
|
+
"query": query,
|
|
165
|
+
"answer": answer_str,
|
|
166
|
+
"content": answer_str,
|
|
167
|
+
"mode": mode,
|
|
168
|
+
"provider": "lightrag",
|
|
169
|
+
}
|