realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,86 @@
1
+ """
2
+ Text Parser
3
+ ===========
4
+
5
+ Parser for plain text documents (.txt files).
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import Union
10
+
11
+ from ...types import Document
12
+ from ..base import BaseComponent
13
+
14
+
15
+ class TextParser(BaseComponent):
16
+ """
17
+ Plain text parser.
18
+
19
+ Parses text files (.txt) into Document objects.
20
+ Also handles common text-based formats.
21
+ """
22
+
23
+ name = "text_parser"
24
+
25
+ # Supported extensions
26
+ SUPPORTED_EXTENSIONS = {".txt", ".text", ".log", ".csv", ".tsv"}
27
+
28
+ async def process(self, file_path: Union[str, Path], **kwargs) -> Document:
29
+ """
30
+ Parse a text file into a Document.
31
+
32
+ Args:
33
+ file_path: Path to the text file
34
+ **kwargs: Additional arguments
35
+
36
+ Returns:
37
+ Parsed Document
38
+ """
39
+ file_path = Path(file_path)
40
+
41
+ if not file_path.exists():
42
+ raise FileNotFoundError(f"Text file not found: {file_path}")
43
+
44
+ self.logger.info(f"Parsing text file: {file_path.name}")
45
+
46
+ # Try different encodings
47
+ content = None
48
+ encodings = ["utf-8", "utf-8-sig", "gbk", "gb2312", "latin-1"]
49
+
50
+ for encoding in encodings:
51
+ try:
52
+ with open(file_path, "r", encoding=encoding) as f:
53
+ content = f.read()
54
+ break
55
+ except UnicodeDecodeError:
56
+ continue
57
+
58
+ if content is None:
59
+ # Last resort: read as binary and decode with error handling
60
+ with open(file_path, "rb") as f:
61
+ content = f.read().decode("utf-8", errors="replace")
62
+
63
+ return Document(
64
+ content=content,
65
+ file_path=str(file_path),
66
+ metadata={
67
+ "filename": file_path.name,
68
+ "parser": self.name,
69
+ "extension": file_path.suffix.lower(),
70
+ "size_bytes": file_path.stat().st_size,
71
+ },
72
+ )
73
+
74
+ @classmethod
75
+ def can_parse(cls, file_path: Union[str, Path]) -> bool:
76
+ """
77
+ Check if this parser can handle the given file.
78
+
79
+ Args:
80
+ file_path: Path to check
81
+
82
+ Returns:
83
+ True if file can be parsed
84
+ """
85
+ suffix = Path(file_path).suffix.lower()
86
+ return suffix in cls.SUPPORTED_EXTENSIONS
@@ -0,0 +1,18 @@
1
+ """
2
+ Document Retrievers
3
+ ===================
4
+
5
+ Retrievers for searching indexed documents.
6
+ """
7
+
8
+ from .base import BaseRetriever
9
+ from .dense import DenseRetriever
10
+ from .hybrid import HybridRetriever
11
+ from .lightrag import LightRAGRetriever
12
+
13
+ __all__ = [
14
+ "BaseRetriever",
15
+ "DenseRetriever",
16
+ "HybridRetriever",
17
+ "LightRAGRetriever",
18
+ ]
@@ -0,0 +1,34 @@
1
+ """
2
+ Base Retriever
3
+ ==============
4
+
5
+ Base class for document retrievers.
6
+ """
7
+
8
+ from typing import Any, Dict
9
+
10
+ from ..base import BaseComponent
11
+
12
+
13
+ class BaseRetriever(BaseComponent):
14
+ """
15
+ Base class for document retrievers.
16
+
17
+ Retrievers search indexed documents and return relevant results.
18
+ """
19
+
20
+ name = "base_retriever"
21
+
22
+ async def process(self, query: str, kb_name: str, **kwargs) -> Dict[str, Any]:
23
+ """
24
+ Search for documents matching a query.
25
+
26
+ Args:
27
+ query: Search query
28
+ kb_name: Knowledge base name
29
+ **kwargs: Additional arguments
30
+
31
+ Returns:
32
+ Search results dictionary
33
+ """
34
+ raise NotImplementedError("Subclasses must implement process()")
@@ -0,0 +1,200 @@
1
+ """
2
+ Dense Retriever
3
+ ===============
4
+
5
+ Dense vector-based retriever using FAISS or cosine similarity.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ import pickle
11
+ from typing import Any, Dict, Optional
12
+
13
+ import numpy as np
14
+
15
+ from ..base import BaseComponent
16
+
17
+
18
+ class DenseRetriever(BaseComponent):
19
+ """
20
+ Dense vector retriever.
21
+
22
+ Uses FAISS for fast similarity search or falls back to
23
+ cosine similarity if FAISS is unavailable.
24
+ """
25
+
26
+ name = "dense_retriever"
27
+
28
+ def __init__(self, kb_base_dir: Optional[str] = None, top_k: int = 5):
29
+ """
30
+ Initialize dense retriever.
31
+
32
+ Args:
33
+ kb_base_dir: Base directory for knowledge bases
34
+ top_k: Number of results to return
35
+ """
36
+ super().__init__()
37
+ self.kb_base_dir = kb_base_dir or str(
38
+ Path(__file__).resolve().parent.parent.parent.parent.parent.parent
39
+ / "data"
40
+ / "knowledge_bases"
41
+ )
42
+ self.top_k = top_k
43
+
44
+ # Try to import FAISS
45
+ self.use_faiss = False
46
+ try:
47
+ import faiss
48
+
49
+ self.faiss = faiss
50
+ self.use_faiss = True
51
+ except ImportError:
52
+ self.logger.warning("FAISS not available, using simple cosine similarity")
53
+
54
+ async def process(self, query: str, kb_name: str, **kwargs) -> Dict[str, Any]:
55
+ """
56
+ Search using dense embeddings with FAISS or cosine similarity.
57
+
58
+ Args:
59
+ query: Search query
60
+ kb_name: Knowledge base name
61
+ **kwargs: Additional arguments (mode, top_k, etc.)
62
+
63
+ Returns:
64
+ Search results dictionary with answer and sources
65
+ """
66
+ top_k = kwargs.get("top_k", self.top_k)
67
+ self.logger.info(f"Dense search in {kb_name}: {query[:50]}... (top_k={top_k})")
68
+
69
+ from src.services.embedding import get_embedding_client
70
+
71
+ # Get query embedding
72
+ client = get_embedding_client()
73
+ query_embedding = np.array((await client.embed([query]))[0], dtype=np.float32)
74
+
75
+ # Load index
76
+ kb_dir = Path(self.kb_base_dir) / kb_name / "vector_store"
77
+ metadata_file = kb_dir / "metadata.json"
78
+ info_file = kb_dir / "info.json"
79
+
80
+ if not metadata_file.exists():
81
+ self.logger.warning(f"No vector index found at {kb_dir}")
82
+ return {
83
+ "query": query,
84
+ "answer": "No documents indexed. Please upload documents first.",
85
+ "content": "",
86
+ "mode": "dense",
87
+ "provider": "llamaindex",
88
+ "results": [],
89
+ }
90
+
91
+ # Load metadata and info (info.json is optional)
92
+ with open(metadata_file, "r", encoding="utf-8") as f:
93
+ metadata = json.load(f)
94
+
95
+ if info_file.exists():
96
+ with open(info_file, "r", encoding="utf-8") as f:
97
+ info = json.load(f)
98
+ else:
99
+ info = {"use_faiss": False}
100
+
101
+ use_faiss = info.get("use_faiss", False)
102
+
103
+ if use_faiss and self.use_faiss:
104
+ # Use FAISS for fast search
105
+ index_file = kb_dir / "index.faiss"
106
+ if not index_file.exists():
107
+ self.logger.error(f"FAISS index file not found: {index_file}")
108
+ return self._empty_response(query)
109
+
110
+ # Load FAISS index
111
+ index = self.faiss.read_index(str(index_file))
112
+
113
+ # Normalize query vector for cosine similarity without modifying original
114
+ norm = np.linalg.norm(query_embedding)
115
+ if norm > 0:
116
+ query_vec = (query_embedding / norm).reshape(1, -1)
117
+ else:
118
+ query_vec = query_embedding.reshape(1, -1)
119
+
120
+ # Search
121
+ distances, indices = index.search(query_vec, min(top_k, len(metadata)))
122
+
123
+ # Build results
124
+ results = []
125
+ for dist, idx in zip(distances[0], indices[0]):
126
+ if idx < len(metadata): # Valid index
127
+ score = 1.0 / (1.0 + dist) # Convert distance to similarity score
128
+ results.append((score, metadata[idx]))
129
+ else:
130
+ # Fallback: Load embeddings and use cosine similarity
131
+ embeddings_file = kb_dir / "embeddings.pkl"
132
+ if not embeddings_file.exists():
133
+ self.logger.error(f"Embeddings file not found: {embeddings_file}")
134
+ return self._empty_response(query)
135
+
136
+ with open(embeddings_file, "rb") as f:
137
+ embeddings = pickle.load(f)
138
+
139
+ # Normalize for cosine similarity (avoid division by zero)
140
+ query_norm = np.linalg.norm(query_embedding)
141
+ if query_norm > 0:
142
+ query_vec = query_embedding / query_norm
143
+ else:
144
+ query_vec = query_embedding # Keep as is if zero norm
145
+
146
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
147
+ # Replace zero norms with 1 to avoid division by zero
148
+ norms = np.where(norms == 0, 1, norms)
149
+ doc_vecs = embeddings / norms
150
+
151
+ # Compute similarities
152
+ similarities = np.dot(doc_vecs, query_vec)
153
+
154
+ # Get top-k results
155
+ top_indices = np.argsort(similarities)[::-1][:top_k]
156
+
157
+ results = []
158
+ for idx in top_indices:
159
+ score = float(similarities[idx])
160
+ results.append((score, metadata[idx]))
161
+
162
+ # Build response content
163
+ # Format chunks cleanly for LLM context (without score annotations)
164
+ content_parts = []
165
+ sources = []
166
+ for score, item in results:
167
+ content = item.get("content", "").strip()
168
+ if content: # Only include non-empty chunks
169
+ # Add chunk without score prefix for clean LLM input
170
+ content_parts.append(content)
171
+ sources.append(
172
+ {
173
+ "content": content,
174
+ "score": score,
175
+ "metadata": item.get("metadata", {}),
176
+ }
177
+ )
178
+
179
+ # Join chunks with clear separation
180
+ content = "\n\n".join(content_parts)
181
+
182
+ return {
183
+ "query": query,
184
+ "answer": content, # Return clean context for LLM to use
185
+ "content": content,
186
+ "mode": "dense",
187
+ "provider": "llamaindex",
188
+ "results": sources,
189
+ }
190
+
191
+ def _empty_response(self, query: str) -> Dict[str, Any]:
192
+ """Return empty response when no results found."""
193
+ return {
194
+ "query": query,
195
+ "answer": "No relevant documents found.",
196
+ "content": "",
197
+ "mode": "dense",
198
+ "provider": "llamaindex",
199
+ "results": [],
200
+ }
@@ -0,0 +1,164 @@
1
+ """
2
+ Hybrid Retriever
3
+ ================
4
+
5
+ Hybrid retriever combining multiple retrieval strategies.
6
+ """
7
+
8
+ from pathlib import Path
9
+ import sys
10
+ from typing import Any, Dict, Optional
11
+
12
+ from ..base import BaseComponent
13
+
14
+
15
+ class HybridRetriever(BaseComponent):
16
+ """
17
+ Hybrid retriever combining graph and vector retrieval.
18
+
19
+ Uses LightRAG's hybrid mode for retrieval.
20
+ """
21
+
22
+ name = "hybrid_retriever"
23
+ _instances: Dict[str, Any] = {}
24
+
25
+ def __init__(self, kb_base_dir: Optional[str] = None):
26
+ """
27
+ Initialize hybrid retriever.
28
+
29
+ Args:
30
+ kb_base_dir: Base directory for knowledge bases
31
+ """
32
+ super().__init__()
33
+ self.kb_base_dir = kb_base_dir or str(
34
+ Path(__file__).resolve().parent.parent.parent.parent.parent.parent
35
+ / "data"
36
+ / "knowledge_bases"
37
+ )
38
+
39
+ def _get_rag_instance(self, kb_name: str):
40
+ """Get or create a RAGAnything instance."""
41
+ working_dir = str(Path(self.kb_base_dir) / kb_name / "rag_storage")
42
+
43
+ if working_dir in self._instances:
44
+ return self._instances[working_dir]
45
+
46
+ # Add RAG-Anything path
47
+ project_root = Path(__file__).resolve().parent.parent.parent.parent.parent.parent
48
+ raganything_path = project_root.parent / "raganything" / "RAG-Anything"
49
+ if raganything_path.exists() and str(raganything_path) not in sys.path:
50
+ sys.path.insert(0, str(raganything_path))
51
+
52
+ try:
53
+ from openai import AsyncOpenAI
54
+ from raganything import RAGAnything, RAGAnythingConfig
55
+
56
+ from src.services.embedding import get_embedding_client
57
+ from src.services.llm import get_llm_client
58
+
59
+ llm_client = get_llm_client()
60
+ embed_client = get_embedding_client()
61
+
62
+ # Create AsyncOpenAI client directly
63
+ openai_client = AsyncOpenAI(
64
+ api_key=llm_client.config.api_key,
65
+ base_url=llm_client.config.base_url,
66
+ )
67
+
68
+ # LLM function using services (ASYNC - LightRAG expects async functions)
69
+ async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
70
+ """Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
71
+ if history_messages is None:
72
+ history_messages = []
73
+
74
+ # Build messages
75
+ messages = []
76
+ if system_prompt:
77
+ messages.append({"role": "system", "content": system_prompt})
78
+ messages.extend(history_messages)
79
+ messages.append({"role": "user", "content": prompt})
80
+
81
+ # Whitelist only valid OpenAI parameters
82
+ valid_params = {
83
+ "temperature",
84
+ "top_p",
85
+ "n",
86
+ "stream",
87
+ "stop",
88
+ "max_tokens",
89
+ "presence_penalty",
90
+ "frequency_penalty",
91
+ "logit_bias",
92
+ "user",
93
+ "seed",
94
+ }
95
+ clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
96
+
97
+ # Call OpenAI API directly (async)
98
+ response = await openai_client.chat.completions.create(
99
+ model=llm_client.config.model,
100
+ messages=messages,
101
+ **clean_kwargs,
102
+ )
103
+
104
+ return response.choices[0].message.content
105
+
106
+ config = RAGAnythingConfig(
107
+ working_dir=working_dir,
108
+ enable_image_processing=True,
109
+ enable_table_processing=True,
110
+ enable_equation_processing=True,
111
+ )
112
+
113
+ rag = RAGAnything(
114
+ config=config,
115
+ llm_model_func=llm_model_func,
116
+ embedding_func=embed_client.get_embedding_func(),
117
+ )
118
+
119
+ self._instances[working_dir] = rag
120
+ return rag
121
+
122
+ except ImportError as e:
123
+ self.logger.error(f"Failed to import RAG-Anything: {e}")
124
+ raise
125
+
126
+ async def process(
127
+ self,
128
+ query: str,
129
+ kb_name: str,
130
+ mode: str = "hybrid",
131
+ only_need_context: bool = False,
132
+ **kwargs,
133
+ ) -> Dict[str, Any]:
134
+ """
135
+ Search using hybrid retrieval.
136
+
137
+ Args:
138
+ query: Search query
139
+ kb_name: Knowledge base name
140
+ mode: Search mode (hybrid, local, global, naive)
141
+ only_need_context: Whether to only return context without answer
142
+ **kwargs: Additional arguments
143
+
144
+ Returns:
145
+ Search results dictionary
146
+ """
147
+ self.logger.info(f"Hybrid search ({mode}) in {kb_name}: {query[:50]}...")
148
+
149
+ from src.logging.adapters import LightRAGLogContext
150
+
151
+ with LightRAGLogContext(scene="rag_search"):
152
+ rag = self._get_rag_instance(kb_name)
153
+ await rag._ensure_lightrag_initialized()
154
+
155
+ answer = await rag.aquery(query, mode=mode, only_need_context=only_need_context)
156
+ answer_str = answer if isinstance(answer, str) else str(answer)
157
+
158
+ return {
159
+ "query": query,
160
+ "answer": answer_str,
161
+ "content": answer_str,
162
+ "mode": mode,
163
+ "provider": "hybrid",
164
+ }
@@ -0,0 +1,169 @@
1
+ """
2
+ LightRAG Retriever
3
+ ==================
4
+
5
+ Pure LightRAG retriever (text-only, no multimodal).
6
+ """
7
+
8
+ from pathlib import Path
9
+ import sys
10
+ from typing import Any, ClassVar, Dict, Optional
11
+
12
+ from ..base import BaseComponent
13
+
14
+
15
+ class LightRAGRetriever(BaseComponent):
16
+ """
17
+ Pure LightRAG retriever using LightRAG.query() directly.
18
+
19
+ Uses LightRAG's native retrieval modes (naive, local, global, hybrid).
20
+ Text-only, no multimodal processing.
21
+ """
22
+
23
+ name = "lightrag_retriever"
24
+ _instances: ClassVar[Dict[str, Any]] = {}
25
+
26
+ def __init__(self, kb_base_dir: Optional[str] = None):
27
+ """
28
+ Initialize LightRAG retriever.
29
+
30
+ Args:
31
+ kb_base_dir: Base directory for knowledge bases
32
+ """
33
+ super().__init__()
34
+ self.kb_base_dir = kb_base_dir or str(
35
+ Path(__file__).resolve().parent.parent.parent.parent.parent.parent
36
+ / "data"
37
+ / "knowledge_bases"
38
+ )
39
+
40
+ def _get_lightrag_instance(self, kb_name: str):
41
+ """Get or create a pure LightRAG instance (text-only)."""
42
+ working_dir = str(Path(self.kb_base_dir) / kb_name / "rag_storage")
43
+
44
+ if working_dir in self._instances:
45
+ return self._instances[working_dir]
46
+
47
+ # Add LightRAG path
48
+ project_root = Path(__file__).resolve().parent.parent.parent.parent.parent.parent
49
+ raganything_path = project_root.parent / "raganything" / "RAG-Anything"
50
+ if raganything_path.exists() and str(raganything_path) not in sys.path:
51
+ sys.path.insert(0, str(raganything_path))
52
+
53
+ try:
54
+ from lightrag import LightRAG
55
+ from openai import AsyncOpenAI
56
+
57
+ from src.services.embedding import get_embedding_client
58
+ from src.services.llm import get_llm_client
59
+
60
+ llm_client = get_llm_client()
61
+ embed_client = get_embedding_client()
62
+
63
+ # Create AsyncOpenAI client directly
64
+ openai_client = AsyncOpenAI(
65
+ api_key=llm_client.config.api_key,
66
+ base_url=llm_client.config.base_url,
67
+ )
68
+
69
+ # LLM function using services (ASYNC - LightRAG expects async functions)
70
+ async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
71
+ """Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
72
+ if history_messages is None:
73
+ history_messages = []
74
+
75
+ # Build messages
76
+ messages = []
77
+ if system_prompt:
78
+ messages.append({"role": "system", "content": system_prompt})
79
+ messages.extend(history_messages)
80
+ messages.append({"role": "user", "content": prompt})
81
+
82
+ # Whitelist only valid OpenAI parameters
83
+ valid_params = {
84
+ "temperature",
85
+ "top_p",
86
+ "n",
87
+ "stream",
88
+ "stop",
89
+ "max_tokens",
90
+ "presence_penalty",
91
+ "frequency_penalty",
92
+ "logit_bias",
93
+ "user",
94
+ "seed",
95
+ }
96
+ clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
97
+
98
+ # Call OpenAI API directly (async)
99
+ response = await openai_client.chat.completions.create(
100
+ model=llm_client.config.model,
101
+ messages=messages,
102
+ **clean_kwargs,
103
+ )
104
+
105
+ return response.choices[0].message.content
106
+
107
+ # Create pure LightRAG instance (no multimodal)
108
+ rag = LightRAG(
109
+ working_dir=working_dir,
110
+ llm_model_func=llm_model_func,
111
+ embedding_func=embed_client.get_embedding_func(), # Use proper EmbeddingFunc object
112
+ )
113
+
114
+ self._instances[working_dir] = rag
115
+ return rag
116
+
117
+ except ImportError as e:
118
+ self.logger.error(f"Failed to import LightRAG: {e}")
119
+ raise
120
+
121
+ async def process(
122
+ self,
123
+ query: str,
124
+ kb_name: str,
125
+ mode: str = "hybrid",
126
+ only_need_context: bool = False,
127
+ **kwargs,
128
+ ) -> Dict[str, Any]:
129
+ """
130
+ Search using pure LightRAG retrieval (text-only).
131
+
132
+ Args:
133
+ query: Search query
134
+ kb_name: Knowledge base name
135
+ mode: Search mode (hybrid, local, global, naive)
136
+ only_need_context: Whether to only return context without answer
137
+ **kwargs: Additional arguments
138
+
139
+ Returns:
140
+ Search results dictionary
141
+ """
142
+ self.logger.info(f"LightRAG search ({mode}) in {kb_name}: {query[:50]}...")
143
+
144
+ from src.logging.adapters import LightRAGLogContext
145
+
146
+ with LightRAGLogContext(scene="LightRAG-Search"):
147
+ rag = self._get_lightrag_instance(kb_name)
148
+
149
+ # Initialize storages if not already initialized
150
+ await rag.initialize_storages()
151
+ from lightrag.kg.shared_storage import initialize_pipeline_status
152
+
153
+ await initialize_pipeline_status()
154
+
155
+ # Import QueryParam for proper query parameter passing
156
+ from lightrag import QueryParam
157
+
158
+ # Use LightRAG's native query method with QueryParam object
159
+ query_param = QueryParam(mode=mode, only_need_context=only_need_context)
160
+ answer = await rag.aquery(query, param=query_param)
161
+ answer_str = answer if isinstance(answer, str) else str(answer)
162
+
163
+ return {
164
+ "query": query,
165
+ "answer": answer_str,
166
+ "content": answer_str,
167
+ "mode": mode,
168
+ "provider": "lightrag",
169
+ }