realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,215 @@
1
+ """
2
+ RAG Pipeline
3
+ ============
4
+
5
+ Composable RAG pipeline with fluent API.
6
+ """
7
+
8
+ import asyncio
9
+ from pathlib import Path
10
+ import shutil
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ from src.logging import get_logger
14
+
15
+ from .components.base import Component
16
+ from .components.routing import FileTypeRouter
17
+ from .types import Document
18
+
19
+ # Default knowledge base directory
20
+ DEFAULT_KB_BASE_DIR = str(
21
+ Path(__file__).resolve().parent.parent.parent.parent / "data" / "knowledge_bases"
22
+ )
23
+
24
+
25
+ class RAGPipeline:
26
+ """
27
+ Composable RAG pipeline.
28
+
29
+ Build custom RAG pipelines using a fluent API:
30
+
31
+ pipeline = (
32
+ RAGPipeline("custom", kb_base_dir="/path/to/kb")
33
+ .parser(PDFParser())
34
+ .chunker(SemanticChunker())
35
+ .embedder(OpenAIEmbedder())
36
+ .indexer(GraphIndexer())
37
+ .retriever(HybridRetriever())
38
+ )
39
+
40
+ await pipeline.initialize("kb_name", ["doc1.pdf"])
41
+ result = await pipeline.search("query", "kb_name")
42
+ """
43
+
44
+ def __init__(self, name: str = "default", kb_base_dir: Optional[str] = None):
45
+ """
46
+ Initialize RAG pipeline.
47
+
48
+ Args:
49
+ name: Pipeline name for logging
50
+ kb_base_dir: Base directory for knowledge bases
51
+ """
52
+ self.name = name
53
+ self.kb_base_dir = kb_base_dir or DEFAULT_KB_BASE_DIR
54
+ self.logger = get_logger(f"Pipeline:{name}")
55
+ self._parser: Optional[Component] = None
56
+ self._chunkers: List[Component] = []
57
+ self._embedder: Optional[Component] = None
58
+ self._indexers: List[Component] = []
59
+ self._retriever: Optional[Component] = None
60
+
61
+ # Fluent API methods
62
+ def parser(self, p: Component) -> "RAGPipeline":
63
+ """Set the document parser."""
64
+ self._parser = p
65
+ return self
66
+
67
+ def chunker(self, c: Component) -> "RAGPipeline":
68
+ """Add a chunker to the pipeline."""
69
+ self._chunkers.append(c)
70
+ return self
71
+
72
+ def embedder(self, e: Component) -> "RAGPipeline":
73
+ """Set the embedder."""
74
+ self._embedder = e
75
+ return self
76
+
77
+ def indexer(self, i: Component) -> "RAGPipeline":
78
+ """Add an indexer to the pipeline."""
79
+ self._indexers.append(i)
80
+ return self
81
+
82
+ def retriever(self, r: Component) -> "RAGPipeline":
83
+ """Set the retriever."""
84
+ self._retriever = r
85
+ return self
86
+
87
+ async def initialize(self, kb_name: str, file_paths: List[str], **kwargs) -> bool:
88
+ """
89
+ Run full initialization pipeline.
90
+
91
+ Uses FileTypeRouter to classify files and route them appropriately:
92
+ - PDF/complex files -> configured parser (e.g., PDFParser)
93
+ - Text files -> direct text reading (fast path)
94
+
95
+ Args:
96
+ kb_name: Knowledge base name
97
+ file_paths: List of file paths to process
98
+ **kwargs: Additional arguments passed to components
99
+
100
+ Returns:
101
+ True if successful
102
+ """
103
+ self.logger.info(f"Initializing KB '{kb_name}' with {len(file_paths)} files")
104
+
105
+ if not self._parser:
106
+ raise ValueError("No parser configured. Use .parser() to set one")
107
+
108
+ # Stage 1: Parse documents with file type routing
109
+ self.logger.info("Stage 1: Parsing documents...")
110
+
111
+ # Classify files by type
112
+ classification = FileTypeRouter.classify_files(file_paths)
113
+ self.logger.info(
114
+ f"File classification: {len(classification.needs_mineru)} complex, "
115
+ f"{len(classification.text_files)} text, "
116
+ f"{len(classification.unsupported)} unsupported"
117
+ )
118
+
119
+ documents = []
120
+
121
+ # Process complex files (PDF, etc.) with configured parser
122
+ for path in classification.needs_mineru:
123
+ self.logger.info(f"Parsing (parser): {Path(path).name}")
124
+ doc = await self._parser.process(path, **kwargs)
125
+ documents.append(doc)
126
+
127
+ # Process text files directly (fast path)
128
+ for path in classification.text_files:
129
+ self.logger.info(f"Parsing (direct text): {Path(path).name}")
130
+ content = await FileTypeRouter.read_text_file(path)
131
+ doc = Document(
132
+ content=content,
133
+ file_path=str(path),
134
+ metadata={
135
+ "filename": Path(path).name,
136
+ "parser": "direct_text",
137
+ },
138
+ )
139
+ documents.append(doc)
140
+
141
+ # Log unsupported files
142
+ for path in classification.unsupported:
143
+ self.logger.warning(f"Skipped unsupported file: {Path(path).name}")
144
+
145
+ # Stage 2: Chunk (sequential - later chunkers see earlier results)
146
+ if self._chunkers:
147
+ self.logger.info("Stage 2: Chunking...")
148
+ for chunker in self._chunkers:
149
+ for doc in documents:
150
+ new_chunks = await chunker.process(doc, **kwargs)
151
+ doc.chunks.extend(new_chunks)
152
+
153
+ # Stage 3: Embed
154
+ if self._embedder:
155
+ self.logger.info("Stage 3: Embedding...")
156
+ for doc in documents:
157
+ await self._embedder.process(doc, **kwargs)
158
+
159
+ # Stage 4: Index (can run in parallel)
160
+ if self._indexers:
161
+ self.logger.info("Stage 4: Indexing...")
162
+ await asyncio.gather(
163
+ *[indexer.process(kb_name, documents, **kwargs) for indexer in self._indexers]
164
+ )
165
+
166
+ self.logger.info(f"KB '{kb_name}' initialized successfully")
167
+ return True
168
+
169
+ async def search(self, query: str, kb_name: str, **kwargs) -> Dict[str, Any]:
170
+ """
171
+ Search the knowledge base.
172
+
173
+ Args:
174
+ query: Search query
175
+ kb_name: Knowledge base name
176
+ **kwargs: Additional arguments passed to retriever
177
+
178
+ Returns:
179
+ Search results dictionary
180
+ """
181
+ if not self._retriever:
182
+ raise ValueError("No retriever configured. Use .retriever() to set one")
183
+
184
+ return await self._retriever.process(query, kb_name=kb_name, **kwargs)
185
+
186
+ async def delete(self, kb_name: str) -> bool:
187
+ """
188
+ Delete a knowledge base.
189
+
190
+ Args:
191
+ kb_name: Knowledge base name
192
+
193
+ Returns:
194
+ True if successful
195
+ """
196
+ # Validate kb_name to prevent path traversal
197
+ if not kb_name or kb_name in (".", "..") or "/" in kb_name or "\\" in kb_name:
198
+ raise ValueError(f"Invalid knowledge base name: {kb_name}")
199
+
200
+ self.logger.info(f"Deleting KB '{kb_name}'")
201
+
202
+ kb_dir = Path(self.kb_base_dir) / kb_name
203
+ # Ensure the resolved path is within the base directory
204
+ kb_dir = kb_dir.resolve()
205
+ base_dir = Path(self.kb_base_dir).resolve()
206
+ if not kb_dir.is_relative_to(base_dir):
207
+ raise ValueError(f"Knowledge base path outside allowed directory: {kb_name}")
208
+
209
+ if kb_dir.exists():
210
+ shutil.rmtree(kb_dir)
211
+ self.logger.info(f"Deleted KB directory: {kb_dir}")
212
+ return True
213
+
214
+ self.logger.warning(f"KB directory not found: {kb_dir}")
215
+ return False
@@ -0,0 +1,32 @@
1
+ """
2
+ Pre-configured Pipelines
3
+ ========================
4
+
5
+ Ready-to-use RAG pipelines for common use cases.
6
+
7
+ LightRAG and Academic pipelines are always available.
8
+ LlamaIndex and RAGAnything require optional dependencies.
9
+ """
10
+
11
+ # Always available pipelines
12
+ from .academic import AcademicPipeline
13
+ from .lightrag import LightRAGPipeline
14
+
15
+ __all__ = [
16
+ "LightRAGPipeline",
17
+ "AcademicPipeline",
18
+ ]
19
+
20
+ # Optional pipelines - import only if dependencies are available
21
+ try:
22
+ from .llamaindex import LlamaIndexPipeline
23
+ __all__.append("LlamaIndexPipeline")
24
+ except ImportError:
25
+ LlamaIndexPipeline = None # type: ignore
26
+
27
+ try:
28
+ from .raganything import RAGAnythingPipeline
29
+ __all__.append("RAGAnythingPipeline")
30
+ except ImportError:
31
+ RAGAnythingPipeline = None # type: ignore
32
+
@@ -0,0 +1,44 @@
1
+ """
2
+ Academic Pipeline
3
+ =================
4
+
5
+ Pipeline optimized for academic documents with numbered item extraction.
6
+ """
7
+
8
+ from typing import Optional
9
+
10
+ from ..components.chunkers import NumberedItemExtractor, SemanticChunker
11
+ from ..components.embedders import OpenAIEmbedder
12
+ from ..components.indexers import GraphIndexer
13
+ from ..components.parsers import TextParser
14
+ from ..components.retrievers import HybridRetriever
15
+ from ..pipeline import RAGPipeline
16
+
17
+
18
+ def AcademicPipeline(kb_base_dir: Optional[str] = None) -> RAGPipeline:
19
+ """
20
+ Create an academic document pipeline.
21
+
22
+ This pipeline uses:
23
+ - TextParser for document parsing (supports txt, md files)
24
+ - SemanticChunker for text chunking
25
+ - NumberedItemExtractor for extracting definitions, theorems, etc.
26
+ - OpenAIEmbedder for embedding generation
27
+ - GraphIndexer for knowledge graph indexing
28
+ - HybridRetriever for hybrid retrieval
29
+
30
+ Args:
31
+ kb_base_dir: Base directory for knowledge bases
32
+
33
+ Returns:
34
+ Configured RAGPipeline
35
+ """
36
+ return (
37
+ RAGPipeline("academic", kb_base_dir=kb_base_dir)
38
+ .parser(TextParser())
39
+ .chunker(SemanticChunker())
40
+ .chunker(NumberedItemExtractor())
41
+ .embedder(OpenAIEmbedder())
42
+ .indexer(GraphIndexer(kb_base_dir=kb_base_dir))
43
+ .retriever(HybridRetriever(kb_base_dir=kb_base_dir))
44
+ )
@@ -0,0 +1,43 @@
1
+ """
2
+ LightRAG Pipeline
3
+ =================
4
+
5
+ Pure LightRAG pipeline (text-only, no multimodal processing).
6
+ Faster than RAGAnything for text-heavy documents.
7
+ """
8
+
9
+ from typing import Optional
10
+
11
+ from ..components.indexers import LightRAGIndexer
12
+ from ..components.parsers import PDFParser
13
+ from ..components.retrievers import LightRAGRetriever
14
+ from ..pipeline import RAGPipeline
15
+
16
+
17
+ def LightRAGPipeline(kb_base_dir: Optional[str] = None) -> RAGPipeline:
18
+ """
19
+ Create a pure LightRAG pipeline (text-only, no multimodal).
20
+
21
+ This pipeline uses:
22
+ - PDFParser for document parsing (extracts raw text from PDF/txt/md)
23
+ - LightRAGIndexer for knowledge graph indexing (text-only, fast)
24
+ * LightRAG handles chunking, entity extraction, and embedding internally
25
+ * No separate chunker/embedder needed - LightRAG does it all
26
+ - LightRAGRetriever for retrieval (uses LightRAG.aquery() directly)
27
+
28
+ Performance: Medium speed (~10-15s per document)
29
+ Use for: Business docs, text-heavy PDFs, when you need knowledge graph
30
+
31
+ Args:
32
+ kb_base_dir: Base directory for knowledge bases
33
+
34
+ Returns:
35
+ Configured RAGPipeline
36
+ """
37
+ return (
38
+ RAGPipeline("lightrag", kb_base_dir=kb_base_dir)
39
+ .parser(PDFParser())
40
+ # No chunker/embedder - LightRAG does everything internally
41
+ .indexer(LightRAGIndexer(kb_base_dir=kb_base_dir))
42
+ .retriever(LightRAGRetriever(kb_base_dir=kb_base_dir))
43
+ )
@@ -0,0 +1,313 @@
1
+ """
2
+ LlamaIndex Pipeline
3
+ ===================
4
+
5
+ True LlamaIndex integration using official llama-index library.
6
+ """
7
+
8
+ import asyncio
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from llama_index.core import (
13
+ Document,
14
+ Settings,
15
+ StorageContext,
16
+ VectorStoreIndex,
17
+ load_index_from_storage,
18
+ )
19
+ from llama_index.core.base.embeddings.base import BaseEmbedding
20
+ from llama_index.core.bridge.pydantic import PrivateAttr
21
+
22
+ from src.logging import get_logger
23
+ from src.services.embedding import get_embedding_client, get_embedding_config
24
+
25
+ # Default knowledge base directory
26
+ DEFAULT_KB_BASE_DIR = str(
27
+ Path(__file__).resolve().parent.parent.parent.parent.parent / "data" / "knowledge_bases"
28
+ )
29
+
30
+
31
+ class CustomEmbedding(BaseEmbedding):
32
+ """
33
+ Custom embedding adapter for OpenAI-compatible APIs.
34
+
35
+ Works with any OpenAI-compatible endpoint including:
36
+ - Google Gemini (text-embedding-004)
37
+ - OpenAI (text-embedding-ada-002, text-embedding-3-*)
38
+ - Azure OpenAI
39
+ - Local models with OpenAI-compatible API
40
+ """
41
+
42
+ _client: Any = PrivateAttr()
43
+
44
+ def __init__(self, **kwargs):
45
+ super().__init__(**kwargs)
46
+ self._client = get_embedding_client()
47
+
48
+ @classmethod
49
+ def class_name(cls) -> str:
50
+ return "custom_embedding"
51
+
52
+ async def _aget_query_embedding(self, query: str) -> List[float]:
53
+ """Get embedding for a query."""
54
+ embeddings = await self._client.embed([query])
55
+ return embeddings[0]
56
+
57
+ async def _aget_text_embedding(self, text: str) -> List[float]:
58
+ """Get embedding for a text."""
59
+ embeddings = await self._client.embed([text])
60
+ return embeddings[0]
61
+
62
+ def _get_query_embedding(self, query: str) -> List[float]:
63
+ """Sync version - called by LlamaIndex sync API."""
64
+ # Use nest_asyncio to allow nested event loops
65
+ import nest_asyncio
66
+
67
+ nest_asyncio.apply()
68
+ return asyncio.run(self._aget_query_embedding(query))
69
+
70
+ def _get_text_embedding(self, text: str) -> List[float]:
71
+ """Sync version - called by LlamaIndex sync API."""
72
+ # Use nest_asyncio to allow nested event loops
73
+ import nest_asyncio
74
+
75
+ nest_asyncio.apply()
76
+ return asyncio.run(self._aget_text_embedding(text))
77
+
78
+ async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
79
+ """Get embeddings for multiple texts."""
80
+ return await self._client.embed(texts)
81
+
82
+
83
+ class LlamaIndexPipeline:
84
+ """
85
+ True LlamaIndex pipeline using official llama-index library.
86
+
87
+ Uses LlamaIndex's native components:
88
+ - VectorStoreIndex for indexing
89
+ - CustomEmbedding for OpenAI-compatible embeddings
90
+ - SentenceSplitter for chunking
91
+ - StorageContext for persistence
92
+ """
93
+
94
+ def __init__(self, kb_base_dir: Optional[str] = None):
95
+ """
96
+ Initialize LlamaIndex pipeline.
97
+
98
+ Args:
99
+ kb_base_dir: Base directory for knowledge bases
100
+ """
101
+ self.logger = get_logger("LlamaIndexPipeline")
102
+ self.kb_base_dir = kb_base_dir or DEFAULT_KB_BASE_DIR
103
+ self._configure_settings()
104
+
105
+ def _configure_settings(self):
106
+ """Configure LlamaIndex global settings."""
107
+ # Get embedding config
108
+ embedding_cfg = get_embedding_config()
109
+
110
+ # Configure custom embedding that works with any OpenAI-compatible API
111
+ Settings.embed_model = CustomEmbedding()
112
+
113
+ # Configure chunking
114
+ Settings.chunk_size = 512
115
+ Settings.chunk_overlap = 50
116
+
117
+ self.logger.info(
118
+ f"LlamaIndex configured: embedding={embedding_cfg.model} "
119
+ f"({embedding_cfg.dim}D, {embedding_cfg.binding}), chunk_size=512"
120
+ )
121
+
122
+ async def initialize(self, kb_name: str, file_paths: List[str], **kwargs) -> bool:
123
+ """
124
+ Initialize KB using real LlamaIndex components.
125
+
126
+ Args:
127
+ kb_name: Knowledge base name
128
+ file_paths: List of file paths to process
129
+ **kwargs: Additional arguments
130
+
131
+ Returns:
132
+ True if successful
133
+ """
134
+ self.logger.info(
135
+ f"Initializing KB '{kb_name}' with {len(file_paths)} files using LlamaIndex"
136
+ )
137
+
138
+ kb_dir = Path(self.kb_base_dir) / kb_name
139
+ storage_dir = kb_dir / "llamaindex_storage"
140
+ storage_dir.mkdir(parents=True, exist_ok=True)
141
+
142
+ try:
143
+ # Parse documents
144
+ documents = []
145
+ for file_path in file_paths:
146
+ file_path = Path(file_path)
147
+ self.logger.info(f"Parsing: {file_path.name}")
148
+
149
+ # Extract text based on file type
150
+ if file_path.suffix.lower() == ".pdf":
151
+ text = self._extract_pdf_text(file_path)
152
+ else:
153
+ with open(file_path, "r", encoding="utf-8") as f:
154
+ text = f.read()
155
+
156
+ if text.strip():
157
+ doc = Document(
158
+ text=text,
159
+ metadata={
160
+ "file_name": file_path.name,
161
+ "file_path": str(file_path),
162
+ },
163
+ )
164
+ documents.append(doc)
165
+ self.logger.info(f"Loaded: {file_path.name} ({len(text)} chars)")
166
+ else:
167
+ self.logger.warning(f"Skipped empty document: {file_path.name}")
168
+
169
+ if not documents:
170
+ self.logger.error("No valid documents found")
171
+ return False
172
+
173
+ # Create index with LlamaIndex (run sync code in thread pool to avoid blocking)
174
+ self.logger.info(f"Creating VectorStoreIndex with {len(documents)} documents...")
175
+
176
+ # Run sync LlamaIndex code in thread pool to avoid blocking async event loop
177
+ loop = asyncio.get_event_loop()
178
+ index = await loop.run_in_executor(
179
+ None, # Use default ThreadPoolExecutor
180
+ lambda: VectorStoreIndex.from_documents(documents, show_progress=True),
181
+ )
182
+
183
+ # Persist index
184
+ index.storage_context.persist(persist_dir=str(storage_dir))
185
+ self.logger.info(f"Index persisted to {storage_dir}")
186
+
187
+ self.logger.info(f"KB '{kb_name}' initialized successfully with LlamaIndex")
188
+ return True
189
+
190
+ except Exception as e:
191
+ self.logger.error(f"Failed to initialize KB: {e}")
192
+ import traceback
193
+
194
+ self.logger.error(traceback.format_exc())
195
+ return False
196
+
197
+ def _extract_pdf_text(self, file_path: Path) -> str:
198
+ """Extract text from PDF using PyMuPDF."""
199
+ try:
200
+ import fitz # PyMuPDF
201
+
202
+ doc = fitz.open(file_path)
203
+ texts = []
204
+ for page in doc:
205
+ texts.append(page.get_text())
206
+ doc.close()
207
+ return "\n\n".join(texts)
208
+ except ImportError:
209
+ self.logger.warning("PyMuPDF not installed. Cannot extract PDF text.")
210
+ return ""
211
+ except Exception as e:
212
+ self.logger.error(f"Failed to extract PDF text: {e}")
213
+ return ""
214
+
215
+ async def search(
216
+ self,
217
+ query: str,
218
+ kb_name: str,
219
+ mode: str = "hybrid",
220
+ **kwargs,
221
+ ) -> Dict[str, Any]:
222
+ """
223
+ Search using LlamaIndex query engine.
224
+
225
+ Args:
226
+ query: Search query
227
+ kb_name: Knowledge base name
228
+ mode: Search mode (ignored, LlamaIndex uses similarity)
229
+ **kwargs: Additional arguments (top_k, etc.)
230
+
231
+ Returns:
232
+ Search results dictionary
233
+ """
234
+ self.logger.info(f"Searching KB '{kb_name}' with query: {query[:50]}...")
235
+
236
+ kb_dir = Path(self.kb_base_dir) / kb_name
237
+ storage_dir = kb_dir / "llamaindex_storage"
238
+
239
+ if not storage_dir.exists():
240
+ self.logger.warning(f"No LlamaIndex storage found at {storage_dir}")
241
+ return {
242
+ "query": query,
243
+ "answer": "No documents indexed. Please upload documents first.",
244
+ "content": "",
245
+ "mode": mode,
246
+ "provider": "llamaindex",
247
+ }
248
+
249
+ try:
250
+ # Load index from storage (run in thread pool)
251
+ loop = asyncio.get_event_loop()
252
+
253
+ def load_and_retrieve():
254
+ storage_context = StorageContext.from_defaults(persist_dir=str(storage_dir))
255
+ index = load_index_from_storage(storage_context)
256
+ top_k = kwargs.get("top_k", 5)
257
+
258
+ # Use retriever instead of query_engine to avoid LLM requirement
259
+ retriever = index.as_retriever(similarity_top_k=top_k)
260
+ nodes = retriever.retrieve(query)
261
+ return nodes
262
+
263
+ # Execute retrieval in thread pool to avoid blocking
264
+ nodes = await loop.run_in_executor(None, load_and_retrieve)
265
+
266
+ # Extract text from retrieved nodes
267
+ context_parts = []
268
+ for node in nodes:
269
+ context_parts.append(node.node.text)
270
+
271
+ content = "\n\n".join(context_parts) if context_parts else ""
272
+
273
+ return {
274
+ "query": query,
275
+ "answer": content, # Return context for ChatAgent to use
276
+ "content": content,
277
+ "mode": mode,
278
+ "provider": "llamaindex",
279
+ }
280
+
281
+ except Exception as e:
282
+ self.logger.error(f"Search failed: {e}")
283
+ import traceback
284
+
285
+ self.logger.error(traceback.format_exc())
286
+ return {
287
+ "query": query,
288
+ "answer": f"Search failed: {str(e)}",
289
+ "content": "",
290
+ "mode": mode,
291
+ "provider": "llamaindex",
292
+ }
293
+
294
+ async def delete(self, kb_name: str) -> bool:
295
+ """
296
+ Delete knowledge base.
297
+
298
+ Args:
299
+ kb_name: Knowledge base name
300
+
301
+ Returns:
302
+ True if successful
303
+ """
304
+ import shutil
305
+
306
+ kb_dir = Path(self.kb_base_dir) / kb_name
307
+
308
+ if kb_dir.exists():
309
+ shutil.rmtree(kb_dir)
310
+ self.logger.info(f"Deleted KB '{kb_name}'")
311
+ return True
312
+
313
+ return False