realtimex-deeptutor 0.5.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realtimex_deeptutor/__init__.py +67 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
- src/__init__.py +40 -0
- src/agents/__init__.py +24 -0
- src/agents/base_agent.py +657 -0
- src/agents/chat/__init__.py +24 -0
- src/agents/chat/chat_agent.py +435 -0
- src/agents/chat/prompts/en/chat_agent.yaml +35 -0
- src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
- src/agents/chat/session_manager.py +311 -0
- src/agents/co_writer/__init__.py +0 -0
- src/agents/co_writer/edit_agent.py +260 -0
- src/agents/co_writer/narrator_agent.py +423 -0
- src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
- src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
- src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
- src/agents/guide/__init__.py +16 -0
- src/agents/guide/agents/__init__.py +11 -0
- src/agents/guide/agents/chat_agent.py +104 -0
- src/agents/guide/agents/interactive_agent.py +223 -0
- src/agents/guide/agents/locate_agent.py +149 -0
- src/agents/guide/agents/summary_agent.py +150 -0
- src/agents/guide/guide_manager.py +500 -0
- src/agents/guide/prompts/en/chat_agent.yaml +41 -0
- src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
- src/agents/guide/prompts/en/locate_agent.yaml +68 -0
- src/agents/guide/prompts/en/summary_agent.yaml +157 -0
- src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
- src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
- src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
- src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
- src/agents/ideagen/__init__.py +12 -0
- src/agents/ideagen/idea_generation_workflow.py +426 -0
- src/agents/ideagen/material_organizer_agent.py +173 -0
- src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
- src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
- src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
- src/agents/question/__init__.py +24 -0
- src/agents/question/agents/__init__.py +18 -0
- src/agents/question/agents/generate_agent.py +381 -0
- src/agents/question/agents/relevance_analyzer.py +207 -0
- src/agents/question/agents/retrieve_agent.py +239 -0
- src/agents/question/coordinator.py +718 -0
- src/agents/question/example.py +109 -0
- src/agents/question/prompts/en/coordinator.yaml +75 -0
- src/agents/question/prompts/en/generate_agent.yaml +77 -0
- src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
- src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
- src/agents/question/prompts/zh/coordinator.yaml +75 -0
- src/agents/question/prompts/zh/generate_agent.yaml +77 -0
- src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
- src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
- src/agents/research/agents/__init__.py +23 -0
- src/agents/research/agents/decompose_agent.py +507 -0
- src/agents/research/agents/manager_agent.py +228 -0
- src/agents/research/agents/note_agent.py +180 -0
- src/agents/research/agents/rephrase_agent.py +263 -0
- src/agents/research/agents/reporting_agent.py +1333 -0
- src/agents/research/agents/research_agent.py +714 -0
- src/agents/research/data_structures.py +451 -0
- src/agents/research/main.py +188 -0
- src/agents/research/prompts/en/decompose_agent.yaml +89 -0
- src/agents/research/prompts/en/manager_agent.yaml +24 -0
- src/agents/research/prompts/en/note_agent.yaml +121 -0
- src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/en/reporting_agent.yaml +380 -0
- src/agents/research/prompts/en/research_agent.yaml +173 -0
- src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
- src/agents/research/prompts/zh/manager_agent.yaml +24 -0
- src/agents/research/prompts/zh/note_agent.yaml +121 -0
- src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
- src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
- src/agents/research/prompts/zh/research_agent.yaml +173 -0
- src/agents/research/research_pipeline.py +1309 -0
- src/agents/research/utils/__init__.py +60 -0
- src/agents/research/utils/citation_manager.py +799 -0
- src/agents/research/utils/json_utils.py +98 -0
- src/agents/research/utils/token_tracker.py +297 -0
- src/agents/solve/__init__.py +80 -0
- src/agents/solve/analysis_loop/__init__.py +14 -0
- src/agents/solve/analysis_loop/investigate_agent.py +414 -0
- src/agents/solve/analysis_loop/note_agent.py +190 -0
- src/agents/solve/main_solver.py +862 -0
- src/agents/solve/memory/__init__.py +34 -0
- src/agents/solve/memory/citation_memory.py +353 -0
- src/agents/solve/memory/investigate_memory.py +226 -0
- src/agents/solve/memory/solve_memory.py +340 -0
- src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
- src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
- src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
- src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
- src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
- src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
- src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
- src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
- src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
- src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
- src/agents/solve/solve_loop/__init__.py +22 -0
- src/agents/solve/solve_loop/citation_manager.py +74 -0
- src/agents/solve/solve_loop/manager_agent.py +274 -0
- src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
- src/agents/solve/solve_loop/response_agent.py +301 -0
- src/agents/solve/solve_loop/solve_agent.py +325 -0
- src/agents/solve/solve_loop/tool_agent.py +470 -0
- src/agents/solve/utils/__init__.py +64 -0
- src/agents/solve/utils/config_validator.py +313 -0
- src/agents/solve/utils/display_manager.py +223 -0
- src/agents/solve/utils/error_handler.py +363 -0
- src/agents/solve/utils/json_utils.py +98 -0
- src/agents/solve/utils/performance_monitor.py +407 -0
- src/agents/solve/utils/token_tracker.py +541 -0
- src/api/__init__.py +0 -0
- src/api/main.py +240 -0
- src/api/routers/__init__.py +1 -0
- src/api/routers/agent_config.py +69 -0
- src/api/routers/chat.py +296 -0
- src/api/routers/co_writer.py +337 -0
- src/api/routers/config.py +627 -0
- src/api/routers/dashboard.py +18 -0
- src/api/routers/guide.py +337 -0
- src/api/routers/ideagen.py +436 -0
- src/api/routers/knowledge.py +821 -0
- src/api/routers/notebook.py +247 -0
- src/api/routers/question.py +537 -0
- src/api/routers/research.py +394 -0
- src/api/routers/settings.py +164 -0
- src/api/routers/solve.py +305 -0
- src/api/routers/system.py +252 -0
- src/api/run_server.py +61 -0
- src/api/utils/history.py +172 -0
- src/api/utils/log_interceptor.py +21 -0
- src/api/utils/notebook_manager.py +415 -0
- src/api/utils/progress_broadcaster.py +72 -0
- src/api/utils/task_id_manager.py +100 -0
- src/config/__init__.py +0 -0
- src/config/accessors.py +18 -0
- src/config/constants.py +34 -0
- src/config/defaults.py +18 -0
- src/config/schema.py +38 -0
- src/config/settings.py +50 -0
- src/core/errors.py +62 -0
- src/knowledge/__init__.py +23 -0
- src/knowledge/add_documents.py +606 -0
- src/knowledge/config.py +65 -0
- src/knowledge/example_add_documents.py +236 -0
- src/knowledge/extract_numbered_items.py +1039 -0
- src/knowledge/initializer.py +621 -0
- src/knowledge/kb.py +22 -0
- src/knowledge/manager.py +782 -0
- src/knowledge/progress_tracker.py +182 -0
- src/knowledge/start_kb.py +535 -0
- src/logging/__init__.py +103 -0
- src/logging/adapters/__init__.py +17 -0
- src/logging/adapters/lightrag.py +184 -0
- src/logging/adapters/llamaindex.py +141 -0
- src/logging/config.py +80 -0
- src/logging/handlers/__init__.py +20 -0
- src/logging/handlers/console.py +75 -0
- src/logging/handlers/file.py +201 -0
- src/logging/handlers/websocket.py +127 -0
- src/logging/logger.py +709 -0
- src/logging/stats/__init__.py +16 -0
- src/logging/stats/llm_stats.py +179 -0
- src/services/__init__.py +56 -0
- src/services/config/__init__.py +61 -0
- src/services/config/knowledge_base_config.py +210 -0
- src/services/config/loader.py +260 -0
- src/services/config/unified_config.py +603 -0
- src/services/embedding/__init__.py +45 -0
- src/services/embedding/adapters/__init__.py +22 -0
- src/services/embedding/adapters/base.py +106 -0
- src/services/embedding/adapters/cohere.py +127 -0
- src/services/embedding/adapters/jina.py +99 -0
- src/services/embedding/adapters/ollama.py +116 -0
- src/services/embedding/adapters/openai_compatible.py +96 -0
- src/services/embedding/client.py +159 -0
- src/services/embedding/config.py +156 -0
- src/services/embedding/provider.py +119 -0
- src/services/llm/__init__.py +152 -0
- src/services/llm/capabilities.py +313 -0
- src/services/llm/client.py +302 -0
- src/services/llm/cloud_provider.py +530 -0
- src/services/llm/config.py +200 -0
- src/services/llm/error_mapping.py +103 -0
- src/services/llm/exceptions.py +152 -0
- src/services/llm/factory.py +450 -0
- src/services/llm/local_provider.py +347 -0
- src/services/llm/providers/anthropic.py +95 -0
- src/services/llm/providers/base_provider.py +93 -0
- src/services/llm/providers/open_ai.py +83 -0
- src/services/llm/registry.py +71 -0
- src/services/llm/telemetry.py +40 -0
- src/services/llm/types.py +27 -0
- src/services/llm/utils.py +333 -0
- src/services/prompt/__init__.py +25 -0
- src/services/prompt/manager.py +206 -0
- src/services/rag/__init__.py +64 -0
- src/services/rag/components/__init__.py +29 -0
- src/services/rag/components/base.py +59 -0
- src/services/rag/components/chunkers/__init__.py +18 -0
- src/services/rag/components/chunkers/base.py +34 -0
- src/services/rag/components/chunkers/fixed.py +71 -0
- src/services/rag/components/chunkers/numbered_item.py +94 -0
- src/services/rag/components/chunkers/semantic.py +97 -0
- src/services/rag/components/embedders/__init__.py +14 -0
- src/services/rag/components/embedders/base.py +32 -0
- src/services/rag/components/embedders/openai.py +63 -0
- src/services/rag/components/indexers/__init__.py +18 -0
- src/services/rag/components/indexers/base.py +35 -0
- src/services/rag/components/indexers/graph.py +172 -0
- src/services/rag/components/indexers/lightrag.py +156 -0
- src/services/rag/components/indexers/vector.py +146 -0
- src/services/rag/components/parsers/__init__.py +18 -0
- src/services/rag/components/parsers/base.py +35 -0
- src/services/rag/components/parsers/markdown.py +52 -0
- src/services/rag/components/parsers/pdf.py +115 -0
- src/services/rag/components/parsers/text.py +86 -0
- src/services/rag/components/retrievers/__init__.py +18 -0
- src/services/rag/components/retrievers/base.py +34 -0
- src/services/rag/components/retrievers/dense.py +200 -0
- src/services/rag/components/retrievers/hybrid.py +164 -0
- src/services/rag/components/retrievers/lightrag.py +169 -0
- src/services/rag/components/routing.py +286 -0
- src/services/rag/factory.py +234 -0
- src/services/rag/pipeline.py +215 -0
- src/services/rag/pipelines/__init__.py +32 -0
- src/services/rag/pipelines/academic.py +44 -0
- src/services/rag/pipelines/lightrag.py +43 -0
- src/services/rag/pipelines/llamaindex.py +313 -0
- src/services/rag/pipelines/raganything.py +384 -0
- src/services/rag/service.py +244 -0
- src/services/rag/types.py +73 -0
- src/services/search/__init__.py +284 -0
- src/services/search/base.py +87 -0
- src/services/search/consolidation.py +398 -0
- src/services/search/providers/__init__.py +128 -0
- src/services/search/providers/baidu.py +188 -0
- src/services/search/providers/exa.py +194 -0
- src/services/search/providers/jina.py +161 -0
- src/services/search/providers/perplexity.py +153 -0
- src/services/search/providers/serper.py +209 -0
- src/services/search/providers/tavily.py +161 -0
- src/services/search/types.py +114 -0
- src/services/setup/__init__.py +34 -0
- src/services/setup/init.py +285 -0
- src/services/tts/__init__.py +16 -0
- src/services/tts/config.py +99 -0
- src/tools/__init__.py +91 -0
- src/tools/code_executor.py +536 -0
- src/tools/paper_search_tool.py +171 -0
- src/tools/query_item_tool.py +310 -0
- src/tools/question/__init__.py +15 -0
- src/tools/question/exam_mimic.py +616 -0
- src/tools/question/pdf_parser.py +211 -0
- src/tools/question/question_extractor.py +397 -0
- src/tools/rag_tool.py +173 -0
- src/tools/tex_chunker.py +339 -0
- src/tools/tex_downloader.py +253 -0
- src/tools/web_search.py +71 -0
- src/utils/config_manager.py +206 -0
- src/utils/document_validator.py +168 -0
- src/utils/error_rate_tracker.py +111 -0
- src/utils/error_utils.py +82 -0
- src/utils/json_parser.py +110 -0
- src/utils/network/circuit_breaker.py +79 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RAG Pipeline
|
|
3
|
+
============
|
|
4
|
+
|
|
5
|
+
Composable RAG pipeline with fluent API.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import shutil
|
|
11
|
+
from typing import Any, Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
from src.logging import get_logger
|
|
14
|
+
|
|
15
|
+
from .components.base import Component
|
|
16
|
+
from .components.routing import FileTypeRouter
|
|
17
|
+
from .types import Document
|
|
18
|
+
|
|
19
|
+
# Default knowledge base directory
|
|
20
|
+
DEFAULT_KB_BASE_DIR = str(
|
|
21
|
+
Path(__file__).resolve().parent.parent.parent.parent / "data" / "knowledge_bases"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RAGPipeline:
|
|
26
|
+
"""
|
|
27
|
+
Composable RAG pipeline.
|
|
28
|
+
|
|
29
|
+
Build custom RAG pipelines using a fluent API:
|
|
30
|
+
|
|
31
|
+
pipeline = (
|
|
32
|
+
RAGPipeline("custom", kb_base_dir="/path/to/kb")
|
|
33
|
+
.parser(PDFParser())
|
|
34
|
+
.chunker(SemanticChunker())
|
|
35
|
+
.embedder(OpenAIEmbedder())
|
|
36
|
+
.indexer(GraphIndexer())
|
|
37
|
+
.retriever(HybridRetriever())
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
await pipeline.initialize("kb_name", ["doc1.pdf"])
|
|
41
|
+
result = await pipeline.search("query", "kb_name")
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, name: str = "default", kb_base_dir: Optional[str] = None):
|
|
45
|
+
"""
|
|
46
|
+
Initialize RAG pipeline.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
name: Pipeline name for logging
|
|
50
|
+
kb_base_dir: Base directory for knowledge bases
|
|
51
|
+
"""
|
|
52
|
+
self.name = name
|
|
53
|
+
self.kb_base_dir = kb_base_dir or DEFAULT_KB_BASE_DIR
|
|
54
|
+
self.logger = get_logger(f"Pipeline:{name}")
|
|
55
|
+
self._parser: Optional[Component] = None
|
|
56
|
+
self._chunkers: List[Component] = []
|
|
57
|
+
self._embedder: Optional[Component] = None
|
|
58
|
+
self._indexers: List[Component] = []
|
|
59
|
+
self._retriever: Optional[Component] = None
|
|
60
|
+
|
|
61
|
+
# Fluent API methods
|
|
62
|
+
def parser(self, p: Component) -> "RAGPipeline":
|
|
63
|
+
"""Set the document parser."""
|
|
64
|
+
self._parser = p
|
|
65
|
+
return self
|
|
66
|
+
|
|
67
|
+
def chunker(self, c: Component) -> "RAGPipeline":
|
|
68
|
+
"""Add a chunker to the pipeline."""
|
|
69
|
+
self._chunkers.append(c)
|
|
70
|
+
return self
|
|
71
|
+
|
|
72
|
+
def embedder(self, e: Component) -> "RAGPipeline":
|
|
73
|
+
"""Set the embedder."""
|
|
74
|
+
self._embedder = e
|
|
75
|
+
return self
|
|
76
|
+
|
|
77
|
+
def indexer(self, i: Component) -> "RAGPipeline":
|
|
78
|
+
"""Add an indexer to the pipeline."""
|
|
79
|
+
self._indexers.append(i)
|
|
80
|
+
return self
|
|
81
|
+
|
|
82
|
+
def retriever(self, r: Component) -> "RAGPipeline":
|
|
83
|
+
"""Set the retriever."""
|
|
84
|
+
self._retriever = r
|
|
85
|
+
return self
|
|
86
|
+
|
|
87
|
+
async def initialize(self, kb_name: str, file_paths: List[str], **kwargs) -> bool:
|
|
88
|
+
"""
|
|
89
|
+
Run full initialization pipeline.
|
|
90
|
+
|
|
91
|
+
Uses FileTypeRouter to classify files and route them appropriately:
|
|
92
|
+
- PDF/complex files -> configured parser (e.g., PDFParser)
|
|
93
|
+
- Text files -> direct text reading (fast path)
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
kb_name: Knowledge base name
|
|
97
|
+
file_paths: List of file paths to process
|
|
98
|
+
**kwargs: Additional arguments passed to components
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
True if successful
|
|
102
|
+
"""
|
|
103
|
+
self.logger.info(f"Initializing KB '{kb_name}' with {len(file_paths)} files")
|
|
104
|
+
|
|
105
|
+
if not self._parser:
|
|
106
|
+
raise ValueError("No parser configured. Use .parser() to set one")
|
|
107
|
+
|
|
108
|
+
# Stage 1: Parse documents with file type routing
|
|
109
|
+
self.logger.info("Stage 1: Parsing documents...")
|
|
110
|
+
|
|
111
|
+
# Classify files by type
|
|
112
|
+
classification = FileTypeRouter.classify_files(file_paths)
|
|
113
|
+
self.logger.info(
|
|
114
|
+
f"File classification: {len(classification.needs_mineru)} complex, "
|
|
115
|
+
f"{len(classification.text_files)} text, "
|
|
116
|
+
f"{len(classification.unsupported)} unsupported"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
documents = []
|
|
120
|
+
|
|
121
|
+
# Process complex files (PDF, etc.) with configured parser
|
|
122
|
+
for path in classification.needs_mineru:
|
|
123
|
+
self.logger.info(f"Parsing (parser): {Path(path).name}")
|
|
124
|
+
doc = await self._parser.process(path, **kwargs)
|
|
125
|
+
documents.append(doc)
|
|
126
|
+
|
|
127
|
+
# Process text files directly (fast path)
|
|
128
|
+
for path in classification.text_files:
|
|
129
|
+
self.logger.info(f"Parsing (direct text): {Path(path).name}")
|
|
130
|
+
content = await FileTypeRouter.read_text_file(path)
|
|
131
|
+
doc = Document(
|
|
132
|
+
content=content,
|
|
133
|
+
file_path=str(path),
|
|
134
|
+
metadata={
|
|
135
|
+
"filename": Path(path).name,
|
|
136
|
+
"parser": "direct_text",
|
|
137
|
+
},
|
|
138
|
+
)
|
|
139
|
+
documents.append(doc)
|
|
140
|
+
|
|
141
|
+
# Log unsupported files
|
|
142
|
+
for path in classification.unsupported:
|
|
143
|
+
self.logger.warning(f"Skipped unsupported file: {Path(path).name}")
|
|
144
|
+
|
|
145
|
+
# Stage 2: Chunk (sequential - later chunkers see earlier results)
|
|
146
|
+
if self._chunkers:
|
|
147
|
+
self.logger.info("Stage 2: Chunking...")
|
|
148
|
+
for chunker in self._chunkers:
|
|
149
|
+
for doc in documents:
|
|
150
|
+
new_chunks = await chunker.process(doc, **kwargs)
|
|
151
|
+
doc.chunks.extend(new_chunks)
|
|
152
|
+
|
|
153
|
+
# Stage 3: Embed
|
|
154
|
+
if self._embedder:
|
|
155
|
+
self.logger.info("Stage 3: Embedding...")
|
|
156
|
+
for doc in documents:
|
|
157
|
+
await self._embedder.process(doc, **kwargs)
|
|
158
|
+
|
|
159
|
+
# Stage 4: Index (can run in parallel)
|
|
160
|
+
if self._indexers:
|
|
161
|
+
self.logger.info("Stage 4: Indexing...")
|
|
162
|
+
await asyncio.gather(
|
|
163
|
+
*[indexer.process(kb_name, documents, **kwargs) for indexer in self._indexers]
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
self.logger.info(f"KB '{kb_name}' initialized successfully")
|
|
167
|
+
return True
|
|
168
|
+
|
|
169
|
+
async def search(self, query: str, kb_name: str, **kwargs) -> Dict[str, Any]:
|
|
170
|
+
"""
|
|
171
|
+
Search the knowledge base.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
query: Search query
|
|
175
|
+
kb_name: Knowledge base name
|
|
176
|
+
**kwargs: Additional arguments passed to retriever
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Search results dictionary
|
|
180
|
+
"""
|
|
181
|
+
if not self._retriever:
|
|
182
|
+
raise ValueError("No retriever configured. Use .retriever() to set one")
|
|
183
|
+
|
|
184
|
+
return await self._retriever.process(query, kb_name=kb_name, **kwargs)
|
|
185
|
+
|
|
186
|
+
async def delete(self, kb_name: str) -> bool:
|
|
187
|
+
"""
|
|
188
|
+
Delete a knowledge base.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
kb_name: Knowledge base name
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
True if successful
|
|
195
|
+
"""
|
|
196
|
+
# Validate kb_name to prevent path traversal
|
|
197
|
+
if not kb_name or kb_name in (".", "..") or "/" in kb_name or "\\" in kb_name:
|
|
198
|
+
raise ValueError(f"Invalid knowledge base name: {kb_name}")
|
|
199
|
+
|
|
200
|
+
self.logger.info(f"Deleting KB '{kb_name}'")
|
|
201
|
+
|
|
202
|
+
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
203
|
+
# Ensure the resolved path is within the base directory
|
|
204
|
+
kb_dir = kb_dir.resolve()
|
|
205
|
+
base_dir = Path(self.kb_base_dir).resolve()
|
|
206
|
+
if not kb_dir.is_relative_to(base_dir):
|
|
207
|
+
raise ValueError(f"Knowledge base path outside allowed directory: {kb_name}")
|
|
208
|
+
|
|
209
|
+
if kb_dir.exists():
|
|
210
|
+
shutil.rmtree(kb_dir)
|
|
211
|
+
self.logger.info(f"Deleted KB directory: {kb_dir}")
|
|
212
|
+
return True
|
|
213
|
+
|
|
214
|
+
self.logger.warning(f"KB directory not found: {kb_dir}")
|
|
215
|
+
return False
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pre-configured Pipelines
|
|
3
|
+
========================
|
|
4
|
+
|
|
5
|
+
Ready-to-use RAG pipelines for common use cases.
|
|
6
|
+
|
|
7
|
+
LightRAG and Academic pipelines are always available.
|
|
8
|
+
LlamaIndex and RAGAnything require optional dependencies.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# Always available pipelines
|
|
12
|
+
from .academic import AcademicPipeline
|
|
13
|
+
from .lightrag import LightRAGPipeline
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"LightRAGPipeline",
|
|
17
|
+
"AcademicPipeline",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
# Optional pipelines - import only if dependencies are available
|
|
21
|
+
try:
|
|
22
|
+
from .llamaindex import LlamaIndexPipeline
|
|
23
|
+
__all__.append("LlamaIndexPipeline")
|
|
24
|
+
except ImportError:
|
|
25
|
+
LlamaIndexPipeline = None # type: ignore
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from .raganything import RAGAnythingPipeline
|
|
29
|
+
__all__.append("RAGAnythingPipeline")
|
|
30
|
+
except ImportError:
|
|
31
|
+
RAGAnythingPipeline = None # type: ignore
|
|
32
|
+
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Academic Pipeline
|
|
3
|
+
=================
|
|
4
|
+
|
|
5
|
+
Pipeline optimized for academic documents with numbered item extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from ..components.chunkers import NumberedItemExtractor, SemanticChunker
|
|
11
|
+
from ..components.embedders import OpenAIEmbedder
|
|
12
|
+
from ..components.indexers import GraphIndexer
|
|
13
|
+
from ..components.parsers import TextParser
|
|
14
|
+
from ..components.retrievers import HybridRetriever
|
|
15
|
+
from ..pipeline import RAGPipeline
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def AcademicPipeline(kb_base_dir: Optional[str] = None) -> RAGPipeline:
|
|
19
|
+
"""
|
|
20
|
+
Create an academic document pipeline.
|
|
21
|
+
|
|
22
|
+
This pipeline uses:
|
|
23
|
+
- TextParser for document parsing (supports txt, md files)
|
|
24
|
+
- SemanticChunker for text chunking
|
|
25
|
+
- NumberedItemExtractor for extracting definitions, theorems, etc.
|
|
26
|
+
- OpenAIEmbedder for embedding generation
|
|
27
|
+
- GraphIndexer for knowledge graph indexing
|
|
28
|
+
- HybridRetriever for hybrid retrieval
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
kb_base_dir: Base directory for knowledge bases
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Configured RAGPipeline
|
|
35
|
+
"""
|
|
36
|
+
return (
|
|
37
|
+
RAGPipeline("academic", kb_base_dir=kb_base_dir)
|
|
38
|
+
.parser(TextParser())
|
|
39
|
+
.chunker(SemanticChunker())
|
|
40
|
+
.chunker(NumberedItemExtractor())
|
|
41
|
+
.embedder(OpenAIEmbedder())
|
|
42
|
+
.indexer(GraphIndexer(kb_base_dir=kb_base_dir))
|
|
43
|
+
.retriever(HybridRetriever(kb_base_dir=kb_base_dir))
|
|
44
|
+
)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LightRAG Pipeline
|
|
3
|
+
=================
|
|
4
|
+
|
|
5
|
+
Pure LightRAG pipeline (text-only, no multimodal processing).
|
|
6
|
+
Faster than RAGAnything for text-heavy documents.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from ..components.indexers import LightRAGIndexer
|
|
12
|
+
from ..components.parsers import PDFParser
|
|
13
|
+
from ..components.retrievers import LightRAGRetriever
|
|
14
|
+
from ..pipeline import RAGPipeline
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def LightRAGPipeline(kb_base_dir: Optional[str] = None) -> RAGPipeline:
|
|
18
|
+
"""
|
|
19
|
+
Create a pure LightRAG pipeline (text-only, no multimodal).
|
|
20
|
+
|
|
21
|
+
This pipeline uses:
|
|
22
|
+
- PDFParser for document parsing (extracts raw text from PDF/txt/md)
|
|
23
|
+
- LightRAGIndexer for knowledge graph indexing (text-only, fast)
|
|
24
|
+
* LightRAG handles chunking, entity extraction, and embedding internally
|
|
25
|
+
* No separate chunker/embedder needed - LightRAG does it all
|
|
26
|
+
- LightRAGRetriever for retrieval (uses LightRAG.aquery() directly)
|
|
27
|
+
|
|
28
|
+
Performance: Medium speed (~10-15s per document)
|
|
29
|
+
Use for: Business docs, text-heavy PDFs, when you need knowledge graph
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
kb_base_dir: Base directory for knowledge bases
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Configured RAGPipeline
|
|
36
|
+
"""
|
|
37
|
+
return (
|
|
38
|
+
RAGPipeline("lightrag", kb_base_dir=kb_base_dir)
|
|
39
|
+
.parser(PDFParser())
|
|
40
|
+
# No chunker/embedder - LightRAG does everything internally
|
|
41
|
+
.indexer(LightRAGIndexer(kb_base_dir=kb_base_dir))
|
|
42
|
+
.retriever(LightRAGRetriever(kb_base_dir=kb_base_dir))
|
|
43
|
+
)
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LlamaIndex Pipeline
|
|
3
|
+
===================
|
|
4
|
+
|
|
5
|
+
True LlamaIndex integration using official llama-index library.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from llama_index.core import (
|
|
13
|
+
Document,
|
|
14
|
+
Settings,
|
|
15
|
+
StorageContext,
|
|
16
|
+
VectorStoreIndex,
|
|
17
|
+
load_index_from_storage,
|
|
18
|
+
)
|
|
19
|
+
from llama_index.core.base.embeddings.base import BaseEmbedding
|
|
20
|
+
from llama_index.core.bridge.pydantic import PrivateAttr
|
|
21
|
+
|
|
22
|
+
from src.logging import get_logger
|
|
23
|
+
from src.services.embedding import get_embedding_client, get_embedding_config
|
|
24
|
+
|
|
25
|
+
# Default knowledge base directory
|
|
26
|
+
DEFAULT_KB_BASE_DIR = str(
|
|
27
|
+
Path(__file__).resolve().parent.parent.parent.parent.parent / "data" / "knowledge_bases"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class CustomEmbedding(BaseEmbedding):
|
|
32
|
+
"""
|
|
33
|
+
Custom embedding adapter for OpenAI-compatible APIs.
|
|
34
|
+
|
|
35
|
+
Works with any OpenAI-compatible endpoint including:
|
|
36
|
+
- Google Gemini (text-embedding-004)
|
|
37
|
+
- OpenAI (text-embedding-ada-002, text-embedding-3-*)
|
|
38
|
+
- Azure OpenAI
|
|
39
|
+
- Local models with OpenAI-compatible API
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
_client: Any = PrivateAttr()
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
self._client = get_embedding_client()
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def class_name(cls) -> str:
|
|
50
|
+
return "custom_embedding"
|
|
51
|
+
|
|
52
|
+
async def _aget_query_embedding(self, query: str) -> List[float]:
|
|
53
|
+
"""Get embedding for a query."""
|
|
54
|
+
embeddings = await self._client.embed([query])
|
|
55
|
+
return embeddings[0]
|
|
56
|
+
|
|
57
|
+
async def _aget_text_embedding(self, text: str) -> List[float]:
|
|
58
|
+
"""Get embedding for a text."""
|
|
59
|
+
embeddings = await self._client.embed([text])
|
|
60
|
+
return embeddings[0]
|
|
61
|
+
|
|
62
|
+
def _get_query_embedding(self, query: str) -> List[float]:
|
|
63
|
+
"""Sync version - called by LlamaIndex sync API."""
|
|
64
|
+
# Use nest_asyncio to allow nested event loops
|
|
65
|
+
import nest_asyncio
|
|
66
|
+
|
|
67
|
+
nest_asyncio.apply()
|
|
68
|
+
return asyncio.run(self._aget_query_embedding(query))
|
|
69
|
+
|
|
70
|
+
def _get_text_embedding(self, text: str) -> List[float]:
|
|
71
|
+
"""Sync version - called by LlamaIndex sync API."""
|
|
72
|
+
# Use nest_asyncio to allow nested event loops
|
|
73
|
+
import nest_asyncio
|
|
74
|
+
|
|
75
|
+
nest_asyncio.apply()
|
|
76
|
+
return asyncio.run(self._aget_text_embedding(text))
|
|
77
|
+
|
|
78
|
+
async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
|
|
79
|
+
"""Get embeddings for multiple texts."""
|
|
80
|
+
return await self._client.embed(texts)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class LlamaIndexPipeline:
|
|
84
|
+
"""
|
|
85
|
+
True LlamaIndex pipeline using official llama-index library.
|
|
86
|
+
|
|
87
|
+
Uses LlamaIndex's native components:
|
|
88
|
+
- VectorStoreIndex for indexing
|
|
89
|
+
- CustomEmbedding for OpenAI-compatible embeddings
|
|
90
|
+
- SentenceSplitter for chunking
|
|
91
|
+
- StorageContext for persistence
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(self, kb_base_dir: Optional[str] = None):
|
|
95
|
+
"""
|
|
96
|
+
Initialize LlamaIndex pipeline.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
kb_base_dir: Base directory for knowledge bases
|
|
100
|
+
"""
|
|
101
|
+
self.logger = get_logger("LlamaIndexPipeline")
|
|
102
|
+
self.kb_base_dir = kb_base_dir or DEFAULT_KB_BASE_DIR
|
|
103
|
+
self._configure_settings()
|
|
104
|
+
|
|
105
|
+
def _configure_settings(self):
|
|
106
|
+
"""Configure LlamaIndex global settings."""
|
|
107
|
+
# Get embedding config
|
|
108
|
+
embedding_cfg = get_embedding_config()
|
|
109
|
+
|
|
110
|
+
# Configure custom embedding that works with any OpenAI-compatible API
|
|
111
|
+
Settings.embed_model = CustomEmbedding()
|
|
112
|
+
|
|
113
|
+
# Configure chunking
|
|
114
|
+
Settings.chunk_size = 512
|
|
115
|
+
Settings.chunk_overlap = 50
|
|
116
|
+
|
|
117
|
+
self.logger.info(
|
|
118
|
+
f"LlamaIndex configured: embedding={embedding_cfg.model} "
|
|
119
|
+
f"({embedding_cfg.dim}D, {embedding_cfg.binding}), chunk_size=512"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
async def initialize(self, kb_name: str, file_paths: List[str], **kwargs) -> bool:
|
|
123
|
+
"""
|
|
124
|
+
Initialize KB using real LlamaIndex components.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
kb_name: Knowledge base name
|
|
128
|
+
file_paths: List of file paths to process
|
|
129
|
+
**kwargs: Additional arguments
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
True if successful
|
|
133
|
+
"""
|
|
134
|
+
self.logger.info(
|
|
135
|
+
f"Initializing KB '{kb_name}' with {len(file_paths)} files using LlamaIndex"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
139
|
+
storage_dir = kb_dir / "llamaindex_storage"
|
|
140
|
+
storage_dir.mkdir(parents=True, exist_ok=True)
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
# Parse documents
|
|
144
|
+
documents = []
|
|
145
|
+
for file_path in file_paths:
|
|
146
|
+
file_path = Path(file_path)
|
|
147
|
+
self.logger.info(f"Parsing: {file_path.name}")
|
|
148
|
+
|
|
149
|
+
# Extract text based on file type
|
|
150
|
+
if file_path.suffix.lower() == ".pdf":
|
|
151
|
+
text = self._extract_pdf_text(file_path)
|
|
152
|
+
else:
|
|
153
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
154
|
+
text = f.read()
|
|
155
|
+
|
|
156
|
+
if text.strip():
|
|
157
|
+
doc = Document(
|
|
158
|
+
text=text,
|
|
159
|
+
metadata={
|
|
160
|
+
"file_name": file_path.name,
|
|
161
|
+
"file_path": str(file_path),
|
|
162
|
+
},
|
|
163
|
+
)
|
|
164
|
+
documents.append(doc)
|
|
165
|
+
self.logger.info(f"Loaded: {file_path.name} ({len(text)} chars)")
|
|
166
|
+
else:
|
|
167
|
+
self.logger.warning(f"Skipped empty document: {file_path.name}")
|
|
168
|
+
|
|
169
|
+
if not documents:
|
|
170
|
+
self.logger.error("No valid documents found")
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
# Create index with LlamaIndex (run sync code in thread pool to avoid blocking)
|
|
174
|
+
self.logger.info(f"Creating VectorStoreIndex with {len(documents)} documents...")
|
|
175
|
+
|
|
176
|
+
# Run sync LlamaIndex code in thread pool to avoid blocking async event loop
|
|
177
|
+
loop = asyncio.get_event_loop()
|
|
178
|
+
index = await loop.run_in_executor(
|
|
179
|
+
None, # Use default ThreadPoolExecutor
|
|
180
|
+
lambda: VectorStoreIndex.from_documents(documents, show_progress=True),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Persist index
|
|
184
|
+
index.storage_context.persist(persist_dir=str(storage_dir))
|
|
185
|
+
self.logger.info(f"Index persisted to {storage_dir}")
|
|
186
|
+
|
|
187
|
+
self.logger.info(f"KB '{kb_name}' initialized successfully with LlamaIndex")
|
|
188
|
+
return True
|
|
189
|
+
|
|
190
|
+
except Exception as e:
|
|
191
|
+
self.logger.error(f"Failed to initialize KB: {e}")
|
|
192
|
+
import traceback
|
|
193
|
+
|
|
194
|
+
self.logger.error(traceback.format_exc())
|
|
195
|
+
return False
|
|
196
|
+
|
|
197
|
+
def _extract_pdf_text(self, file_path: Path) -> str:
|
|
198
|
+
"""Extract text from PDF using PyMuPDF."""
|
|
199
|
+
try:
|
|
200
|
+
import fitz # PyMuPDF
|
|
201
|
+
|
|
202
|
+
doc = fitz.open(file_path)
|
|
203
|
+
texts = []
|
|
204
|
+
for page in doc:
|
|
205
|
+
texts.append(page.get_text())
|
|
206
|
+
doc.close()
|
|
207
|
+
return "\n\n".join(texts)
|
|
208
|
+
except ImportError:
|
|
209
|
+
self.logger.warning("PyMuPDF not installed. Cannot extract PDF text.")
|
|
210
|
+
return ""
|
|
211
|
+
except Exception as e:
|
|
212
|
+
self.logger.error(f"Failed to extract PDF text: {e}")
|
|
213
|
+
return ""
|
|
214
|
+
|
|
215
|
+
async def search(
|
|
216
|
+
self,
|
|
217
|
+
query: str,
|
|
218
|
+
kb_name: str,
|
|
219
|
+
mode: str = "hybrid",
|
|
220
|
+
**kwargs,
|
|
221
|
+
) -> Dict[str, Any]:
|
|
222
|
+
"""
|
|
223
|
+
Search using LlamaIndex query engine.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
query: Search query
|
|
227
|
+
kb_name: Knowledge base name
|
|
228
|
+
mode: Search mode (ignored, LlamaIndex uses similarity)
|
|
229
|
+
**kwargs: Additional arguments (top_k, etc.)
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Search results dictionary
|
|
233
|
+
"""
|
|
234
|
+
self.logger.info(f"Searching KB '{kb_name}' with query: {query[:50]}...")
|
|
235
|
+
|
|
236
|
+
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
237
|
+
storage_dir = kb_dir / "llamaindex_storage"
|
|
238
|
+
|
|
239
|
+
if not storage_dir.exists():
|
|
240
|
+
self.logger.warning(f"No LlamaIndex storage found at {storage_dir}")
|
|
241
|
+
return {
|
|
242
|
+
"query": query,
|
|
243
|
+
"answer": "No documents indexed. Please upload documents first.",
|
|
244
|
+
"content": "",
|
|
245
|
+
"mode": mode,
|
|
246
|
+
"provider": "llamaindex",
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
# Load index from storage (run in thread pool)
|
|
251
|
+
loop = asyncio.get_event_loop()
|
|
252
|
+
|
|
253
|
+
def load_and_retrieve():
|
|
254
|
+
storage_context = StorageContext.from_defaults(persist_dir=str(storage_dir))
|
|
255
|
+
index = load_index_from_storage(storage_context)
|
|
256
|
+
top_k = kwargs.get("top_k", 5)
|
|
257
|
+
|
|
258
|
+
# Use retriever instead of query_engine to avoid LLM requirement
|
|
259
|
+
retriever = index.as_retriever(similarity_top_k=top_k)
|
|
260
|
+
nodes = retriever.retrieve(query)
|
|
261
|
+
return nodes
|
|
262
|
+
|
|
263
|
+
# Execute retrieval in thread pool to avoid blocking
|
|
264
|
+
nodes = await loop.run_in_executor(None, load_and_retrieve)
|
|
265
|
+
|
|
266
|
+
# Extract text from retrieved nodes
|
|
267
|
+
context_parts = []
|
|
268
|
+
for node in nodes:
|
|
269
|
+
context_parts.append(node.node.text)
|
|
270
|
+
|
|
271
|
+
content = "\n\n".join(context_parts) if context_parts else ""
|
|
272
|
+
|
|
273
|
+
return {
|
|
274
|
+
"query": query,
|
|
275
|
+
"answer": content, # Return context for ChatAgent to use
|
|
276
|
+
"content": content,
|
|
277
|
+
"mode": mode,
|
|
278
|
+
"provider": "llamaindex",
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
except Exception as e:
|
|
282
|
+
self.logger.error(f"Search failed: {e}")
|
|
283
|
+
import traceback
|
|
284
|
+
|
|
285
|
+
self.logger.error(traceback.format_exc())
|
|
286
|
+
return {
|
|
287
|
+
"query": query,
|
|
288
|
+
"answer": f"Search failed: {str(e)}",
|
|
289
|
+
"content": "",
|
|
290
|
+
"mode": mode,
|
|
291
|
+
"provider": "llamaindex",
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
async def delete(self, kb_name: str) -> bool:
|
|
295
|
+
"""
|
|
296
|
+
Delete knowledge base.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
kb_name: Knowledge base name
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
True if successful
|
|
303
|
+
"""
|
|
304
|
+
import shutil
|
|
305
|
+
|
|
306
|
+
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
307
|
+
|
|
308
|
+
if kb_dir.exists():
|
|
309
|
+
shutil.rmtree(kb_dir)
|
|
310
|
+
self.logger.info(f"Deleted KB '{kb_name}'")
|
|
311
|
+
return True
|
|
312
|
+
|
|
313
|
+
return False
|