realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
- realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
- scripts/__init__.py +1 -0
- scripts/audit_prompts.py +179 -0
- scripts/check_install.py +460 -0
- scripts/generate_roster.py +327 -0
- scripts/install_all.py +653 -0
- scripts/migrate_kb.py +655 -0
- scripts/start.py +807 -0
- scripts/start_web.py +632 -0
- scripts/sync_prompts_from_en.py +147 -0
- src/__init__.py +2 -2
- src/agents/ideagen/material_organizer_agent.py +2 -0
- src/agents/solve/__init__.py +6 -0
- src/agents/solve/main_solver.py +9 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
- src/agents/solve/session_manager.py +345 -0
- src/api/main.py +14 -0
- src/api/routers/chat.py +3 -3
- src/api/routers/co_writer.py +12 -7
- src/api/routers/config.py +1 -0
- src/api/routers/guide.py +3 -1
- src/api/routers/ideagen.py +7 -0
- src/api/routers/knowledge.py +64 -12
- src/api/routers/question.py +2 -0
- src/api/routers/realtimex.py +137 -0
- src/api/routers/research.py +9 -0
- src/api/routers/solve.py +120 -2
- src/cli/__init__.py +13 -0
- src/cli/start.py +209 -0
- src/config/constants.py +11 -9
- src/knowledge/add_documents.py +453 -213
- src/knowledge/extract_numbered_items.py +9 -10
- src/knowledge/initializer.py +102 -101
- src/knowledge/manager.py +251 -74
- src/knowledge/progress_tracker.py +43 -2
- src/knowledge/start_kb.py +11 -2
- src/logging/__init__.py +5 -0
- src/logging/adapters/__init__.py +1 -0
- src/logging/adapters/lightrag.py +25 -18
- src/logging/adapters/llamaindex.py +1 -0
- src/logging/config.py +30 -27
- src/logging/handlers/__init__.py +1 -0
- src/logging/handlers/console.py +7 -50
- src/logging/handlers/file.py +5 -20
- src/logging/handlers/websocket.py +23 -19
- src/logging/logger.py +161 -126
- src/logging/stats/__init__.py +1 -0
- src/logging/stats/llm_stats.py +37 -17
- src/services/__init__.py +17 -1
- src/services/config/__init__.py +1 -0
- src/services/config/knowledge_base_config.py +1 -0
- src/services/config/loader.py +1 -1
- src/services/config/unified_config.py +211 -4
- src/services/embedding/__init__.py +1 -0
- src/services/embedding/adapters/__init__.py +3 -0
- src/services/embedding/adapters/base.py +1 -0
- src/services/embedding/adapters/cohere.py +1 -0
- src/services/embedding/adapters/jina.py +1 -0
- src/services/embedding/adapters/ollama.py +1 -0
- src/services/embedding/adapters/openai_compatible.py +1 -0
- src/services/embedding/adapters/realtimex.py +125 -0
- src/services/embedding/client.py +27 -0
- src/services/embedding/config.py +3 -0
- src/services/embedding/provider.py +1 -0
- src/services/llm/__init__.py +17 -3
- src/services/llm/capabilities.py +47 -0
- src/services/llm/client.py +32 -0
- src/services/llm/cloud_provider.py +21 -4
- src/services/llm/config.py +36 -2
- src/services/llm/error_mapping.py +1 -0
- src/services/llm/exceptions.py +30 -0
- src/services/llm/factory.py +55 -16
- src/services/llm/local_provider.py +1 -0
- src/services/llm/providers/anthropic.py +1 -0
- src/services/llm/providers/base_provider.py +1 -0
- src/services/llm/providers/open_ai.py +1 -0
- src/services/llm/realtimex_provider.py +240 -0
- src/services/llm/registry.py +1 -0
- src/services/llm/telemetry.py +1 -0
- src/services/llm/types.py +1 -0
- src/services/llm/utils.py +1 -0
- src/services/prompt/__init__.py +1 -0
- src/services/prompt/manager.py +3 -2
- src/services/rag/__init__.py +27 -5
- src/services/rag/components/__init__.py +1 -0
- src/services/rag/components/base.py +1 -0
- src/services/rag/components/chunkers/__init__.py +1 -0
- src/services/rag/components/chunkers/base.py +1 -0
- src/services/rag/components/chunkers/fixed.py +1 -0
- src/services/rag/components/chunkers/numbered_item.py +1 -0
- src/services/rag/components/chunkers/semantic.py +1 -0
- src/services/rag/components/embedders/__init__.py +1 -0
- src/services/rag/components/embedders/base.py +1 -0
- src/services/rag/components/embedders/openai.py +1 -0
- src/services/rag/components/indexers/__init__.py +1 -0
- src/services/rag/components/indexers/base.py +1 -0
- src/services/rag/components/indexers/graph.py +5 -44
- src/services/rag/components/indexers/lightrag.py +5 -44
- src/services/rag/components/indexers/vector.py +1 -0
- src/services/rag/components/parsers/__init__.py +1 -0
- src/services/rag/components/parsers/base.py +1 -0
- src/services/rag/components/parsers/markdown.py +1 -0
- src/services/rag/components/parsers/pdf.py +1 -0
- src/services/rag/components/parsers/text.py +1 -0
- src/services/rag/components/retrievers/__init__.py +1 -0
- src/services/rag/components/retrievers/base.py +1 -0
- src/services/rag/components/retrievers/dense.py +1 -0
- src/services/rag/components/retrievers/hybrid.py +5 -44
- src/services/rag/components/retrievers/lightrag.py +5 -44
- src/services/rag/components/routing.py +48 -0
- src/services/rag/factory.py +112 -46
- src/services/rag/pipeline.py +1 -0
- src/services/rag/pipelines/__init__.py +27 -18
- src/services/rag/pipelines/lightrag.py +1 -0
- src/services/rag/pipelines/llamaindex.py +99 -0
- src/services/rag/pipelines/raganything.py +67 -100
- src/services/rag/pipelines/raganything_docling.py +368 -0
- src/services/rag/service.py +5 -12
- src/services/rag/types.py +1 -0
- src/services/rag/utils/__init__.py +17 -0
- src/services/rag/utils/image_migration.py +279 -0
- src/services/search/__init__.py +1 -0
- src/services/search/base.py +1 -0
- src/services/search/consolidation.py +1 -0
- src/services/search/providers/__init__.py +1 -0
- src/services/search/providers/baidu.py +1 -0
- src/services/search/providers/exa.py +1 -0
- src/services/search/providers/jina.py +1 -0
- src/services/search/providers/perplexity.py +1 -0
- src/services/search/providers/serper.py +1 -0
- src/services/search/providers/tavily.py +1 -0
- src/services/search/types.py +1 -0
- src/services/settings/__init__.py +1 -0
- src/services/settings/interface_settings.py +78 -0
- src/services/setup/__init__.py +1 -0
- src/services/tts/__init__.py +1 -0
- src/services/tts/config.py +1 -0
- src/utils/realtimex.py +284 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
- src/services/rag/pipelines/academic.py +0 -44
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
1
2
|
"""
|
|
2
3
|
LightRAG Retriever
|
|
3
4
|
==================
|
|
@@ -52,57 +53,17 @@ class LightRAGRetriever(BaseComponent):
|
|
|
52
53
|
|
|
53
54
|
try:
|
|
54
55
|
from lightrag import LightRAG
|
|
55
|
-
from openai import AsyncOpenAI
|
|
56
56
|
|
|
57
57
|
from src.services.embedding import get_embedding_client
|
|
58
58
|
from src.services.llm import get_llm_client
|
|
59
59
|
|
|
60
|
+
# Use unified LLM client from src/services/llm
|
|
60
61
|
llm_client = get_llm_client()
|
|
61
62
|
embed_client = get_embedding_client()
|
|
62
63
|
|
|
63
|
-
#
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
base_url=llm_client.config.base_url,
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
# LLM function using services (ASYNC - LightRAG expects async functions)
|
|
70
|
-
async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
|
|
71
|
-
"""Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
|
|
72
|
-
if history_messages is None:
|
|
73
|
-
history_messages = []
|
|
74
|
-
|
|
75
|
-
# Build messages
|
|
76
|
-
messages = []
|
|
77
|
-
if system_prompt:
|
|
78
|
-
messages.append({"role": "system", "content": system_prompt})
|
|
79
|
-
messages.extend(history_messages)
|
|
80
|
-
messages.append({"role": "user", "content": prompt})
|
|
81
|
-
|
|
82
|
-
# Whitelist only valid OpenAI parameters
|
|
83
|
-
valid_params = {
|
|
84
|
-
"temperature",
|
|
85
|
-
"top_p",
|
|
86
|
-
"n",
|
|
87
|
-
"stream",
|
|
88
|
-
"stop",
|
|
89
|
-
"max_tokens",
|
|
90
|
-
"presence_penalty",
|
|
91
|
-
"frequency_penalty",
|
|
92
|
-
"logit_bias",
|
|
93
|
-
"user",
|
|
94
|
-
"seed",
|
|
95
|
-
}
|
|
96
|
-
clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
|
|
97
|
-
|
|
98
|
-
# Call OpenAI API directly (async)
|
|
99
|
-
response = await openai_client.chat.completions.create(
|
|
100
|
-
model=llm_client.config.model,
|
|
101
|
-
messages=messages,
|
|
102
|
-
**clean_kwargs,
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
return response.choices[0].message.content
|
|
64
|
+
# Get model function from unified LLM client
|
|
65
|
+
# This handles all provider differences and env var setup for LightRAG
|
|
66
|
+
llm_model_func = llm_client.get_model_func()
|
|
106
67
|
|
|
107
68
|
# Create pure LightRAG instance (no multimodal)
|
|
108
69
|
rag = LightRAG(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
1
2
|
"""
|
|
2
3
|
File Type Router
|
|
3
4
|
================
|
|
@@ -284,3 +285,50 @@ class FileTypeRouter:
|
|
|
284
285
|
"""
|
|
285
286
|
doc_type = cls.get_document_type(file_path)
|
|
286
287
|
return doc_type in (DocumentType.TEXT, DocumentType.MARKDOWN)
|
|
288
|
+
|
|
289
|
+
@classmethod
|
|
290
|
+
def get_extensions_for_provider(cls, provider: str) -> set[str]:
|
|
291
|
+
"""
|
|
292
|
+
Get supported file extensions for a specific RAG provider.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
provider: RAG provider name (llamaindex, lightrag, raganything, raganything_docling)
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
Set of supported file extensions (with leading dot, e.g., {".pdf", ".txt"})
|
|
299
|
+
"""
|
|
300
|
+
# Base text extensions supported by all providers
|
|
301
|
+
text_extensions = cls.TEXT_EXTENSIONS.copy()
|
|
302
|
+
|
|
303
|
+
if provider == "llamaindex":
|
|
304
|
+
# LlamaIndex: PDF + all text files (reads any text file directly)
|
|
305
|
+
return cls.MINERU_EXTENSIONS | text_extensions
|
|
306
|
+
|
|
307
|
+
elif provider == "lightrag":
|
|
308
|
+
# LightRAG: PDF + all text files (uses FileTypeRouter)
|
|
309
|
+
return cls.MINERU_EXTENSIONS | text_extensions
|
|
310
|
+
|
|
311
|
+
elif provider in ("raganything", "raganything_docling"):
|
|
312
|
+
# RAGAnything: PDF + Word + Images + all text files (full multimodal via MinerU)
|
|
313
|
+
return (
|
|
314
|
+
cls.MINERU_EXTENSIONS | cls.DOCX_EXTENSIONS | cls.IMAGE_EXTENSIONS | text_extensions
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
else:
|
|
318
|
+
# Default: same as llamaindex (most conservative)
|
|
319
|
+
logger.warning(f"Unknown provider '{provider}', using default extensions")
|
|
320
|
+
return cls.MINERU_EXTENSIONS | text_extensions
|
|
321
|
+
|
|
322
|
+
@classmethod
|
|
323
|
+
def get_glob_patterns_for_provider(cls, provider: str) -> list[str]:
|
|
324
|
+
"""
|
|
325
|
+
Get glob patterns for file searching based on RAG provider.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
provider: RAG provider name (llamaindex, lightrag, raganything, raganything_docling)
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
List of glob patterns (e.g., ["*.pdf", "*.txt", "*.md"])
|
|
332
|
+
"""
|
|
333
|
+
extensions = cls.get_extensions_for_provider(provider)
|
|
334
|
+
return [f"*{ext}" for ext in sorted(extensions)]
|
src/services/rag/factory.py
CHANGED
|
@@ -1,25 +1,77 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
1
2
|
"""
|
|
2
3
|
Pipeline Factory
|
|
3
4
|
================
|
|
4
5
|
|
|
5
6
|
Factory for creating and managing RAG pipelines.
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
|
|
8
|
+
Note: Pipeline imports are lazy to avoid importing heavy dependencies (lightrag, llama_index, etc.)
|
|
9
|
+
at module load time. This allows the core services to be imported without RAG dependencies.
|
|
9
10
|
"""
|
|
10
11
|
|
|
11
12
|
import logging
|
|
12
13
|
from typing import Callable, Dict, List, Optional
|
|
13
|
-
|
|
14
|
-
from .pipelines import lightrag
|
|
14
|
+
import warnings
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
|
-
# Pipeline registry -
|
|
19
|
-
_PIPELINES: Dict[str, Callable] = {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
18
|
+
# Pipeline registry - populated lazily
|
|
19
|
+
_PIPELINES: Dict[str, Callable] = {}
|
|
20
|
+
_PIPELINES_INITIALIZED = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _init_pipelines():
|
|
24
|
+
"""Lazily initialize pipeline registry.
|
|
25
|
+
|
|
26
|
+
Important:
|
|
27
|
+
- Do NOT import optional heavy dependencies (e.g. llama_index) here.
|
|
28
|
+
- Pipelines must be imported inside their factory callables, so users can
|
|
29
|
+
use other providers without installing every optional dependency.
|
|
30
|
+
"""
|
|
31
|
+
global _PIPELINES, _PIPELINES_INITIALIZED
|
|
32
|
+
if _PIPELINES_INITIALIZED:
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
def _build_raganything(**kwargs):
|
|
36
|
+
from .pipelines.raganything import RAGAnythingPipeline
|
|
37
|
+
|
|
38
|
+
return RAGAnythingPipeline(**kwargs)
|
|
39
|
+
|
|
40
|
+
def _build_raganything_docling(**kwargs):
|
|
41
|
+
from .pipelines.raganything_docling import RAGAnythingDoclingPipeline
|
|
42
|
+
|
|
43
|
+
return RAGAnythingDoclingPipeline(**kwargs)
|
|
44
|
+
|
|
45
|
+
def _build_lightrag(kb_base_dir: Optional[str] = None, **kwargs):
|
|
46
|
+
# LightRAGPipeline is a factory function returning a composed RAGPipeline
|
|
47
|
+
from .pipelines.lightrag import LightRAGPipeline
|
|
48
|
+
|
|
49
|
+
return LightRAGPipeline(kb_base_dir=kb_base_dir)
|
|
50
|
+
|
|
51
|
+
def _build_realtimex(kb_base_dir: Optional[str] = None, **kwargs):
|
|
52
|
+
# RealTimeX is an alias for LightRAG with RealTimeX branding
|
|
53
|
+
from .pipelines.lightrag import LightRAGPipeline
|
|
54
|
+
|
|
55
|
+
return LightRAGPipeline(kb_base_dir=kb_base_dir)
|
|
56
|
+
|
|
57
|
+
def _build_llamaindex(**kwargs):
|
|
58
|
+
# LlamaIndexPipeline depends on optional `llama_index` package.
|
|
59
|
+
# Import it only when explicitly requested.
|
|
60
|
+
from .pipelines.llamaindex import LlamaIndexPipeline
|
|
61
|
+
|
|
62
|
+
return LlamaIndexPipeline(**kwargs)
|
|
63
|
+
|
|
64
|
+
_PIPELINES.update(
|
|
65
|
+
{
|
|
66
|
+
"raganything": _build_raganything, # Full multimodal: MinerU parser, deep analysis (slow, thorough)
|
|
67
|
+
"raganything_docling": _build_raganything_docling, # Docling parser: Office/HTML friendly, easier setup
|
|
68
|
+
"lightrag": _build_lightrag, # Knowledge graph: PDFParser, fast text-only (medium speed)
|
|
69
|
+
"realtimex": _build_realtimex, # RealTimeX AI powered knowledge retrieval (recommended, uses LightRAG)
|
|
70
|
+
"llamaindex": _build_llamaindex, # Vector-only: Simple chunking, fast (fastest)
|
|
71
|
+
}
|
|
72
|
+
)
|
|
73
|
+
_PIPELINES_INITIALIZED = True
|
|
74
|
+
|
|
23
75
|
|
|
24
76
|
# Pipeline metadata for list_pipelines()
|
|
25
77
|
_PIPELINE_INFO: Dict[str, Dict[str, str]] = {
|
|
@@ -37,14 +89,16 @@ _PIPELINE_INFO: Dict[str, Dict[str, str]] = {
|
|
|
37
89
|
},
|
|
38
90
|
}
|
|
39
91
|
|
|
92
|
+
|
|
40
93
|
# Try to register optional pipelines
|
|
41
94
|
def _register_optional_pipelines():
|
|
42
95
|
"""Register pipelines that have optional dependencies."""
|
|
43
96
|
global _PIPELINES, _PIPELINE_INFO
|
|
44
|
-
|
|
97
|
+
|
|
45
98
|
# Try RAGAnything (requires raganything package)
|
|
46
99
|
try:
|
|
47
100
|
from .pipelines.raganything import RAGAnythingPipeline
|
|
101
|
+
|
|
48
102
|
_PIPELINES["raganything"] = RAGAnythingPipeline
|
|
49
103
|
_PIPELINE_INFO["raganything"] = {
|
|
50
104
|
"id": "raganything",
|
|
@@ -61,10 +115,11 @@ def _register_optional_pipelines():
|
|
|
61
115
|
"available": False,
|
|
62
116
|
}
|
|
63
117
|
logger.debug(f"RAGAnything not available: {e}")
|
|
64
|
-
|
|
118
|
+
|
|
65
119
|
# Try LlamaIndex (requires llama-index package)
|
|
66
120
|
try:
|
|
67
121
|
from .pipelines import llamaindex
|
|
122
|
+
|
|
68
123
|
_PIPELINES["llamaindex"] = llamaindex.LlamaIndexPipeline
|
|
69
124
|
_PIPELINE_INFO["llamaindex"] = {
|
|
70
125
|
"id": "llamaindex",
|
|
@@ -87,13 +142,13 @@ def _register_optional_pipelines():
|
|
|
87
142
|
_register_optional_pipelines()
|
|
88
143
|
|
|
89
144
|
|
|
90
|
-
def get_pipeline(name: str = "
|
|
145
|
+
def get_pipeline(name: str = "realtimex", kb_base_dir: Optional[str] = None, **kwargs):
|
|
91
146
|
"""
|
|
92
147
|
Get a pre-configured pipeline by name.
|
|
93
148
|
|
|
94
149
|
Args:
|
|
95
|
-
name: Pipeline name (lightrag,
|
|
96
|
-
Default is '
|
|
150
|
+
name: Pipeline name (raganything, raganything_docling, lightrag, realtimex, llamaindex)
|
|
151
|
+
Default is 'realtimex' (recommended, always available).
|
|
97
152
|
kb_base_dir: Base directory for knowledge bases (passed to all pipelines)
|
|
98
153
|
**kwargs: Additional arguments passed to pipeline constructor
|
|
99
154
|
|
|
@@ -103,6 +158,7 @@ def get_pipeline(name: str = "lightrag", kb_base_dir: Optional[str] = None, **kw
|
|
|
103
158
|
Raises:
|
|
104
159
|
ValueError: If pipeline name is not found or not available
|
|
105
160
|
"""
|
|
161
|
+
_init_pipelines()
|
|
106
162
|
if name not in _PIPELINES:
|
|
107
163
|
available = list(_PIPELINES.keys())
|
|
108
164
|
# Check if it's a known but unavailable pipeline
|
|
@@ -116,20 +172,22 @@ def get_pipeline(name: str = "lightrag", kb_base_dir: Optional[str] = None, **kw
|
|
|
116
172
|
|
|
117
173
|
factory = _PIPELINES[name]
|
|
118
174
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
# LlamaIndexPipeline and RAGAnythingPipeline are classes
|
|
175
|
+
try:
|
|
176
|
+
# Handle different pipeline types:
|
|
177
|
+
# - lightrag, realtimex: callable that accepts kb_base_dir and returns a composed RAGPipeline
|
|
178
|
+
# - llamaindex, raganything, raganything_docling: callables that instantiate class-based pipelines
|
|
179
|
+
if name in ("lightrag", "realtimex"):
|
|
180
|
+
return factory(kb_base_dir=kb_base_dir, **kwargs)
|
|
181
|
+
|
|
127
182
|
if kb_base_dir:
|
|
128
183
|
kwargs["kb_base_dir"] = kb_base_dir
|
|
129
184
|
return factory(**kwargs)
|
|
130
|
-
|
|
131
|
-
#
|
|
132
|
-
|
|
185
|
+
except ImportError as e:
|
|
186
|
+
# Common case: user didn't install optional RAG backend deps (e.g. llama_index).
|
|
187
|
+
raise ValueError(
|
|
188
|
+
f"Pipeline '{name}' is not available because an optional dependency is missing: {e}. "
|
|
189
|
+
f"Please install the required dependency for '{name}', or switch provider to 'realtimex'/'lightrag'."
|
|
190
|
+
) from e
|
|
133
191
|
|
|
134
192
|
|
|
135
193
|
def list_pipelines(include_unavailable: bool = False) -> List[Dict[str, str]]:
|
|
@@ -142,21 +200,33 @@ def list_pipelines(include_unavailable: bool = False) -> List[Dict[str, str]]:
|
|
|
142
200
|
Returns:
|
|
143
201
|
List of pipeline info dictionaries
|
|
144
202
|
"""
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
203
|
+
return [
|
|
204
|
+
{
|
|
205
|
+
"id": "realtimex",
|
|
206
|
+
"name": "RealTimeX",
|
|
207
|
+
"description": "RealTimeX AI powered knowledge retrieval (recommended).",
|
|
208
|
+
},
|
|
209
|
+
{
|
|
210
|
+
"id": "lightrag",
|
|
211
|
+
"name": "LightRAG",
|
|
212
|
+
"description": "Lightweight knowledge graph retrieval, fast processing of text documents.",
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
"id": "raganything",
|
|
216
|
+
"name": "RAG-Anything (MinerU)",
|
|
217
|
+
"description": "Multimodal document processing with MinerU parser. Best for academic PDFs with complex equations and formulas.",
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
"id": "raganything_docling",
|
|
221
|
+
"name": "RAG-Anything (Docling)",
|
|
222
|
+
"description": "Multimodal document processing with Docling parser. Better for Office documents (.docx, .pptx) and HTML. Easier to install.",
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
"id": "llamaindex",
|
|
226
|
+
"name": "LlamaIndex",
|
|
227
|
+
"description": "Pure vector retrieval, fastest processing speed.",
|
|
228
|
+
},
|
|
229
|
+
]
|
|
160
230
|
|
|
161
231
|
|
|
162
232
|
def register_pipeline(name: str, factory: Callable):
|
|
@@ -167,6 +237,7 @@ def register_pipeline(name: str, factory: Callable):
|
|
|
167
237
|
name: Pipeline name
|
|
168
238
|
factory: Factory function or class that creates the pipeline
|
|
169
239
|
"""
|
|
240
|
+
_init_pipelines()
|
|
170
241
|
_PIPELINES[name] = factory
|
|
171
242
|
|
|
172
243
|
|
|
@@ -180,6 +251,7 @@ def has_pipeline(name: str) -> bool:
|
|
|
180
251
|
Returns:
|
|
181
252
|
True if pipeline exists
|
|
182
253
|
"""
|
|
254
|
+
_init_pipelines()
|
|
183
255
|
return name in _PIPELINES
|
|
184
256
|
|
|
185
257
|
|
|
@@ -190,8 +262,6 @@ def get_plugin(name: str) -> Dict[str, Callable]:
|
|
|
190
262
|
|
|
191
263
|
Get a plugin by name (maps to pipeline API).
|
|
192
264
|
"""
|
|
193
|
-
import warnings
|
|
194
|
-
|
|
195
265
|
warnings.warn(
|
|
196
266
|
"get_plugin() is deprecated, use get_pipeline() instead",
|
|
197
267
|
DeprecationWarning,
|
|
@@ -210,8 +280,6 @@ def list_plugins() -> List[Dict[str, str]]:
|
|
|
210
280
|
"""
|
|
211
281
|
DEPRECATED: Use list_pipelines() instead.
|
|
212
282
|
"""
|
|
213
|
-
import warnings
|
|
214
|
-
|
|
215
283
|
warnings.warn(
|
|
216
284
|
"list_plugins() is deprecated, use list_pipelines() instead",
|
|
217
285
|
DeprecationWarning,
|
|
@@ -224,8 +292,6 @@ def has_plugin(name: str) -> bool:
|
|
|
224
292
|
"""
|
|
225
293
|
DEPRECATED: Use has_pipeline() instead.
|
|
226
294
|
"""
|
|
227
|
-
import warnings
|
|
228
|
-
|
|
229
295
|
warnings.warn(
|
|
230
296
|
"has_plugin() is deprecated, use has_pipeline() instead",
|
|
231
297
|
DeprecationWarning,
|
src/services/rag/pipeline.py
CHANGED
|
@@ -1,32 +1,41 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
1
2
|
"""
|
|
2
3
|
Pre-configured Pipelines
|
|
3
4
|
========================
|
|
4
5
|
|
|
5
6
|
Ready-to-use RAG pipelines for common use cases.
|
|
6
|
-
|
|
7
|
-
LightRAG and Academic pipelines are always available.
|
|
8
|
-
LlamaIndex and RAGAnything require optional dependencies.
|
|
9
7
|
"""
|
|
10
8
|
|
|
11
|
-
|
|
12
|
-
from .academic import AcademicPipeline
|
|
13
|
-
from .lightrag import LightRAGPipeline
|
|
9
|
+
from typing import Any
|
|
14
10
|
|
|
15
11
|
__all__ = [
|
|
12
|
+
"RAGAnythingPipeline",
|
|
13
|
+
"RAGAnythingDoclingPipeline",
|
|
16
14
|
"LightRAGPipeline",
|
|
17
|
-
"AcademicPipeline",
|
|
18
15
|
]
|
|
19
16
|
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
17
|
+
# NOTE:
|
|
18
|
+
# - Do NOT import heavy/optional backends at module import time.
|
|
19
|
+
# - Users may want `llamaindex` without `raganything`, or vice versa.
|
|
20
|
+
# - Accessing an attribute triggers a targeted import via __getattr__.
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def __getattr__(name: str) -> Any:
|
|
24
|
+
if name == "LightRAGPipeline":
|
|
25
|
+
from .lightrag import LightRAGPipeline
|
|
26
|
+
|
|
27
|
+
return LightRAGPipeline
|
|
28
|
+
if name == "RAGAnythingPipeline":
|
|
29
|
+
from .raganything import RAGAnythingPipeline
|
|
30
|
+
|
|
31
|
+
return RAGAnythingPipeline
|
|
32
|
+
if name == "RAGAnythingDoclingPipeline":
|
|
33
|
+
from .raganything_docling import RAGAnythingDoclingPipeline
|
|
26
34
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
RAGAnythingPipeline = None # type: ignore
|
|
35
|
+
return RAGAnythingDoclingPipeline
|
|
36
|
+
if name == "LlamaIndexPipeline":
|
|
37
|
+
# Optional dependency: llama_index
|
|
38
|
+
from .llamaindex import LlamaIndexPipeline
|
|
32
39
|
|
|
40
|
+
return LlamaIndexPipeline
|
|
41
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
1
2
|
"""
|
|
2
3
|
LlamaIndex Pipeline
|
|
3
4
|
===================
|
|
@@ -291,6 +292,104 @@ class LlamaIndexPipeline:
|
|
|
291
292
|
"provider": "llamaindex",
|
|
292
293
|
}
|
|
293
294
|
|
|
295
|
+
async def add_documents(self, kb_name: str, file_paths: List[str], **kwargs) -> bool:
|
|
296
|
+
"""
|
|
297
|
+
Incrementally add documents to an existing LlamaIndex KB.
|
|
298
|
+
|
|
299
|
+
If the storage directory exists, loads the existing index and inserts
|
|
300
|
+
new documents. Otherwise, creates a new index.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
kb_name: Knowledge base name
|
|
304
|
+
file_paths: List of file paths to add
|
|
305
|
+
**kwargs: Additional arguments
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
True if successful
|
|
309
|
+
"""
|
|
310
|
+
self.logger.info(f"Adding {len(file_paths)} documents to KB '{kb_name}' using LlamaIndex")
|
|
311
|
+
|
|
312
|
+
kb_dir = Path(self.kb_base_dir) / kb_name
|
|
313
|
+
storage_dir = kb_dir / "llamaindex_storage"
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
# Parse new documents
|
|
317
|
+
documents = []
|
|
318
|
+
for file_path in file_paths:
|
|
319
|
+
file_path = Path(file_path)
|
|
320
|
+
self.logger.info(f"Parsing: {file_path.name}")
|
|
321
|
+
|
|
322
|
+
# Extract text based on file type
|
|
323
|
+
if file_path.suffix.lower() == ".pdf":
|
|
324
|
+
text = self._extract_pdf_text(file_path)
|
|
325
|
+
else:
|
|
326
|
+
try:
|
|
327
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
328
|
+
text = f.read()
|
|
329
|
+
except UnicodeDecodeError:
|
|
330
|
+
with open(file_path, "r", encoding="latin-1") as f:
|
|
331
|
+
text = f.read()
|
|
332
|
+
|
|
333
|
+
if text.strip():
|
|
334
|
+
doc = Document(
|
|
335
|
+
text=text,
|
|
336
|
+
metadata={
|
|
337
|
+
"file_name": file_path.name,
|
|
338
|
+
"file_path": str(file_path),
|
|
339
|
+
},
|
|
340
|
+
)
|
|
341
|
+
documents.append(doc)
|
|
342
|
+
self.logger.info(f"Loaded: {file_path.name} ({len(text)} chars)")
|
|
343
|
+
else:
|
|
344
|
+
self.logger.warning(f"Skipped empty document: {file_path.name}")
|
|
345
|
+
|
|
346
|
+
if not documents:
|
|
347
|
+
self.logger.warning("No valid documents to add")
|
|
348
|
+
return False
|
|
349
|
+
|
|
350
|
+
loop = asyncio.get_event_loop()
|
|
351
|
+
|
|
352
|
+
if storage_dir.exists():
|
|
353
|
+
# Load existing index and insert new documents
|
|
354
|
+
self.logger.info(f"Loading existing index from {storage_dir}...")
|
|
355
|
+
|
|
356
|
+
def load_and_insert():
|
|
357
|
+
storage_context = StorageContext.from_defaults(persist_dir=str(storage_dir))
|
|
358
|
+
index = load_index_from_storage(storage_context)
|
|
359
|
+
|
|
360
|
+
# Insert new documents
|
|
361
|
+
for doc in documents:
|
|
362
|
+
index.insert(doc)
|
|
363
|
+
|
|
364
|
+
# Persist updated index
|
|
365
|
+
index.storage_context.persist(persist_dir=str(storage_dir))
|
|
366
|
+
return len(documents)
|
|
367
|
+
|
|
368
|
+
num_added = await loop.run_in_executor(None, load_and_insert)
|
|
369
|
+
self.logger.info(f"Added {num_added} documents to existing index")
|
|
370
|
+
else:
|
|
371
|
+
# Create new index (first time)
|
|
372
|
+
self.logger.info(f"Creating new index with {len(documents)} documents...")
|
|
373
|
+
storage_dir.mkdir(parents=True, exist_ok=True)
|
|
374
|
+
|
|
375
|
+
def create_index():
|
|
376
|
+
index = VectorStoreIndex.from_documents(documents, show_progress=True)
|
|
377
|
+
index.storage_context.persist(persist_dir=str(storage_dir))
|
|
378
|
+
return len(documents)
|
|
379
|
+
|
|
380
|
+
num_added = await loop.run_in_executor(None, create_index)
|
|
381
|
+
self.logger.info(f"Created new index with {num_added} documents")
|
|
382
|
+
|
|
383
|
+
self.logger.info(f"Successfully added documents to KB '{kb_name}'")
|
|
384
|
+
return True
|
|
385
|
+
|
|
386
|
+
except Exception as e:
|
|
387
|
+
self.logger.error(f"Failed to add documents: {e}")
|
|
388
|
+
import traceback
|
|
389
|
+
|
|
390
|
+
self.logger.error(traceback.format_exc())
|
|
391
|
+
return False
|
|
392
|
+
|
|
294
393
|
async def delete(self, kb_name: str) -> bool:
|
|
295
394
|
"""
|
|
296
395
|
Delete knowledge base.
|