realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
  2. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
  3. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
  4. realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
  5. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
  6. scripts/__init__.py +1 -0
  7. scripts/audit_prompts.py +179 -0
  8. scripts/check_install.py +460 -0
  9. scripts/generate_roster.py +327 -0
  10. scripts/install_all.py +653 -0
  11. scripts/migrate_kb.py +655 -0
  12. scripts/start.py +807 -0
  13. scripts/start_web.py +632 -0
  14. scripts/sync_prompts_from_en.py +147 -0
  15. src/__init__.py +2 -2
  16. src/agents/ideagen/material_organizer_agent.py +2 -0
  17. src/agents/solve/__init__.py +6 -0
  18. src/agents/solve/main_solver.py +9 -0
  19. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
  20. src/agents/solve/session_manager.py +345 -0
  21. src/api/main.py +14 -0
  22. src/api/routers/chat.py +3 -3
  23. src/api/routers/co_writer.py +12 -7
  24. src/api/routers/config.py +1 -0
  25. src/api/routers/guide.py +3 -1
  26. src/api/routers/ideagen.py +7 -0
  27. src/api/routers/knowledge.py +64 -12
  28. src/api/routers/question.py +2 -0
  29. src/api/routers/realtimex.py +137 -0
  30. src/api/routers/research.py +9 -0
  31. src/api/routers/solve.py +120 -2
  32. src/cli/__init__.py +13 -0
  33. src/cli/start.py +209 -0
  34. src/config/constants.py +11 -9
  35. src/knowledge/add_documents.py +453 -213
  36. src/knowledge/extract_numbered_items.py +9 -10
  37. src/knowledge/initializer.py +102 -101
  38. src/knowledge/manager.py +251 -74
  39. src/knowledge/progress_tracker.py +43 -2
  40. src/knowledge/start_kb.py +11 -2
  41. src/logging/__init__.py +5 -0
  42. src/logging/adapters/__init__.py +1 -0
  43. src/logging/adapters/lightrag.py +25 -18
  44. src/logging/adapters/llamaindex.py +1 -0
  45. src/logging/config.py +30 -27
  46. src/logging/handlers/__init__.py +1 -0
  47. src/logging/handlers/console.py +7 -50
  48. src/logging/handlers/file.py +5 -20
  49. src/logging/handlers/websocket.py +23 -19
  50. src/logging/logger.py +161 -126
  51. src/logging/stats/__init__.py +1 -0
  52. src/logging/stats/llm_stats.py +37 -17
  53. src/services/__init__.py +17 -1
  54. src/services/config/__init__.py +1 -0
  55. src/services/config/knowledge_base_config.py +1 -0
  56. src/services/config/loader.py +1 -1
  57. src/services/config/unified_config.py +211 -4
  58. src/services/embedding/__init__.py +1 -0
  59. src/services/embedding/adapters/__init__.py +3 -0
  60. src/services/embedding/adapters/base.py +1 -0
  61. src/services/embedding/adapters/cohere.py +1 -0
  62. src/services/embedding/adapters/jina.py +1 -0
  63. src/services/embedding/adapters/ollama.py +1 -0
  64. src/services/embedding/adapters/openai_compatible.py +1 -0
  65. src/services/embedding/adapters/realtimex.py +125 -0
  66. src/services/embedding/client.py +27 -0
  67. src/services/embedding/config.py +3 -0
  68. src/services/embedding/provider.py +1 -0
  69. src/services/llm/__init__.py +17 -3
  70. src/services/llm/capabilities.py +47 -0
  71. src/services/llm/client.py +32 -0
  72. src/services/llm/cloud_provider.py +21 -4
  73. src/services/llm/config.py +36 -2
  74. src/services/llm/error_mapping.py +1 -0
  75. src/services/llm/exceptions.py +30 -0
  76. src/services/llm/factory.py +55 -16
  77. src/services/llm/local_provider.py +1 -0
  78. src/services/llm/providers/anthropic.py +1 -0
  79. src/services/llm/providers/base_provider.py +1 -0
  80. src/services/llm/providers/open_ai.py +1 -0
  81. src/services/llm/realtimex_provider.py +240 -0
  82. src/services/llm/registry.py +1 -0
  83. src/services/llm/telemetry.py +1 -0
  84. src/services/llm/types.py +1 -0
  85. src/services/llm/utils.py +1 -0
  86. src/services/prompt/__init__.py +1 -0
  87. src/services/prompt/manager.py +3 -2
  88. src/services/rag/__init__.py +27 -5
  89. src/services/rag/components/__init__.py +1 -0
  90. src/services/rag/components/base.py +1 -0
  91. src/services/rag/components/chunkers/__init__.py +1 -0
  92. src/services/rag/components/chunkers/base.py +1 -0
  93. src/services/rag/components/chunkers/fixed.py +1 -0
  94. src/services/rag/components/chunkers/numbered_item.py +1 -0
  95. src/services/rag/components/chunkers/semantic.py +1 -0
  96. src/services/rag/components/embedders/__init__.py +1 -0
  97. src/services/rag/components/embedders/base.py +1 -0
  98. src/services/rag/components/embedders/openai.py +1 -0
  99. src/services/rag/components/indexers/__init__.py +1 -0
  100. src/services/rag/components/indexers/base.py +1 -0
  101. src/services/rag/components/indexers/graph.py +5 -44
  102. src/services/rag/components/indexers/lightrag.py +5 -44
  103. src/services/rag/components/indexers/vector.py +1 -0
  104. src/services/rag/components/parsers/__init__.py +1 -0
  105. src/services/rag/components/parsers/base.py +1 -0
  106. src/services/rag/components/parsers/markdown.py +1 -0
  107. src/services/rag/components/parsers/pdf.py +1 -0
  108. src/services/rag/components/parsers/text.py +1 -0
  109. src/services/rag/components/retrievers/__init__.py +1 -0
  110. src/services/rag/components/retrievers/base.py +1 -0
  111. src/services/rag/components/retrievers/dense.py +1 -0
  112. src/services/rag/components/retrievers/hybrid.py +5 -44
  113. src/services/rag/components/retrievers/lightrag.py +5 -44
  114. src/services/rag/components/routing.py +48 -0
  115. src/services/rag/factory.py +112 -46
  116. src/services/rag/pipeline.py +1 -0
  117. src/services/rag/pipelines/__init__.py +27 -18
  118. src/services/rag/pipelines/lightrag.py +1 -0
  119. src/services/rag/pipelines/llamaindex.py +99 -0
  120. src/services/rag/pipelines/raganything.py +67 -100
  121. src/services/rag/pipelines/raganything_docling.py +368 -0
  122. src/services/rag/service.py +5 -12
  123. src/services/rag/types.py +1 -0
  124. src/services/rag/utils/__init__.py +17 -0
  125. src/services/rag/utils/image_migration.py +279 -0
  126. src/services/search/__init__.py +1 -0
  127. src/services/search/base.py +1 -0
  128. src/services/search/consolidation.py +1 -0
  129. src/services/search/providers/__init__.py +1 -0
  130. src/services/search/providers/baidu.py +1 -0
  131. src/services/search/providers/exa.py +1 -0
  132. src/services/search/providers/jina.py +1 -0
  133. src/services/search/providers/perplexity.py +1 -0
  134. src/services/search/providers/serper.py +1 -0
  135. src/services/search/providers/tavily.py +1 -0
  136. src/services/search/types.py +1 -0
  137. src/services/settings/__init__.py +1 -0
  138. src/services/settings/interface_settings.py +78 -0
  139. src/services/setup/__init__.py +1 -0
  140. src/services/tts/__init__.py +1 -0
  141. src/services/tts/config.py +1 -0
  142. src/utils/realtimex.py +284 -0
  143. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
  144. src/services/rag/pipelines/academic.py +0 -44
  145. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  LightRAG Retriever
3
4
  ==================
@@ -52,57 +53,17 @@ class LightRAGRetriever(BaseComponent):
52
53
 
53
54
  try:
54
55
  from lightrag import LightRAG
55
- from openai import AsyncOpenAI
56
56
 
57
57
  from src.services.embedding import get_embedding_client
58
58
  from src.services.llm import get_llm_client
59
59
 
60
+ # Use unified LLM client from src/services/llm
60
61
  llm_client = get_llm_client()
61
62
  embed_client = get_embedding_client()
62
63
 
63
- # Create AsyncOpenAI client directly
64
- openai_client = AsyncOpenAI(
65
- api_key=llm_client.config.api_key,
66
- base_url=llm_client.config.base_url,
67
- )
68
-
69
- # LLM function using services (ASYNC - LightRAG expects async functions)
70
- async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
71
- """Custom async LLM function that bypasses LightRAG's openai_complete_if_cache."""
72
- if history_messages is None:
73
- history_messages = []
74
-
75
- # Build messages
76
- messages = []
77
- if system_prompt:
78
- messages.append({"role": "system", "content": system_prompt})
79
- messages.extend(history_messages)
80
- messages.append({"role": "user", "content": prompt})
81
-
82
- # Whitelist only valid OpenAI parameters
83
- valid_params = {
84
- "temperature",
85
- "top_p",
86
- "n",
87
- "stream",
88
- "stop",
89
- "max_tokens",
90
- "presence_penalty",
91
- "frequency_penalty",
92
- "logit_bias",
93
- "user",
94
- "seed",
95
- }
96
- clean_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
97
-
98
- # Call OpenAI API directly (async)
99
- response = await openai_client.chat.completions.create(
100
- model=llm_client.config.model,
101
- messages=messages,
102
- **clean_kwargs,
103
- )
104
-
105
- return response.choices[0].message.content
64
+ # Get model function from unified LLM client
65
+ # This handles all provider differences and env var setup for LightRAG
66
+ llm_model_func = llm_client.get_model_func()
106
67
 
107
68
  # Create pure LightRAG instance (no multimodal)
108
69
  rag = LightRAG(
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  File Type Router
3
4
  ================
@@ -284,3 +285,50 @@ class FileTypeRouter:
284
285
  """
285
286
  doc_type = cls.get_document_type(file_path)
286
287
  return doc_type in (DocumentType.TEXT, DocumentType.MARKDOWN)
288
+
289
+ @classmethod
290
+ def get_extensions_for_provider(cls, provider: str) -> set[str]:
291
+ """
292
+ Get supported file extensions for a specific RAG provider.
293
+
294
+ Args:
295
+ provider: RAG provider name (llamaindex, lightrag, raganything, raganything_docling)
296
+
297
+ Returns:
298
+ Set of supported file extensions (with leading dot, e.g., {".pdf", ".txt"})
299
+ """
300
+ # Base text extensions supported by all providers
301
+ text_extensions = cls.TEXT_EXTENSIONS.copy()
302
+
303
+ if provider == "llamaindex":
304
+ # LlamaIndex: PDF + all text files (reads any text file directly)
305
+ return cls.MINERU_EXTENSIONS | text_extensions
306
+
307
+ elif provider == "lightrag":
308
+ # LightRAG: PDF + all text files (uses FileTypeRouter)
309
+ return cls.MINERU_EXTENSIONS | text_extensions
310
+
311
+ elif provider in ("raganything", "raganything_docling"):
312
+ # RAGAnything: PDF + Word + Images + all text files (full multimodal via MinerU)
313
+ return (
314
+ cls.MINERU_EXTENSIONS | cls.DOCX_EXTENSIONS | cls.IMAGE_EXTENSIONS | text_extensions
315
+ )
316
+
317
+ else:
318
+ # Default: same as llamaindex (most conservative)
319
+ logger.warning(f"Unknown provider '{provider}', using default extensions")
320
+ return cls.MINERU_EXTENSIONS | text_extensions
321
+
322
+ @classmethod
323
+ def get_glob_patterns_for_provider(cls, provider: str) -> list[str]:
324
+ """
325
+ Get glob patterns for file searching based on RAG provider.
326
+
327
+ Args:
328
+ provider: RAG provider name (llamaindex, lightrag, raganything, raganything_docling)
329
+
330
+ Returns:
331
+ List of glob patterns (e.g., ["*.pdf", "*.txt", "*.md"])
332
+ """
333
+ extensions = cls.get_extensions_for_provider(provider)
334
+ return [f"*{ext}" for ext in sorted(extensions)]
@@ -1,25 +1,77 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Pipeline Factory
3
4
  ================
4
5
 
5
6
  Factory for creating and managing RAG pipelines.
6
7
 
7
- LightRAG is the default pipeline (always available).
8
- RAGAnything and LlamaIndex are optional (require extra dependencies).
8
+ Note: Pipeline imports are lazy to avoid importing heavy dependencies (lightrag, llama_index, etc.)
9
+ at module load time. This allows the core services to be imported without RAG dependencies.
9
10
  """
10
11
 
11
12
  import logging
12
13
  from typing import Callable, Dict, List, Optional
13
-
14
- from .pipelines import lightrag
14
+ import warnings
15
15
 
16
16
  logger = logging.getLogger(__name__)
17
17
 
18
- # Pipeline registry - start with always-available pipelines
19
- _PIPELINES: Dict[str, Callable] = {
20
- "lightrag": lightrag.LightRAGPipeline, # Knowledge graph: PDFParser, fast text-only (default)
21
- "realtimex": lightrag.LightRAGPipeline, # Alias: RealTimeX (uses LightRAG with RealTimeX AI config)
22
- }
18
+ # Pipeline registry - populated lazily
19
+ _PIPELINES: Dict[str, Callable] = {}
20
+ _PIPELINES_INITIALIZED = False
21
+
22
+
23
+ def _init_pipelines():
24
+ """Lazily initialize pipeline registry.
25
+
26
+ Important:
27
+ - Do NOT import optional heavy dependencies (e.g. llama_index) here.
28
+ - Pipelines must be imported inside their factory callables, so users can
29
+ use other providers without installing every optional dependency.
30
+ """
31
+ global _PIPELINES, _PIPELINES_INITIALIZED
32
+ if _PIPELINES_INITIALIZED:
33
+ return
34
+
35
+ def _build_raganything(**kwargs):
36
+ from .pipelines.raganything import RAGAnythingPipeline
37
+
38
+ return RAGAnythingPipeline(**kwargs)
39
+
40
+ def _build_raganything_docling(**kwargs):
41
+ from .pipelines.raganything_docling import RAGAnythingDoclingPipeline
42
+
43
+ return RAGAnythingDoclingPipeline(**kwargs)
44
+
45
+ def _build_lightrag(kb_base_dir: Optional[str] = None, **kwargs):
46
+ # LightRAGPipeline is a factory function returning a composed RAGPipeline
47
+ from .pipelines.lightrag import LightRAGPipeline
48
+
49
+ return LightRAGPipeline(kb_base_dir=kb_base_dir)
50
+
51
+ def _build_realtimex(kb_base_dir: Optional[str] = None, **kwargs):
52
+ # RealTimeX is an alias for LightRAG with RealTimeX branding
53
+ from .pipelines.lightrag import LightRAGPipeline
54
+
55
+ return LightRAGPipeline(kb_base_dir=kb_base_dir)
56
+
57
+ def _build_llamaindex(**kwargs):
58
+ # LlamaIndexPipeline depends on optional `llama_index` package.
59
+ # Import it only when explicitly requested.
60
+ from .pipelines.llamaindex import LlamaIndexPipeline
61
+
62
+ return LlamaIndexPipeline(**kwargs)
63
+
64
+ _PIPELINES.update(
65
+ {
66
+ "raganything": _build_raganything, # Full multimodal: MinerU parser, deep analysis (slow, thorough)
67
+ "raganything_docling": _build_raganything_docling, # Docling parser: Office/HTML friendly, easier setup
68
+ "lightrag": _build_lightrag, # Knowledge graph: PDFParser, fast text-only (medium speed)
69
+ "realtimex": _build_realtimex, # RealTimeX AI powered knowledge retrieval (recommended, uses LightRAG)
70
+ "llamaindex": _build_llamaindex, # Vector-only: Simple chunking, fast (fastest)
71
+ }
72
+ )
73
+ _PIPELINES_INITIALIZED = True
74
+
23
75
 
24
76
  # Pipeline metadata for list_pipelines()
25
77
  _PIPELINE_INFO: Dict[str, Dict[str, str]] = {
@@ -37,14 +89,16 @@ _PIPELINE_INFO: Dict[str, Dict[str, str]] = {
37
89
  },
38
90
  }
39
91
 
92
+
40
93
  # Try to register optional pipelines
41
94
  def _register_optional_pipelines():
42
95
  """Register pipelines that have optional dependencies."""
43
96
  global _PIPELINES, _PIPELINE_INFO
44
-
97
+
45
98
  # Try RAGAnything (requires raganything package)
46
99
  try:
47
100
  from .pipelines.raganything import RAGAnythingPipeline
101
+
48
102
  _PIPELINES["raganything"] = RAGAnythingPipeline
49
103
  _PIPELINE_INFO["raganything"] = {
50
104
  "id": "raganything",
@@ -61,10 +115,11 @@ def _register_optional_pipelines():
61
115
  "available": False,
62
116
  }
63
117
  logger.debug(f"RAGAnything not available: {e}")
64
-
118
+
65
119
  # Try LlamaIndex (requires llama-index package)
66
120
  try:
67
121
  from .pipelines import llamaindex
122
+
68
123
  _PIPELINES["llamaindex"] = llamaindex.LlamaIndexPipeline
69
124
  _PIPELINE_INFO["llamaindex"] = {
70
125
  "id": "llamaindex",
@@ -87,13 +142,13 @@ def _register_optional_pipelines():
87
142
  _register_optional_pipelines()
88
143
 
89
144
 
90
- def get_pipeline(name: str = "lightrag", kb_base_dir: Optional[str] = None, **kwargs):
145
+ def get_pipeline(name: str = "realtimex", kb_base_dir: Optional[str] = None, **kwargs):
91
146
  """
92
147
  Get a pre-configured pipeline by name.
93
148
 
94
149
  Args:
95
- name: Pipeline name (lightrag, raganything, llamaindex)
96
- Default is 'lightrag' (always available).
150
+ name: Pipeline name (raganything, raganything_docling, lightrag, realtimex, llamaindex)
151
+ Default is 'realtimex' (recommended, always available).
97
152
  kb_base_dir: Base directory for knowledge bases (passed to all pipelines)
98
153
  **kwargs: Additional arguments passed to pipeline constructor
99
154
 
@@ -103,6 +158,7 @@ def get_pipeline(name: str = "lightrag", kb_base_dir: Optional[str] = None, **kw
103
158
  Raises:
104
159
  ValueError: If pipeline name is not found or not available
105
160
  """
161
+ _init_pipelines()
106
162
  if name not in _PIPELINES:
107
163
  available = list(_PIPELINES.keys())
108
164
  # Check if it's a known but unavailable pipeline
@@ -116,20 +172,22 @@ def get_pipeline(name: str = "lightrag", kb_base_dir: Optional[str] = None, **kw
116
172
 
117
173
  factory = _PIPELINES[name]
118
174
 
119
- # Handle different pipeline types:
120
- # - lightrag, realtimex, academic: functions that return RAGPipeline
121
- # - llamaindex, raganything: classes that need instantiation
122
- if name in ("lightrag", "realtimex", "academic"):
123
- # LightRAGPipeline and AcademicPipeline are factory functions
124
- return factory(kb_base_dir=kb_base_dir)
125
- elif name in ("llamaindex", "raganything"):
126
- # LlamaIndexPipeline and RAGAnythingPipeline are classes
175
+ try:
176
+ # Handle different pipeline types:
177
+ # - lightrag, realtimex: callable that accepts kb_base_dir and returns a composed RAGPipeline
178
+ # - llamaindex, raganything, raganything_docling: callables that instantiate class-based pipelines
179
+ if name in ("lightrag", "realtimex"):
180
+ return factory(kb_base_dir=kb_base_dir, **kwargs)
181
+
127
182
  if kb_base_dir:
128
183
  kwargs["kb_base_dir"] = kb_base_dir
129
184
  return factory(**kwargs)
130
- else:
131
- # Default: try calling with kb_base_dir
132
- return factory(kb_base_dir=kb_base_dir)
185
+ except ImportError as e:
186
+ # Common case: user didn't install optional RAG backend deps (e.g. llama_index).
187
+ raise ValueError(
188
+ f"Pipeline '{name}' is not available because an optional dependency is missing: {e}. "
189
+ f"Please install the required dependency for '{name}', or switch provider to 'realtimex'/'lightrag'."
190
+ ) from e
133
191
 
134
192
 
135
193
  def list_pipelines(include_unavailable: bool = False) -> List[Dict[str, str]]:
@@ -142,21 +200,33 @@ def list_pipelines(include_unavailable: bool = False) -> List[Dict[str, str]]:
142
200
  Returns:
143
201
  List of pipeline info dictionaries
144
202
  """
145
- result = []
146
- # Order: realtimex first (recommended), then others
147
- order = ["realtimex", "lightrag", "raganything", "llamaindex"]
148
-
149
- for pipeline_id in order:
150
- if pipeline_id in _PIPELINE_INFO:
151
- info = _PIPELINE_INFO[pipeline_id]
152
- if include_unavailable or info.get("available", False):
153
- result.append({
154
- "id": info["id"],
155
- "name": info["name"],
156
- "description": info["description"],
157
- })
158
-
159
- return result
203
+ return [
204
+ {
205
+ "id": "realtimex",
206
+ "name": "RealTimeX",
207
+ "description": "RealTimeX AI powered knowledge retrieval (recommended).",
208
+ },
209
+ {
210
+ "id": "lightrag",
211
+ "name": "LightRAG",
212
+ "description": "Lightweight knowledge graph retrieval, fast processing of text documents.",
213
+ },
214
+ {
215
+ "id": "raganything",
216
+ "name": "RAG-Anything (MinerU)",
217
+ "description": "Multimodal document processing with MinerU parser. Best for academic PDFs with complex equations and formulas.",
218
+ },
219
+ {
220
+ "id": "raganything_docling",
221
+ "name": "RAG-Anything (Docling)",
222
+ "description": "Multimodal document processing with Docling parser. Better for Office documents (.docx, .pptx) and HTML. Easier to install.",
223
+ },
224
+ {
225
+ "id": "llamaindex",
226
+ "name": "LlamaIndex",
227
+ "description": "Pure vector retrieval, fastest processing speed.",
228
+ },
229
+ ]
160
230
 
161
231
 
162
232
  def register_pipeline(name: str, factory: Callable):
@@ -167,6 +237,7 @@ def register_pipeline(name: str, factory: Callable):
167
237
  name: Pipeline name
168
238
  factory: Factory function or class that creates the pipeline
169
239
  """
240
+ _init_pipelines()
170
241
  _PIPELINES[name] = factory
171
242
 
172
243
 
@@ -180,6 +251,7 @@ def has_pipeline(name: str) -> bool:
180
251
  Returns:
181
252
  True if pipeline exists
182
253
  """
254
+ _init_pipelines()
183
255
  return name in _PIPELINES
184
256
 
185
257
 
@@ -190,8 +262,6 @@ def get_plugin(name: str) -> Dict[str, Callable]:
190
262
 
191
263
  Get a plugin by name (maps to pipeline API).
192
264
  """
193
- import warnings
194
-
195
265
  warnings.warn(
196
266
  "get_plugin() is deprecated, use get_pipeline() instead",
197
267
  DeprecationWarning,
@@ -210,8 +280,6 @@ def list_plugins() -> List[Dict[str, str]]:
210
280
  """
211
281
  DEPRECATED: Use list_pipelines() instead.
212
282
  """
213
- import warnings
214
-
215
283
  warnings.warn(
216
284
  "list_plugins() is deprecated, use list_pipelines() instead",
217
285
  DeprecationWarning,
@@ -224,8 +292,6 @@ def has_plugin(name: str) -> bool:
224
292
  """
225
293
  DEPRECATED: Use has_pipeline() instead.
226
294
  """
227
- import warnings
228
-
229
295
  warnings.warn(
230
296
  "has_plugin() is deprecated, use has_pipeline() instead",
231
297
  DeprecationWarning,
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  RAG Pipeline
3
4
  ============
@@ -1,32 +1,41 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  Pre-configured Pipelines
3
4
  ========================
4
5
 
5
6
  Ready-to-use RAG pipelines for common use cases.
6
-
7
- LightRAG and Academic pipelines are always available.
8
- LlamaIndex and RAGAnything require optional dependencies.
9
7
  """
10
8
 
11
- # Always available pipelines
12
- from .academic import AcademicPipeline
13
- from .lightrag import LightRAGPipeline
9
+ from typing import Any
14
10
 
15
11
  __all__ = [
12
+ "RAGAnythingPipeline",
13
+ "RAGAnythingDoclingPipeline",
16
14
  "LightRAGPipeline",
17
- "AcademicPipeline",
18
15
  ]
19
16
 
20
- # Optional pipelines - import only if dependencies are available
21
- try:
22
- from .llamaindex import LlamaIndexPipeline
23
- __all__.append("LlamaIndexPipeline")
24
- except ImportError:
25
- LlamaIndexPipeline = None # type: ignore
17
+ # NOTE:
18
+ # - Do NOT import heavy/optional backends at module import time.
19
+ # - Users may want `llamaindex` without `raganything`, or vice versa.
20
+ # - Accessing an attribute triggers a targeted import via __getattr__.
21
+
22
+
23
+ def __getattr__(name: str) -> Any:
24
+ if name == "LightRAGPipeline":
25
+ from .lightrag import LightRAGPipeline
26
+
27
+ return LightRAGPipeline
28
+ if name == "RAGAnythingPipeline":
29
+ from .raganything import RAGAnythingPipeline
30
+
31
+ return RAGAnythingPipeline
32
+ if name == "RAGAnythingDoclingPipeline":
33
+ from .raganything_docling import RAGAnythingDoclingPipeline
26
34
 
27
- try:
28
- from .raganything import RAGAnythingPipeline
29
- __all__.append("RAGAnythingPipeline")
30
- except ImportError:
31
- RAGAnythingPipeline = None # type: ignore
35
+ return RAGAnythingDoclingPipeline
36
+ if name == "LlamaIndexPipeline":
37
+ # Optional dependency: llama_index
38
+ from .llamaindex import LlamaIndexPipeline
32
39
 
40
+ return LlamaIndexPipeline
41
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  LightRAG Pipeline
3
4
  =================
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  """
2
3
  LlamaIndex Pipeline
3
4
  ===================
@@ -291,6 +292,104 @@ class LlamaIndexPipeline:
291
292
  "provider": "llamaindex",
292
293
  }
293
294
 
295
+ async def add_documents(self, kb_name: str, file_paths: List[str], **kwargs) -> bool:
296
+ """
297
+ Incrementally add documents to an existing LlamaIndex KB.
298
+
299
+ If the storage directory exists, loads the existing index and inserts
300
+ new documents. Otherwise, creates a new index.
301
+
302
+ Args:
303
+ kb_name: Knowledge base name
304
+ file_paths: List of file paths to add
305
+ **kwargs: Additional arguments
306
+
307
+ Returns:
308
+ True if successful
309
+ """
310
+ self.logger.info(f"Adding {len(file_paths)} documents to KB '{kb_name}' using LlamaIndex")
311
+
312
+ kb_dir = Path(self.kb_base_dir) / kb_name
313
+ storage_dir = kb_dir / "llamaindex_storage"
314
+
315
+ try:
316
+ # Parse new documents
317
+ documents = []
318
+ for file_path in file_paths:
319
+ file_path = Path(file_path)
320
+ self.logger.info(f"Parsing: {file_path.name}")
321
+
322
+ # Extract text based on file type
323
+ if file_path.suffix.lower() == ".pdf":
324
+ text = self._extract_pdf_text(file_path)
325
+ else:
326
+ try:
327
+ with open(file_path, "r", encoding="utf-8") as f:
328
+ text = f.read()
329
+ except UnicodeDecodeError:
330
+ with open(file_path, "r", encoding="latin-1") as f:
331
+ text = f.read()
332
+
333
+ if text.strip():
334
+ doc = Document(
335
+ text=text,
336
+ metadata={
337
+ "file_name": file_path.name,
338
+ "file_path": str(file_path),
339
+ },
340
+ )
341
+ documents.append(doc)
342
+ self.logger.info(f"Loaded: {file_path.name} ({len(text)} chars)")
343
+ else:
344
+ self.logger.warning(f"Skipped empty document: {file_path.name}")
345
+
346
+ if not documents:
347
+ self.logger.warning("No valid documents to add")
348
+ return False
349
+
350
+ loop = asyncio.get_event_loop()
351
+
352
+ if storage_dir.exists():
353
+ # Load existing index and insert new documents
354
+ self.logger.info(f"Loading existing index from {storage_dir}...")
355
+
356
+ def load_and_insert():
357
+ storage_context = StorageContext.from_defaults(persist_dir=str(storage_dir))
358
+ index = load_index_from_storage(storage_context)
359
+
360
+ # Insert new documents
361
+ for doc in documents:
362
+ index.insert(doc)
363
+
364
+ # Persist updated index
365
+ index.storage_context.persist(persist_dir=str(storage_dir))
366
+ return len(documents)
367
+
368
+ num_added = await loop.run_in_executor(None, load_and_insert)
369
+ self.logger.info(f"Added {num_added} documents to existing index")
370
+ else:
371
+ # Create new index (first time)
372
+ self.logger.info(f"Creating new index with {len(documents)} documents...")
373
+ storage_dir.mkdir(parents=True, exist_ok=True)
374
+
375
+ def create_index():
376
+ index = VectorStoreIndex.from_documents(documents, show_progress=True)
377
+ index.storage_context.persist(persist_dir=str(storage_dir))
378
+ return len(documents)
379
+
380
+ num_added = await loop.run_in_executor(None, create_index)
381
+ self.logger.info(f"Created new index with {num_added} documents")
382
+
383
+ self.logger.info(f"Successfully added documents to KB '{kb_name}'")
384
+ return True
385
+
386
+ except Exception as e:
387
+ self.logger.error(f"Failed to add documents: {e}")
388
+ import traceback
389
+
390
+ self.logger.error(traceback.format_exc())
391
+ return False
392
+
294
393
  async def delete(self, kb_name: str) -> bool:
295
394
  """
296
395
  Delete knowledge base.