realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
  2. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
  3. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
  4. realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
  5. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
  6. scripts/__init__.py +1 -0
  7. scripts/audit_prompts.py +179 -0
  8. scripts/check_install.py +460 -0
  9. scripts/generate_roster.py +327 -0
  10. scripts/install_all.py +653 -0
  11. scripts/migrate_kb.py +655 -0
  12. scripts/start.py +807 -0
  13. scripts/start_web.py +632 -0
  14. scripts/sync_prompts_from_en.py +147 -0
  15. src/__init__.py +2 -2
  16. src/agents/ideagen/material_organizer_agent.py +2 -0
  17. src/agents/solve/__init__.py +6 -0
  18. src/agents/solve/main_solver.py +9 -0
  19. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
  20. src/agents/solve/session_manager.py +345 -0
  21. src/api/main.py +14 -0
  22. src/api/routers/chat.py +3 -3
  23. src/api/routers/co_writer.py +12 -7
  24. src/api/routers/config.py +1 -0
  25. src/api/routers/guide.py +3 -1
  26. src/api/routers/ideagen.py +7 -0
  27. src/api/routers/knowledge.py +64 -12
  28. src/api/routers/question.py +2 -0
  29. src/api/routers/realtimex.py +137 -0
  30. src/api/routers/research.py +9 -0
  31. src/api/routers/solve.py +120 -2
  32. src/cli/__init__.py +13 -0
  33. src/cli/start.py +209 -0
  34. src/config/constants.py +11 -9
  35. src/knowledge/add_documents.py +453 -213
  36. src/knowledge/extract_numbered_items.py +9 -10
  37. src/knowledge/initializer.py +102 -101
  38. src/knowledge/manager.py +251 -74
  39. src/knowledge/progress_tracker.py +43 -2
  40. src/knowledge/start_kb.py +11 -2
  41. src/logging/__init__.py +5 -0
  42. src/logging/adapters/__init__.py +1 -0
  43. src/logging/adapters/lightrag.py +25 -18
  44. src/logging/adapters/llamaindex.py +1 -0
  45. src/logging/config.py +30 -27
  46. src/logging/handlers/__init__.py +1 -0
  47. src/logging/handlers/console.py +7 -50
  48. src/logging/handlers/file.py +5 -20
  49. src/logging/handlers/websocket.py +23 -19
  50. src/logging/logger.py +161 -126
  51. src/logging/stats/__init__.py +1 -0
  52. src/logging/stats/llm_stats.py +37 -17
  53. src/services/__init__.py +17 -1
  54. src/services/config/__init__.py +1 -0
  55. src/services/config/knowledge_base_config.py +1 -0
  56. src/services/config/loader.py +1 -1
  57. src/services/config/unified_config.py +211 -4
  58. src/services/embedding/__init__.py +1 -0
  59. src/services/embedding/adapters/__init__.py +3 -0
  60. src/services/embedding/adapters/base.py +1 -0
  61. src/services/embedding/adapters/cohere.py +1 -0
  62. src/services/embedding/adapters/jina.py +1 -0
  63. src/services/embedding/adapters/ollama.py +1 -0
  64. src/services/embedding/adapters/openai_compatible.py +1 -0
  65. src/services/embedding/adapters/realtimex.py +125 -0
  66. src/services/embedding/client.py +27 -0
  67. src/services/embedding/config.py +3 -0
  68. src/services/embedding/provider.py +1 -0
  69. src/services/llm/__init__.py +17 -3
  70. src/services/llm/capabilities.py +47 -0
  71. src/services/llm/client.py +32 -0
  72. src/services/llm/cloud_provider.py +21 -4
  73. src/services/llm/config.py +36 -2
  74. src/services/llm/error_mapping.py +1 -0
  75. src/services/llm/exceptions.py +30 -0
  76. src/services/llm/factory.py +55 -16
  77. src/services/llm/local_provider.py +1 -0
  78. src/services/llm/providers/anthropic.py +1 -0
  79. src/services/llm/providers/base_provider.py +1 -0
  80. src/services/llm/providers/open_ai.py +1 -0
  81. src/services/llm/realtimex_provider.py +240 -0
  82. src/services/llm/registry.py +1 -0
  83. src/services/llm/telemetry.py +1 -0
  84. src/services/llm/types.py +1 -0
  85. src/services/llm/utils.py +1 -0
  86. src/services/prompt/__init__.py +1 -0
  87. src/services/prompt/manager.py +3 -2
  88. src/services/rag/__init__.py +27 -5
  89. src/services/rag/components/__init__.py +1 -0
  90. src/services/rag/components/base.py +1 -0
  91. src/services/rag/components/chunkers/__init__.py +1 -0
  92. src/services/rag/components/chunkers/base.py +1 -0
  93. src/services/rag/components/chunkers/fixed.py +1 -0
  94. src/services/rag/components/chunkers/numbered_item.py +1 -0
  95. src/services/rag/components/chunkers/semantic.py +1 -0
  96. src/services/rag/components/embedders/__init__.py +1 -0
  97. src/services/rag/components/embedders/base.py +1 -0
  98. src/services/rag/components/embedders/openai.py +1 -0
  99. src/services/rag/components/indexers/__init__.py +1 -0
  100. src/services/rag/components/indexers/base.py +1 -0
  101. src/services/rag/components/indexers/graph.py +5 -44
  102. src/services/rag/components/indexers/lightrag.py +5 -44
  103. src/services/rag/components/indexers/vector.py +1 -0
  104. src/services/rag/components/parsers/__init__.py +1 -0
  105. src/services/rag/components/parsers/base.py +1 -0
  106. src/services/rag/components/parsers/markdown.py +1 -0
  107. src/services/rag/components/parsers/pdf.py +1 -0
  108. src/services/rag/components/parsers/text.py +1 -0
  109. src/services/rag/components/retrievers/__init__.py +1 -0
  110. src/services/rag/components/retrievers/base.py +1 -0
  111. src/services/rag/components/retrievers/dense.py +1 -0
  112. src/services/rag/components/retrievers/hybrid.py +5 -44
  113. src/services/rag/components/retrievers/lightrag.py +5 -44
  114. src/services/rag/components/routing.py +48 -0
  115. src/services/rag/factory.py +112 -46
  116. src/services/rag/pipeline.py +1 -0
  117. src/services/rag/pipelines/__init__.py +27 -18
  118. src/services/rag/pipelines/lightrag.py +1 -0
  119. src/services/rag/pipelines/llamaindex.py +99 -0
  120. src/services/rag/pipelines/raganything.py +67 -100
  121. src/services/rag/pipelines/raganything_docling.py +368 -0
  122. src/services/rag/service.py +5 -12
  123. src/services/rag/types.py +1 -0
  124. src/services/rag/utils/__init__.py +17 -0
  125. src/services/rag/utils/image_migration.py +279 -0
  126. src/services/search/__init__.py +1 -0
  127. src/services/search/base.py +1 -0
  128. src/services/search/consolidation.py +1 -0
  129. src/services/search/providers/__init__.py +1 -0
  130. src/services/search/providers/baidu.py +1 -0
  131. src/services/search/providers/exa.py +1 -0
  132. src/services/search/providers/jina.py +1 -0
  133. src/services/search/providers/perplexity.py +1 -0
  134. src/services/search/providers/serper.py +1 -0
  135. src/services/search/providers/tavily.py +1 -0
  136. src/services/search/types.py +1 -0
  137. src/services/settings/__init__.py +1 -0
  138. src/services/settings/interface_settings.py +78 -0
  139. src/services/setup/__init__.py +1 -0
  140. src/services/tts/__init__.py +1 -0
  141. src/services/tts/config.py +1 -0
  142. src/utils/realtimex.py +284 -0
  143. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
  144. src/services/rag/pipelines/academic.py +0 -44
  145. {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
@@ -4,6 +4,12 @@
4
4
  Incrementally add documents to existing knowledge base.
5
5
  Improved version with Hash-based duplicate checking, robust error handling,
6
6
  and architectural improvements for data integrity and vision support.
7
+
8
+ Supports multiple RAG providers with lazy loading:
9
+ - llamaindex: Pure vector retrieval (load_index + insert + persist)
10
+ - lightrag: Knowledge graph (LightRAG.ainsert, text-only)
11
+ - raganything: Multimodal with MinerU parser
12
+ - raganything_docling: Multimodal with Docling parser
7
13
  """
8
14
 
9
15
  import argparse
@@ -17,64 +23,18 @@ from pathlib import Path
17
23
  import shutil
18
24
  import sys
19
25
  import tempfile
20
- from typing import TYPE_CHECKING, Any, Dict, List
26
+ from typing import Any, Dict, List
21
27
 
22
28
  from dotenv import load_dotenv
23
29
 
24
- # Attempt imports for dynamic dependencies
25
- try:
26
- from lightrag.llm.openai import openai_complete_if_cache
27
- from lightrag.utils import EmbeddingFunc
28
- except ImportError:
29
- # These will be caught during runtime if needed
30
- openai_complete_if_cache = None
31
- EmbeddingFunc = None
32
-
33
- # Type hinting support for dynamic imports
34
- if TYPE_CHECKING:
35
- try:
36
- from raganything import RAGAnything
37
- from raganything import RAGAnythingConfig as RAGAnythingConfigType
38
- except ImportError:
39
- RAGAnything = Any
40
- RAGAnythingConfigType = Any
41
- else:
42
- RAGAnything = None
43
- RAGAnythingConfigType = None
44
-
45
- # Placeholder for runtime classes
46
- raganything_cls = None
47
- RAGAnythingConfig = None
48
-
49
-
50
- def load_dynamic_imports(project_root: Path):
51
- """Handle the path injections and dynamic imports safely."""
52
- global raganything_cls, RAGAnythingConfig
53
-
54
- sys.path.insert(0, str(project_root))
55
- raganything_path = project_root.parent / "raganything" / "RAG-Anything"
56
- if raganything_path.exists():
57
- sys.path.insert(0, str(raganything_path))
58
-
59
- try:
60
- from raganything import RAGAnything as RA
61
- from raganything import RAGAnythingConfig as RAC
62
-
63
- raganything_cls = RA
64
- RAGAnythingConfig = RAC
65
- except ImportError:
66
- pass
67
-
68
-
69
30
  from src.knowledge.extract_numbered_items import process_content_list
70
31
  from src.logging import LightRAGLogContext, get_logger
71
- from src.services.embedding import (
72
- get_embedding_client,
73
- get_embedding_config,
74
- reset_embedding_client,
75
- )
76
32
  from src.services.llm import get_llm_config
77
33
 
34
+ # Load LLM config early to ensure OPENAI_API_KEY env var is set before LightRAG imports
35
+ # This is critical because LightRAG reads os.environ["OPENAI_API_KEY"] directly
36
+ from src.services.llm.config import get_llm_config as _early_config_load # noqa: F401
37
+
78
38
  logger = get_logger("KnowledgeInit")
79
39
 
80
40
  # Default base directory for knowledge bases
@@ -82,7 +42,14 @@ DEFAULT_BASE_DIR = "./data/knowledge_bases"
82
42
 
83
43
 
84
44
  class DocumentAdder:
85
- """Add documents to existing knowledge base with Hash-validation"""
45
+ """Add documents to existing knowledge base with Hash-validation.
46
+
47
+ Supports multiple RAG providers with lazy loading to avoid unnecessary imports:
48
+ - llamaindex: Only imports llama_index modules
49
+ - lightrag: Only imports lightrag modules (no raganything)
50
+ - raganything: Imports raganything with MinerU parser
51
+ - raganything_docling: Imports raganything with Docling parser
52
+ """
86
53
 
87
54
  def __init__(
88
55
  self,
@@ -106,15 +73,61 @@ class DocumentAdder:
106
73
  self.content_list_dir = self.kb_dir / "content_list"
107
74
  self.metadata_file = self.kb_dir / "metadata.json"
108
75
 
109
- if not self.rag_storage_dir.exists():
110
- raise ValueError(f"Knowledge base not initialized: {kb_name}")
76
+ # For llamaindex, check llamaindex_storage instead of rag_storage
77
+ provider = self._get_provider_from_metadata()
78
+ if provider == "llamaindex":
79
+ llamaindex_storage = self.kb_dir / "llamaindex_storage"
80
+ if not llamaindex_storage.exists():
81
+ raise ValueError(f"Knowledge base not initialized (llamaindex): {kb_name}")
82
+ else:
83
+ if not self.rag_storage_dir.exists():
84
+ raise ValueError(f"Knowledge base not initialized: {kb_name}")
111
85
 
112
86
  self.api_key = api_key
113
87
  self.base_url = base_url
114
88
  self.progress_tracker = progress_tracker
115
- self.rag_provider = rag_provider
89
+
90
+ # IMPORTANT: rag_provider parameter is IGNORED for incremental add
91
+ # We always use the provider from KB metadata to ensure consistency
92
+ # This prevents mixing different index formats in the same KB
93
+ self._resolved_provider = provider
94
+ if rag_provider and rag_provider != provider:
95
+ logger.warning(
96
+ f"Requested provider '{rag_provider}' ignored. "
97
+ f"Using KB's existing provider '{provider}' for consistency."
98
+ )
99
+ logger.info(f"Incremental add will use provider: {provider} (from KB metadata)")
116
100
  self._ensure_working_directories()
117
101
 
102
+ def _get_provider_from_metadata(self) -> str:
103
+ """
104
+ Get the RAG provider from KB metadata.
105
+
106
+ This is the ONLY source of truth for incremental adds - we must use
107
+ the same provider that was used during initialization to ensure
108
+ data consistency and correct storage format.
109
+
110
+ Returns:
111
+ Provider name (llamaindex, lightrag, raganything, raganything_docling)
112
+ """
113
+ if self.metadata_file.exists():
114
+ try:
115
+ with open(self.metadata_file, "r", encoding="utf-8") as f:
116
+ metadata = json.load(f)
117
+ provider = metadata.get("rag_provider")
118
+ if provider:
119
+ return provider
120
+ except Exception as e:
121
+ logger.warning(f"Failed to read provider from metadata: {e}")
122
+
123
+ # Fallback: detect from storage structure
124
+ llamaindex_storage = self.kb_dir / "llamaindex_storage"
125
+ if llamaindex_storage.exists():
126
+ return "llamaindex"
127
+
128
+ # Default to raganything for backward compatibility
129
+ return "raganything"
130
+
118
131
  def _ensure_working_directories(self):
119
132
  for directory in [self.raw_dir, self.images_dir, self.content_list_dir]:
120
133
  directory.mkdir(parents=True, exist_ok=True)
@@ -202,118 +215,271 @@ class DocumentAdder:
202
215
  """
203
216
  Async phase: Ingests files into the RAG system.
204
217
 
205
- Uses FileTypeRouter to classify files and route them appropriately:
206
- - PDF/DOCX/images -> MinerU parser (full document analysis)
207
- - Text/Markdown -> Direct read + LightRAG insert (fast)
218
+ Uses lazy loading to only import dependencies for the actual provider:
219
+ - llamaindex: Only imports llama_index
220
+ - lightrag: Only imports lightrag (no raganything)
221
+ - raganything: Imports raganything with MinerU
222
+ - raganything_docling: Imports raganything with Docling
208
223
  """
209
224
  if not new_files:
210
225
  return None
211
226
 
212
- if raganything_cls is None:
213
- raise ImportError("RAGAnything module not found.")
227
+ provider = self._resolved_provider
228
+ logger.info(f"Processing {len(new_files)} files with provider: {provider}")
229
+
230
+ # Dispatch to provider-specific implementation
231
+ if provider == "llamaindex":
232
+ return await self._process_llamaindex(new_files)
233
+ elif provider == "lightrag":
234
+ return await self._process_lightrag(new_files)
235
+ elif provider == "raganything":
236
+ return await self._process_raganything(new_files, parser="mineru")
237
+ elif provider == "raganything_docling":
238
+ return await self._process_raganything(new_files, parser="docling")
239
+ else:
240
+ raise ValueError(f"Unknown RAG provider: {provider}")
241
+
242
+ async def _process_llamaindex(self, new_files: List[Path]) -> List[Path]:
243
+ """
244
+ Incremental add for LlamaIndex pipeline.
245
+ Lazy imports llama_index only when needed.
246
+ """
247
+ logger.info("Using LlamaIndex incremental add...")
248
+
249
+ # Lazy import llama_index
250
+ try:
251
+ from src.services.rag.pipelines.llamaindex import LlamaIndexPipeline
252
+ except ImportError as e:
253
+ raise ImportError(
254
+ f"LlamaIndex dependencies not installed. "
255
+ f"Install with: pip install llama-index llama-index-core. Error: {e}"
256
+ ) from e
257
+
258
+ # Pre-import progress stage if needed
259
+ ProgressStage: Any = None
260
+ if self.progress_tracker:
261
+ from src.knowledge.progress_tracker import ProgressStage
262
+
263
+ pipeline = LlamaIndexPipeline(kb_base_dir=str(self.base_dir))
264
+ file_paths = [str(f) for f in new_files]
265
+
266
+ # Use the new add_documents method for incremental add
267
+ processed_files = []
268
+ total_files = len(file_paths)
269
+
270
+ for idx, file_path in enumerate(file_paths, 1):
271
+ doc_file = Path(file_path)
272
+ try:
273
+ if self.progress_tracker and ProgressStage:
274
+ self.progress_tracker.update(
275
+ ProgressStage.PROCESSING_FILE,
276
+ f"Indexing (LlamaIndex) {doc_file.name}",
277
+ current=idx,
278
+ total=total_files,
279
+ )
280
+
281
+ # Use add_documents for incremental add
282
+ success = await pipeline.add_documents(self.kb_name, [file_path])
283
+ if success:
284
+ processed_files.append(doc_file)
285
+ self._record_successful_hash(doc_file)
286
+ logger.info(f" ✓ Processed (LlamaIndex): {doc_file.name}")
287
+ else:
288
+ logger.error(f" ✗ Failed to index: {doc_file.name}")
289
+ except Exception as e:
290
+ logger.exception(f" ✗ Failed {doc_file.name}: {e}")
291
+
292
+ return processed_files
293
+
294
+ async def _process_lightrag(self, new_files: List[Path]) -> List[Path]:
295
+ """
296
+ Incremental add for LightRAG pipeline (text-only).
297
+ Lazy imports lightrag only when needed - does NOT require raganything.
298
+ """
299
+ logger.info("Using LightRAG incremental add (text-only)...")
214
300
 
301
+ # Lazy import lightrag
302
+ try:
303
+ from lightrag import LightRAG
304
+ from lightrag.utils import EmbeddingFunc
305
+ except ImportError as e:
306
+ raise ImportError(
307
+ f"LightRAG dependencies not installed. "
308
+ f"Install with: pip install lightrag. Error: {e}"
309
+ ) from e
310
+
311
+ from src.services.embedding import (
312
+ get_embedding_client,
313
+ get_embedding_config,
314
+ reset_embedding_client,
315
+ )
316
+ from src.services.llm import get_llm_client
215
317
  from src.services.rag.components.routing import FileTypeRouter
216
318
 
217
- # Pre-import progress stage if needed to avoid overhead in loop
319
+ # Pre-import progress stage if needed
218
320
  ProgressStage: Any = None
219
321
  if self.progress_tracker:
220
322
  from src.knowledge.progress_tracker import ProgressStage
221
323
 
222
- self.llm_cfg = get_llm_config()
223
- model = self.llm_cfg.model
224
- api_key = self.api_key or self.llm_cfg.api_key
225
- base_url = self.base_url or self.llm_cfg.base_url
226
-
227
- # LLM Function Wrapper
228
- def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
229
- if history_messages is None:
230
- history_messages = []
231
- return openai_complete_if_cache(
232
- model,
233
- prompt,
234
- system_prompt=system_prompt,
235
- history_messages=history_messages,
236
- api_key=api_key,
237
- base_url=base_url,
238
- **kwargs,
324
+ # Setup LLM and embedding
325
+ llm_client = get_llm_client()
326
+ self.llm_cfg = llm_client.config
327
+ llm_model_func = llm_client.get_model_func()
328
+
329
+ reset_embedding_client()
330
+ embedding_cfg = get_embedding_config()
331
+ embedding_client = get_embedding_client()
332
+
333
+ async def unified_embed_func(texts):
334
+ return await embedding_client.embed(texts)
335
+
336
+ embedding_func = EmbeddingFunc(
337
+ embedding_dim=embedding_cfg.dim,
338
+ max_token_size=embedding_cfg.max_tokens,
339
+ func=unified_embed_func,
340
+ )
341
+
342
+ # Create LightRAG instance (text-only, no raganything)
343
+ with LightRAGLogContext(scene="knowledge_incremental"):
344
+ rag = LightRAG(
345
+ working_dir=str(self.rag_storage_dir),
346
+ llm_model_func=llm_model_func,
347
+ embedding_func=embedding_func,
239
348
  )
349
+ await rag.initialize_storages()
240
350
 
241
- # Vision Function Wrapper - Robust history handling
242
- def vision_model_func(
243
- prompt,
244
- system_prompt=None,
245
- history_messages=None,
246
- image_data=None,
247
- messages=None,
248
- **kwargs,
249
- ):
250
- if history_messages is None:
251
- history_messages = []
252
- # If pre-formatted messages are provided, sanitize them
253
- if messages:
254
- safe_messages = self._filter_valid_messages(messages)
255
- return openai_complete_if_cache(
256
- model,
257
- prompt="",
258
- messages=safe_messages,
259
- api_key=api_key,
260
- base_url=base_url,
261
- **kwargs,
262
- )
351
+ from lightrag.kg.shared_storage import initialize_pipeline_status
263
352
 
264
- # --- Construct Message History ---
265
- current_messages = []
266
-
267
- # 1. Add System Prompt (if provided)
268
- if system_prompt:
269
- current_messages.append({"role": "system", "content": system_prompt})
270
-
271
- # 2. Add History (Filtering out conflicting system prompts)
272
- if history_messages:
273
- # Filter out system messages from history to avoid duplicates/conflicts with the new system_prompt
274
- filtered_history = [
275
- msg
276
- for msg in history_messages
277
- if isinstance(msg, dict) and msg.get("role") != "system"
278
- ]
279
- current_messages.extend(filtered_history)
280
-
281
- # 3. Construct New User Message
282
- user_content: List[Dict[str, Any]] = [{"type": "text", "text": prompt}]
283
- if image_data:
284
- user_content.append(
285
- {
286
- "type": "image_url",
287
- "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
288
- }
289
- )
353
+ await initialize_pipeline_status()
290
354
 
291
- # 4. Merge Logic: Avoid back-to-back user messages
292
- if current_messages and current_messages[-1].get("role") == "user":
293
- last_msg = current_messages[-1]
294
- # If last content is string, convert to list format first
295
- if isinstance(last_msg["content"], str):
296
- last_msg["content"] = [{"type": "text", "text": last_msg["content"]}]
355
+ # Classify files
356
+ file_paths_str = [str(f) for f in new_files]
357
+ classification = FileTypeRouter.classify_files(file_paths_str)
297
358
 
298
- # Append new content blocks
299
- if isinstance(last_msg["content"], list):
300
- last_msg["content"].extend(user_content)
359
+ logger.info(
360
+ f"File classification: {len(classification.needs_mineru)} need parsing, "
361
+ f"{len(classification.text_files)} text files, "
362
+ f"{len(classification.unsupported)} unsupported"
363
+ )
364
+
365
+ processed_files = []
366
+ total_files = len(classification.needs_mineru) + len(classification.text_files)
367
+ idx = 0
368
+
369
+ # For LightRAG (text-only), use basic PDF text extraction for PDFs
370
+ for doc_file_str in classification.needs_mineru:
371
+ doc_file = Path(doc_file_str)
372
+ idx += 1
373
+ try:
374
+ if self.progress_tracker and ProgressStage:
375
+ self.progress_tracker.update(
376
+ ProgressStage.PROCESSING_FILE,
377
+ f"Extracting text from {doc_file.name}",
378
+ current=idx,
379
+ total=total_files,
380
+ )
381
+
382
+ if not doc_file.exists():
383
+ logger.error(f" ✗ Failed: File missing {doc_file.name}")
384
+ continue
385
+
386
+ # Basic text extraction
387
+ content = await self._extract_text_basic(doc_file)
388
+ if content.strip():
389
+ await rag.ainsert(content)
390
+ processed_files.append(doc_file)
391
+ self._record_successful_hash(doc_file)
392
+ logger.info(f" ✓ Processed (LightRAG): {doc_file.name}")
301
393
  else:
302
- # Fallback if structure is unexpected, just append new message
303
- current_messages.append({"role": "user", "content": user_content})
304
- else:
305
- current_messages.append({"role": "user", "content": user_content})
306
-
307
- return openai_complete_if_cache(
308
- model,
309
- prompt="",
310
- messages=current_messages,
311
- api_key=api_key,
312
- base_url=base_url,
313
- **kwargs,
314
- )
394
+ logger.warning(f" ⚠ No text extracted: {doc_file.name}")
395
+ except Exception as e:
396
+ logger.exception(f" ✗ Failed {doc_file.name}: {e}")
397
+
398
+ # Process text files directly
399
+ for doc_file_str in classification.text_files:
400
+ doc_file = Path(doc_file_str)
401
+ idx += 1
402
+ try:
403
+ if self.progress_tracker and ProgressStage:
404
+ self.progress_tracker.update(
405
+ ProgressStage.PROCESSING_FILE,
406
+ f"Ingesting (text) {doc_file.name}",
407
+ current=idx,
408
+ total=total_files,
409
+ )
410
+
411
+ if not doc_file.exists():
412
+ logger.error(f" ✗ Failed: File missing {doc_file.name}")
413
+ continue
414
+
415
+ content = await FileTypeRouter.read_text_file(str(doc_file))
416
+ if content.strip():
417
+ await rag.ainsert(content)
418
+ processed_files.append(doc_file)
419
+ self._record_successful_hash(doc_file)
420
+ logger.info(f" ✓ Processed (text): {doc_file.name}")
421
+ else:
422
+ logger.warning(f" ⚠ Skipped empty file: {doc_file.name}")
423
+ except Exception as e:
424
+ logger.exception(f" ✗ Failed {doc_file.name}: {e}")
425
+
426
+ for doc_file_str in classification.unsupported:
427
+ logger.warning(f" ⚠ Skipped unsupported file: {Path(doc_file_str).name}")
428
+
429
+ return processed_files
430
+
431
+ async def _process_raganything(
432
+ self, new_files: List[Path], parser: str = "mineru"
433
+ ) -> List[Path]:
434
+ """
435
+ Incremental add for RAGAnything pipeline (multimodal).
436
+ Lazy imports raganything only when needed.
437
+
438
+ Args:
439
+ parser: "mineru" for RAGAnything, "docling" for RAGAnything Docling
440
+ """
441
+ parser_name = "MinerU" if parser == "mineru" else "Docling"
442
+ logger.info(f"Using RAGAnything incremental add with {parser_name} parser...")
443
+
444
+ # Lazy import raganything
445
+ try:
446
+ # Add RAG-Anything to path if needed
447
+ project_root = Path(__file__).resolve().parent.parent.parent
448
+ raganything_path = project_root.parent / "raganything" / "RAG-Anything"
449
+ if raganything_path.exists() and str(raganything_path) not in sys.path:
450
+ sys.path.insert(0, str(raganything_path))
451
+
452
+ from lightrag.utils import EmbeddingFunc
453
+ from raganything import RAGAnything, RAGAnythingConfig
454
+ except ImportError as e:
455
+ raise ImportError(
456
+ f"RAGAnything dependencies not installed. "
457
+ f"Please install raganything package. Error: {e}"
458
+ ) from e
459
+
460
+ from src.services.embedding import (
461
+ get_embedding_client,
462
+ get_embedding_config,
463
+ reset_embedding_client,
464
+ )
465
+ from src.services.llm import get_llm_client
466
+ from src.services.rag.components.routing import FileTypeRouter
467
+ from src.services.rag.utils.image_migration import (
468
+ cleanup_parser_output_dirs,
469
+ migrate_images_and_update_paths,
470
+ )
471
+
472
+ # Pre-import progress stage if needed
473
+ ProgressStage: Any = None
474
+ if self.progress_tracker:
475
+ from src.knowledge.progress_tracker import ProgressStage
476
+
477
+ # Setup LLM and embedding
478
+ llm_client = get_llm_client()
479
+ self.llm_cfg = llm_client.config
480
+ llm_model_func = llm_client.get_model_func()
481
+ vision_model_func = llm_client.get_vision_model_func()
315
482
 
316
- # Embedding Setup
317
483
  reset_embedding_client()
318
484
  embedding_cfg = get_embedding_config()
319
485
  embedding_client = get_embedding_client()
@@ -327,16 +493,17 @@ class DocumentAdder:
327
493
  func=unified_embed_func,
328
494
  )
329
495
 
496
+ # Configure RAGAnything with the appropriate parser
330
497
  config = RAGAnythingConfig(
331
498
  working_dir=str(self.rag_storage_dir),
332
- parser="mineru",
499
+ parser=parser,
333
500
  enable_image_processing=True,
334
501
  enable_table_processing=True,
335
502
  enable_equation_processing=True,
336
503
  )
337
504
 
338
- with LightRAGLogContext(scene="knowledge_init"):
339
- rag = raganything_cls(
505
+ with LightRAGLogContext(scene="knowledge_incremental"):
506
+ rag = RAGAnything(
340
507
  config=config,
341
508
  llm_model_func=llm_model_func,
342
509
  vision_model_func=vision_model_func,
@@ -345,12 +512,12 @@ class DocumentAdder:
345
512
  if hasattr(rag, "_ensure_lightrag_initialized"):
346
513
  await rag._ensure_lightrag_initialized()
347
514
 
348
- # Classify files by type
515
+ # Classify files
349
516
  file_paths_str = [str(f) for f in new_files]
350
517
  classification = FileTypeRouter.classify_files(file_paths_str)
351
518
 
352
519
  logger.info(
353
- f"File classification: {len(classification.needs_mineru)} need MinerU, "
520
+ f"File classification: {len(classification.needs_mineru)} need {parser_name}, "
354
521
  f"{len(classification.text_files)} text files, "
355
522
  f"{len(classification.unsupported)} unsupported"
356
523
  )
@@ -358,8 +525,9 @@ class DocumentAdder:
358
525
  processed_files = []
359
526
  total_files = len(classification.needs_mineru) + len(classification.text_files)
360
527
  idx = 0
528
+ total_images_migrated = 0
361
529
 
362
- # Process files requiring MinerU (PDF, DOCX, images)
530
+ # Process files requiring parser (PDF, DOCX, images)
363
531
  for doc_file_str in classification.needs_mineru:
364
532
  doc_file = Path(doc_file_str)
365
533
  idx += 1
@@ -367,32 +535,53 @@ class DocumentAdder:
367
535
  if self.progress_tracker and ProgressStage:
368
536
  self.progress_tracker.update(
369
537
  ProgressStage.PROCESSING_FILE,
370
- f"Ingesting (MinerU) {doc_file.name}",
538
+ f"Ingesting ({parser_name}) {doc_file.name}",
371
539
  current=idx,
372
540
  total=total_files,
373
541
  )
374
542
 
375
- # Verify file still exists in raw/ (it should, as we staged it)
376
543
  if not doc_file.exists():
377
- logger.error(f" ✗ Failed: Staged file missing {doc_file.name}")
544
+ logger.error(f" ✗ Failed: File missing {doc_file.name}")
378
545
  continue
379
546
 
380
- await asyncio.wait_for(
381
- rag.process_document_complete(
382
- file_path=str(doc_file),
383
- output_dir=str(self.content_list_dir),
384
- parse_method="auto",
385
- ),
386
- timeout=600.0,
547
+ # Step 1: Parse document
548
+ logger.info(f" Step 1/3: Parsing {doc_file.name}...")
549
+ content_list, doc_id = await rag.parse_document(
550
+ file_path=str(doc_file),
551
+ output_dir=str(self.content_list_dir),
552
+ parse_method="auto",
387
553
  )
554
+
555
+ # Step 2: Migrate images
556
+ logger.info(" Step 2/3: Migrating images...")
557
+ updated_content_list, num_migrated = await migrate_images_and_update_paths(
558
+ content_list=content_list,
559
+ source_base_dir=self.content_list_dir,
560
+ target_images_dir=self.images_dir,
561
+ batch_size=50,
562
+ )
563
+ total_images_migrated += num_migrated
564
+
565
+ # Save content_list
566
+ content_list_file = self.content_list_dir / f"{doc_file.stem}.json"
567
+ with open(content_list_file, "w", encoding="utf-8") as f:
568
+ json.dump(updated_content_list, f, ensure_ascii=False, indent=2)
569
+
570
+ # Step 3: Insert into RAG
571
+ logger.info(" Step 3/3: Inserting into knowledge graph...")
572
+ await rag.insert_content_list(
573
+ content_list=updated_content_list,
574
+ file_path=str(doc_file),
575
+ doc_id=doc_id,
576
+ )
577
+
388
578
  processed_files.append(doc_file)
389
- # Store hash on success - "Canonizing" the file
390
579
  self._record_successful_hash(doc_file)
391
- logger.info(f" ✓ Processed (MinerU): {doc_file.name}")
580
+ logger.info(f" ✓ Processed ({parser_name}): {doc_file.name}")
392
581
  except Exception as e:
393
582
  logger.exception(f" ✗ Failed {doc_file.name}: {e}")
394
583
 
395
- # Process text files directly (fast path - no MinerU)
584
+ # Process text files directly
396
585
  for doc_file_str in classification.text_files:
397
586
  doc_file = Path(doc_file_str)
398
587
  idx += 1
@@ -405,15 +594,12 @@ class DocumentAdder:
405
594
  total=total_files,
406
595
  )
407
596
 
408
- # Verify file still exists
409
597
  if not doc_file.exists():
410
- logger.error(f" ✗ Failed: Staged file missing {doc_file.name}")
598
+ logger.error(f" ✗ Failed: File missing {doc_file.name}")
411
599
  continue
412
600
 
413
- # Read text file directly
414
601
  content = await FileTypeRouter.read_text_file(str(doc_file))
415
602
  if content.strip():
416
- # Insert directly into LightRAG, bypassing MinerU
417
603
  await rag.lightrag.ainsert(content)
418
604
  processed_files.append(doc_file)
419
605
  self._record_successful_hash(doc_file)
@@ -423,13 +609,50 @@ class DocumentAdder:
423
609
  except Exception as e:
424
610
  logger.exception(f" ✗ Failed {doc_file.name}: {e}")
425
611
 
426
- # Log unsupported files
427
612
  for doc_file_str in classification.unsupported:
428
613
  logger.warning(f" ⚠ Skipped unsupported file: {Path(doc_file_str).name}")
429
614
 
615
+ # Cleanup parser output directories
616
+ if total_images_migrated > 0:
617
+ logger.info("Cleaning up temporary parser output directories...")
618
+ await cleanup_parser_output_dirs(self.content_list_dir)
619
+
430
620
  await self.fix_structure()
431
621
  return processed_files
432
622
 
623
+ async def _extract_text_basic(self, file_path: Path) -> str:
624
+ """Basic text extraction for LightRAG (text-only pipeline)."""
625
+ suffix = file_path.suffix.lower()
626
+
627
+ if suffix == ".pdf":
628
+ try:
629
+ import fitz # PyMuPDF
630
+
631
+ doc = fitz.open(file_path)
632
+ texts = []
633
+ for page in doc:
634
+ texts.append(page.get_text())
635
+ doc.close()
636
+ return "\n\n".join(texts)
637
+ except ImportError:
638
+ logger.warning("PyMuPDF not installed. Cannot extract PDF text.")
639
+ return ""
640
+ except Exception as e:
641
+ logger.error(f"Failed to extract PDF text: {e}")
642
+ return ""
643
+ else:
644
+ # Try to read as text
645
+ try:
646
+ with open(file_path, "r", encoding="utf-8") as f:
647
+ return f.read()
648
+ except Exception:
649
+ try:
650
+ with open(file_path, "r", encoding="latin-1") as f:
651
+ return f.read()
652
+ except Exception as e:
653
+ logger.error(f"Failed to read file as text: {e}")
654
+ return ""
655
+
433
656
  def _record_successful_hash(self, file_path: Path):
434
657
  """Update metadata with the hash of a successfully processed file."""
435
658
  file_hash = self._get_file_hash(file_path)
@@ -464,36 +687,50 @@ class DocumentAdder:
464
687
  ]
465
688
 
466
689
  async def fix_structure(self):
467
- """Robustly moves nested outputs and cleans up."""
468
- logger.info("Organizing storage structure...")
690
+ """
691
+ Clean up parser output directories after image migration.
692
+
693
+ NOTE: Image migration and path updates are now handled by the RAG pipeline
694
+ (raganything.py / raganything_docling.py) BEFORE RAG insertion. This ensures
695
+ RAG stores the correct canonical image paths (kb/images/) from the start.
696
+
697
+ This method now only cleans up empty temporary parser output directories.
698
+ """
699
+ logger.info("Checking for leftover parser output directories...")
469
700
 
470
- # 1. Identify moves
471
- moves = []
472
- for doc_dir in self.content_list_dir.glob("*"):
701
+ # Support both 'auto' (MinerU) and 'docling' parser output directories
702
+ parser_subdirs = ["auto", "docling"]
703
+ cleaned_count = 0
704
+
705
+ for doc_dir in list(self.content_list_dir.glob("*")):
473
706
  if not doc_dir.is_dir():
474
707
  continue
475
708
 
476
- # Content List
477
- json_src = next(doc_dir.glob("auto/*_content_list.json"), None)
478
- if json_src:
479
- moves.append((json_src, self.content_list_dir / f"{doc_dir.name}.json"))
709
+ for parser_subdir in parser_subdirs:
710
+ subdir = doc_dir / parser_subdir
711
+ if subdir.exists():
712
+ try:
713
+ # Check if directory is empty or only contains empty subdirs
714
+ has_content = any(
715
+ f.is_file() or (f.is_dir() and any(f.iterdir()))
716
+ for f in subdir.iterdir()
717
+ )
480
718
 
481
- # Images
482
- for img in doc_dir.glob("auto/images/*"):
483
- moves.append((img, self.images_dir / img.name))
719
+ if not has_content:
720
+ await self._run_in_executor(shutil.rmtree, subdir)
721
+ cleaned_count += 1
722
+ except Exception as e:
723
+ logger.debug(f"Could not clean up {subdir}: {e}")
484
724
 
485
- # 2. Execute moves
486
- for src, dest in moves:
487
- if src.exists():
488
- await self._run_in_executor(shutil.copy2, src, dest)
725
+ # Remove doc_dir if it's now empty
726
+ try:
727
+ if doc_dir.exists() and not any(doc_dir.iterdir()):
728
+ doc_dir.rmdir()
729
+ except Exception:
730
+ pass
489
731
 
490
- # 3. Safe Cleanup: Only delete directories we actually processed
491
- for doc_dir in self.content_list_dir.glob("*"):
492
- if doc_dir.is_dir():
493
- # Safety check: only delete if it looks like a parser output (has 'auto' subdir)
494
- # This prevents wiping manual user folders in content_list_dir
495
- if (doc_dir / "auto").exists():
496
- await self._run_in_executor(shutil.rmtree, doc_dir, ignore_errors=True)
732
+ if cleaned_count > 0:
733
+ logger.info(f"Cleaned up {cleaned_count} empty parser directories")
497
734
 
498
735
  def extract_numbered_items_for_new_docs(self, processed_files, batch_size=20):
499
736
  if not processed_files:
@@ -519,6 +756,11 @@ class DocumentAdder:
519
756
  )
520
757
 
521
758
  def update_metadata(self, added_count: int):
759
+ """Update metadata after incremental add.
760
+
761
+ Note: We do NOT update rag_provider here - incremental adds must use
762
+ the same provider as the original initialization for consistency.
763
+ """
522
764
  if not self.metadata_file.exists():
523
765
  return
524
766
  try:
@@ -527,18 +769,9 @@ class DocumentAdder:
527
769
 
528
770
  metadata["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
529
771
 
530
- # Update RAG provider if specified
531
- if self.rag_provider:
532
- metadata["rag_provider"] = self.rag_provider
533
-
534
- # Also save to centralized config file
535
- try:
536
- from src.services.config import get_kb_config_service
537
-
538
- kb_config_service = get_kb_config_service()
539
- kb_config_service.set_rag_provider(self.kb_name, self.rag_provider)
540
- except Exception as config_err:
541
- logger.warning(f"Failed to save to centralized config: {config_err}")
772
+ # Record the provider used (should match what's already in metadata)
773
+ if "rag_provider" not in metadata and self._resolved_provider:
774
+ metadata["rag_provider"] = self._resolved_provider
542
775
 
543
776
  history = metadata.get("update_history", [])
544
777
  history.append(
@@ -546,6 +779,7 @@ class DocumentAdder:
546
779
  "timestamp": metadata["last_updated"],
547
780
  "action": "incremental_add",
548
781
  "count": added_count,
782
+ "provider": self._resolved_provider,
549
783
  }
550
784
  )
551
785
  metadata["update_history"] = history
@@ -557,7 +791,17 @@ class DocumentAdder:
557
791
 
558
792
 
559
793
  async def main():
560
- parser = argparse.ArgumentParser(description="Incrementally add documents to RAG KB")
794
+ parser = argparse.ArgumentParser(
795
+ description="Incrementally add documents to RAG KB",
796
+ epilog="""
797
+ Example usage:
798
+ # Add documents to existing KB (uses provider from KB metadata)
799
+ python -m src.knowledge.add_documents my_kb --docs doc1.pdf doc2.txt
800
+
801
+ # Add all documents from a directory
802
+ python -m src.knowledge.add_documents my_kb --docs-dir ./documents/
803
+ """,
804
+ )
561
805
  parser.add_argument("kb_name", help="KB Name")
562
806
  parser.add_argument("--docs", nargs="+", help="Files")
563
807
  parser.add_argument("--docs-dir", help="Directory")
@@ -568,10 +812,6 @@ async def main():
568
812
 
569
813
  args = parser.parse_args()
570
814
 
571
- # Initialize dynamic paths
572
- project_root = Path(__file__).parent.parent.parent
573
- load_dynamic_imports(project_root)
574
-
575
815
  load_dotenv()
576
816
 
577
817
  doc_files = []