realtimex-deeptutor 0.5.0.post1__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +24 -17
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +143 -123
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +1 -1
- realtimex_deeptutor-0.5.0.post3.dist-info/entry_points.txt +4 -0
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
- scripts/__init__.py +1 -0
- scripts/audit_prompts.py +179 -0
- scripts/check_install.py +460 -0
- scripts/generate_roster.py +327 -0
- scripts/install_all.py +653 -0
- scripts/migrate_kb.py +655 -0
- scripts/start.py +807 -0
- scripts/start_web.py +632 -0
- scripts/sync_prompts_from_en.py +147 -0
- src/__init__.py +2 -2
- src/agents/ideagen/material_organizer_agent.py +2 -0
- src/agents/solve/__init__.py +6 -0
- src/agents/solve/main_solver.py +9 -0
- src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +9 -7
- src/agents/solve/session_manager.py +345 -0
- src/api/main.py +14 -0
- src/api/routers/chat.py +3 -3
- src/api/routers/co_writer.py +12 -7
- src/api/routers/config.py +1 -0
- src/api/routers/guide.py +3 -1
- src/api/routers/ideagen.py +7 -0
- src/api/routers/knowledge.py +64 -12
- src/api/routers/question.py +2 -0
- src/api/routers/realtimex.py +137 -0
- src/api/routers/research.py +9 -0
- src/api/routers/solve.py +120 -2
- src/cli/__init__.py +13 -0
- src/cli/start.py +209 -0
- src/config/constants.py +11 -9
- src/knowledge/add_documents.py +453 -213
- src/knowledge/extract_numbered_items.py +9 -10
- src/knowledge/initializer.py +102 -101
- src/knowledge/manager.py +251 -74
- src/knowledge/progress_tracker.py +43 -2
- src/knowledge/start_kb.py +11 -2
- src/logging/__init__.py +5 -0
- src/logging/adapters/__init__.py +1 -0
- src/logging/adapters/lightrag.py +25 -18
- src/logging/adapters/llamaindex.py +1 -0
- src/logging/config.py +30 -27
- src/logging/handlers/__init__.py +1 -0
- src/logging/handlers/console.py +7 -50
- src/logging/handlers/file.py +5 -20
- src/logging/handlers/websocket.py +23 -19
- src/logging/logger.py +161 -126
- src/logging/stats/__init__.py +1 -0
- src/logging/stats/llm_stats.py +37 -17
- src/services/__init__.py +17 -1
- src/services/config/__init__.py +1 -0
- src/services/config/knowledge_base_config.py +1 -0
- src/services/config/loader.py +1 -1
- src/services/config/unified_config.py +211 -4
- src/services/embedding/__init__.py +1 -0
- src/services/embedding/adapters/__init__.py +3 -0
- src/services/embedding/adapters/base.py +1 -0
- src/services/embedding/adapters/cohere.py +1 -0
- src/services/embedding/adapters/jina.py +1 -0
- src/services/embedding/adapters/ollama.py +1 -0
- src/services/embedding/adapters/openai_compatible.py +1 -0
- src/services/embedding/adapters/realtimex.py +125 -0
- src/services/embedding/client.py +27 -0
- src/services/embedding/config.py +3 -0
- src/services/embedding/provider.py +1 -0
- src/services/llm/__init__.py +17 -3
- src/services/llm/capabilities.py +47 -0
- src/services/llm/client.py +32 -0
- src/services/llm/cloud_provider.py +21 -4
- src/services/llm/config.py +36 -2
- src/services/llm/error_mapping.py +1 -0
- src/services/llm/exceptions.py +30 -0
- src/services/llm/factory.py +55 -16
- src/services/llm/local_provider.py +1 -0
- src/services/llm/providers/anthropic.py +1 -0
- src/services/llm/providers/base_provider.py +1 -0
- src/services/llm/providers/open_ai.py +1 -0
- src/services/llm/realtimex_provider.py +240 -0
- src/services/llm/registry.py +1 -0
- src/services/llm/telemetry.py +1 -0
- src/services/llm/types.py +1 -0
- src/services/llm/utils.py +1 -0
- src/services/prompt/__init__.py +1 -0
- src/services/prompt/manager.py +3 -2
- src/services/rag/__init__.py +27 -5
- src/services/rag/components/__init__.py +1 -0
- src/services/rag/components/base.py +1 -0
- src/services/rag/components/chunkers/__init__.py +1 -0
- src/services/rag/components/chunkers/base.py +1 -0
- src/services/rag/components/chunkers/fixed.py +1 -0
- src/services/rag/components/chunkers/numbered_item.py +1 -0
- src/services/rag/components/chunkers/semantic.py +1 -0
- src/services/rag/components/embedders/__init__.py +1 -0
- src/services/rag/components/embedders/base.py +1 -0
- src/services/rag/components/embedders/openai.py +1 -0
- src/services/rag/components/indexers/__init__.py +1 -0
- src/services/rag/components/indexers/base.py +1 -0
- src/services/rag/components/indexers/graph.py +5 -44
- src/services/rag/components/indexers/lightrag.py +5 -44
- src/services/rag/components/indexers/vector.py +1 -0
- src/services/rag/components/parsers/__init__.py +1 -0
- src/services/rag/components/parsers/base.py +1 -0
- src/services/rag/components/parsers/markdown.py +1 -0
- src/services/rag/components/parsers/pdf.py +1 -0
- src/services/rag/components/parsers/text.py +1 -0
- src/services/rag/components/retrievers/__init__.py +1 -0
- src/services/rag/components/retrievers/base.py +1 -0
- src/services/rag/components/retrievers/dense.py +1 -0
- src/services/rag/components/retrievers/hybrid.py +5 -44
- src/services/rag/components/retrievers/lightrag.py +5 -44
- src/services/rag/components/routing.py +48 -0
- src/services/rag/factory.py +112 -46
- src/services/rag/pipeline.py +1 -0
- src/services/rag/pipelines/__init__.py +27 -18
- src/services/rag/pipelines/lightrag.py +1 -0
- src/services/rag/pipelines/llamaindex.py +99 -0
- src/services/rag/pipelines/raganything.py +67 -100
- src/services/rag/pipelines/raganything_docling.py +368 -0
- src/services/rag/service.py +5 -12
- src/services/rag/types.py +1 -0
- src/services/rag/utils/__init__.py +17 -0
- src/services/rag/utils/image_migration.py +279 -0
- src/services/search/__init__.py +1 -0
- src/services/search/base.py +1 -0
- src/services/search/consolidation.py +1 -0
- src/services/search/providers/__init__.py +1 -0
- src/services/search/providers/baidu.py +1 -0
- src/services/search/providers/exa.py +1 -0
- src/services/search/providers/jina.py +1 -0
- src/services/search/providers/perplexity.py +1 -0
- src/services/search/providers/serper.py +1 -0
- src/services/search/providers/tavily.py +1 -0
- src/services/search/types.py +1 -0
- src/services/settings/__init__.py +1 -0
- src/services/settings/interface_settings.py +78 -0
- src/services/setup/__init__.py +1 -0
- src/services/tts/__init__.py +1 -0
- src/services/tts/config.py +1 -0
- src/utils/realtimex.py +284 -0
- realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +0 -2
- src/services/rag/pipelines/academic.py +0 -44
- {realtimex_deeptutor-0.5.0.post1.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
src/knowledge/add_documents.py
CHANGED
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
Incrementally add documents to existing knowledge base.
|
|
5
5
|
Improved version with Hash-based duplicate checking, robust error handling,
|
|
6
6
|
and architectural improvements for data integrity and vision support.
|
|
7
|
+
|
|
8
|
+
Supports multiple RAG providers with lazy loading:
|
|
9
|
+
- llamaindex: Pure vector retrieval (load_index + insert + persist)
|
|
10
|
+
- lightrag: Knowledge graph (LightRAG.ainsert, text-only)
|
|
11
|
+
- raganything: Multimodal with MinerU parser
|
|
12
|
+
- raganything_docling: Multimodal with Docling parser
|
|
7
13
|
"""
|
|
8
14
|
|
|
9
15
|
import argparse
|
|
@@ -17,64 +23,18 @@ from pathlib import Path
|
|
|
17
23
|
import shutil
|
|
18
24
|
import sys
|
|
19
25
|
import tempfile
|
|
20
|
-
from typing import
|
|
26
|
+
from typing import Any, Dict, List
|
|
21
27
|
|
|
22
28
|
from dotenv import load_dotenv
|
|
23
29
|
|
|
24
|
-
# Attempt imports for dynamic dependencies
|
|
25
|
-
try:
|
|
26
|
-
from lightrag.llm.openai import openai_complete_if_cache
|
|
27
|
-
from lightrag.utils import EmbeddingFunc
|
|
28
|
-
except ImportError:
|
|
29
|
-
# These will be caught during runtime if needed
|
|
30
|
-
openai_complete_if_cache = None
|
|
31
|
-
EmbeddingFunc = None
|
|
32
|
-
|
|
33
|
-
# Type hinting support for dynamic imports
|
|
34
|
-
if TYPE_CHECKING:
|
|
35
|
-
try:
|
|
36
|
-
from raganything import RAGAnything
|
|
37
|
-
from raganything import RAGAnythingConfig as RAGAnythingConfigType
|
|
38
|
-
except ImportError:
|
|
39
|
-
RAGAnything = Any
|
|
40
|
-
RAGAnythingConfigType = Any
|
|
41
|
-
else:
|
|
42
|
-
RAGAnything = None
|
|
43
|
-
RAGAnythingConfigType = None
|
|
44
|
-
|
|
45
|
-
# Placeholder for runtime classes
|
|
46
|
-
raganything_cls = None
|
|
47
|
-
RAGAnythingConfig = None
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def load_dynamic_imports(project_root: Path):
|
|
51
|
-
"""Handle the path injections and dynamic imports safely."""
|
|
52
|
-
global raganything_cls, RAGAnythingConfig
|
|
53
|
-
|
|
54
|
-
sys.path.insert(0, str(project_root))
|
|
55
|
-
raganything_path = project_root.parent / "raganything" / "RAG-Anything"
|
|
56
|
-
if raganything_path.exists():
|
|
57
|
-
sys.path.insert(0, str(raganything_path))
|
|
58
|
-
|
|
59
|
-
try:
|
|
60
|
-
from raganything import RAGAnything as RA
|
|
61
|
-
from raganything import RAGAnythingConfig as RAC
|
|
62
|
-
|
|
63
|
-
raganything_cls = RA
|
|
64
|
-
RAGAnythingConfig = RAC
|
|
65
|
-
except ImportError:
|
|
66
|
-
pass
|
|
67
|
-
|
|
68
|
-
|
|
69
30
|
from src.knowledge.extract_numbered_items import process_content_list
|
|
70
31
|
from src.logging import LightRAGLogContext, get_logger
|
|
71
|
-
from src.services.embedding import (
|
|
72
|
-
get_embedding_client,
|
|
73
|
-
get_embedding_config,
|
|
74
|
-
reset_embedding_client,
|
|
75
|
-
)
|
|
76
32
|
from src.services.llm import get_llm_config
|
|
77
33
|
|
|
34
|
+
# Load LLM config early to ensure OPENAI_API_KEY env var is set before LightRAG imports
|
|
35
|
+
# This is critical because LightRAG reads os.environ["OPENAI_API_KEY"] directly
|
|
36
|
+
from src.services.llm.config import get_llm_config as _early_config_load # noqa: F401
|
|
37
|
+
|
|
78
38
|
logger = get_logger("KnowledgeInit")
|
|
79
39
|
|
|
80
40
|
# Default base directory for knowledge bases
|
|
@@ -82,7 +42,14 @@ DEFAULT_BASE_DIR = "./data/knowledge_bases"
|
|
|
82
42
|
|
|
83
43
|
|
|
84
44
|
class DocumentAdder:
|
|
85
|
-
"""Add documents to existing knowledge base with Hash-validation
|
|
45
|
+
"""Add documents to existing knowledge base with Hash-validation.
|
|
46
|
+
|
|
47
|
+
Supports multiple RAG providers with lazy loading to avoid unnecessary imports:
|
|
48
|
+
- llamaindex: Only imports llama_index modules
|
|
49
|
+
- lightrag: Only imports lightrag modules (no raganything)
|
|
50
|
+
- raganything: Imports raganything with MinerU parser
|
|
51
|
+
- raganything_docling: Imports raganything with Docling parser
|
|
52
|
+
"""
|
|
86
53
|
|
|
87
54
|
def __init__(
|
|
88
55
|
self,
|
|
@@ -106,15 +73,61 @@ class DocumentAdder:
|
|
|
106
73
|
self.content_list_dir = self.kb_dir / "content_list"
|
|
107
74
|
self.metadata_file = self.kb_dir / "metadata.json"
|
|
108
75
|
|
|
109
|
-
|
|
110
|
-
|
|
76
|
+
# For llamaindex, check llamaindex_storage instead of rag_storage
|
|
77
|
+
provider = self._get_provider_from_metadata()
|
|
78
|
+
if provider == "llamaindex":
|
|
79
|
+
llamaindex_storage = self.kb_dir / "llamaindex_storage"
|
|
80
|
+
if not llamaindex_storage.exists():
|
|
81
|
+
raise ValueError(f"Knowledge base not initialized (llamaindex): {kb_name}")
|
|
82
|
+
else:
|
|
83
|
+
if not self.rag_storage_dir.exists():
|
|
84
|
+
raise ValueError(f"Knowledge base not initialized: {kb_name}")
|
|
111
85
|
|
|
112
86
|
self.api_key = api_key
|
|
113
87
|
self.base_url = base_url
|
|
114
88
|
self.progress_tracker = progress_tracker
|
|
115
|
-
|
|
89
|
+
|
|
90
|
+
# IMPORTANT: rag_provider parameter is IGNORED for incremental add
|
|
91
|
+
# We always use the provider from KB metadata to ensure consistency
|
|
92
|
+
# This prevents mixing different index formats in the same KB
|
|
93
|
+
self._resolved_provider = provider
|
|
94
|
+
if rag_provider and rag_provider != provider:
|
|
95
|
+
logger.warning(
|
|
96
|
+
f"Requested provider '{rag_provider}' ignored. "
|
|
97
|
+
f"Using KB's existing provider '{provider}' for consistency."
|
|
98
|
+
)
|
|
99
|
+
logger.info(f"Incremental add will use provider: {provider} (from KB metadata)")
|
|
116
100
|
self._ensure_working_directories()
|
|
117
101
|
|
|
102
|
+
def _get_provider_from_metadata(self) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Get the RAG provider from KB metadata.
|
|
105
|
+
|
|
106
|
+
This is the ONLY source of truth for incremental adds - we must use
|
|
107
|
+
the same provider that was used during initialization to ensure
|
|
108
|
+
data consistency and correct storage format.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Provider name (llamaindex, lightrag, raganything, raganything_docling)
|
|
112
|
+
"""
|
|
113
|
+
if self.metadata_file.exists():
|
|
114
|
+
try:
|
|
115
|
+
with open(self.metadata_file, "r", encoding="utf-8") as f:
|
|
116
|
+
metadata = json.load(f)
|
|
117
|
+
provider = metadata.get("rag_provider")
|
|
118
|
+
if provider:
|
|
119
|
+
return provider
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.warning(f"Failed to read provider from metadata: {e}")
|
|
122
|
+
|
|
123
|
+
# Fallback: detect from storage structure
|
|
124
|
+
llamaindex_storage = self.kb_dir / "llamaindex_storage"
|
|
125
|
+
if llamaindex_storage.exists():
|
|
126
|
+
return "llamaindex"
|
|
127
|
+
|
|
128
|
+
# Default to raganything for backward compatibility
|
|
129
|
+
return "raganything"
|
|
130
|
+
|
|
118
131
|
def _ensure_working_directories(self):
|
|
119
132
|
for directory in [self.raw_dir, self.images_dir, self.content_list_dir]:
|
|
120
133
|
directory.mkdir(parents=True, exist_ok=True)
|
|
@@ -202,118 +215,271 @@ class DocumentAdder:
|
|
|
202
215
|
"""
|
|
203
216
|
Async phase: Ingests files into the RAG system.
|
|
204
217
|
|
|
205
|
-
Uses
|
|
206
|
-
-
|
|
207
|
-
-
|
|
218
|
+
Uses lazy loading to only import dependencies for the actual provider:
|
|
219
|
+
- llamaindex: Only imports llama_index
|
|
220
|
+
- lightrag: Only imports lightrag (no raganything)
|
|
221
|
+
- raganything: Imports raganything with MinerU
|
|
222
|
+
- raganything_docling: Imports raganything with Docling
|
|
208
223
|
"""
|
|
209
224
|
if not new_files:
|
|
210
225
|
return None
|
|
211
226
|
|
|
212
|
-
|
|
213
|
-
|
|
227
|
+
provider = self._resolved_provider
|
|
228
|
+
logger.info(f"Processing {len(new_files)} files with provider: {provider}")
|
|
229
|
+
|
|
230
|
+
# Dispatch to provider-specific implementation
|
|
231
|
+
if provider == "llamaindex":
|
|
232
|
+
return await self._process_llamaindex(new_files)
|
|
233
|
+
elif provider == "lightrag":
|
|
234
|
+
return await self._process_lightrag(new_files)
|
|
235
|
+
elif provider == "raganything":
|
|
236
|
+
return await self._process_raganything(new_files, parser="mineru")
|
|
237
|
+
elif provider == "raganything_docling":
|
|
238
|
+
return await self._process_raganything(new_files, parser="docling")
|
|
239
|
+
else:
|
|
240
|
+
raise ValueError(f"Unknown RAG provider: {provider}")
|
|
241
|
+
|
|
242
|
+
async def _process_llamaindex(self, new_files: List[Path]) -> List[Path]:
|
|
243
|
+
"""
|
|
244
|
+
Incremental add for LlamaIndex pipeline.
|
|
245
|
+
Lazy imports llama_index only when needed.
|
|
246
|
+
"""
|
|
247
|
+
logger.info("Using LlamaIndex incremental add...")
|
|
248
|
+
|
|
249
|
+
# Lazy import llama_index
|
|
250
|
+
try:
|
|
251
|
+
from src.services.rag.pipelines.llamaindex import LlamaIndexPipeline
|
|
252
|
+
except ImportError as e:
|
|
253
|
+
raise ImportError(
|
|
254
|
+
f"LlamaIndex dependencies not installed. "
|
|
255
|
+
f"Install with: pip install llama-index llama-index-core. Error: {e}"
|
|
256
|
+
) from e
|
|
257
|
+
|
|
258
|
+
# Pre-import progress stage if needed
|
|
259
|
+
ProgressStage: Any = None
|
|
260
|
+
if self.progress_tracker:
|
|
261
|
+
from src.knowledge.progress_tracker import ProgressStage
|
|
262
|
+
|
|
263
|
+
pipeline = LlamaIndexPipeline(kb_base_dir=str(self.base_dir))
|
|
264
|
+
file_paths = [str(f) for f in new_files]
|
|
265
|
+
|
|
266
|
+
# Use the new add_documents method for incremental add
|
|
267
|
+
processed_files = []
|
|
268
|
+
total_files = len(file_paths)
|
|
269
|
+
|
|
270
|
+
for idx, file_path in enumerate(file_paths, 1):
|
|
271
|
+
doc_file = Path(file_path)
|
|
272
|
+
try:
|
|
273
|
+
if self.progress_tracker and ProgressStage:
|
|
274
|
+
self.progress_tracker.update(
|
|
275
|
+
ProgressStage.PROCESSING_FILE,
|
|
276
|
+
f"Indexing (LlamaIndex) {doc_file.name}",
|
|
277
|
+
current=idx,
|
|
278
|
+
total=total_files,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Use add_documents for incremental add
|
|
282
|
+
success = await pipeline.add_documents(self.kb_name, [file_path])
|
|
283
|
+
if success:
|
|
284
|
+
processed_files.append(doc_file)
|
|
285
|
+
self._record_successful_hash(doc_file)
|
|
286
|
+
logger.info(f" ✓ Processed (LlamaIndex): {doc_file.name}")
|
|
287
|
+
else:
|
|
288
|
+
logger.error(f" ✗ Failed to index: {doc_file.name}")
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.exception(f" ✗ Failed {doc_file.name}: {e}")
|
|
291
|
+
|
|
292
|
+
return processed_files
|
|
293
|
+
|
|
294
|
+
async def _process_lightrag(self, new_files: List[Path]) -> List[Path]:
|
|
295
|
+
"""
|
|
296
|
+
Incremental add for LightRAG pipeline (text-only).
|
|
297
|
+
Lazy imports lightrag only when needed - does NOT require raganything.
|
|
298
|
+
"""
|
|
299
|
+
logger.info("Using LightRAG incremental add (text-only)...")
|
|
214
300
|
|
|
301
|
+
# Lazy import lightrag
|
|
302
|
+
try:
|
|
303
|
+
from lightrag import LightRAG
|
|
304
|
+
from lightrag.utils import EmbeddingFunc
|
|
305
|
+
except ImportError as e:
|
|
306
|
+
raise ImportError(
|
|
307
|
+
f"LightRAG dependencies not installed. "
|
|
308
|
+
f"Install with: pip install lightrag. Error: {e}"
|
|
309
|
+
) from e
|
|
310
|
+
|
|
311
|
+
from src.services.embedding import (
|
|
312
|
+
get_embedding_client,
|
|
313
|
+
get_embedding_config,
|
|
314
|
+
reset_embedding_client,
|
|
315
|
+
)
|
|
316
|
+
from src.services.llm import get_llm_client
|
|
215
317
|
from src.services.rag.components.routing import FileTypeRouter
|
|
216
318
|
|
|
217
|
-
# Pre-import progress stage if needed
|
|
319
|
+
# Pre-import progress stage if needed
|
|
218
320
|
ProgressStage: Any = None
|
|
219
321
|
if self.progress_tracker:
|
|
220
322
|
from src.knowledge.progress_tracker import ProgressStage
|
|
221
323
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
324
|
+
# Setup LLM and embedding
|
|
325
|
+
llm_client = get_llm_client()
|
|
326
|
+
self.llm_cfg = llm_client.config
|
|
327
|
+
llm_model_func = llm_client.get_model_func()
|
|
328
|
+
|
|
329
|
+
reset_embedding_client()
|
|
330
|
+
embedding_cfg = get_embedding_config()
|
|
331
|
+
embedding_client = get_embedding_client()
|
|
332
|
+
|
|
333
|
+
async def unified_embed_func(texts):
|
|
334
|
+
return await embedding_client.embed(texts)
|
|
335
|
+
|
|
336
|
+
embedding_func = EmbeddingFunc(
|
|
337
|
+
embedding_dim=embedding_cfg.dim,
|
|
338
|
+
max_token_size=embedding_cfg.max_tokens,
|
|
339
|
+
func=unified_embed_func,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Create LightRAG instance (text-only, no raganything)
|
|
343
|
+
with LightRAGLogContext(scene="knowledge_incremental"):
|
|
344
|
+
rag = LightRAG(
|
|
345
|
+
working_dir=str(self.rag_storage_dir),
|
|
346
|
+
llm_model_func=llm_model_func,
|
|
347
|
+
embedding_func=embedding_func,
|
|
239
348
|
)
|
|
349
|
+
await rag.initialize_storages()
|
|
240
350
|
|
|
241
|
-
|
|
242
|
-
def vision_model_func(
|
|
243
|
-
prompt,
|
|
244
|
-
system_prompt=None,
|
|
245
|
-
history_messages=None,
|
|
246
|
-
image_data=None,
|
|
247
|
-
messages=None,
|
|
248
|
-
**kwargs,
|
|
249
|
-
):
|
|
250
|
-
if history_messages is None:
|
|
251
|
-
history_messages = []
|
|
252
|
-
# If pre-formatted messages are provided, sanitize them
|
|
253
|
-
if messages:
|
|
254
|
-
safe_messages = self._filter_valid_messages(messages)
|
|
255
|
-
return openai_complete_if_cache(
|
|
256
|
-
model,
|
|
257
|
-
prompt="",
|
|
258
|
-
messages=safe_messages,
|
|
259
|
-
api_key=api_key,
|
|
260
|
-
base_url=base_url,
|
|
261
|
-
**kwargs,
|
|
262
|
-
)
|
|
351
|
+
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
263
352
|
|
|
264
|
-
|
|
265
|
-
current_messages = []
|
|
266
|
-
|
|
267
|
-
# 1. Add System Prompt (if provided)
|
|
268
|
-
if system_prompt:
|
|
269
|
-
current_messages.append({"role": "system", "content": system_prompt})
|
|
270
|
-
|
|
271
|
-
# 2. Add History (Filtering out conflicting system prompts)
|
|
272
|
-
if history_messages:
|
|
273
|
-
# Filter out system messages from history to avoid duplicates/conflicts with the new system_prompt
|
|
274
|
-
filtered_history = [
|
|
275
|
-
msg
|
|
276
|
-
for msg in history_messages
|
|
277
|
-
if isinstance(msg, dict) and msg.get("role") != "system"
|
|
278
|
-
]
|
|
279
|
-
current_messages.extend(filtered_history)
|
|
280
|
-
|
|
281
|
-
# 3. Construct New User Message
|
|
282
|
-
user_content: List[Dict[str, Any]] = [{"type": "text", "text": prompt}]
|
|
283
|
-
if image_data:
|
|
284
|
-
user_content.append(
|
|
285
|
-
{
|
|
286
|
-
"type": "image_url",
|
|
287
|
-
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
|
|
288
|
-
}
|
|
289
|
-
)
|
|
353
|
+
await initialize_pipeline_status()
|
|
290
354
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
# If last content is string, convert to list format first
|
|
295
|
-
if isinstance(last_msg["content"], str):
|
|
296
|
-
last_msg["content"] = [{"type": "text", "text": last_msg["content"]}]
|
|
355
|
+
# Classify files
|
|
356
|
+
file_paths_str = [str(f) for f in new_files]
|
|
357
|
+
classification = FileTypeRouter.classify_files(file_paths_str)
|
|
297
358
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
359
|
+
logger.info(
|
|
360
|
+
f"File classification: {len(classification.needs_mineru)} need parsing, "
|
|
361
|
+
f"{len(classification.text_files)} text files, "
|
|
362
|
+
f"{len(classification.unsupported)} unsupported"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
processed_files = []
|
|
366
|
+
total_files = len(classification.needs_mineru) + len(classification.text_files)
|
|
367
|
+
idx = 0
|
|
368
|
+
|
|
369
|
+
# For LightRAG (text-only), use basic PDF text extraction for PDFs
|
|
370
|
+
for doc_file_str in classification.needs_mineru:
|
|
371
|
+
doc_file = Path(doc_file_str)
|
|
372
|
+
idx += 1
|
|
373
|
+
try:
|
|
374
|
+
if self.progress_tracker and ProgressStage:
|
|
375
|
+
self.progress_tracker.update(
|
|
376
|
+
ProgressStage.PROCESSING_FILE,
|
|
377
|
+
f"Extracting text from {doc_file.name}",
|
|
378
|
+
current=idx,
|
|
379
|
+
total=total_files,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
if not doc_file.exists():
|
|
383
|
+
logger.error(f" ✗ Failed: File missing {doc_file.name}")
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
# Basic text extraction
|
|
387
|
+
content = await self._extract_text_basic(doc_file)
|
|
388
|
+
if content.strip():
|
|
389
|
+
await rag.ainsert(content)
|
|
390
|
+
processed_files.append(doc_file)
|
|
391
|
+
self._record_successful_hash(doc_file)
|
|
392
|
+
logger.info(f" ✓ Processed (LightRAG): {doc_file.name}")
|
|
301
393
|
else:
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
394
|
+
logger.warning(f" ⚠ No text extracted: {doc_file.name}")
|
|
395
|
+
except Exception as e:
|
|
396
|
+
logger.exception(f" ✗ Failed {doc_file.name}: {e}")
|
|
397
|
+
|
|
398
|
+
# Process text files directly
|
|
399
|
+
for doc_file_str in classification.text_files:
|
|
400
|
+
doc_file = Path(doc_file_str)
|
|
401
|
+
idx += 1
|
|
402
|
+
try:
|
|
403
|
+
if self.progress_tracker and ProgressStage:
|
|
404
|
+
self.progress_tracker.update(
|
|
405
|
+
ProgressStage.PROCESSING_FILE,
|
|
406
|
+
f"Ingesting (text) {doc_file.name}",
|
|
407
|
+
current=idx,
|
|
408
|
+
total=total_files,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
if not doc_file.exists():
|
|
412
|
+
logger.error(f" ✗ Failed: File missing {doc_file.name}")
|
|
413
|
+
continue
|
|
414
|
+
|
|
415
|
+
content = await FileTypeRouter.read_text_file(str(doc_file))
|
|
416
|
+
if content.strip():
|
|
417
|
+
await rag.ainsert(content)
|
|
418
|
+
processed_files.append(doc_file)
|
|
419
|
+
self._record_successful_hash(doc_file)
|
|
420
|
+
logger.info(f" ✓ Processed (text): {doc_file.name}")
|
|
421
|
+
else:
|
|
422
|
+
logger.warning(f" ⚠ Skipped empty file: {doc_file.name}")
|
|
423
|
+
except Exception as e:
|
|
424
|
+
logger.exception(f" ✗ Failed {doc_file.name}: {e}")
|
|
425
|
+
|
|
426
|
+
for doc_file_str in classification.unsupported:
|
|
427
|
+
logger.warning(f" ⚠ Skipped unsupported file: {Path(doc_file_str).name}")
|
|
428
|
+
|
|
429
|
+
return processed_files
|
|
430
|
+
|
|
431
|
+
async def _process_raganything(
|
|
432
|
+
self, new_files: List[Path], parser: str = "mineru"
|
|
433
|
+
) -> List[Path]:
|
|
434
|
+
"""
|
|
435
|
+
Incremental add for RAGAnything pipeline (multimodal).
|
|
436
|
+
Lazy imports raganything only when needed.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
parser: "mineru" for RAGAnything, "docling" for RAGAnything Docling
|
|
440
|
+
"""
|
|
441
|
+
parser_name = "MinerU" if parser == "mineru" else "Docling"
|
|
442
|
+
logger.info(f"Using RAGAnything incremental add with {parser_name} parser...")
|
|
443
|
+
|
|
444
|
+
# Lazy import raganything
|
|
445
|
+
try:
|
|
446
|
+
# Add RAG-Anything to path if needed
|
|
447
|
+
project_root = Path(__file__).resolve().parent.parent.parent
|
|
448
|
+
raganything_path = project_root.parent / "raganything" / "RAG-Anything"
|
|
449
|
+
if raganything_path.exists() and str(raganything_path) not in sys.path:
|
|
450
|
+
sys.path.insert(0, str(raganything_path))
|
|
451
|
+
|
|
452
|
+
from lightrag.utils import EmbeddingFunc
|
|
453
|
+
from raganything import RAGAnything, RAGAnythingConfig
|
|
454
|
+
except ImportError as e:
|
|
455
|
+
raise ImportError(
|
|
456
|
+
f"RAGAnything dependencies not installed. "
|
|
457
|
+
f"Please install raganything package. Error: {e}"
|
|
458
|
+
) from e
|
|
459
|
+
|
|
460
|
+
from src.services.embedding import (
|
|
461
|
+
get_embedding_client,
|
|
462
|
+
get_embedding_config,
|
|
463
|
+
reset_embedding_client,
|
|
464
|
+
)
|
|
465
|
+
from src.services.llm import get_llm_client
|
|
466
|
+
from src.services.rag.components.routing import FileTypeRouter
|
|
467
|
+
from src.services.rag.utils.image_migration import (
|
|
468
|
+
cleanup_parser_output_dirs,
|
|
469
|
+
migrate_images_and_update_paths,
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Pre-import progress stage if needed
|
|
473
|
+
ProgressStage: Any = None
|
|
474
|
+
if self.progress_tracker:
|
|
475
|
+
from src.knowledge.progress_tracker import ProgressStage
|
|
476
|
+
|
|
477
|
+
# Setup LLM and embedding
|
|
478
|
+
llm_client = get_llm_client()
|
|
479
|
+
self.llm_cfg = llm_client.config
|
|
480
|
+
llm_model_func = llm_client.get_model_func()
|
|
481
|
+
vision_model_func = llm_client.get_vision_model_func()
|
|
315
482
|
|
|
316
|
-
# Embedding Setup
|
|
317
483
|
reset_embedding_client()
|
|
318
484
|
embedding_cfg = get_embedding_config()
|
|
319
485
|
embedding_client = get_embedding_client()
|
|
@@ -327,16 +493,17 @@ class DocumentAdder:
|
|
|
327
493
|
func=unified_embed_func,
|
|
328
494
|
)
|
|
329
495
|
|
|
496
|
+
# Configure RAGAnything with the appropriate parser
|
|
330
497
|
config = RAGAnythingConfig(
|
|
331
498
|
working_dir=str(self.rag_storage_dir),
|
|
332
|
-
parser=
|
|
499
|
+
parser=parser,
|
|
333
500
|
enable_image_processing=True,
|
|
334
501
|
enable_table_processing=True,
|
|
335
502
|
enable_equation_processing=True,
|
|
336
503
|
)
|
|
337
504
|
|
|
338
|
-
with LightRAGLogContext(scene="
|
|
339
|
-
rag =
|
|
505
|
+
with LightRAGLogContext(scene="knowledge_incremental"):
|
|
506
|
+
rag = RAGAnything(
|
|
340
507
|
config=config,
|
|
341
508
|
llm_model_func=llm_model_func,
|
|
342
509
|
vision_model_func=vision_model_func,
|
|
@@ -345,12 +512,12 @@ class DocumentAdder:
|
|
|
345
512
|
if hasattr(rag, "_ensure_lightrag_initialized"):
|
|
346
513
|
await rag._ensure_lightrag_initialized()
|
|
347
514
|
|
|
348
|
-
# Classify files
|
|
515
|
+
# Classify files
|
|
349
516
|
file_paths_str = [str(f) for f in new_files]
|
|
350
517
|
classification = FileTypeRouter.classify_files(file_paths_str)
|
|
351
518
|
|
|
352
519
|
logger.info(
|
|
353
|
-
f"File classification: {len(classification.needs_mineru)} need
|
|
520
|
+
f"File classification: {len(classification.needs_mineru)} need {parser_name}, "
|
|
354
521
|
f"{len(classification.text_files)} text files, "
|
|
355
522
|
f"{len(classification.unsupported)} unsupported"
|
|
356
523
|
)
|
|
@@ -358,8 +525,9 @@ class DocumentAdder:
|
|
|
358
525
|
processed_files = []
|
|
359
526
|
total_files = len(classification.needs_mineru) + len(classification.text_files)
|
|
360
527
|
idx = 0
|
|
528
|
+
total_images_migrated = 0
|
|
361
529
|
|
|
362
|
-
# Process files requiring
|
|
530
|
+
# Process files requiring parser (PDF, DOCX, images)
|
|
363
531
|
for doc_file_str in classification.needs_mineru:
|
|
364
532
|
doc_file = Path(doc_file_str)
|
|
365
533
|
idx += 1
|
|
@@ -367,32 +535,53 @@ class DocumentAdder:
|
|
|
367
535
|
if self.progress_tracker and ProgressStage:
|
|
368
536
|
self.progress_tracker.update(
|
|
369
537
|
ProgressStage.PROCESSING_FILE,
|
|
370
|
-
f"Ingesting (
|
|
538
|
+
f"Ingesting ({parser_name}) {doc_file.name}",
|
|
371
539
|
current=idx,
|
|
372
540
|
total=total_files,
|
|
373
541
|
)
|
|
374
542
|
|
|
375
|
-
# Verify file still exists in raw/ (it should, as we staged it)
|
|
376
543
|
if not doc_file.exists():
|
|
377
|
-
logger.error(f" ✗ Failed:
|
|
544
|
+
logger.error(f" ✗ Failed: File missing {doc_file.name}")
|
|
378
545
|
continue
|
|
379
546
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
timeout=600.0,
|
|
547
|
+
# Step 1: Parse document
|
|
548
|
+
logger.info(f" Step 1/3: Parsing {doc_file.name}...")
|
|
549
|
+
content_list, doc_id = await rag.parse_document(
|
|
550
|
+
file_path=str(doc_file),
|
|
551
|
+
output_dir=str(self.content_list_dir),
|
|
552
|
+
parse_method="auto",
|
|
387
553
|
)
|
|
554
|
+
|
|
555
|
+
# Step 2: Migrate images
|
|
556
|
+
logger.info(" Step 2/3: Migrating images...")
|
|
557
|
+
updated_content_list, num_migrated = await migrate_images_and_update_paths(
|
|
558
|
+
content_list=content_list,
|
|
559
|
+
source_base_dir=self.content_list_dir,
|
|
560
|
+
target_images_dir=self.images_dir,
|
|
561
|
+
batch_size=50,
|
|
562
|
+
)
|
|
563
|
+
total_images_migrated += num_migrated
|
|
564
|
+
|
|
565
|
+
# Save content_list
|
|
566
|
+
content_list_file = self.content_list_dir / f"{doc_file.stem}.json"
|
|
567
|
+
with open(content_list_file, "w", encoding="utf-8") as f:
|
|
568
|
+
json.dump(updated_content_list, f, ensure_ascii=False, indent=2)
|
|
569
|
+
|
|
570
|
+
# Step 3: Insert into RAG
|
|
571
|
+
logger.info(" Step 3/3: Inserting into knowledge graph...")
|
|
572
|
+
await rag.insert_content_list(
|
|
573
|
+
content_list=updated_content_list,
|
|
574
|
+
file_path=str(doc_file),
|
|
575
|
+
doc_id=doc_id,
|
|
576
|
+
)
|
|
577
|
+
|
|
388
578
|
processed_files.append(doc_file)
|
|
389
|
-
# Store hash on success - "Canonizing" the file
|
|
390
579
|
self._record_successful_hash(doc_file)
|
|
391
|
-
logger.info(f" ✓ Processed (
|
|
580
|
+
logger.info(f" ✓ Processed ({parser_name}): {doc_file.name}")
|
|
392
581
|
except Exception as e:
|
|
393
582
|
logger.exception(f" ✗ Failed {doc_file.name}: {e}")
|
|
394
583
|
|
|
395
|
-
# Process text files directly
|
|
584
|
+
# Process text files directly
|
|
396
585
|
for doc_file_str in classification.text_files:
|
|
397
586
|
doc_file = Path(doc_file_str)
|
|
398
587
|
idx += 1
|
|
@@ -405,15 +594,12 @@ class DocumentAdder:
|
|
|
405
594
|
total=total_files,
|
|
406
595
|
)
|
|
407
596
|
|
|
408
|
-
# Verify file still exists
|
|
409
597
|
if not doc_file.exists():
|
|
410
|
-
logger.error(f" ✗ Failed:
|
|
598
|
+
logger.error(f" ✗ Failed: File missing {doc_file.name}")
|
|
411
599
|
continue
|
|
412
600
|
|
|
413
|
-
# Read text file directly
|
|
414
601
|
content = await FileTypeRouter.read_text_file(str(doc_file))
|
|
415
602
|
if content.strip():
|
|
416
|
-
# Insert directly into LightRAG, bypassing MinerU
|
|
417
603
|
await rag.lightrag.ainsert(content)
|
|
418
604
|
processed_files.append(doc_file)
|
|
419
605
|
self._record_successful_hash(doc_file)
|
|
@@ -423,13 +609,50 @@ class DocumentAdder:
|
|
|
423
609
|
except Exception as e:
|
|
424
610
|
logger.exception(f" ✗ Failed {doc_file.name}: {e}")
|
|
425
611
|
|
|
426
|
-
# Log unsupported files
|
|
427
612
|
for doc_file_str in classification.unsupported:
|
|
428
613
|
logger.warning(f" ⚠ Skipped unsupported file: {Path(doc_file_str).name}")
|
|
429
614
|
|
|
615
|
+
# Cleanup parser output directories
|
|
616
|
+
if total_images_migrated > 0:
|
|
617
|
+
logger.info("Cleaning up temporary parser output directories...")
|
|
618
|
+
await cleanup_parser_output_dirs(self.content_list_dir)
|
|
619
|
+
|
|
430
620
|
await self.fix_structure()
|
|
431
621
|
return processed_files
|
|
432
622
|
|
|
623
|
+
async def _extract_text_basic(self, file_path: Path) -> str:
|
|
624
|
+
"""Basic text extraction for LightRAG (text-only pipeline)."""
|
|
625
|
+
suffix = file_path.suffix.lower()
|
|
626
|
+
|
|
627
|
+
if suffix == ".pdf":
|
|
628
|
+
try:
|
|
629
|
+
import fitz # PyMuPDF
|
|
630
|
+
|
|
631
|
+
doc = fitz.open(file_path)
|
|
632
|
+
texts = []
|
|
633
|
+
for page in doc:
|
|
634
|
+
texts.append(page.get_text())
|
|
635
|
+
doc.close()
|
|
636
|
+
return "\n\n".join(texts)
|
|
637
|
+
except ImportError:
|
|
638
|
+
logger.warning("PyMuPDF not installed. Cannot extract PDF text.")
|
|
639
|
+
return ""
|
|
640
|
+
except Exception as e:
|
|
641
|
+
logger.error(f"Failed to extract PDF text: {e}")
|
|
642
|
+
return ""
|
|
643
|
+
else:
|
|
644
|
+
# Try to read as text
|
|
645
|
+
try:
|
|
646
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
647
|
+
return f.read()
|
|
648
|
+
except Exception:
|
|
649
|
+
try:
|
|
650
|
+
with open(file_path, "r", encoding="latin-1") as f:
|
|
651
|
+
return f.read()
|
|
652
|
+
except Exception as e:
|
|
653
|
+
logger.error(f"Failed to read file as text: {e}")
|
|
654
|
+
return ""
|
|
655
|
+
|
|
433
656
|
def _record_successful_hash(self, file_path: Path):
|
|
434
657
|
"""Update metadata with the hash of a successfully processed file."""
|
|
435
658
|
file_hash = self._get_file_hash(file_path)
|
|
@@ -464,36 +687,50 @@ class DocumentAdder:
|
|
|
464
687
|
]
|
|
465
688
|
|
|
466
689
|
async def fix_structure(self):
|
|
467
|
-
"""
|
|
468
|
-
|
|
690
|
+
"""
|
|
691
|
+
Clean up parser output directories after image migration.
|
|
692
|
+
|
|
693
|
+
NOTE: Image migration and path updates are now handled by the RAG pipeline
|
|
694
|
+
(raganything.py / raganything_docling.py) BEFORE RAG insertion. This ensures
|
|
695
|
+
RAG stores the correct canonical image paths (kb/images/) from the start.
|
|
696
|
+
|
|
697
|
+
This method now only cleans up empty temporary parser output directories.
|
|
698
|
+
"""
|
|
699
|
+
logger.info("Checking for leftover parser output directories...")
|
|
469
700
|
|
|
470
|
-
#
|
|
471
|
-
|
|
472
|
-
|
|
701
|
+
# Support both 'auto' (MinerU) and 'docling' parser output directories
|
|
702
|
+
parser_subdirs = ["auto", "docling"]
|
|
703
|
+
cleaned_count = 0
|
|
704
|
+
|
|
705
|
+
for doc_dir in list(self.content_list_dir.glob("*")):
|
|
473
706
|
if not doc_dir.is_dir():
|
|
474
707
|
continue
|
|
475
708
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
709
|
+
for parser_subdir in parser_subdirs:
|
|
710
|
+
subdir = doc_dir / parser_subdir
|
|
711
|
+
if subdir.exists():
|
|
712
|
+
try:
|
|
713
|
+
# Check if directory is empty or only contains empty subdirs
|
|
714
|
+
has_content = any(
|
|
715
|
+
f.is_file() or (f.is_dir() and any(f.iterdir()))
|
|
716
|
+
for f in subdir.iterdir()
|
|
717
|
+
)
|
|
480
718
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
719
|
+
if not has_content:
|
|
720
|
+
await self._run_in_executor(shutil.rmtree, subdir)
|
|
721
|
+
cleaned_count += 1
|
|
722
|
+
except Exception as e:
|
|
723
|
+
logger.debug(f"Could not clean up {subdir}: {e}")
|
|
484
724
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
725
|
+
# Remove doc_dir if it's now empty
|
|
726
|
+
try:
|
|
727
|
+
if doc_dir.exists() and not any(doc_dir.iterdir()):
|
|
728
|
+
doc_dir.rmdir()
|
|
729
|
+
except Exception:
|
|
730
|
+
pass
|
|
489
731
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
if doc_dir.is_dir():
|
|
493
|
-
# Safety check: only delete if it looks like a parser output (has 'auto' subdir)
|
|
494
|
-
# This prevents wiping manual user folders in content_list_dir
|
|
495
|
-
if (doc_dir / "auto").exists():
|
|
496
|
-
await self._run_in_executor(shutil.rmtree, doc_dir, ignore_errors=True)
|
|
732
|
+
if cleaned_count > 0:
|
|
733
|
+
logger.info(f"Cleaned up {cleaned_count} empty parser directories")
|
|
497
734
|
|
|
498
735
|
def extract_numbered_items_for_new_docs(self, processed_files, batch_size=20):
|
|
499
736
|
if not processed_files:
|
|
@@ -519,6 +756,11 @@ class DocumentAdder:
|
|
|
519
756
|
)
|
|
520
757
|
|
|
521
758
|
def update_metadata(self, added_count: int):
|
|
759
|
+
"""Update metadata after incremental add.
|
|
760
|
+
|
|
761
|
+
Note: We do NOT update rag_provider here - incremental adds must use
|
|
762
|
+
the same provider as the original initialization for consistency.
|
|
763
|
+
"""
|
|
522
764
|
if not self.metadata_file.exists():
|
|
523
765
|
return
|
|
524
766
|
try:
|
|
@@ -527,18 +769,9 @@ class DocumentAdder:
|
|
|
527
769
|
|
|
528
770
|
metadata["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
529
771
|
|
|
530
|
-
#
|
|
531
|
-
if self.
|
|
532
|
-
metadata["rag_provider"] = self.
|
|
533
|
-
|
|
534
|
-
# Also save to centralized config file
|
|
535
|
-
try:
|
|
536
|
-
from src.services.config import get_kb_config_service
|
|
537
|
-
|
|
538
|
-
kb_config_service = get_kb_config_service()
|
|
539
|
-
kb_config_service.set_rag_provider(self.kb_name, self.rag_provider)
|
|
540
|
-
except Exception as config_err:
|
|
541
|
-
logger.warning(f"Failed to save to centralized config: {config_err}")
|
|
772
|
+
# Record the provider used (should match what's already in metadata)
|
|
773
|
+
if "rag_provider" not in metadata and self._resolved_provider:
|
|
774
|
+
metadata["rag_provider"] = self._resolved_provider
|
|
542
775
|
|
|
543
776
|
history = metadata.get("update_history", [])
|
|
544
777
|
history.append(
|
|
@@ -546,6 +779,7 @@ class DocumentAdder:
|
|
|
546
779
|
"timestamp": metadata["last_updated"],
|
|
547
780
|
"action": "incremental_add",
|
|
548
781
|
"count": added_count,
|
|
782
|
+
"provider": self._resolved_provider,
|
|
549
783
|
}
|
|
550
784
|
)
|
|
551
785
|
metadata["update_history"] = history
|
|
@@ -557,7 +791,17 @@ class DocumentAdder:
|
|
|
557
791
|
|
|
558
792
|
|
|
559
793
|
async def main():
|
|
560
|
-
parser = argparse.ArgumentParser(
|
|
794
|
+
parser = argparse.ArgumentParser(
|
|
795
|
+
description="Incrementally add documents to RAG KB",
|
|
796
|
+
epilog="""
|
|
797
|
+
Example usage:
|
|
798
|
+
# Add documents to existing KB (uses provider from KB metadata)
|
|
799
|
+
python -m src.knowledge.add_documents my_kb --docs doc1.pdf doc2.txt
|
|
800
|
+
|
|
801
|
+
# Add all documents from a directory
|
|
802
|
+
python -m src.knowledge.add_documents my_kb --docs-dir ./documents/
|
|
803
|
+
""",
|
|
804
|
+
)
|
|
561
805
|
parser.add_argument("kb_name", help="KB Name")
|
|
562
806
|
parser.add_argument("--docs", nargs="+", help="Files")
|
|
563
807
|
parser.add_argument("--docs-dir", help="Directory")
|
|
@@ -568,10 +812,6 @@ async def main():
|
|
|
568
812
|
|
|
569
813
|
args = parser.parse_args()
|
|
570
814
|
|
|
571
|
-
# Initialize dynamic paths
|
|
572
|
-
project_root = Path(__file__).parent.parent.parent
|
|
573
|
-
load_dynamic_imports(project_root)
|
|
574
|
-
|
|
575
815
|
load_dotenv()
|
|
576
816
|
|
|
577
817
|
doc_files = []
|