realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,606 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Incrementally add documents to existing knowledge base.
5
+ Improved version with Hash-based duplicate checking, robust error handling,
6
+ and architectural improvements for data integrity and vision support.
7
+ """
8
+
9
+ import argparse
10
+ import asyncio
11
+ from datetime import datetime
12
+ from functools import partial
13
+ import hashlib
14
+ import json
15
+ import os
16
+ from pathlib import Path
17
+ import shutil
18
+ import sys
19
+ import tempfile
20
+ from typing import TYPE_CHECKING, Any, Dict, List
21
+
22
+ from dotenv import load_dotenv
23
+
24
+ # Attempt imports for dynamic dependencies
25
+ try:
26
+ from lightrag.llm.openai import openai_complete_if_cache
27
+ from lightrag.utils import EmbeddingFunc
28
+ except ImportError:
29
+ # These will be caught during runtime if needed
30
+ openai_complete_if_cache = None
31
+ EmbeddingFunc = None
32
+
33
+ # Type hinting support for dynamic imports
34
+ if TYPE_CHECKING:
35
+ try:
36
+ from raganything import RAGAnything
37
+ from raganything import RAGAnythingConfig as RAGAnythingConfigType
38
+ except ImportError:
39
+ RAGAnything = Any
40
+ RAGAnythingConfigType = Any
41
+ else:
42
+ RAGAnything = None
43
+ RAGAnythingConfigType = None
44
+
45
+ # Placeholder for runtime classes
46
+ raganything_cls = None
47
+ RAGAnythingConfig = None
48
+
49
+
50
+ def load_dynamic_imports(project_root: Path):
51
+ """Handle the path injections and dynamic imports safely."""
52
+ global raganything_cls, RAGAnythingConfig
53
+
54
+ sys.path.insert(0, str(project_root))
55
+ raganything_path = project_root.parent / "raganything" / "RAG-Anything"
56
+ if raganything_path.exists():
57
+ sys.path.insert(0, str(raganything_path))
58
+
59
+ try:
60
+ from raganything import RAGAnything as RA
61
+ from raganything import RAGAnythingConfig as RAC
62
+
63
+ raganything_cls = RA
64
+ RAGAnythingConfig = RAC
65
+ except ImportError:
66
+ pass
67
+
68
+
69
+ from src.knowledge.extract_numbered_items import process_content_list
70
+ from src.logging import LightRAGLogContext, get_logger
71
+ from src.services.embedding import (
72
+ get_embedding_client,
73
+ get_embedding_config,
74
+ reset_embedding_client,
75
+ )
76
+ from src.services.llm import get_llm_config
77
+
78
+ logger = get_logger("KnowledgeInit")
79
+
80
+ # Default base directory for knowledge bases
81
+ DEFAULT_BASE_DIR = "./data/knowledge_bases"
82
+
83
+
84
+ class DocumentAdder:
85
+ """Add documents to existing knowledge base with Hash-validation"""
86
+
87
+ def __init__(
88
+ self,
89
+ kb_name: str,
90
+ base_dir=DEFAULT_BASE_DIR,
91
+ api_key: str | None = None,
92
+ base_url: str | None = None,
93
+ progress_tracker=None,
94
+ rag_provider: str | None = None,
95
+ ):
96
+ self.kb_name = kb_name
97
+ self.base_dir = Path(base_dir)
98
+ self.kb_dir = self.base_dir / kb_name
99
+
100
+ if not self.kb_dir.exists():
101
+ raise ValueError(f"Knowledge base does not exist: {kb_name}")
102
+
103
+ self.raw_dir = self.kb_dir / "raw"
104
+ self.images_dir = self.kb_dir / "images"
105
+ self.rag_storage_dir = self.kb_dir / "rag_storage"
106
+ self.content_list_dir = self.kb_dir / "content_list"
107
+ self.metadata_file = self.kb_dir / "metadata.json"
108
+
109
+ if not self.rag_storage_dir.exists():
110
+ raise ValueError(f"Knowledge base not initialized: {kb_name}")
111
+
112
+ self.api_key = api_key
113
+ self.base_url = base_url
114
+ self.progress_tracker = progress_tracker
115
+ self.rag_provider = rag_provider
116
+ self._ensure_working_directories()
117
+
118
+ def _ensure_working_directories(self):
119
+ for directory in [self.raw_dir, self.images_dir, self.content_list_dir]:
120
+ directory.mkdir(parents=True, exist_ok=True)
121
+
122
+ def _get_file_hash(self, file_path: Path) -> str:
123
+ """
124
+ Calculate SHA-256 hash of a file.
125
+ Uses 64KB chunks for better throughput on SSDs.
126
+ """
127
+ sha256_hash = hashlib.sha256()
128
+ chunk_size = 65536 # 64KB
129
+ with open(file_path, "rb") as f:
130
+ for byte_block in iter(lambda: f.read(chunk_size), b""):
131
+ sha256_hash.update(byte_block)
132
+ return sha256_hash.hexdigest()
133
+
134
+ def get_ingested_hashes(self) -> Dict[str, str]:
135
+ """Get map of filename -> hash from metadata."""
136
+ if self.metadata_file.exists():
137
+ try:
138
+ with open(self.metadata_file, "r", encoding="utf-8") as f:
139
+ data = json.load(f)
140
+ return data.get("file_hashes", {})
141
+ except Exception:
142
+ return {}
143
+ return {}
144
+
145
+ async def _run_in_executor(self, func, *args, **kwargs):
146
+ loop = asyncio.get_running_loop()
147
+ return await loop.run_in_executor(None, partial(func, *args, **kwargs))
148
+
149
+ def add_documents(self, source_files: List[str], allow_duplicates: bool = False) -> List[Path]:
150
+ """
151
+ Synchronous phase: Validates hashes and prepares files.
152
+ Treats 'raw/' as a Write-Ahead Log: files exist there before being canonized in metadata.
153
+ """
154
+ logger.info(f"Validating documents for '{self.kb_name}'...")
155
+
156
+ ingested_hashes = self.get_ingested_hashes()
157
+
158
+ files_to_process = []
159
+ for source in source_files:
160
+ source_path = Path(source)
161
+ if not source_path.exists():
162
+ logger.warning(f" ⚠ Missing: {source}")
163
+ continue
164
+
165
+ current_hash = self._get_file_hash(source_path)
166
+
167
+ # 1. Check if content is already fully ingested (Canon Check)
168
+ # We look for value matches in the metadata hash map
169
+ if current_hash in ingested_hashes.values() and not allow_duplicates:
170
+ logger.info(f" → Skipped (content already indexed): {source_path.name}")
171
+ continue
172
+
173
+ # 2. Prepare file in raw/ (Write-Ahead Log)
174
+ dest_path = self.raw_dir / source_path.name
175
+
176
+ should_copy = True
177
+ if dest_path.exists():
178
+ # If file exists in raw, check if it's the same content
179
+ dest_hash = self._get_file_hash(dest_path)
180
+ if dest_hash == current_hash:
181
+ should_copy = False
182
+ logger.info(f" ⚠ Recovering staged file (interrupted run): {source_path.name}")
183
+ else:
184
+ if not allow_duplicates:
185
+ # Name collision with different content
186
+ logger.info(
187
+ f" → Skipped (filename collision with different content): {source_path.name}"
188
+ )
189
+ continue
190
+ else:
191
+ logger.info(f" → Overwriting existing raw file: {source_path.name}")
192
+
193
+ if should_copy:
194
+ shutil.copy2(source_path, dest_path)
195
+ logger.info(f" ✓ Staged to raw: {source_path.name}")
196
+
197
+ files_to_process.append(dest_path)
198
+
199
+ return files_to_process
200
+
201
+ async def process_new_documents(self, new_files: List[Path]):
202
+ """
203
+ Async phase: Ingests files into the RAG system.
204
+
205
+ Uses FileTypeRouter to classify files and route them appropriately:
206
+ - PDF/DOCX/images -> MinerU parser (full document analysis)
207
+ - Text/Markdown -> Direct read + LightRAG insert (fast)
208
+ """
209
+ if not new_files:
210
+ return None
211
+
212
+ if raganything_cls is None:
213
+ raise ImportError("RAGAnything module not found.")
214
+
215
+ from src.services.rag.components.routing import FileTypeRouter
216
+
217
+ # Pre-import progress stage if needed to avoid overhead in loop
218
+ ProgressStage: Any = None
219
+ if self.progress_tracker:
220
+ from src.knowledge.progress_tracker import ProgressStage
221
+
222
+ self.llm_cfg = get_llm_config()
223
+ model = self.llm_cfg.model
224
+ api_key = self.api_key or self.llm_cfg.api_key
225
+ base_url = self.base_url or self.llm_cfg.base_url
226
+
227
+ # LLM Function Wrapper
228
+ def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
229
+ if history_messages is None:
230
+ history_messages = []
231
+ return openai_complete_if_cache(
232
+ model,
233
+ prompt,
234
+ system_prompt=system_prompt,
235
+ history_messages=history_messages,
236
+ api_key=api_key,
237
+ base_url=base_url,
238
+ **kwargs,
239
+ )
240
+
241
+ # Vision Function Wrapper - Robust history handling
242
+ def vision_model_func(
243
+ prompt,
244
+ system_prompt=None,
245
+ history_messages=None,
246
+ image_data=None,
247
+ messages=None,
248
+ **kwargs,
249
+ ):
250
+ if history_messages is None:
251
+ history_messages = []
252
+ # If pre-formatted messages are provided, sanitize them
253
+ if messages:
254
+ safe_messages = self._filter_valid_messages(messages)
255
+ return openai_complete_if_cache(
256
+ model,
257
+ prompt="",
258
+ messages=safe_messages,
259
+ api_key=api_key,
260
+ base_url=base_url,
261
+ **kwargs,
262
+ )
263
+
264
+ # --- Construct Message History ---
265
+ current_messages = []
266
+
267
+ # 1. Add System Prompt (if provided)
268
+ if system_prompt:
269
+ current_messages.append({"role": "system", "content": system_prompt})
270
+
271
+ # 2. Add History (Filtering out conflicting system prompts)
272
+ if history_messages:
273
+ # Filter out system messages from history to avoid duplicates/conflicts with the new system_prompt
274
+ filtered_history = [
275
+ msg
276
+ for msg in history_messages
277
+ if isinstance(msg, dict) and msg.get("role") != "system"
278
+ ]
279
+ current_messages.extend(filtered_history)
280
+
281
+ # 3. Construct New User Message
282
+ user_content: List[Dict[str, Any]] = [{"type": "text", "text": prompt}]
283
+ if image_data:
284
+ user_content.append(
285
+ {
286
+ "type": "image_url",
287
+ "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
288
+ }
289
+ )
290
+
291
+ # 4. Merge Logic: Avoid back-to-back user messages
292
+ if current_messages and current_messages[-1].get("role") == "user":
293
+ last_msg = current_messages[-1]
294
+ # If last content is string, convert to list format first
295
+ if isinstance(last_msg["content"], str):
296
+ last_msg["content"] = [{"type": "text", "text": last_msg["content"]}]
297
+
298
+ # Append new content blocks
299
+ if isinstance(last_msg["content"], list):
300
+ last_msg["content"].extend(user_content)
301
+ else:
302
+ # Fallback if structure is unexpected, just append new message
303
+ current_messages.append({"role": "user", "content": user_content})
304
+ else:
305
+ current_messages.append({"role": "user", "content": user_content})
306
+
307
+ return openai_complete_if_cache(
308
+ model,
309
+ prompt="",
310
+ messages=current_messages,
311
+ api_key=api_key,
312
+ base_url=base_url,
313
+ **kwargs,
314
+ )
315
+
316
+ # Embedding Setup
317
+ reset_embedding_client()
318
+ embedding_cfg = get_embedding_config()
319
+ embedding_client = get_embedding_client()
320
+
321
+ async def unified_embed_func(texts):
322
+ return await embedding_client.embed(texts)
323
+
324
+ embedding_func = EmbeddingFunc(
325
+ embedding_dim=embedding_cfg.dim,
326
+ max_token_size=embedding_cfg.max_tokens,
327
+ func=unified_embed_func,
328
+ )
329
+
330
+ config = RAGAnythingConfig(
331
+ working_dir=str(self.rag_storage_dir),
332
+ parser="mineru",
333
+ enable_image_processing=True,
334
+ enable_table_processing=True,
335
+ enable_equation_processing=True,
336
+ )
337
+
338
+ with LightRAGLogContext(scene="knowledge_init"):
339
+ rag = raganything_cls(
340
+ config=config,
341
+ llm_model_func=llm_model_func,
342
+ vision_model_func=vision_model_func,
343
+ embedding_func=embedding_func,
344
+ )
345
+ if hasattr(rag, "_ensure_lightrag_initialized"):
346
+ await rag._ensure_lightrag_initialized()
347
+
348
+ # Classify files by type
349
+ file_paths_str = [str(f) for f in new_files]
350
+ classification = FileTypeRouter.classify_files(file_paths_str)
351
+
352
+ logger.info(
353
+ f"File classification: {len(classification.needs_mineru)} need MinerU, "
354
+ f"{len(classification.text_files)} text files, "
355
+ f"{len(classification.unsupported)} unsupported"
356
+ )
357
+
358
+ processed_files = []
359
+ total_files = len(classification.needs_mineru) + len(classification.text_files)
360
+ idx = 0
361
+
362
+ # Process files requiring MinerU (PDF, DOCX, images)
363
+ for doc_file_str in classification.needs_mineru:
364
+ doc_file = Path(doc_file_str)
365
+ idx += 1
366
+ try:
367
+ if self.progress_tracker and ProgressStage:
368
+ self.progress_tracker.update(
369
+ ProgressStage.PROCESSING_FILE,
370
+ f"Ingesting (MinerU) {doc_file.name}",
371
+ current=idx,
372
+ total=total_files,
373
+ )
374
+
375
+ # Verify file still exists in raw/ (it should, as we staged it)
376
+ if not doc_file.exists():
377
+ logger.error(f" ✗ Failed: Staged file missing {doc_file.name}")
378
+ continue
379
+
380
+ await asyncio.wait_for(
381
+ rag.process_document_complete(
382
+ file_path=str(doc_file),
383
+ output_dir=str(self.content_list_dir),
384
+ parse_method="auto",
385
+ ),
386
+ timeout=600.0,
387
+ )
388
+ processed_files.append(doc_file)
389
+ # Store hash on success - "Canonizing" the file
390
+ self._record_successful_hash(doc_file)
391
+ logger.info(f" ✓ Processed (MinerU): {doc_file.name}")
392
+ except Exception as e:
393
+ logger.exception(f" ✗ Failed {doc_file.name}: {e}")
394
+
395
+ # Process text files directly (fast path - no MinerU)
396
+ for doc_file_str in classification.text_files:
397
+ doc_file = Path(doc_file_str)
398
+ idx += 1
399
+ try:
400
+ if self.progress_tracker and ProgressStage:
401
+ self.progress_tracker.update(
402
+ ProgressStage.PROCESSING_FILE,
403
+ f"Ingesting (text) {doc_file.name}",
404
+ current=idx,
405
+ total=total_files,
406
+ )
407
+
408
+ # Verify file still exists
409
+ if not doc_file.exists():
410
+ logger.error(f" ✗ Failed: Staged file missing {doc_file.name}")
411
+ continue
412
+
413
+ # Read text file directly
414
+ content = await FileTypeRouter.read_text_file(str(doc_file))
415
+ if content.strip():
416
+ # Insert directly into LightRAG, bypassing MinerU
417
+ await rag.lightrag.ainsert(content)
418
+ processed_files.append(doc_file)
419
+ self._record_successful_hash(doc_file)
420
+ logger.info(f" ✓ Processed (text): {doc_file.name}")
421
+ else:
422
+ logger.warning(f" ⚠ Skipped empty file: {doc_file.name}")
423
+ except Exception as e:
424
+ logger.exception(f" ✗ Failed {doc_file.name}: {e}")
425
+
426
+ # Log unsupported files
427
+ for doc_file_str in classification.unsupported:
428
+ logger.warning(f" ⚠ Skipped unsupported file: {Path(doc_file_str).name}")
429
+
430
+ await self.fix_structure()
431
+ return processed_files
432
+
433
+ def _record_successful_hash(self, file_path: Path):
434
+ """Update metadata with the hash of a successfully processed file."""
435
+ file_hash = self._get_file_hash(file_path)
436
+ try:
437
+ metadata = {}
438
+ if self.metadata_file.exists():
439
+ with open(self.metadata_file, "r", encoding="utf-8") as f:
440
+ metadata = json.load(f)
441
+
442
+ if "file_hashes" not in metadata:
443
+ metadata["file_hashes"] = {}
444
+
445
+ metadata["file_hashes"][file_path.name] = file_hash
446
+ # Atomic write: write to temp file, then rename
447
+ fd, tmp_path = tempfile.mkstemp(dir=self.kb_dir, suffix=".json")
448
+ try:
449
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
450
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
451
+ os.replace(tmp_path, self.metadata_file)
452
+ except Exception:
453
+ os.unlink(tmp_path)
454
+ raise
455
+ except Exception as e:
456
+ logger.warning(f"Could not update hash metadata: {e}")
457
+
458
+ @staticmethod
459
+ def _filter_valid_messages(messages):
460
+ return [
461
+ m
462
+ for m in messages
463
+ if isinstance(m, dict) and m.get("role") is not None and m.get("content") is not None
464
+ ]
465
+
466
+ async def fix_structure(self):
467
+ """Robustly moves nested outputs and cleans up."""
468
+ logger.info("Organizing storage structure...")
469
+
470
+ # 1. Identify moves
471
+ moves = []
472
+ for doc_dir in self.content_list_dir.glob("*"):
473
+ if not doc_dir.is_dir():
474
+ continue
475
+
476
+ # Content List
477
+ json_src = next(doc_dir.glob("auto/*_content_list.json"), None)
478
+ if json_src:
479
+ moves.append((json_src, self.content_list_dir / f"{doc_dir.name}.json"))
480
+
481
+ # Images
482
+ for img in doc_dir.glob("auto/images/*"):
483
+ moves.append((img, self.images_dir / img.name))
484
+
485
+ # 2. Execute moves
486
+ for src, dest in moves:
487
+ if src.exists():
488
+ await self._run_in_executor(shutil.copy2, src, dest)
489
+
490
+ # 3. Safe Cleanup: Only delete directories we actually processed
491
+ for doc_dir in self.content_list_dir.glob("*"):
492
+ if doc_dir.is_dir():
493
+ # Safety check: only delete if it looks like a parser output (has 'auto' subdir)
494
+ # This prevents wiping manual user folders in content_list_dir
495
+ if (doc_dir / "auto").exists():
496
+ await self._run_in_executor(shutil.rmtree, doc_dir, ignore_errors=True)
497
+
498
+ def extract_numbered_items_for_new_docs(self, processed_files, batch_size=20):
499
+ if not processed_files:
500
+ return
501
+
502
+ llm_cfg = getattr(self, "llm_cfg", None)
503
+ if llm_cfg is None:
504
+ llm_cfg = get_llm_config()
505
+ api_key = self.api_key or llm_cfg.api_key
506
+ base_url = self.base_url or llm_cfg.base_url
507
+ output_file = self.kb_dir / "numbered_items.json"
508
+
509
+ for doc_file in processed_files:
510
+ content_list_file = self.content_list_dir / f"{doc_file.stem}.json"
511
+ if content_list_file.exists():
512
+ process_content_list(
513
+ content_list_file=content_list_file,
514
+ output_file=output_file,
515
+ api_key=api_key,
516
+ base_url=base_url,
517
+ batch_size=batch_size,
518
+ merge=output_file.exists(),
519
+ )
520
+
521
+ def update_metadata(self, added_count: int):
522
+ if not self.metadata_file.exists():
523
+ return
524
+ try:
525
+ with open(self.metadata_file, "r", encoding="utf-8") as f:
526
+ metadata = json.load(f)
527
+
528
+ metadata["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
529
+
530
+ # Update RAG provider if specified
531
+ if self.rag_provider:
532
+ metadata["rag_provider"] = self.rag_provider
533
+
534
+ # Also save to centralized config file
535
+ try:
536
+ from src.services.config import get_kb_config_service
537
+
538
+ kb_config_service = get_kb_config_service()
539
+ kb_config_service.set_rag_provider(self.kb_name, self.rag_provider)
540
+ except Exception as config_err:
541
+ logger.warning(f"Failed to save to centralized config: {config_err}")
542
+
543
+ history = metadata.get("update_history", [])
544
+ history.append(
545
+ {
546
+ "timestamp": metadata["last_updated"],
547
+ "action": "incremental_add",
548
+ "count": added_count,
549
+ }
550
+ )
551
+ metadata["update_history"] = history
552
+
553
+ with open(self.metadata_file, "w", encoding="utf-8") as f:
554
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
555
+ except Exception as e:
556
+ logger.warning(f"Metadata update failed: {e}")
557
+
558
+
559
+ async def main():
560
+ parser = argparse.ArgumentParser(description="Incrementally add documents to RAG KB")
561
+ parser.add_argument("kb_name", help="KB Name")
562
+ parser.add_argument("--docs", nargs="+", help="Files")
563
+ parser.add_argument("--docs-dir", help="Directory")
564
+ parser.add_argument("--base-dir", default=DEFAULT_BASE_DIR)
565
+ parser.add_argument("--api-key", default=os.getenv("LLM_API_KEY"))
566
+ parser.add_argument("--base-url", default=os.getenv("LLM_HOST"))
567
+ parser.add_argument("--allow-duplicates", action="store_true")
568
+
569
+ args = parser.parse_args()
570
+
571
+ # Initialize dynamic paths
572
+ project_root = Path(__file__).parent.parent.parent
573
+ load_dynamic_imports(project_root)
574
+
575
+ load_dotenv()
576
+
577
+ doc_files = []
578
+ if args.docs:
579
+ doc_files.extend(args.docs)
580
+ if args.docs_dir:
581
+ p = Path(args.docs_dir)
582
+ for ext in ["*.pdf", "*.docx", "*.txt", "*.md"]:
583
+ doc_files.extend([str(f) for f in p.glob(ext)])
584
+
585
+ if not doc_files:
586
+ logger.error("No documents provided.")
587
+ return
588
+
589
+ adder = DocumentAdder(args.kb_name, args.base_dir, args.api_key, args.base_url)
590
+
591
+ # 1. Sync Phase (Validate and Stage)
592
+ new_files = adder.add_documents(doc_files, allow_duplicates=args.allow_duplicates)
593
+
594
+ # 2. Async Ingestion (Process and Canonize)
595
+ if new_files:
596
+ processed = await adder.process_new_documents(new_files)
597
+ if processed:
598
+ adder.extract_numbered_items_for_new_docs(processed)
599
+ adder.update_metadata(len(processed))
600
+ logger.info(f"Done! Successfully added {len(processed)} documents.")
601
+ else:
602
+ logger.info("No new unique documents to add.")
603
+
604
+
605
+ if __name__ == "__main__":
606
+ asyncio.run(main())
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Knowledge Base Path Configuration Module - Unified management of all paths
4
+ """
5
+
6
+ import os
7
+ from pathlib import Path
8
+
9
+ # Project root directory (DeepTutor/)
10
+ PROJECT_ROOT = Path(__file__).parent.parent.parent.resolve()
11
+
12
+ # Knowledge base base directory
13
+ KNOWLEDGE_BASES_DIR = PROJECT_ROOT / "data" / "knowledge_bases"
14
+
15
+ # raganything module path
16
+ RAGANYTHING_PATH = PROJECT_ROOT.parent / "raganything" / "RAG-Anything"
17
+
18
+
19
+ # Ensure raganything path existence check
20
+ def check_raganything():
21
+ """Check if raganything module exists"""
22
+ return RAGANYTHING_PATH.exists()
23
+
24
+
25
+ # Environment variable configuration
26
+ def get_env_config():
27
+ """Get environment variable configuration (unified read from env_config)"""
28
+ try:
29
+ from src.services.llm import get_llm_config
30
+
31
+ cfg = get_llm_config()
32
+ return {
33
+ "api_key": cfg.api_key,
34
+ "base_url": cfg.base_url,
35
+ }
36
+ except Exception:
37
+ # Compatibility fallback: directly read environment variables
38
+ return {
39
+ "api_key": os.getenv("LLM_API_KEY"),
40
+ "base_url": os.getenv("LLM_HOST"),
41
+ }
42
+
43
+
44
+ # Add necessary paths to sys.path
45
+ def setup_paths():
46
+ """Set Python module search paths"""
47
+ import sys
48
+
49
+ # Add project root directory
50
+ if str(PROJECT_ROOT) not in sys.path:
51
+ sys.path.insert(0, str(PROJECT_ROOT))
52
+
53
+ # Add raganything path (if exists)
54
+ if check_raganything() and str(RAGANYTHING_PATH) not in sys.path:
55
+ sys.path.insert(0, str(RAGANYTHING_PATH))
56
+
57
+
58
+ __all__ = [
59
+ "KNOWLEDGE_BASES_DIR",
60
+ "PROJECT_ROOT",
61
+ "RAGANYTHING_PATH",
62
+ "check_raganything",
63
+ "get_env_config",
64
+ "setup_paths",
65
+ ]