realtimex-deeptutor 0.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. realtimex_deeptutor/__init__.py +67 -0
  2. realtimex_deeptutor-0.5.0.post1.dist-info/METADATA +1612 -0
  3. realtimex_deeptutor-0.5.0.post1.dist-info/RECORD +276 -0
  4. realtimex_deeptutor-0.5.0.post1.dist-info/WHEEL +5 -0
  5. realtimex_deeptutor-0.5.0.post1.dist-info/entry_points.txt +2 -0
  6. realtimex_deeptutor-0.5.0.post1.dist-info/licenses/LICENSE +661 -0
  7. realtimex_deeptutor-0.5.0.post1.dist-info/top_level.txt +2 -0
  8. src/__init__.py +40 -0
  9. src/agents/__init__.py +24 -0
  10. src/agents/base_agent.py +657 -0
  11. src/agents/chat/__init__.py +24 -0
  12. src/agents/chat/chat_agent.py +435 -0
  13. src/agents/chat/prompts/en/chat_agent.yaml +35 -0
  14. src/agents/chat/prompts/zh/chat_agent.yaml +35 -0
  15. src/agents/chat/session_manager.py +311 -0
  16. src/agents/co_writer/__init__.py +0 -0
  17. src/agents/co_writer/edit_agent.py +260 -0
  18. src/agents/co_writer/narrator_agent.py +423 -0
  19. src/agents/co_writer/prompts/en/edit_agent.yaml +113 -0
  20. src/agents/co_writer/prompts/en/narrator_agent.yaml +88 -0
  21. src/agents/co_writer/prompts/zh/edit_agent.yaml +113 -0
  22. src/agents/co_writer/prompts/zh/narrator_agent.yaml +88 -0
  23. src/agents/guide/__init__.py +16 -0
  24. src/agents/guide/agents/__init__.py +11 -0
  25. src/agents/guide/agents/chat_agent.py +104 -0
  26. src/agents/guide/agents/interactive_agent.py +223 -0
  27. src/agents/guide/agents/locate_agent.py +149 -0
  28. src/agents/guide/agents/summary_agent.py +150 -0
  29. src/agents/guide/guide_manager.py +500 -0
  30. src/agents/guide/prompts/en/chat_agent.yaml +41 -0
  31. src/agents/guide/prompts/en/interactive_agent.yaml +202 -0
  32. src/agents/guide/prompts/en/locate_agent.yaml +68 -0
  33. src/agents/guide/prompts/en/summary_agent.yaml +157 -0
  34. src/agents/guide/prompts/zh/chat_agent.yaml +41 -0
  35. src/agents/guide/prompts/zh/interactive_agent.yaml +626 -0
  36. src/agents/guide/prompts/zh/locate_agent.yaml +68 -0
  37. src/agents/guide/prompts/zh/summary_agent.yaml +157 -0
  38. src/agents/ideagen/__init__.py +12 -0
  39. src/agents/ideagen/idea_generation_workflow.py +426 -0
  40. src/agents/ideagen/material_organizer_agent.py +173 -0
  41. src/agents/ideagen/prompts/en/idea_generation.yaml +187 -0
  42. src/agents/ideagen/prompts/en/material_organizer.yaml +69 -0
  43. src/agents/ideagen/prompts/zh/idea_generation.yaml +187 -0
  44. src/agents/ideagen/prompts/zh/material_organizer.yaml +69 -0
  45. src/agents/question/__init__.py +24 -0
  46. src/agents/question/agents/__init__.py +18 -0
  47. src/agents/question/agents/generate_agent.py +381 -0
  48. src/agents/question/agents/relevance_analyzer.py +207 -0
  49. src/agents/question/agents/retrieve_agent.py +239 -0
  50. src/agents/question/coordinator.py +718 -0
  51. src/agents/question/example.py +109 -0
  52. src/agents/question/prompts/en/coordinator.yaml +75 -0
  53. src/agents/question/prompts/en/generate_agent.yaml +77 -0
  54. src/agents/question/prompts/en/relevance_analyzer.yaml +41 -0
  55. src/agents/question/prompts/en/retrieve_agent.yaml +32 -0
  56. src/agents/question/prompts/zh/coordinator.yaml +75 -0
  57. src/agents/question/prompts/zh/generate_agent.yaml +77 -0
  58. src/agents/question/prompts/zh/relevance_analyzer.yaml +39 -0
  59. src/agents/question/prompts/zh/retrieve_agent.yaml +30 -0
  60. src/agents/research/agents/__init__.py +23 -0
  61. src/agents/research/agents/decompose_agent.py +507 -0
  62. src/agents/research/agents/manager_agent.py +228 -0
  63. src/agents/research/agents/note_agent.py +180 -0
  64. src/agents/research/agents/rephrase_agent.py +263 -0
  65. src/agents/research/agents/reporting_agent.py +1333 -0
  66. src/agents/research/agents/research_agent.py +714 -0
  67. src/agents/research/data_structures.py +451 -0
  68. src/agents/research/main.py +188 -0
  69. src/agents/research/prompts/en/decompose_agent.yaml +89 -0
  70. src/agents/research/prompts/en/manager_agent.yaml +24 -0
  71. src/agents/research/prompts/en/note_agent.yaml +121 -0
  72. src/agents/research/prompts/en/rephrase_agent.yaml +58 -0
  73. src/agents/research/prompts/en/reporting_agent.yaml +380 -0
  74. src/agents/research/prompts/en/research_agent.yaml +173 -0
  75. src/agents/research/prompts/zh/decompose_agent.yaml +89 -0
  76. src/agents/research/prompts/zh/manager_agent.yaml +24 -0
  77. src/agents/research/prompts/zh/note_agent.yaml +121 -0
  78. src/agents/research/prompts/zh/rephrase_agent.yaml +58 -0
  79. src/agents/research/prompts/zh/reporting_agent.yaml +380 -0
  80. src/agents/research/prompts/zh/research_agent.yaml +173 -0
  81. src/agents/research/research_pipeline.py +1309 -0
  82. src/agents/research/utils/__init__.py +60 -0
  83. src/agents/research/utils/citation_manager.py +799 -0
  84. src/agents/research/utils/json_utils.py +98 -0
  85. src/agents/research/utils/token_tracker.py +297 -0
  86. src/agents/solve/__init__.py +80 -0
  87. src/agents/solve/analysis_loop/__init__.py +14 -0
  88. src/agents/solve/analysis_loop/investigate_agent.py +414 -0
  89. src/agents/solve/analysis_loop/note_agent.py +190 -0
  90. src/agents/solve/main_solver.py +862 -0
  91. src/agents/solve/memory/__init__.py +34 -0
  92. src/agents/solve/memory/citation_memory.py +353 -0
  93. src/agents/solve/memory/investigate_memory.py +226 -0
  94. src/agents/solve/memory/solve_memory.py +340 -0
  95. src/agents/solve/prompts/en/analysis_loop/investigate_agent.yaml +55 -0
  96. src/agents/solve/prompts/en/analysis_loop/note_agent.yaml +54 -0
  97. src/agents/solve/prompts/en/solve_loop/manager_agent.yaml +67 -0
  98. src/agents/solve/prompts/en/solve_loop/precision_answer_agent.yaml +62 -0
  99. src/agents/solve/prompts/en/solve_loop/response_agent.yaml +90 -0
  100. src/agents/solve/prompts/en/solve_loop/solve_agent.yaml +75 -0
  101. src/agents/solve/prompts/en/solve_loop/tool_agent.yaml +38 -0
  102. src/agents/solve/prompts/zh/analysis_loop/investigate_agent.yaml +53 -0
  103. src/agents/solve/prompts/zh/analysis_loop/note_agent.yaml +54 -0
  104. src/agents/solve/prompts/zh/solve_loop/manager_agent.yaml +66 -0
  105. src/agents/solve/prompts/zh/solve_loop/precision_answer_agent.yaml +62 -0
  106. src/agents/solve/prompts/zh/solve_loop/response_agent.yaml +90 -0
  107. src/agents/solve/prompts/zh/solve_loop/solve_agent.yaml +76 -0
  108. src/agents/solve/prompts/zh/solve_loop/tool_agent.yaml +41 -0
  109. src/agents/solve/solve_loop/__init__.py +22 -0
  110. src/agents/solve/solve_loop/citation_manager.py +74 -0
  111. src/agents/solve/solve_loop/manager_agent.py +274 -0
  112. src/agents/solve/solve_loop/precision_answer_agent.py +96 -0
  113. src/agents/solve/solve_loop/response_agent.py +301 -0
  114. src/agents/solve/solve_loop/solve_agent.py +325 -0
  115. src/agents/solve/solve_loop/tool_agent.py +470 -0
  116. src/agents/solve/utils/__init__.py +64 -0
  117. src/agents/solve/utils/config_validator.py +313 -0
  118. src/agents/solve/utils/display_manager.py +223 -0
  119. src/agents/solve/utils/error_handler.py +363 -0
  120. src/agents/solve/utils/json_utils.py +98 -0
  121. src/agents/solve/utils/performance_monitor.py +407 -0
  122. src/agents/solve/utils/token_tracker.py +541 -0
  123. src/api/__init__.py +0 -0
  124. src/api/main.py +240 -0
  125. src/api/routers/__init__.py +1 -0
  126. src/api/routers/agent_config.py +69 -0
  127. src/api/routers/chat.py +296 -0
  128. src/api/routers/co_writer.py +337 -0
  129. src/api/routers/config.py +627 -0
  130. src/api/routers/dashboard.py +18 -0
  131. src/api/routers/guide.py +337 -0
  132. src/api/routers/ideagen.py +436 -0
  133. src/api/routers/knowledge.py +821 -0
  134. src/api/routers/notebook.py +247 -0
  135. src/api/routers/question.py +537 -0
  136. src/api/routers/research.py +394 -0
  137. src/api/routers/settings.py +164 -0
  138. src/api/routers/solve.py +305 -0
  139. src/api/routers/system.py +252 -0
  140. src/api/run_server.py +61 -0
  141. src/api/utils/history.py +172 -0
  142. src/api/utils/log_interceptor.py +21 -0
  143. src/api/utils/notebook_manager.py +415 -0
  144. src/api/utils/progress_broadcaster.py +72 -0
  145. src/api/utils/task_id_manager.py +100 -0
  146. src/config/__init__.py +0 -0
  147. src/config/accessors.py +18 -0
  148. src/config/constants.py +34 -0
  149. src/config/defaults.py +18 -0
  150. src/config/schema.py +38 -0
  151. src/config/settings.py +50 -0
  152. src/core/errors.py +62 -0
  153. src/knowledge/__init__.py +23 -0
  154. src/knowledge/add_documents.py +606 -0
  155. src/knowledge/config.py +65 -0
  156. src/knowledge/example_add_documents.py +236 -0
  157. src/knowledge/extract_numbered_items.py +1039 -0
  158. src/knowledge/initializer.py +621 -0
  159. src/knowledge/kb.py +22 -0
  160. src/knowledge/manager.py +782 -0
  161. src/knowledge/progress_tracker.py +182 -0
  162. src/knowledge/start_kb.py +535 -0
  163. src/logging/__init__.py +103 -0
  164. src/logging/adapters/__init__.py +17 -0
  165. src/logging/adapters/lightrag.py +184 -0
  166. src/logging/adapters/llamaindex.py +141 -0
  167. src/logging/config.py +80 -0
  168. src/logging/handlers/__init__.py +20 -0
  169. src/logging/handlers/console.py +75 -0
  170. src/logging/handlers/file.py +201 -0
  171. src/logging/handlers/websocket.py +127 -0
  172. src/logging/logger.py +709 -0
  173. src/logging/stats/__init__.py +16 -0
  174. src/logging/stats/llm_stats.py +179 -0
  175. src/services/__init__.py +56 -0
  176. src/services/config/__init__.py +61 -0
  177. src/services/config/knowledge_base_config.py +210 -0
  178. src/services/config/loader.py +260 -0
  179. src/services/config/unified_config.py +603 -0
  180. src/services/embedding/__init__.py +45 -0
  181. src/services/embedding/adapters/__init__.py +22 -0
  182. src/services/embedding/adapters/base.py +106 -0
  183. src/services/embedding/adapters/cohere.py +127 -0
  184. src/services/embedding/adapters/jina.py +99 -0
  185. src/services/embedding/adapters/ollama.py +116 -0
  186. src/services/embedding/adapters/openai_compatible.py +96 -0
  187. src/services/embedding/client.py +159 -0
  188. src/services/embedding/config.py +156 -0
  189. src/services/embedding/provider.py +119 -0
  190. src/services/llm/__init__.py +152 -0
  191. src/services/llm/capabilities.py +313 -0
  192. src/services/llm/client.py +302 -0
  193. src/services/llm/cloud_provider.py +530 -0
  194. src/services/llm/config.py +200 -0
  195. src/services/llm/error_mapping.py +103 -0
  196. src/services/llm/exceptions.py +152 -0
  197. src/services/llm/factory.py +450 -0
  198. src/services/llm/local_provider.py +347 -0
  199. src/services/llm/providers/anthropic.py +95 -0
  200. src/services/llm/providers/base_provider.py +93 -0
  201. src/services/llm/providers/open_ai.py +83 -0
  202. src/services/llm/registry.py +71 -0
  203. src/services/llm/telemetry.py +40 -0
  204. src/services/llm/types.py +27 -0
  205. src/services/llm/utils.py +333 -0
  206. src/services/prompt/__init__.py +25 -0
  207. src/services/prompt/manager.py +206 -0
  208. src/services/rag/__init__.py +64 -0
  209. src/services/rag/components/__init__.py +29 -0
  210. src/services/rag/components/base.py +59 -0
  211. src/services/rag/components/chunkers/__init__.py +18 -0
  212. src/services/rag/components/chunkers/base.py +34 -0
  213. src/services/rag/components/chunkers/fixed.py +71 -0
  214. src/services/rag/components/chunkers/numbered_item.py +94 -0
  215. src/services/rag/components/chunkers/semantic.py +97 -0
  216. src/services/rag/components/embedders/__init__.py +14 -0
  217. src/services/rag/components/embedders/base.py +32 -0
  218. src/services/rag/components/embedders/openai.py +63 -0
  219. src/services/rag/components/indexers/__init__.py +18 -0
  220. src/services/rag/components/indexers/base.py +35 -0
  221. src/services/rag/components/indexers/graph.py +172 -0
  222. src/services/rag/components/indexers/lightrag.py +156 -0
  223. src/services/rag/components/indexers/vector.py +146 -0
  224. src/services/rag/components/parsers/__init__.py +18 -0
  225. src/services/rag/components/parsers/base.py +35 -0
  226. src/services/rag/components/parsers/markdown.py +52 -0
  227. src/services/rag/components/parsers/pdf.py +115 -0
  228. src/services/rag/components/parsers/text.py +86 -0
  229. src/services/rag/components/retrievers/__init__.py +18 -0
  230. src/services/rag/components/retrievers/base.py +34 -0
  231. src/services/rag/components/retrievers/dense.py +200 -0
  232. src/services/rag/components/retrievers/hybrid.py +164 -0
  233. src/services/rag/components/retrievers/lightrag.py +169 -0
  234. src/services/rag/components/routing.py +286 -0
  235. src/services/rag/factory.py +234 -0
  236. src/services/rag/pipeline.py +215 -0
  237. src/services/rag/pipelines/__init__.py +32 -0
  238. src/services/rag/pipelines/academic.py +44 -0
  239. src/services/rag/pipelines/lightrag.py +43 -0
  240. src/services/rag/pipelines/llamaindex.py +313 -0
  241. src/services/rag/pipelines/raganything.py +384 -0
  242. src/services/rag/service.py +244 -0
  243. src/services/rag/types.py +73 -0
  244. src/services/search/__init__.py +284 -0
  245. src/services/search/base.py +87 -0
  246. src/services/search/consolidation.py +398 -0
  247. src/services/search/providers/__init__.py +128 -0
  248. src/services/search/providers/baidu.py +188 -0
  249. src/services/search/providers/exa.py +194 -0
  250. src/services/search/providers/jina.py +161 -0
  251. src/services/search/providers/perplexity.py +153 -0
  252. src/services/search/providers/serper.py +209 -0
  253. src/services/search/providers/tavily.py +161 -0
  254. src/services/search/types.py +114 -0
  255. src/services/setup/__init__.py +34 -0
  256. src/services/setup/init.py +285 -0
  257. src/services/tts/__init__.py +16 -0
  258. src/services/tts/config.py +99 -0
  259. src/tools/__init__.py +91 -0
  260. src/tools/code_executor.py +536 -0
  261. src/tools/paper_search_tool.py +171 -0
  262. src/tools/query_item_tool.py +310 -0
  263. src/tools/question/__init__.py +15 -0
  264. src/tools/question/exam_mimic.py +616 -0
  265. src/tools/question/pdf_parser.py +211 -0
  266. src/tools/question/question_extractor.py +397 -0
  267. src/tools/rag_tool.py +173 -0
  268. src/tools/tex_chunker.py +339 -0
  269. src/tools/tex_downloader.py +253 -0
  270. src/tools/web_search.py +71 -0
  271. src/utils/config_manager.py +206 -0
  272. src/utils/document_validator.py +168 -0
  273. src/utils/error_rate_tracker.py +111 -0
  274. src/utils/error_utils.py +82 -0
  275. src/utils/json_parser.py +110 -0
  276. src/utils/network/circuit_breaker.py +79 -0
@@ -0,0 +1,782 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Knowledge Base Manager
5
+
6
+ Manages multiple knowledge bases and provides utilities for accessing them.
7
+ """
8
+
9
+ from datetime import datetime
10
+ import hashlib
11
+ import json
12
+ from pathlib import Path
13
+ import shutil
14
+
15
+
16
+ class KnowledgeBaseManager:
17
+ """Manager for knowledge bases"""
18
+
19
+ def __init__(self, base_dir="./data/knowledge_bases"):
20
+ self.base_dir = Path(base_dir)
21
+ self.base_dir.mkdir(parents=True, exist_ok=True)
22
+
23
+ # Config file to track knowledge bases
24
+ self.config_file = self.base_dir / "kb_config.json"
25
+ self.config = self._load_config()
26
+
27
+ def _load_config(self) -> dict:
28
+ """Load knowledge base configuration (kb_config.json only stores KB list)"""
29
+ if self.config_file.exists():
30
+ with open(self.config_file, encoding="utf-8") as f:
31
+ config = json.load(f)
32
+ # Migration: remove old "default" field if present
33
+ if "default" in config:
34
+ del config["default"]
35
+ # Save cleaned config
36
+ try:
37
+ with open(self.config_file, "w", encoding="utf-8") as wf:
38
+ json.dump(config, wf, indent=2, ensure_ascii=False)
39
+ except Exception:
40
+ pass
41
+ return config
42
+ return {"knowledge_bases": {}}
43
+
44
+ def _save_config(self):
45
+ """Save knowledge base configuration"""
46
+ with open(self.config_file, "w", encoding="utf-8") as f:
47
+ json.dump(self.config, f, indent=2, ensure_ascii=False)
48
+
49
+ def list_knowledge_bases(self) -> list[str]:
50
+ """List all available knowledge bases from kb_config.json"""
51
+ kb_list = []
52
+
53
+ # Read knowledge base list from config file (this is the authoritative source)
54
+ config_kbs = self.config.get("knowledge_bases", {})
55
+
56
+ for kb_name in config_kbs.keys():
57
+ # Verify knowledge base directory exists
58
+ kb_dir = self.base_dir / kb_name
59
+ if kb_dir.exists() and kb_dir.is_dir():
60
+ kb_list.append(kb_name)
61
+ else:
62
+ # If in config but directory doesn't exist, log warning but don't add
63
+ print(
64
+ f"Warning: Knowledge base '{kb_name}' is in config but directory does not exist: {kb_dir}"
65
+ )
66
+
67
+ # If no config file or config is empty, fallback to scanning directory (backward compatibility)
68
+ if not kb_list and self.base_dir.exists():
69
+ for item in self.base_dir.iterdir():
70
+ if item.is_dir() and item.name != "__pycache__":
71
+ metadata_file = item / "metadata.json"
72
+ if metadata_file.exists():
73
+ kb_list.append(item.name)
74
+
75
+ return sorted(kb_list)
76
+
77
+ def register_knowledge_base(self, name: str, description: str = "", set_default: bool = False):
78
+ """Register a knowledge base"""
79
+ kb_dir = self.base_dir / name
80
+ if not kb_dir.exists():
81
+ raise ValueError(f"Knowledge base directory does not exist: {kb_dir}")
82
+
83
+ if "knowledge_bases" not in self.config:
84
+ self.config["knowledge_bases"] = {}
85
+
86
+ self.config["knowledge_bases"][name] = {"path": name, "description": description}
87
+
88
+ # Only set default if explicitly requested
89
+ if set_default:
90
+ self.set_default(name)
91
+
92
+ self._save_config()
93
+
94
+ def get_knowledge_base_path(self, name: str | None = None) -> Path:
95
+ """Get path to a knowledge base"""
96
+ if name is None:
97
+ name = self.config.get("default")
98
+ if name is None:
99
+ raise ValueError("No default knowledge base set")
100
+
101
+ kb_dir = self.base_dir / name
102
+ if not kb_dir.exists():
103
+ raise ValueError(f"Knowledge base not found: {name}")
104
+
105
+ return kb_dir
106
+
107
+ def get_rag_storage_path(self, name: str | None = None) -> Path:
108
+ """Get RAG storage path for a knowledge base"""
109
+ kb_dir = self.get_knowledge_base_path(name)
110
+ rag_storage = kb_dir / "rag_storage"
111
+ if not rag_storage.exists():
112
+ raise ValueError(f"RAG storage not found for knowledge base: {name or 'default'}")
113
+ return rag_storage
114
+
115
+ def get_images_path(self, name: str | None = None) -> Path:
116
+ """Get images path for a knowledge base"""
117
+ kb_dir = self.get_knowledge_base_path(name)
118
+ return kb_dir / "images"
119
+
120
+ def get_content_list_path(self, name: str | None = None) -> Path:
121
+ """Get content list path for a knowledge base"""
122
+ kb_dir = self.get_knowledge_base_path(name)
123
+ return kb_dir / "content_list"
124
+
125
+ def get_raw_path(self, name: str | None = None) -> Path:
126
+ """Get raw documents path for a knowledge base"""
127
+ kb_dir = self.get_knowledge_base_path(name)
128
+ return kb_dir / "raw"
129
+
130
+ def set_default(self, name: str):
131
+ """Set default knowledge base using centralized config service."""
132
+ if name not in self.list_knowledge_bases():
133
+ raise ValueError(f"Knowledge base not found: {name}")
134
+
135
+ # Use centralized config service only (no longer stored in kb_config.json)
136
+ try:
137
+ from src.services.config import get_kb_config_service
138
+
139
+ kb_config_service = get_kb_config_service()
140
+ kb_config_service.set_default_kb(name)
141
+ except Exception as e:
142
+ print(f"Warning: Failed to save default to centralized config: {e}")
143
+
144
+ def get_default(self) -> str | None:
145
+ """
146
+ Get default knowledge base name.
147
+
148
+ Priority:
149
+ 1. Centralized config service (knowledge_base_configs.json)
150
+ 2. First knowledge base in the list (auto-fallback)
151
+ """
152
+ # Try centralized config first
153
+ try:
154
+ from src.services.config import get_kb_config_service
155
+
156
+ kb_config_service = get_kb_config_service()
157
+ default_kb = kb_config_service.get_default_kb()
158
+ if default_kb and default_kb in self.list_knowledge_bases():
159
+ return default_kb
160
+ except Exception:
161
+ pass
162
+
163
+ # Fallback to first knowledge base in sorted list
164
+ kb_list = self.list_knowledge_bases()
165
+ if kb_list:
166
+ return kb_list[0]
167
+
168
+ return None
169
+
170
+ def get_metadata(self, name: str | None = None) -> dict:
171
+ """Get knowledge base metadata"""
172
+ kb_dir = self.get_knowledge_base_path(name)
173
+ metadata_file = kb_dir / "metadata.json"
174
+
175
+ if metadata_file.exists():
176
+ with open(metadata_file, encoding="utf-8") as f:
177
+ return json.load(f)
178
+
179
+ return {}
180
+
181
+ def get_info(self, name: str | None = None) -> dict:
182
+ """Get detailed information about a knowledge base.
183
+
184
+ This method:
185
+ 1. Gets the KB name (from parameter or default)
186
+ 2. Reads metadata.json from the KB directory
187
+ 3. Collects statistics about files and RAG status
188
+ """
189
+ kb_name = name or self.get_default()
190
+ if kb_name is None:
191
+ raise ValueError("No knowledge base name provided and no default set")
192
+
193
+ # Get knowledge base path
194
+ kb_dir = self.base_dir / kb_name
195
+ if not kb_dir.exists():
196
+ raise ValueError(f"Knowledge base directory does not exist: {kb_dir}")
197
+
198
+ # Verify knowledge base is in config (if not, give warning but don't block)
199
+ if kb_name not in self.config.get("knowledge_bases", {}):
200
+ print(
201
+ f"Warning: Knowledge base '{kb_name}' is not in kb_config.json, but directory exists"
202
+ )
203
+
204
+ info = {
205
+ "name": kb_name,
206
+ "path": str(kb_dir),
207
+ "is_default": kb_name == self.get_default(),
208
+ "metadata": {},
209
+ }
210
+
211
+ # Read metadata.json (if exists)
212
+ metadata_file = kb_dir / "metadata.json"
213
+ if metadata_file.exists():
214
+ try:
215
+ with open(metadata_file, encoding="utf-8") as f:
216
+ info["metadata"] = json.load(f)
217
+ except Exception as e:
218
+ print(f"Warning: Failed to read metadata.json for KB '{kb_name}': {e}")
219
+ info["metadata"] = {}
220
+ else:
221
+ # metadata.json doesn't exist, use empty dict
222
+ info["metadata"] = {}
223
+
224
+ # Count files - handle errors gracefully
225
+ raw_dir = kb_dir / "raw"
226
+ images_dir = kb_dir / "images"
227
+ content_list_dir = kb_dir / "content_list"
228
+ rag_storage_dir = kb_dir / "rag_storage"
229
+
230
+ try:
231
+ raw_count = (
232
+ len([f for f in raw_dir.iterdir() if f.is_file()]) if raw_dir.exists() else 0
233
+ )
234
+ except Exception:
235
+ raw_count = 0
236
+
237
+ try:
238
+ images_count = (
239
+ len([f for f in images_dir.iterdir() if f.is_file()]) if images_dir.exists() else 0
240
+ )
241
+ except Exception:
242
+ images_count = 0
243
+
244
+ try:
245
+ content_lists_count = (
246
+ len(list(content_list_dir.glob("*.json"))) if content_list_dir.exists() else 0
247
+ )
248
+ except Exception:
249
+ content_lists_count = 0
250
+
251
+ metadata = info["metadata"]
252
+ rag_provider = metadata.get("rag_provider") if isinstance(metadata, dict) else None
253
+ info["statistics"] = {
254
+ "raw_documents": raw_count,
255
+ "images": images_count,
256
+ "content_lists": content_lists_count,
257
+ "rag_initialized": rag_storage_dir.exists() and rag_storage_dir.is_dir(),
258
+ "rag_provider": rag_provider, # Add RAG provider info
259
+ }
260
+
261
+ # Try to get RAG statistics
262
+ if rag_storage_dir.exists() and rag_storage_dir.is_dir():
263
+ try:
264
+ entities_file = rag_storage_dir / "kv_store_full_entities.json"
265
+ relations_file = rag_storage_dir / "kv_store_full_relations.json"
266
+ chunks_file = rag_storage_dir / "kv_store_text_chunks.json"
267
+
268
+ rag_stats = {}
269
+ if entities_file.exists():
270
+ try:
271
+ with open(entities_file, encoding="utf-8") as f:
272
+ entities_data = json.load(f)
273
+ rag_stats["entities"] = (
274
+ len(entities_data) if isinstance(entities_data, (list, dict)) else 0
275
+ )
276
+ except Exception:
277
+ pass
278
+
279
+ if relations_file.exists():
280
+ try:
281
+ with open(relations_file, encoding="utf-8") as f:
282
+ relations_data = json.load(f)
283
+ rag_stats["relations"] = (
284
+ len(relations_data)
285
+ if isinstance(relations_data, (list, dict))
286
+ else 0
287
+ )
288
+ except Exception:
289
+ pass
290
+
291
+ if chunks_file.exists():
292
+ try:
293
+ with open(chunks_file, encoding="utf-8") as f:
294
+ chunks_data = json.load(f)
295
+ rag_stats["chunks"] = (
296
+ len(chunks_data) if isinstance(chunks_data, (list, dict)) else 0
297
+ )
298
+ except Exception:
299
+ pass
300
+
301
+ if rag_stats:
302
+ statistics = info["statistics"]
303
+ if isinstance(statistics, dict):
304
+ statistics["rag"] = rag_stats
305
+ except Exception:
306
+ pass
307
+
308
+ return info
309
+
310
+ def delete_knowledge_base(self, name: str, confirm: bool = False) -> bool:
311
+ """
312
+ Delete a knowledge base
313
+
314
+ Args:
315
+ name: Knowledge base name
316
+ confirm: If True, skip confirmation (use with caution!)
317
+
318
+ Returns:
319
+ True if deleted successfully
320
+ """
321
+ if name not in self.list_knowledge_bases():
322
+ raise ValueError(f"Knowledge base not found: {name}")
323
+
324
+ kb_dir = self.get_knowledge_base_path(name)
325
+
326
+ if not confirm:
327
+ # Ask for confirmation in CLI
328
+ print(f"⚠️ Warning: This will permanently delete the knowledge base '{name}'")
329
+ print(f" Path: {kb_dir}")
330
+ response = input("Are you sure? Type 'yes' to confirm: ")
331
+ if response.lower() != "yes":
332
+ print("Deletion cancelled.")
333
+ return False
334
+
335
+ # Delete the directory
336
+ shutil.rmtree(kb_dir)
337
+
338
+ # Remove from config
339
+ if name in self.config.get("knowledge_bases", {}):
340
+ del self.config["knowledge_bases"][name]
341
+
342
+ # Update default if this was the default
343
+ if self.config.get("default") == name:
344
+ remaining = self.list_knowledge_bases()
345
+ self.config["default"] = remaining[0] if remaining else None
346
+
347
+ self._save_config()
348
+ return True
349
+
350
+ def clean_rag_storage(self, name: str | None = None, backup: bool = True) -> bool:
351
+ """
352
+ Clean (delete) RAG storage for a knowledge base
353
+ Useful when RAG data is corrupted
354
+
355
+ Args:
356
+ name: Knowledge base name (default if not specified)
357
+ backup: If True, backup the RAG storage before deleting
358
+
359
+ Returns:
360
+ True if cleaned successfully
361
+ """
362
+ kb_name = name or self.get_default()
363
+ kb_dir = self.get_knowledge_base_path(kb_name)
364
+ rag_storage_dir = kb_dir / "rag_storage"
365
+
366
+ if not rag_storage_dir.exists():
367
+ print(f"RAG storage does not exist for '{kb_name}'")
368
+ return False
369
+
370
+ # Backup if requested
371
+ if backup:
372
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
373
+ backup_dir = kb_dir / f"rag_storage_backup_{timestamp}"
374
+ shutil.copytree(rag_storage_dir, backup_dir)
375
+ print(f"✓ Backed up to: {backup_dir}")
376
+
377
+ # Delete RAG storage
378
+ shutil.rmtree(rag_storage_dir)
379
+ rag_storage_dir.mkdir(parents=True, exist_ok=True)
380
+
381
+ print(f"✓ RAG storage cleaned for '{kb_name}'")
382
+ return True
383
+
384
+ def link_folder(self, kb_name: str, folder_path: str) -> dict:
385
+ """
386
+ Link a local folder to a knowledge base.
387
+
388
+ Args:
389
+ kb_name: Knowledge base name
390
+ folder_path: Path to local folder (supports ~, relative paths)
391
+
392
+ Returns:
393
+ Dict with folder info including id, path, and file count
394
+
395
+ Raises:
396
+ ValueError: If KB not found or folder doesn't exist
397
+ """
398
+ if kb_name not in self.list_knowledge_bases():
399
+ raise ValueError(f"Knowledge base not found: {kb_name}")
400
+
401
+ # Normalize path (cross-platform: handles ~, relative paths, etc.)
402
+ folder = Path(folder_path).expanduser().resolve()
403
+
404
+ if not folder.exists():
405
+ raise ValueError(f"Folder does not exist: {folder}")
406
+ if not folder.is_dir():
407
+ raise ValueError(f"Path is not a directory: {folder}")
408
+
409
+ # Get supported files in folder
410
+ supported_extensions = {".pdf", ".docx", ".doc", ".txt", ".md", ".markdown"}
411
+ files: list[Path] = []
412
+ for ext in supported_extensions:
413
+ files.extend(folder.glob(f"**/*{ext}"))
414
+
415
+ # Generate folder ID
416
+ import hashlib
417
+
418
+ folder_id = hashlib.md5( # noqa: S324
419
+ str(folder).encode(), usedforsecurity=False
420
+ ).hexdigest()[:8]
421
+
422
+ # Load existing linked folders from metadata
423
+ kb_dir = self.base_dir / kb_name
424
+ metadata_file = kb_dir / "metadata.json"
425
+ metadata: dict = {}
426
+
427
+ if metadata_file.exists():
428
+ try:
429
+ with open(metadata_file, encoding="utf-8") as fp:
430
+ metadata = json.load(fp)
431
+ except Exception:
432
+ metadata = {}
433
+
434
+ if "linked_folders" not in metadata:
435
+ metadata["linked_folders"] = []
436
+
437
+ # Check if already linked
438
+ existing_ids = [item["id"] for item in metadata.get("linked_folders", [])]
439
+ if folder_id in existing_ids:
440
+ # If already linked, treat as success (idempotent)
441
+ # Find and return existing info
442
+ for item in metadata.get("linked_folders", []):
443
+ if item["id"] == folder_id:
444
+ return item
445
+
446
+ # Add folder info
447
+ folder_info = {
448
+ "id": folder_id,
449
+ "path": str(folder),
450
+ "added_at": datetime.now().isoformat(),
451
+ "file_count": len(files),
452
+ }
453
+ metadata["linked_folders"].append(folder_info)
454
+
455
+ # Save metadata
456
+ with open(metadata_file, "w", encoding="utf-8") as fp:
457
+ json.dump(metadata, fp, indent=2, ensure_ascii=False)
458
+
459
+ return folder_info
460
+
461
+ def get_linked_folders(self, kb_name: str) -> list[dict]:
462
+ """
463
+ Get list of linked folders for a knowledge base.
464
+
465
+ Args:
466
+ kb_name: Knowledge base name
467
+
468
+ Returns:
469
+ List of linked folder info dicts
470
+ """
471
+ if kb_name not in self.list_knowledge_bases():
472
+ raise ValueError(f"Knowledge base not found: {kb_name}")
473
+
474
+ kb_dir = self.base_dir / kb_name
475
+ metadata_file = kb_dir / "metadata.json"
476
+
477
+ if not metadata_file.exists():
478
+ return []
479
+
480
+ try:
481
+ with open(metadata_file, encoding="utf-8") as f:
482
+ metadata = json.load(f)
483
+ return metadata.get("linked_folders", [])
484
+ except Exception:
485
+ return []
486
+
487
+ def unlink_folder(self, kb_name: str, folder_id: str) -> bool:
488
+ """
489
+ Unlink a folder from a knowledge base.
490
+
491
+ Args:
492
+ kb_name: Knowledge base name
493
+ folder_id: Folder ID to unlink
494
+
495
+ Returns:
496
+ True if unlinked successfully, False if not found
497
+ """
498
+ if kb_name not in self.list_knowledge_bases():
499
+ raise ValueError(f"Knowledge base not found: {kb_name}")
500
+
501
+ kb_dir = self.base_dir / kb_name
502
+ metadata_file = kb_dir / "metadata.json"
503
+
504
+ if not metadata_file.exists():
505
+ return False
506
+
507
+ try:
508
+ with open(metadata_file, encoding="utf-8") as f:
509
+ metadata = json.load(f)
510
+ except Exception:
511
+ return False
512
+
513
+ linked = metadata.get("linked_folders", [])
514
+ new_linked = [f for f in linked if f["id"] != folder_id]
515
+
516
+ if len(new_linked) == len(linked):
517
+ return False # Not found
518
+
519
+ metadata["linked_folders"] = new_linked
520
+
521
+ with open(metadata_file, "w", encoding="utf-8") as f:
522
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
523
+
524
+ return True
525
+
526
+ def scan_linked_folder(self, folder_path: str) -> list[str]:
527
+ """
528
+ Scan a linked folder and return list of supported file paths.
529
+
530
+ Args:
531
+ folder_path: Path to folder
532
+
533
+ Returns:
534
+ List of file paths (as strings)
535
+ """
536
+ folder = Path(folder_path).expanduser().resolve()
537
+
538
+ if not folder.exists() or not folder.is_dir():
539
+ return []
540
+
541
+ supported_extensions = {".pdf", ".docx", ".doc", ".txt", ".md", ".markdown"}
542
+ files = []
543
+
544
+ for ext in supported_extensions:
545
+ for file_path in folder.glob(f"**/*{ext}"):
546
+ files.append(str(file_path))
547
+
548
+ return sorted(files)
549
+
550
+ def detect_folder_changes(self, kb_name: str, folder_id: str) -> dict:
551
+ """
552
+ Detect new and modified files in a linked folder since last sync.
553
+
554
+ This enables automatic sync of changes from local folders that may
555
+ be synced with cloud services like SharePoint, Google Drive, etc.
556
+
557
+ Args:
558
+ kb_name: Knowledge base name
559
+ folder_id: Folder ID to check for changes
560
+
561
+ Returns:
562
+ Dict with 'new_files', 'modified_files', and 'has_changes' keys
563
+ """
564
+ if kb_name not in self.list_knowledge_bases():
565
+ raise ValueError(f"Knowledge base not found: {kb_name}")
566
+
567
+ # Get folder info
568
+ folders = self.get_linked_folders(kb_name)
569
+ folder_info = next((f for f in folders if f["id"] == folder_id), None)
570
+
571
+ if not folder_info:
572
+ raise ValueError(f"Linked folder not found: {folder_id}")
573
+
574
+ folder_path = Path(folder_info["path"]).expanduser().resolve()
575
+ last_sync = folder_info.get("last_sync")
576
+ synced_files = folder_info.get("synced_files", {})
577
+
578
+ # Parse last sync timestamp
579
+ last_sync_time = None
580
+ if last_sync:
581
+ try:
582
+ last_sync_time = datetime.fromisoformat(last_sync)
583
+ except Exception:
584
+ pass
585
+
586
+ # Scan current files
587
+ supported_extensions = {".pdf", ".docx", ".doc", ".txt", ".md", ".markdown"}
588
+ new_files = []
589
+ modified_files = []
590
+
591
+ for ext in supported_extensions:
592
+ for file_path in folder_path.glob(f"**/*{ext}"):
593
+ file_str = str(file_path)
594
+ file_mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
595
+
596
+ if file_str in synced_files:
597
+ # Check if modified since last sync
598
+ prev_mtime_str = synced_files[file_str]
599
+ try:
600
+ prev_mtime = datetime.fromisoformat(prev_mtime_str)
601
+ if file_mtime > prev_mtime:
602
+ modified_files.append(file_str)
603
+ except Exception:
604
+ modified_files.append(file_str)
605
+ else:
606
+ # New file (not in synced files)
607
+ new_files.append(file_str)
608
+
609
+ return {
610
+ "new_files": sorted(new_files),
611
+ "modified_files": sorted(modified_files),
612
+ "has_changes": len(new_files) > 0 or len(modified_files) > 0,
613
+ "new_count": len(new_files),
614
+ "modified_count": len(modified_files),
615
+ }
616
+
617
+ def update_folder_sync_state(self, kb_name: str, folder_id: str, synced_files: list[str]):
618
+ """
619
+ Update the sync state for a linked folder after successful sync.
620
+
621
+ Records which files were synced and their modification times,
622
+ enabling future change detection.
623
+
624
+ Args:
625
+ kb_name: Knowledge base name
626
+ folder_id: Folder ID
627
+ synced_files: List of file paths that were successfully synced
628
+ """
629
+ if kb_name not in self.list_knowledge_bases():
630
+ raise ValueError(f"Knowledge base not found: {kb_name}")
631
+
632
+ kb_dir = self.base_dir / kb_name
633
+ metadata_file = kb_dir / "metadata.json"
634
+
635
+ if not metadata_file.exists():
636
+ return
637
+
638
+ try:
639
+ with open(metadata_file, encoding="utf-8") as f:
640
+ metadata = json.load(f)
641
+ except Exception:
642
+ return
643
+
644
+ linked = metadata.get("linked_folders", [])
645
+
646
+ for folder in linked:
647
+ if folder["id"] == folder_id:
648
+ # Record sync timestamp
649
+ folder["last_sync"] = datetime.now().isoformat()
650
+
651
+ # Record file modification times
652
+ file_states = folder.get("synced_files", {})
653
+ for file_path in synced_files:
654
+ try:
655
+ p = Path(file_path)
656
+ if p.exists():
657
+ mtime = datetime.fromtimestamp(p.stat().st_mtime)
658
+ file_states[file_path] = mtime.isoformat()
659
+ except Exception:
660
+ pass
661
+
662
+ folder["synced_files"] = file_states
663
+ folder["file_count"] = len(file_states)
664
+ break
665
+
666
+
667
+ def main():
668
+ """Command-line interface for knowledge base manager"""
669
+ import argparse
670
+
671
+ parser = argparse.ArgumentParser(description="Knowledge Base Manager")
672
+ parser.add_argument(
673
+ "--base-dir", default="./knowledge_bases", help="Base directory for knowledge bases"
674
+ )
675
+
676
+ subparsers = parser.add_subparsers(dest="command", help="Commands")
677
+
678
+ # List command
679
+ subparsers.add_parser("list", help="List all knowledge bases")
680
+
681
+ # Info command
682
+ info_parser = subparsers.add_parser("info", help="Show knowledge base information")
683
+ info_parser.add_argument(
684
+ "name", nargs="?", help="Knowledge base name (default if not specified)"
685
+ )
686
+
687
+ # Set default command
688
+ default_parser = subparsers.add_parser("set-default", help="Set default knowledge base")
689
+ default_parser.add_argument("name", help="Knowledge base name")
690
+
691
+ # Delete command
692
+ delete_parser = subparsers.add_parser("delete", help="Delete a knowledge base")
693
+ delete_parser.add_argument("name", help="Knowledge base name")
694
+ delete_parser.add_argument("--force", action="store_true", help="Skip confirmation")
695
+
696
+ # Clean RAG command
697
+ clean_parser = subparsers.add_parser(
698
+ "clean-rag", help="Clean RAG storage (useful for corrupted data)"
699
+ )
700
+ clean_parser.add_argument(
701
+ "name", nargs="?", help="Knowledge base name (default if not specified)"
702
+ )
703
+ clean_parser.add_argument(
704
+ "--no-backup", action="store_true", help="Don't backup before cleaning"
705
+ )
706
+
707
+ args = parser.parse_args()
708
+
709
+ manager = KnowledgeBaseManager(args.base_dir)
710
+
711
+ if args.command == "list":
712
+ kb_list = manager.list_knowledge_bases()
713
+ default_kb = manager.get_default()
714
+
715
+ print("\nAvailable Knowledge Bases:")
716
+ print("=" * 60)
717
+ if not kb_list:
718
+ print("No knowledge bases found")
719
+ else:
720
+ for kb_name in kb_list:
721
+ default_marker = " (default)" if kb_name == default_kb else ""
722
+ print(f" • {kb_name}{default_marker}")
723
+ print()
724
+
725
+ elif args.command == "info":
726
+ try:
727
+ info = manager.get_info(args.name)
728
+
729
+ print("\nKnowledge Base Information:")
730
+ print("=" * 60)
731
+ print(f"Name: {info['name']}")
732
+ print(f"Path: {info['path']}")
733
+ print(f"Default: {'Yes' if info['is_default'] else 'No'}")
734
+
735
+ if info.get("metadata"):
736
+ print("\nMetadata:")
737
+ for key, value in info["metadata"].items():
738
+ print(f" {key}: {value}")
739
+
740
+ print("\nStatistics:")
741
+ stats = info["statistics"]
742
+ print(f" Raw documents: {stats['raw_documents']}")
743
+ print(f" Images: {stats['images']}")
744
+ print(f" Content lists: {stats['content_lists']}")
745
+ print(f" RAG initialized: {'Yes' if stats['rag_initialized'] else 'No'}")
746
+
747
+ if "rag" in stats:
748
+ print("\n RAG Statistics:")
749
+ for key, value in stats["rag"].items():
750
+ print(f" {key}: {value}")
751
+
752
+ print()
753
+ except Exception as e:
754
+ print(f"Error: {e!s}")
755
+
756
+ elif args.command == "set-default":
757
+ try:
758
+ manager.set_default(args.name)
759
+ print(f"✓ Set '{args.name}' as default knowledge base")
760
+ except Exception as e:
761
+ print(f"Error: {e!s}")
762
+
763
+ elif args.command == "delete":
764
+ try:
765
+ success = manager.delete_knowledge_base(args.name, confirm=args.force)
766
+ if success:
767
+ print(f"✓ Deleted knowledge base '{args.name}'")
768
+ except Exception as e:
769
+ print(f"Error: {e!s}")
770
+
771
+ elif args.command == "clean-rag":
772
+ try:
773
+ manager.clean_rag_storage(args.name, backup=not args.no_backup)
774
+ except Exception as e:
775
+ print(f"Error: {e!s}")
776
+
777
+ else:
778
+ parser.print_help()
779
+
780
+
781
+ if __name__ == "__main__":
782
+ main()