jarvis-ai-assistant 0.1.222__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. jarvis/__init__.py +1 -1
  2. jarvis/jarvis_agent/__init__.py +1143 -245
  3. jarvis/jarvis_agent/agent_manager.py +97 -0
  4. jarvis/jarvis_agent/builtin_input_handler.py +12 -10
  5. jarvis/jarvis_agent/config_editor.py +57 -0
  6. jarvis/jarvis_agent/edit_file_handler.py +392 -99
  7. jarvis/jarvis_agent/event_bus.py +48 -0
  8. jarvis/jarvis_agent/events.py +157 -0
  9. jarvis/jarvis_agent/file_context_handler.py +79 -0
  10. jarvis/jarvis_agent/file_methodology_manager.py +117 -0
  11. jarvis/jarvis_agent/jarvis.py +1117 -147
  12. jarvis/jarvis_agent/main.py +78 -34
  13. jarvis/jarvis_agent/memory_manager.py +195 -0
  14. jarvis/jarvis_agent/methodology_share_manager.py +174 -0
  15. jarvis/jarvis_agent/prompt_manager.py +82 -0
  16. jarvis/jarvis_agent/prompts.py +46 -9
  17. jarvis/jarvis_agent/protocols.py +4 -1
  18. jarvis/jarvis_agent/rewrite_file_handler.py +141 -0
  19. jarvis/jarvis_agent/run_loop.py +146 -0
  20. jarvis/jarvis_agent/session_manager.py +9 -9
  21. jarvis/jarvis_agent/share_manager.py +228 -0
  22. jarvis/jarvis_agent/shell_input_handler.py +23 -3
  23. jarvis/jarvis_agent/stdio_redirect.py +295 -0
  24. jarvis/jarvis_agent/task_analyzer.py +212 -0
  25. jarvis/jarvis_agent/task_manager.py +154 -0
  26. jarvis/jarvis_agent/task_planner.py +496 -0
  27. jarvis/jarvis_agent/tool_executor.py +8 -4
  28. jarvis/jarvis_agent/tool_share_manager.py +139 -0
  29. jarvis/jarvis_agent/user_interaction.py +42 -0
  30. jarvis/jarvis_agent/utils.py +54 -0
  31. jarvis/jarvis_agent/web_bridge.py +189 -0
  32. jarvis/jarvis_agent/web_output_sink.py +53 -0
  33. jarvis/jarvis_agent/web_server.py +751 -0
  34. jarvis/jarvis_c2rust/__init__.py +26 -0
  35. jarvis/jarvis_c2rust/cli.py +613 -0
  36. jarvis/jarvis_c2rust/collector.py +258 -0
  37. jarvis/jarvis_c2rust/library_replacer.py +1122 -0
  38. jarvis/jarvis_c2rust/llm_module_agent.py +1300 -0
  39. jarvis/jarvis_c2rust/optimizer.py +960 -0
  40. jarvis/jarvis_c2rust/scanner.py +1681 -0
  41. jarvis/jarvis_c2rust/transpiler.py +2325 -0
  42. jarvis/jarvis_code_agent/build_validation_config.py +133 -0
  43. jarvis/jarvis_code_agent/code_agent.py +1605 -178
  44. jarvis/jarvis_code_agent/code_analyzer/__init__.py +62 -0
  45. jarvis/jarvis_code_agent/code_analyzer/base_language.py +74 -0
  46. jarvis/jarvis_code_agent/code_analyzer/build_validator/__init__.py +44 -0
  47. jarvis/jarvis_code_agent/code_analyzer/build_validator/base.py +102 -0
  48. jarvis/jarvis_code_agent/code_analyzer/build_validator/cmake.py +59 -0
  49. jarvis/jarvis_code_agent/code_analyzer/build_validator/detector.py +125 -0
  50. jarvis/jarvis_code_agent/code_analyzer/build_validator/fallback.py +69 -0
  51. jarvis/jarvis_code_agent/code_analyzer/build_validator/go.py +38 -0
  52. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_gradle.py +44 -0
  53. jarvis/jarvis_code_agent/code_analyzer/build_validator/java_maven.py +38 -0
  54. jarvis/jarvis_code_agent/code_analyzer/build_validator/makefile.py +50 -0
  55. jarvis/jarvis_code_agent/code_analyzer/build_validator/nodejs.py +93 -0
  56. jarvis/jarvis_code_agent/code_analyzer/build_validator/python.py +129 -0
  57. jarvis/jarvis_code_agent/code_analyzer/build_validator/rust.py +54 -0
  58. jarvis/jarvis_code_agent/code_analyzer/build_validator/validator.py +154 -0
  59. jarvis/jarvis_code_agent/code_analyzer/build_validator.py +43 -0
  60. jarvis/jarvis_code_agent/code_analyzer/context_manager.py +363 -0
  61. jarvis/jarvis_code_agent/code_analyzer/context_recommender.py +18 -0
  62. jarvis/jarvis_code_agent/code_analyzer/dependency_analyzer.py +132 -0
  63. jarvis/jarvis_code_agent/code_analyzer/file_ignore.py +330 -0
  64. jarvis/jarvis_code_agent/code_analyzer/impact_analyzer.py +781 -0
  65. jarvis/jarvis_code_agent/code_analyzer/language_registry.py +185 -0
  66. jarvis/jarvis_code_agent/code_analyzer/language_support.py +89 -0
  67. jarvis/jarvis_code_agent/code_analyzer/languages/__init__.py +31 -0
  68. jarvis/jarvis_code_agent/code_analyzer/languages/c_cpp_language.py +231 -0
  69. jarvis/jarvis_code_agent/code_analyzer/languages/go_language.py +183 -0
  70. jarvis/jarvis_code_agent/code_analyzer/languages/python_language.py +219 -0
  71. jarvis/jarvis_code_agent/code_analyzer/languages/rust_language.py +209 -0
  72. jarvis/jarvis_code_agent/code_analyzer/llm_context_recommender.py +451 -0
  73. jarvis/jarvis_code_agent/code_analyzer/symbol_extractor.py +77 -0
  74. jarvis/jarvis_code_agent/code_analyzer/tree_sitter_extractor.py +48 -0
  75. jarvis/jarvis_code_agent/lint.py +275 -13
  76. jarvis/jarvis_code_agent/utils.py +142 -0
  77. jarvis/jarvis_code_analysis/checklists/loader.py +20 -6
  78. jarvis/jarvis_code_analysis/code_review.py +583 -548
  79. jarvis/jarvis_data/config_schema.json +339 -28
  80. jarvis/jarvis_git_squash/main.py +22 -13
  81. jarvis/jarvis_git_utils/git_commiter.py +171 -55
  82. jarvis/jarvis_mcp/sse_mcp_client.py +22 -15
  83. jarvis/jarvis_mcp/stdio_mcp_client.py +4 -4
  84. jarvis/jarvis_mcp/streamable_mcp_client.py +36 -16
  85. jarvis/jarvis_memory_organizer/memory_organizer.py +753 -0
  86. jarvis/jarvis_methodology/main.py +48 -63
  87. jarvis/jarvis_multi_agent/__init__.py +302 -43
  88. jarvis/jarvis_multi_agent/main.py +70 -24
  89. jarvis/jarvis_platform/ai8.py +40 -23
  90. jarvis/jarvis_platform/base.py +210 -49
  91. jarvis/jarvis_platform/human.py +11 -1
  92. jarvis/jarvis_platform/kimi.py +82 -76
  93. jarvis/jarvis_platform/openai.py +73 -1
  94. jarvis/jarvis_platform/registry.py +8 -15
  95. jarvis/jarvis_platform/tongyi.py +115 -101
  96. jarvis/jarvis_platform/yuanbao.py +89 -63
  97. jarvis/jarvis_platform_manager/main.py +194 -132
  98. jarvis/jarvis_platform_manager/service.py +122 -86
  99. jarvis/jarvis_rag/cli.py +156 -53
  100. jarvis/jarvis_rag/embedding_manager.py +155 -12
  101. jarvis/jarvis_rag/llm_interface.py +10 -13
  102. jarvis/jarvis_rag/query_rewriter.py +63 -12
  103. jarvis/jarvis_rag/rag_pipeline.py +222 -40
  104. jarvis/jarvis_rag/reranker.py +26 -3
  105. jarvis/jarvis_rag/retriever.py +270 -14
  106. jarvis/jarvis_sec/__init__.py +3605 -0
  107. jarvis/jarvis_sec/checkers/__init__.py +32 -0
  108. jarvis/jarvis_sec/checkers/c_checker.py +2680 -0
  109. jarvis/jarvis_sec/checkers/rust_checker.py +1108 -0
  110. jarvis/jarvis_sec/cli.py +116 -0
  111. jarvis/jarvis_sec/report.py +257 -0
  112. jarvis/jarvis_sec/status.py +264 -0
  113. jarvis/jarvis_sec/types.py +20 -0
  114. jarvis/jarvis_sec/workflow.py +219 -0
  115. jarvis/jarvis_smart_shell/main.py +405 -137
  116. jarvis/jarvis_stats/__init__.py +13 -0
  117. jarvis/jarvis_stats/cli.py +387 -0
  118. jarvis/jarvis_stats/stats.py +711 -0
  119. jarvis/jarvis_stats/storage.py +612 -0
  120. jarvis/jarvis_stats/visualizer.py +282 -0
  121. jarvis/jarvis_tools/ask_user.py +1 -0
  122. jarvis/jarvis_tools/base.py +18 -2
  123. jarvis/jarvis_tools/clear_memory.py +239 -0
  124. jarvis/jarvis_tools/cli/main.py +220 -144
  125. jarvis/jarvis_tools/execute_script.py +52 -12
  126. jarvis/jarvis_tools/file_analyzer.py +17 -12
  127. jarvis/jarvis_tools/generate_new_tool.py +46 -24
  128. jarvis/jarvis_tools/read_code.py +277 -18
  129. jarvis/jarvis_tools/read_symbols.py +141 -0
  130. jarvis/jarvis_tools/read_webpage.py +86 -13
  131. jarvis/jarvis_tools/registry.py +294 -90
  132. jarvis/jarvis_tools/retrieve_memory.py +227 -0
  133. jarvis/jarvis_tools/save_memory.py +194 -0
  134. jarvis/jarvis_tools/search_web.py +62 -28
  135. jarvis/jarvis_tools/sub_agent.py +205 -0
  136. jarvis/jarvis_tools/sub_code_agent.py +217 -0
  137. jarvis/jarvis_tools/virtual_tty.py +330 -62
  138. jarvis/jarvis_utils/builtin_replace_map.py +4 -5
  139. jarvis/jarvis_utils/clipboard.py +90 -0
  140. jarvis/jarvis_utils/config.py +607 -50
  141. jarvis/jarvis_utils/embedding.py +3 -0
  142. jarvis/jarvis_utils/fzf.py +57 -0
  143. jarvis/jarvis_utils/git_utils.py +251 -29
  144. jarvis/jarvis_utils/globals.py +174 -17
  145. jarvis/jarvis_utils/http.py +58 -79
  146. jarvis/jarvis_utils/input.py +899 -153
  147. jarvis/jarvis_utils/methodology.py +210 -83
  148. jarvis/jarvis_utils/output.py +220 -137
  149. jarvis/jarvis_utils/utils.py +1906 -135
  150. jarvis_ai_assistant-0.7.0.dist-info/METADATA +465 -0
  151. jarvis_ai_assistant-0.7.0.dist-info/RECORD +192 -0
  152. {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/entry_points.txt +8 -2
  153. jarvis/jarvis_git_details/main.py +0 -265
  154. jarvis/jarvis_platform/oyi.py +0 -357
  155. jarvis/jarvis_tools/edit_file.py +0 -255
  156. jarvis/jarvis_tools/rewrite_file.py +0 -195
  157. jarvis_ai_assistant-0.1.222.dist-info/METADATA +0 -767
  158. jarvis_ai_assistant-0.1.222.dist-info/RECORD +0 -110
  159. /jarvis/{jarvis_git_details → jarvis_memory_organizer}/__init__.py +0 -0
  160. {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/WHEEL +0 -0
  161. {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/licenses/LICENSE +0 -0
  162. {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
1
1
  import os
2
2
  import pickle
3
- from typing import Any, Dict, List, cast
3
+ import json
4
+ import hashlib
5
+ from typing import Any, Dict, List, Optional, cast
4
6
 
5
7
  import chromadb
6
8
  from langchain.docstore.document import Document
@@ -8,6 +10,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
10
  from rank_bm25 import BM25Okapi # type: ignore
9
11
 
10
12
  from .embedding_manager import EmbeddingManager
13
+ from jarvis.jarvis_utils.output import OutputType, PrettyOutput
11
14
 
12
15
 
13
16
  class ChromaRetriever:
@@ -39,35 +42,271 @@ class ChromaRetriever:
39
42
  self.collection = self.client.get_or_create_collection(
40
43
  name=self.collection_name
41
44
  )
42
- print(
43
- f"ChromaDB 客户端已在 '{db_path}' 初始化,集合为 '{collection_name}'。"
45
+ PrettyOutput.print(
46
+ f"ChromaDB 客户端已在 '{db_path}' 初始化,集合为 '{collection_name}'。",
47
+ OutputType.SUCCESS,
44
48
  )
45
49
 
46
50
  # BM25索引设置
47
51
  self.bm25_index_path = os.path.join(self.db_path, f"{collection_name}_bm25.pkl")
48
52
  self._load_or_initialize_bm25()
53
+ # 清单文件用于检测源文件的变更/删除
54
+ self.manifest_path = os.path.join(
55
+ self.db_path, f"{collection_name}_manifest.json"
56
+ )
49
57
 
50
58
  def _load_or_initialize_bm25(self):
51
59
  """从磁盘加载BM25索引或初始化一个新索引。"""
52
60
  if os.path.exists(self.bm25_index_path):
53
- print("🔍 正在加载现有的 BM25 索引...")
61
+ PrettyOutput.print("正在加载现有的 BM25 索引...", OutputType.INFO)
54
62
  with open(self.bm25_index_path, "rb") as f:
55
63
  data = pickle.load(f)
56
64
  self.bm25_corpus = data["corpus"]
57
65
  self.bm25_index = BM25Okapi(self.bm25_corpus)
58
- print("BM25 索引加载成功。")
66
+ PrettyOutput.print("BM25 索引加载成功。", OutputType.SUCCESS)
59
67
  else:
60
- print("⚠️ 未找到 BM25 索引,将初始化一个新的。")
68
+ PrettyOutput.print(
69
+ "未找到 BM25 索引,将初始化一个新的。", OutputType.WARNING
70
+ )
61
71
  self.bm25_corpus = []
62
72
  self.bm25_index = None
63
73
 
64
74
  def _save_bm25_index(self):
65
75
  """将BM25索引保存到磁盘。"""
66
76
  if self.bm25_index:
67
- print("💾 正在保存 BM25 索引...")
77
+ PrettyOutput.print("正在保存 BM25 索引...", OutputType.INFO)
68
78
  with open(self.bm25_index_path, "wb") as f:
69
79
  pickle.dump({"corpus": self.bm25_corpus, "index": self.bm25_index}, f)
70
- print("BM25 索引保存成功。")
80
+ PrettyOutput.print("BM25 索引保存成功。", OutputType.SUCCESS)
81
+
82
+ def _load_manifest(self) -> Dict[str, Dict[str, Any]]:
83
+ """加载已索引文件清单,用于变更检测。"""
84
+ if os.path.exists(self.manifest_path):
85
+ try:
86
+ with open(self.manifest_path, "r", encoding="utf-8") as f:
87
+ data = json.load(f)
88
+ if isinstance(data, dict):
89
+ return data # type: ignore[return-value]
90
+ except Exception:
91
+ pass
92
+ return {}
93
+
94
+ def _save_manifest(self, manifest: Dict[str, Dict[str, Any]]) -> None:
95
+ """保存已索引文件清单。"""
96
+ try:
97
+ with open(self.manifest_path, "w", encoding="utf-8") as f:
98
+ json.dump(manifest, f, ensure_ascii=False, indent=2)
99
+ except Exception as e:
100
+ PrettyOutput.print(f"保存索引清单失败: {e}", OutputType.WARNING)
101
+
102
+ def _compute_md5(
103
+ self, file_path: str, chunk_size: int = 1024 * 1024
104
+ ) -> Optional[str]:
105
+ """流式计算文件的MD5,避免占用过多内存。失败时返回None。"""
106
+ try:
107
+ md5 = hashlib.md5()
108
+ with open(file_path, "rb") as f:
109
+ while True:
110
+ data = f.read(chunk_size)
111
+ if not data:
112
+ break
113
+ md5.update(data)
114
+ return md5.hexdigest()
115
+ except Exception:
116
+ return None
117
+
118
+ def _update_manifest_with_sources(self, sources: List[str]) -> None:
119
+ """根据本次新增文档的来源,更新索引清单(记录mtime与size)。"""
120
+ manifest = self._load_manifest()
121
+ updated = 0
122
+ for src in set(sources):
123
+ try:
124
+ if isinstance(src, str) and os.path.exists(src):
125
+ st = os.stat(src)
126
+ entry: Dict[str, Any] = {
127
+ "mtime": float(st.st_mtime),
128
+ "size": int(st.st_size),
129
+ }
130
+ md5sum = self._compute_md5(src)
131
+ if md5sum:
132
+ entry["md5"] = md5sum
133
+ manifest[src] = entry # type: ignore[dict-item]
134
+ updated += 1
135
+ except Exception:
136
+ continue
137
+ if updated > 0:
138
+ self._save_manifest(manifest)
139
+ PrettyOutput.print(
140
+ f"已更新索引清单,记录 {updated} 个源文件状态。", OutputType.INFO
141
+ )
142
+
143
+ def _detect_changed_or_deleted(self) -> Dict[str, List[str]]:
144
+ """检测已记录的源文件是否发生变化或被删除。"""
145
+ manifest = self._load_manifest()
146
+ changed: List[str] = []
147
+ deleted: List[str] = []
148
+ for src, info in manifest.items():
149
+ try:
150
+ if not os.path.exists(src):
151
+ deleted.append(src)
152
+ continue
153
+ st = os.stat(src)
154
+ size_changed = int(info.get("size", -1)) != int(st.st_size)
155
+ if size_changed:
156
+ changed.append(src)
157
+ continue
158
+ md5_old = info.get("md5")
159
+ if md5_old:
160
+ # 仅在mtime变化时计算md5以降低开销
161
+ mtime_changed = (
162
+ abs(float(info.get("mtime", 0.0)) - float(st.st_mtime)) >= 1e-6
163
+ )
164
+ if mtime_changed:
165
+ md5_new = self._compute_md5(src)
166
+ if not md5_new or md5_new != md5_old:
167
+ changed.append(src)
168
+ else:
169
+ # 没有记录md5,回退使用mtime判断
170
+ mtime_changed = (
171
+ abs(float(info.get("mtime", 0.0)) - float(st.st_mtime)) >= 1e-6
172
+ )
173
+ if mtime_changed:
174
+ changed.append(src)
175
+ except Exception:
176
+ # 无法读取文件状态,视为发生变化
177
+ changed.append(src)
178
+ return {"changed": changed, "deleted": deleted}
179
+
180
+ def _warn_if_sources_changed(self) -> None:
181
+ """如发现已索引文件变化或删除,给出提醒。"""
182
+ result = self._detect_changed_or_deleted()
183
+ changed = result["changed"]
184
+ deleted = result["deleted"]
185
+ if not changed and not deleted:
186
+ return
187
+ # 为避免在循环中逐条打印,先拼接后统一打印
188
+ lines: List[str] = []
189
+ if changed:
190
+ lines.append(
191
+ f"检测到 {len(changed)} 个已索引文件发生变化,建议重新索引以保证检索准确性。"
192
+ )
193
+ lines.extend([f" 变更: {p}" for p in changed[:5]])
194
+ if len(changed) > 5:
195
+ lines.append(f" ... 以及另外 {len(changed) - 5} 个文件")
196
+ if deleted:
197
+ lines.append(
198
+ f"检测到 {len(deleted)} 个已索引文件已被删除,建议清理并重新索引。"
199
+ )
200
+ lines.extend([f" 删除: {p}" for p in deleted[:5]])
201
+ if len(deleted) > 5:
202
+ lines.append(f" ... 以及另外 {len(deleted) - 5} 个文件")
203
+ lines.append(
204
+ "提示:请使用 'jarvis-rag add <路径>' 重新索引相关文件,以更新向量库与BM25索引。"
205
+ )
206
+ PrettyOutput.print("\n".join(lines), OutputType.WARNING)
207
+
208
+ def detect_index_changes(self) -> Dict[str, List[str]]:
209
+ """
210
+ 公共方法:检测索引变更(变更与删除)。
211
+ 返回:
212
+ {'changed': List[str], 'deleted': List[str]}
213
+ """
214
+ return self._detect_changed_or_deleted()
215
+
216
+ def _remove_sources_from_manifest(self, sources: List[str]) -> None:
217
+ """从manifest中移除指定源文件记录并保存。"""
218
+ if not sources:
219
+ return
220
+ manifest = self._load_manifest()
221
+ removed = 0
222
+ for src in set(sources):
223
+ if src in manifest:
224
+ manifest.pop(src, None)
225
+ removed += 1
226
+ if removed > 0:
227
+ self._save_manifest(manifest)
228
+ PrettyOutput.print(
229
+ f"已从索引清单中移除 {removed} 个已删除的源文件记录。", OutputType.INFO
230
+ )
231
+
232
+ def update_index_for_changes(self, changed: List[str], deleted: List[str]) -> None:
233
+ """
234
+ 公共方法:根据变更与删除列表更新索引。
235
+ - 对 deleted: 从向量库按 metadata.source 删除
236
+ - 对 changed: 先删除旧条目,再从源文件重建并添加
237
+ - 最后:从集合重建BM25索引,更新manifest
238
+ """
239
+ changed = list(
240
+ dict.fromkeys([p for p in (changed or []) if isinstance(p, str)])
241
+ )
242
+ deleted = list(
243
+ dict.fromkeys([p for p in (deleted or []) if isinstance(p, str)])
244
+ )
245
+
246
+ if not changed and not deleted:
247
+ return
248
+
249
+ # 先处理删除
250
+ delete_errors: List[str] = []
251
+ for src in deleted:
252
+ try:
253
+ self.collection.delete(where={"source": src}) # type: ignore[arg-type]
254
+ except Exception as e:
255
+ delete_errors.append(f"删除源 '{src}' 时出错: {e}")
256
+ if delete_errors:
257
+ PrettyOutput.print("\n".join(delete_errors), OutputType.WARNING)
258
+
259
+ # 再处理变更(重建)
260
+ docs_to_add: List[Document] = []
261
+ rebuild_errors: List[str] = []
262
+ for src in changed:
263
+ try:
264
+ # 删除旧条目
265
+ try:
266
+ self.collection.delete(where={"source": src}) # type: ignore[arg-type]
267
+ except Exception:
268
+ pass
269
+ # 读取源文件内容(作为单文档载入,由 add_documents 进行拆分与嵌入)
270
+ with open(src, "r", encoding="utf-8", errors="ignore") as f:
271
+ content = f.read()
272
+ docs_to_add.append(
273
+ Document(page_content=content, metadata={"source": src})
274
+ )
275
+ except Exception as e:
276
+ rebuild_errors.append(f"重建源 '{src}' 内容时出错: {e}")
277
+ if rebuild_errors:
278
+ PrettyOutput.print("\n".join(rebuild_errors), OutputType.WARNING)
279
+
280
+ if docs_to_add:
281
+ try:
282
+ # 复用现有拆分与嵌入逻辑
283
+ self.add_documents(docs_to_add)
284
+ except Exception as e:
285
+ PrettyOutput.print(f"添加变更文档到索引时出错: {e}", OutputType.ERROR)
286
+
287
+ # 重建BM25索引,确保删除后的语料被清理
288
+ try:
289
+ all_docs_in_collection = self.collection.get()
290
+ all_documents = all_docs_in_collection.get("documents") or []
291
+ self.bm25_corpus = [str(text).split() for text in all_documents if text]
292
+ self.bm25_index = BM25Okapi(self.bm25_corpus) if self.bm25_corpus else None
293
+ self._save_bm25_index()
294
+ except Exception as e:
295
+ PrettyOutput.print(f"重建BM25索引失败: {e}", OutputType.WARNING)
296
+
297
+ # 更新manifest:变更文件更新状态;删除文件从清单中移除
298
+ try:
299
+ if changed:
300
+ self._update_manifest_with_sources(changed)
301
+ if deleted:
302
+ self._remove_sources_from_manifest(deleted)
303
+ except Exception as e:
304
+ PrettyOutput.print(f"更新索引清单时出错: {e}", OutputType.WARNING)
305
+
306
+ PrettyOutput.print(
307
+ f"索引已更新:变更 {len(changed)} 个,删除 {len(deleted)} 个。",
308
+ OutputType.SUCCESS,
309
+ )
71
310
 
72
311
  def add_documents(
73
312
  self, documents: List[Document], chunk_size=1000, chunk_overlap=100
@@ -80,7 +319,10 @@ class ChromaRetriever:
80
319
  )
81
320
  chunks = text_splitter.split_documents(documents)
82
321
 
83
- print(f"📄 已将 {len(documents)} 个文档拆分为 {len(chunks)} 个块。")
322
+ PrettyOutput.print(
323
+ f"已将 {len(documents)} 个文档拆分为 {len(chunks)} 个块。",
324
+ OutputType.INFO,
325
+ )
84
326
 
85
327
  if not chunks:
86
328
  return
@@ -99,19 +341,33 @@ class ChromaRetriever:
99
341
  documents=chunk_texts,
100
342
  metadatas=cast(Any, metadatas),
101
343
  )
102
- print(f"✅ 成功将 {len(chunks)} 个块添加到 ChromaDB 集合中。")
344
+ PrettyOutput.print(
345
+ f"成功将 {len(chunks)} 个块添加到 ChromaDB 集合中。",
346
+ OutputType.SUCCESS,
347
+ )
103
348
 
104
349
  # 更新并保存BM25索引
105
350
  tokenized_chunks = [doc.split() for doc in chunk_texts]
106
351
  self.bm25_corpus.extend(tokenized_chunks)
107
352
  self.bm25_index = BM25Okapi(self.bm25_corpus)
108
353
  self._save_bm25_index()
109
-
110
- def retrieve(self, query: str, n_results: int = 5) -> List[Document]:
354
+ # 更新索引清单(用于检测源文件变更/删除)
355
+ source_list = [
356
+ md.get("source")
357
+ for md in metadatas
358
+ if md and isinstance(md.get("source"), str)
359
+ ]
360
+ self._update_manifest_with_sources(cast(List[str], source_list))
361
+
362
+ def retrieve(
363
+ self, query: str, n_results: int = 5, use_bm25: bool = True
364
+ ) -> List[Document]:
111
365
  """
112
366
  使用向量搜索和BM25执行混合检索,然后使用倒数排序融合(RRF)
113
367
  对结果进行融合。
114
368
  """
369
+ # 在检索前检查源文件变更/删除并提醒
370
+ self._warn_if_sources_changed()
115
371
  # 1. 向量搜索 (ChromaDB)
116
372
  query_embedding = self.embedding_manager.embed_query(query)
117
373
  vector_results = self.collection.query(
@@ -121,7 +377,7 @@ class ChromaRetriever:
121
377
 
122
378
  # 2. 关键字搜索 (BM25)
123
379
  bm25_docs = []
124
- if self.bm25_index:
380
+ if self.bm25_index and use_bm25:
125
381
  tokenized_query = query.split()
126
382
  doc_scores = self.bm25_index.get_scores(tokenized_query)
127
383
 
@@ -144,7 +400,7 @@ class ChromaRetriever:
144
400
  ]
145
401
 
146
402
  # 按分数排序并取最高结果
147
- bm25_results_with_docs.sort(key=lambda x: x[2], reverse=True)
403
+ bm25_results_with_docs.sort(key=lambda x: x[2], reverse=True) # type: ignore
148
404
 
149
405
  for doc_text, metadata, _ in bm25_results_with_docs[: n_results * 2]:
150
406
  bm25_docs.append(Document(page_content=doc_text, metadata=metadata))