jarvis-ai-assistant 0.1.222__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/__init__.py +1143 -245
- jarvis/jarvis_agent/agent_manager.py +97 -0
- jarvis/jarvis_agent/builtin_input_handler.py +12 -10
- jarvis/jarvis_agent/config_editor.py +57 -0
- jarvis/jarvis_agent/edit_file_handler.py +392 -99
- jarvis/jarvis_agent/event_bus.py +48 -0
- jarvis/jarvis_agent/events.py +157 -0
- jarvis/jarvis_agent/file_context_handler.py +79 -0
- jarvis/jarvis_agent/file_methodology_manager.py +117 -0
- jarvis/jarvis_agent/jarvis.py +1117 -147
- jarvis/jarvis_agent/main.py +78 -34
- jarvis/jarvis_agent/memory_manager.py +195 -0
- jarvis/jarvis_agent/methodology_share_manager.py +174 -0
- jarvis/jarvis_agent/prompt_manager.py +82 -0
- jarvis/jarvis_agent/prompts.py +46 -9
- jarvis/jarvis_agent/protocols.py +4 -1
- jarvis/jarvis_agent/rewrite_file_handler.py +141 -0
- jarvis/jarvis_agent/run_loop.py +146 -0
- jarvis/jarvis_agent/session_manager.py +9 -9
- jarvis/jarvis_agent/share_manager.py +228 -0
- jarvis/jarvis_agent/shell_input_handler.py +23 -3
- jarvis/jarvis_agent/stdio_redirect.py +295 -0
- jarvis/jarvis_agent/task_analyzer.py +212 -0
- jarvis/jarvis_agent/task_manager.py +154 -0
- jarvis/jarvis_agent/task_planner.py +496 -0
- jarvis/jarvis_agent/tool_executor.py +8 -4
- jarvis/jarvis_agent/tool_share_manager.py +139 -0
- jarvis/jarvis_agent/user_interaction.py +42 -0
- jarvis/jarvis_agent/utils.py +54 -0
- jarvis/jarvis_agent/web_bridge.py +189 -0
- jarvis/jarvis_agent/web_output_sink.py +53 -0
- jarvis/jarvis_agent/web_server.py +751 -0
- jarvis/jarvis_c2rust/__init__.py +26 -0
- jarvis/jarvis_c2rust/cli.py +613 -0
- jarvis/jarvis_c2rust/collector.py +258 -0
- jarvis/jarvis_c2rust/library_replacer.py +1122 -0
- jarvis/jarvis_c2rust/llm_module_agent.py +1300 -0
- jarvis/jarvis_c2rust/optimizer.py +960 -0
- jarvis/jarvis_c2rust/scanner.py +1681 -0
- jarvis/jarvis_c2rust/transpiler.py +2325 -0
- jarvis/jarvis_code_agent/build_validation_config.py +133 -0
- jarvis/jarvis_code_agent/code_agent.py +1605 -178
- jarvis/jarvis_code_agent/code_analyzer/__init__.py +62 -0
- jarvis/jarvis_code_agent/code_analyzer/base_language.py +74 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/__init__.py +44 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/base.py +102 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/cmake.py +59 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/detector.py +125 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/fallback.py +69 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/go.py +38 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/java_gradle.py +44 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/java_maven.py +38 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/makefile.py +50 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/nodejs.py +93 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/python.py +129 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/rust.py +54 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator/validator.py +154 -0
- jarvis/jarvis_code_agent/code_analyzer/build_validator.py +43 -0
- jarvis/jarvis_code_agent/code_analyzer/context_manager.py +363 -0
- jarvis/jarvis_code_agent/code_analyzer/context_recommender.py +18 -0
- jarvis/jarvis_code_agent/code_analyzer/dependency_analyzer.py +132 -0
- jarvis/jarvis_code_agent/code_analyzer/file_ignore.py +330 -0
- jarvis/jarvis_code_agent/code_analyzer/impact_analyzer.py +781 -0
- jarvis/jarvis_code_agent/code_analyzer/language_registry.py +185 -0
- jarvis/jarvis_code_agent/code_analyzer/language_support.py +89 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/__init__.py +31 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/c_cpp_language.py +231 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/go_language.py +183 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/python_language.py +219 -0
- jarvis/jarvis_code_agent/code_analyzer/languages/rust_language.py +209 -0
- jarvis/jarvis_code_agent/code_analyzer/llm_context_recommender.py +451 -0
- jarvis/jarvis_code_agent/code_analyzer/symbol_extractor.py +77 -0
- jarvis/jarvis_code_agent/code_analyzer/tree_sitter_extractor.py +48 -0
- jarvis/jarvis_code_agent/lint.py +275 -13
- jarvis/jarvis_code_agent/utils.py +142 -0
- jarvis/jarvis_code_analysis/checklists/loader.py +20 -6
- jarvis/jarvis_code_analysis/code_review.py +583 -548
- jarvis/jarvis_data/config_schema.json +339 -28
- jarvis/jarvis_git_squash/main.py +22 -13
- jarvis/jarvis_git_utils/git_commiter.py +171 -55
- jarvis/jarvis_mcp/sse_mcp_client.py +22 -15
- jarvis/jarvis_mcp/stdio_mcp_client.py +4 -4
- jarvis/jarvis_mcp/streamable_mcp_client.py +36 -16
- jarvis/jarvis_memory_organizer/memory_organizer.py +753 -0
- jarvis/jarvis_methodology/main.py +48 -63
- jarvis/jarvis_multi_agent/__init__.py +302 -43
- jarvis/jarvis_multi_agent/main.py +70 -24
- jarvis/jarvis_platform/ai8.py +40 -23
- jarvis/jarvis_platform/base.py +210 -49
- jarvis/jarvis_platform/human.py +11 -1
- jarvis/jarvis_platform/kimi.py +82 -76
- jarvis/jarvis_platform/openai.py +73 -1
- jarvis/jarvis_platform/registry.py +8 -15
- jarvis/jarvis_platform/tongyi.py +115 -101
- jarvis/jarvis_platform/yuanbao.py +89 -63
- jarvis/jarvis_platform_manager/main.py +194 -132
- jarvis/jarvis_platform_manager/service.py +122 -86
- jarvis/jarvis_rag/cli.py +156 -53
- jarvis/jarvis_rag/embedding_manager.py +155 -12
- jarvis/jarvis_rag/llm_interface.py +10 -13
- jarvis/jarvis_rag/query_rewriter.py +63 -12
- jarvis/jarvis_rag/rag_pipeline.py +222 -40
- jarvis/jarvis_rag/reranker.py +26 -3
- jarvis/jarvis_rag/retriever.py +270 -14
- jarvis/jarvis_sec/__init__.py +3605 -0
- jarvis/jarvis_sec/checkers/__init__.py +32 -0
- jarvis/jarvis_sec/checkers/c_checker.py +2680 -0
- jarvis/jarvis_sec/checkers/rust_checker.py +1108 -0
- jarvis/jarvis_sec/cli.py +116 -0
- jarvis/jarvis_sec/report.py +257 -0
- jarvis/jarvis_sec/status.py +264 -0
- jarvis/jarvis_sec/types.py +20 -0
- jarvis/jarvis_sec/workflow.py +219 -0
- jarvis/jarvis_smart_shell/main.py +405 -137
- jarvis/jarvis_stats/__init__.py +13 -0
- jarvis/jarvis_stats/cli.py +387 -0
- jarvis/jarvis_stats/stats.py +711 -0
- jarvis/jarvis_stats/storage.py +612 -0
- jarvis/jarvis_stats/visualizer.py +282 -0
- jarvis/jarvis_tools/ask_user.py +1 -0
- jarvis/jarvis_tools/base.py +18 -2
- jarvis/jarvis_tools/clear_memory.py +239 -0
- jarvis/jarvis_tools/cli/main.py +220 -144
- jarvis/jarvis_tools/execute_script.py +52 -12
- jarvis/jarvis_tools/file_analyzer.py +17 -12
- jarvis/jarvis_tools/generate_new_tool.py +46 -24
- jarvis/jarvis_tools/read_code.py +277 -18
- jarvis/jarvis_tools/read_symbols.py +141 -0
- jarvis/jarvis_tools/read_webpage.py +86 -13
- jarvis/jarvis_tools/registry.py +294 -90
- jarvis/jarvis_tools/retrieve_memory.py +227 -0
- jarvis/jarvis_tools/save_memory.py +194 -0
- jarvis/jarvis_tools/search_web.py +62 -28
- jarvis/jarvis_tools/sub_agent.py +205 -0
- jarvis/jarvis_tools/sub_code_agent.py +217 -0
- jarvis/jarvis_tools/virtual_tty.py +330 -62
- jarvis/jarvis_utils/builtin_replace_map.py +4 -5
- jarvis/jarvis_utils/clipboard.py +90 -0
- jarvis/jarvis_utils/config.py +607 -50
- jarvis/jarvis_utils/embedding.py +3 -0
- jarvis/jarvis_utils/fzf.py +57 -0
- jarvis/jarvis_utils/git_utils.py +251 -29
- jarvis/jarvis_utils/globals.py +174 -17
- jarvis/jarvis_utils/http.py +58 -79
- jarvis/jarvis_utils/input.py +899 -153
- jarvis/jarvis_utils/methodology.py +210 -83
- jarvis/jarvis_utils/output.py +220 -137
- jarvis/jarvis_utils/utils.py +1906 -135
- jarvis_ai_assistant-0.7.0.dist-info/METADATA +465 -0
- jarvis_ai_assistant-0.7.0.dist-info/RECORD +192 -0
- {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/entry_points.txt +8 -2
- jarvis/jarvis_git_details/main.py +0 -265
- jarvis/jarvis_platform/oyi.py +0 -357
- jarvis/jarvis_tools/edit_file.py +0 -255
- jarvis/jarvis_tools/rewrite_file.py +0 -195
- jarvis_ai_assistant-0.1.222.dist-info/METADATA +0 -767
- jarvis_ai_assistant-0.1.222.dist-info/RECORD +0 -110
- /jarvis/{jarvis_git_details → jarvis_memory_organizer}/__init__.py +0 -0
- {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {jarvis_ai_assistant-0.1.222.dist-info → jarvis_ai_assistant-0.7.0.dist-info}/top_level.txt +0 -0
jarvis/jarvis_rag/retriever.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pickle
|
|
3
|
-
|
|
3
|
+
import json
|
|
4
|
+
import hashlib
|
|
5
|
+
from typing import Any, Dict, List, Optional, cast
|
|
4
6
|
|
|
5
7
|
import chromadb
|
|
6
8
|
from langchain.docstore.document import Document
|
|
@@ -8,6 +10,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
8
10
|
from rank_bm25 import BM25Okapi # type: ignore
|
|
9
11
|
|
|
10
12
|
from .embedding_manager import EmbeddingManager
|
|
13
|
+
from jarvis.jarvis_utils.output import OutputType, PrettyOutput
|
|
11
14
|
|
|
12
15
|
|
|
13
16
|
class ChromaRetriever:
|
|
@@ -39,35 +42,271 @@ class ChromaRetriever:
|
|
|
39
42
|
self.collection = self.client.get_or_create_collection(
|
|
40
43
|
name=self.collection_name
|
|
41
44
|
)
|
|
42
|
-
print(
|
|
43
|
-
f"
|
|
45
|
+
PrettyOutput.print(
|
|
46
|
+
f"ChromaDB 客户端已在 '{db_path}' 初始化,集合为 '{collection_name}'。",
|
|
47
|
+
OutputType.SUCCESS,
|
|
44
48
|
)
|
|
45
49
|
|
|
46
50
|
# BM25索引设置
|
|
47
51
|
self.bm25_index_path = os.path.join(self.db_path, f"{collection_name}_bm25.pkl")
|
|
48
52
|
self._load_or_initialize_bm25()
|
|
53
|
+
# 清单文件用于检测源文件的变更/删除
|
|
54
|
+
self.manifest_path = os.path.join(
|
|
55
|
+
self.db_path, f"{collection_name}_manifest.json"
|
|
56
|
+
)
|
|
49
57
|
|
|
50
58
|
def _load_or_initialize_bm25(self):
|
|
51
59
|
"""从磁盘加载BM25索引或初始化一个新索引。"""
|
|
52
60
|
if os.path.exists(self.bm25_index_path):
|
|
53
|
-
print("
|
|
61
|
+
PrettyOutput.print("正在加载现有的 BM25 索引...", OutputType.INFO)
|
|
54
62
|
with open(self.bm25_index_path, "rb") as f:
|
|
55
63
|
data = pickle.load(f)
|
|
56
64
|
self.bm25_corpus = data["corpus"]
|
|
57
65
|
self.bm25_index = BM25Okapi(self.bm25_corpus)
|
|
58
|
-
print("
|
|
66
|
+
PrettyOutput.print("BM25 索引加载成功。", OutputType.SUCCESS)
|
|
59
67
|
else:
|
|
60
|
-
print(
|
|
68
|
+
PrettyOutput.print(
|
|
69
|
+
"未找到 BM25 索引,将初始化一个新的。", OutputType.WARNING
|
|
70
|
+
)
|
|
61
71
|
self.bm25_corpus = []
|
|
62
72
|
self.bm25_index = None
|
|
63
73
|
|
|
64
74
|
def _save_bm25_index(self):
|
|
65
75
|
"""将BM25索引保存到磁盘。"""
|
|
66
76
|
if self.bm25_index:
|
|
67
|
-
print("
|
|
77
|
+
PrettyOutput.print("正在保存 BM25 索引...", OutputType.INFO)
|
|
68
78
|
with open(self.bm25_index_path, "wb") as f:
|
|
69
79
|
pickle.dump({"corpus": self.bm25_corpus, "index": self.bm25_index}, f)
|
|
70
|
-
print("
|
|
80
|
+
PrettyOutput.print("BM25 索引保存成功。", OutputType.SUCCESS)
|
|
81
|
+
|
|
82
|
+
def _load_manifest(self) -> Dict[str, Dict[str, Any]]:
|
|
83
|
+
"""加载已索引文件清单,用于变更检测。"""
|
|
84
|
+
if os.path.exists(self.manifest_path):
|
|
85
|
+
try:
|
|
86
|
+
with open(self.manifest_path, "r", encoding="utf-8") as f:
|
|
87
|
+
data = json.load(f)
|
|
88
|
+
if isinstance(data, dict):
|
|
89
|
+
return data # type: ignore[return-value]
|
|
90
|
+
except Exception:
|
|
91
|
+
pass
|
|
92
|
+
return {}
|
|
93
|
+
|
|
94
|
+
def _save_manifest(self, manifest: Dict[str, Dict[str, Any]]) -> None:
|
|
95
|
+
"""保存已索引文件清单。"""
|
|
96
|
+
try:
|
|
97
|
+
with open(self.manifest_path, "w", encoding="utf-8") as f:
|
|
98
|
+
json.dump(manifest, f, ensure_ascii=False, indent=2)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
PrettyOutput.print(f"保存索引清单失败: {e}", OutputType.WARNING)
|
|
101
|
+
|
|
102
|
+
def _compute_md5(
|
|
103
|
+
self, file_path: str, chunk_size: int = 1024 * 1024
|
|
104
|
+
) -> Optional[str]:
|
|
105
|
+
"""流式计算文件的MD5,避免占用过多内存。失败时返回None。"""
|
|
106
|
+
try:
|
|
107
|
+
md5 = hashlib.md5()
|
|
108
|
+
with open(file_path, "rb") as f:
|
|
109
|
+
while True:
|
|
110
|
+
data = f.read(chunk_size)
|
|
111
|
+
if not data:
|
|
112
|
+
break
|
|
113
|
+
md5.update(data)
|
|
114
|
+
return md5.hexdigest()
|
|
115
|
+
except Exception:
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
def _update_manifest_with_sources(self, sources: List[str]) -> None:
|
|
119
|
+
"""根据本次新增文档的来源,更新索引清单(记录mtime与size)。"""
|
|
120
|
+
manifest = self._load_manifest()
|
|
121
|
+
updated = 0
|
|
122
|
+
for src in set(sources):
|
|
123
|
+
try:
|
|
124
|
+
if isinstance(src, str) and os.path.exists(src):
|
|
125
|
+
st = os.stat(src)
|
|
126
|
+
entry: Dict[str, Any] = {
|
|
127
|
+
"mtime": float(st.st_mtime),
|
|
128
|
+
"size": int(st.st_size),
|
|
129
|
+
}
|
|
130
|
+
md5sum = self._compute_md5(src)
|
|
131
|
+
if md5sum:
|
|
132
|
+
entry["md5"] = md5sum
|
|
133
|
+
manifest[src] = entry # type: ignore[dict-item]
|
|
134
|
+
updated += 1
|
|
135
|
+
except Exception:
|
|
136
|
+
continue
|
|
137
|
+
if updated > 0:
|
|
138
|
+
self._save_manifest(manifest)
|
|
139
|
+
PrettyOutput.print(
|
|
140
|
+
f"已更新索引清单,记录 {updated} 个源文件状态。", OutputType.INFO
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def _detect_changed_or_deleted(self) -> Dict[str, List[str]]:
|
|
144
|
+
"""检测已记录的源文件是否发生变化或被删除。"""
|
|
145
|
+
manifest = self._load_manifest()
|
|
146
|
+
changed: List[str] = []
|
|
147
|
+
deleted: List[str] = []
|
|
148
|
+
for src, info in manifest.items():
|
|
149
|
+
try:
|
|
150
|
+
if not os.path.exists(src):
|
|
151
|
+
deleted.append(src)
|
|
152
|
+
continue
|
|
153
|
+
st = os.stat(src)
|
|
154
|
+
size_changed = int(info.get("size", -1)) != int(st.st_size)
|
|
155
|
+
if size_changed:
|
|
156
|
+
changed.append(src)
|
|
157
|
+
continue
|
|
158
|
+
md5_old = info.get("md5")
|
|
159
|
+
if md5_old:
|
|
160
|
+
# 仅在mtime变化时计算md5以降低开销
|
|
161
|
+
mtime_changed = (
|
|
162
|
+
abs(float(info.get("mtime", 0.0)) - float(st.st_mtime)) >= 1e-6
|
|
163
|
+
)
|
|
164
|
+
if mtime_changed:
|
|
165
|
+
md5_new = self._compute_md5(src)
|
|
166
|
+
if not md5_new or md5_new != md5_old:
|
|
167
|
+
changed.append(src)
|
|
168
|
+
else:
|
|
169
|
+
# 没有记录md5,回退使用mtime判断
|
|
170
|
+
mtime_changed = (
|
|
171
|
+
abs(float(info.get("mtime", 0.0)) - float(st.st_mtime)) >= 1e-6
|
|
172
|
+
)
|
|
173
|
+
if mtime_changed:
|
|
174
|
+
changed.append(src)
|
|
175
|
+
except Exception:
|
|
176
|
+
# 无法读取文件状态,视为发生变化
|
|
177
|
+
changed.append(src)
|
|
178
|
+
return {"changed": changed, "deleted": deleted}
|
|
179
|
+
|
|
180
|
+
def _warn_if_sources_changed(self) -> None:
|
|
181
|
+
"""如发现已索引文件变化或删除,给出提醒。"""
|
|
182
|
+
result = self._detect_changed_or_deleted()
|
|
183
|
+
changed = result["changed"]
|
|
184
|
+
deleted = result["deleted"]
|
|
185
|
+
if not changed and not deleted:
|
|
186
|
+
return
|
|
187
|
+
# 为避免在循环中逐条打印,先拼接后统一打印
|
|
188
|
+
lines: List[str] = []
|
|
189
|
+
if changed:
|
|
190
|
+
lines.append(
|
|
191
|
+
f"检测到 {len(changed)} 个已索引文件发生变化,建议重新索引以保证检索准确性。"
|
|
192
|
+
)
|
|
193
|
+
lines.extend([f" 变更: {p}" for p in changed[:5]])
|
|
194
|
+
if len(changed) > 5:
|
|
195
|
+
lines.append(f" ... 以及另外 {len(changed) - 5} 个文件")
|
|
196
|
+
if deleted:
|
|
197
|
+
lines.append(
|
|
198
|
+
f"检测到 {len(deleted)} 个已索引文件已被删除,建议清理并重新索引。"
|
|
199
|
+
)
|
|
200
|
+
lines.extend([f" 删除: {p}" for p in deleted[:5]])
|
|
201
|
+
if len(deleted) > 5:
|
|
202
|
+
lines.append(f" ... 以及另外 {len(deleted) - 5} 个文件")
|
|
203
|
+
lines.append(
|
|
204
|
+
"提示:请使用 'jarvis-rag add <路径>' 重新索引相关文件,以更新向量库与BM25索引。"
|
|
205
|
+
)
|
|
206
|
+
PrettyOutput.print("\n".join(lines), OutputType.WARNING)
|
|
207
|
+
|
|
208
|
+
def detect_index_changes(self) -> Dict[str, List[str]]:
|
|
209
|
+
"""
|
|
210
|
+
公共方法:检测索引变更(变更与删除)。
|
|
211
|
+
返回:
|
|
212
|
+
{'changed': List[str], 'deleted': List[str]}
|
|
213
|
+
"""
|
|
214
|
+
return self._detect_changed_or_deleted()
|
|
215
|
+
|
|
216
|
+
def _remove_sources_from_manifest(self, sources: List[str]) -> None:
|
|
217
|
+
"""从manifest中移除指定源文件记录并保存。"""
|
|
218
|
+
if not sources:
|
|
219
|
+
return
|
|
220
|
+
manifest = self._load_manifest()
|
|
221
|
+
removed = 0
|
|
222
|
+
for src in set(sources):
|
|
223
|
+
if src in manifest:
|
|
224
|
+
manifest.pop(src, None)
|
|
225
|
+
removed += 1
|
|
226
|
+
if removed > 0:
|
|
227
|
+
self._save_manifest(manifest)
|
|
228
|
+
PrettyOutput.print(
|
|
229
|
+
f"已从索引清单中移除 {removed} 个已删除的源文件记录。", OutputType.INFO
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
def update_index_for_changes(self, changed: List[str], deleted: List[str]) -> None:
|
|
233
|
+
"""
|
|
234
|
+
公共方法:根据变更与删除列表更新索引。
|
|
235
|
+
- 对 deleted: 从向量库按 metadata.source 删除
|
|
236
|
+
- 对 changed: 先删除旧条目,再从源文件重建并添加
|
|
237
|
+
- 最后:从集合重建BM25索引,更新manifest
|
|
238
|
+
"""
|
|
239
|
+
changed = list(
|
|
240
|
+
dict.fromkeys([p for p in (changed or []) if isinstance(p, str)])
|
|
241
|
+
)
|
|
242
|
+
deleted = list(
|
|
243
|
+
dict.fromkeys([p for p in (deleted or []) if isinstance(p, str)])
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
if not changed and not deleted:
|
|
247
|
+
return
|
|
248
|
+
|
|
249
|
+
# 先处理删除
|
|
250
|
+
delete_errors: List[str] = []
|
|
251
|
+
for src in deleted:
|
|
252
|
+
try:
|
|
253
|
+
self.collection.delete(where={"source": src}) # type: ignore[arg-type]
|
|
254
|
+
except Exception as e:
|
|
255
|
+
delete_errors.append(f"删除源 '{src}' 时出错: {e}")
|
|
256
|
+
if delete_errors:
|
|
257
|
+
PrettyOutput.print("\n".join(delete_errors), OutputType.WARNING)
|
|
258
|
+
|
|
259
|
+
# 再处理变更(重建)
|
|
260
|
+
docs_to_add: List[Document] = []
|
|
261
|
+
rebuild_errors: List[str] = []
|
|
262
|
+
for src in changed:
|
|
263
|
+
try:
|
|
264
|
+
# 删除旧条目
|
|
265
|
+
try:
|
|
266
|
+
self.collection.delete(where={"source": src}) # type: ignore[arg-type]
|
|
267
|
+
except Exception:
|
|
268
|
+
pass
|
|
269
|
+
# 读取源文件内容(作为单文档载入,由 add_documents 进行拆分与嵌入)
|
|
270
|
+
with open(src, "r", encoding="utf-8", errors="ignore") as f:
|
|
271
|
+
content = f.read()
|
|
272
|
+
docs_to_add.append(
|
|
273
|
+
Document(page_content=content, metadata={"source": src})
|
|
274
|
+
)
|
|
275
|
+
except Exception as e:
|
|
276
|
+
rebuild_errors.append(f"重建源 '{src}' 内容时出错: {e}")
|
|
277
|
+
if rebuild_errors:
|
|
278
|
+
PrettyOutput.print("\n".join(rebuild_errors), OutputType.WARNING)
|
|
279
|
+
|
|
280
|
+
if docs_to_add:
|
|
281
|
+
try:
|
|
282
|
+
# 复用现有拆分与嵌入逻辑
|
|
283
|
+
self.add_documents(docs_to_add)
|
|
284
|
+
except Exception as e:
|
|
285
|
+
PrettyOutput.print(f"添加变更文档到索引时出错: {e}", OutputType.ERROR)
|
|
286
|
+
|
|
287
|
+
# 重建BM25索引,确保删除后的语料被清理
|
|
288
|
+
try:
|
|
289
|
+
all_docs_in_collection = self.collection.get()
|
|
290
|
+
all_documents = all_docs_in_collection.get("documents") or []
|
|
291
|
+
self.bm25_corpus = [str(text).split() for text in all_documents if text]
|
|
292
|
+
self.bm25_index = BM25Okapi(self.bm25_corpus) if self.bm25_corpus else None
|
|
293
|
+
self._save_bm25_index()
|
|
294
|
+
except Exception as e:
|
|
295
|
+
PrettyOutput.print(f"重建BM25索引失败: {e}", OutputType.WARNING)
|
|
296
|
+
|
|
297
|
+
# 更新manifest:变更文件更新状态;删除文件从清单中移除
|
|
298
|
+
try:
|
|
299
|
+
if changed:
|
|
300
|
+
self._update_manifest_with_sources(changed)
|
|
301
|
+
if deleted:
|
|
302
|
+
self._remove_sources_from_manifest(deleted)
|
|
303
|
+
except Exception as e:
|
|
304
|
+
PrettyOutput.print(f"更新索引清单时出错: {e}", OutputType.WARNING)
|
|
305
|
+
|
|
306
|
+
PrettyOutput.print(
|
|
307
|
+
f"索引已更新:变更 {len(changed)} 个,删除 {len(deleted)} 个。",
|
|
308
|
+
OutputType.SUCCESS,
|
|
309
|
+
)
|
|
71
310
|
|
|
72
311
|
def add_documents(
|
|
73
312
|
self, documents: List[Document], chunk_size=1000, chunk_overlap=100
|
|
@@ -80,7 +319,10 @@ class ChromaRetriever:
|
|
|
80
319
|
)
|
|
81
320
|
chunks = text_splitter.split_documents(documents)
|
|
82
321
|
|
|
83
|
-
print(
|
|
322
|
+
PrettyOutput.print(
|
|
323
|
+
f"已将 {len(documents)} 个文档拆分为 {len(chunks)} 个块。",
|
|
324
|
+
OutputType.INFO,
|
|
325
|
+
)
|
|
84
326
|
|
|
85
327
|
if not chunks:
|
|
86
328
|
return
|
|
@@ -99,19 +341,33 @@ class ChromaRetriever:
|
|
|
99
341
|
documents=chunk_texts,
|
|
100
342
|
metadatas=cast(Any, metadatas),
|
|
101
343
|
)
|
|
102
|
-
print(
|
|
344
|
+
PrettyOutput.print(
|
|
345
|
+
f"成功将 {len(chunks)} 个块添加到 ChromaDB 集合中。",
|
|
346
|
+
OutputType.SUCCESS,
|
|
347
|
+
)
|
|
103
348
|
|
|
104
349
|
# 更新并保存BM25索引
|
|
105
350
|
tokenized_chunks = [doc.split() for doc in chunk_texts]
|
|
106
351
|
self.bm25_corpus.extend(tokenized_chunks)
|
|
107
352
|
self.bm25_index = BM25Okapi(self.bm25_corpus)
|
|
108
353
|
self._save_bm25_index()
|
|
109
|
-
|
|
110
|
-
|
|
354
|
+
# 更新索引清单(用于检测源文件变更/删除)
|
|
355
|
+
source_list = [
|
|
356
|
+
md.get("source")
|
|
357
|
+
for md in metadatas
|
|
358
|
+
if md and isinstance(md.get("source"), str)
|
|
359
|
+
]
|
|
360
|
+
self._update_manifest_with_sources(cast(List[str], source_list))
|
|
361
|
+
|
|
362
|
+
def retrieve(
|
|
363
|
+
self, query: str, n_results: int = 5, use_bm25: bool = True
|
|
364
|
+
) -> List[Document]:
|
|
111
365
|
"""
|
|
112
366
|
使用向量搜索和BM25执行混合检索,然后使用倒数排序融合(RRF)
|
|
113
367
|
对结果进行融合。
|
|
114
368
|
"""
|
|
369
|
+
# 在检索前检查源文件变更/删除并提醒
|
|
370
|
+
self._warn_if_sources_changed()
|
|
115
371
|
# 1. 向量搜索 (ChromaDB)
|
|
116
372
|
query_embedding = self.embedding_manager.embed_query(query)
|
|
117
373
|
vector_results = self.collection.query(
|
|
@@ -121,7 +377,7 @@ class ChromaRetriever:
|
|
|
121
377
|
|
|
122
378
|
# 2. 关键字搜索 (BM25)
|
|
123
379
|
bm25_docs = []
|
|
124
|
-
if self.bm25_index:
|
|
380
|
+
if self.bm25_index and use_bm25:
|
|
125
381
|
tokenized_query = query.split()
|
|
126
382
|
doc_scores = self.bm25_index.get_scores(tokenized_query)
|
|
127
383
|
|
|
@@ -144,7 +400,7 @@ class ChromaRetriever:
|
|
|
144
400
|
]
|
|
145
401
|
|
|
146
402
|
# 按分数排序并取最高结果
|
|
147
|
-
bm25_results_with_docs.sort(key=lambda x: x[2], reverse=True)
|
|
403
|
+
bm25_results_with_docs.sort(key=lambda x: x[2], reverse=True) # type: ignore
|
|
148
404
|
|
|
149
405
|
for doc_text, metadata, _ in bm25_results_with_docs[: n_results * 2]:
|
|
150
406
|
bm25_docs.append(Document(page_content=doc_text, metadata=metadata))
|