ultra-memory 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAWHUB.md +190 -0
- package/LICENSE +21 -0
- package/README.md +195 -0
- package/SKILL.md +383 -0
- package/package.json +107 -0
- package/platform/SYSTEM_PROMPT.md +184 -0
- package/platform/__pycache__/server.cpython-313.pyc +0 -0
- package/platform/openapi.yaml +305 -0
- package/platform/server.py +454 -0
- package/platform/tools_gemini.json +176 -0
- package/platform/tools_openai.json +207 -0
- package/scripts/__pycache__/cleanup.cpython-313.pyc +0 -0
- package/scripts/__pycache__/export.cpython-313.pyc +0 -0
- package/scripts/__pycache__/extract_entities.cpython-313.pyc +0 -0
- package/scripts/__pycache__/init.cpython-313.pyc +0 -0
- package/scripts/__pycache__/log_op.cpython-313.pyc +0 -0
- package/scripts/__pycache__/recall.cpython-313.pyc +0 -0
- package/scripts/__pycache__/restore.cpython-313.pyc +0 -0
- package/scripts/__pycache__/summarize.cpython-313.pyc +0 -0
- package/scripts/cleanup.py +156 -0
- package/scripts/export.py +158 -0
- package/scripts/extract_entities.py +289 -0
- package/scripts/init.py +243 -0
- package/scripts/log_op.py +328 -0
- package/scripts/mcp-server.js +341 -0
- package/scripts/recall.py +683 -0
- package/scripts/restore.py +267 -0
- package/scripts/summarize.py +389 -0
|
@@ -0,0 +1,683 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
ultra-memory: 记忆检索脚本
|
|
4
|
+
支持从三层记忆中检索相关内容
|
|
5
|
+
优化:同义词/别名映射 + 时间衰减权重 + 上下文窗口(前后各1条)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import json
|
|
11
|
+
import argparse
|
|
12
|
+
import re
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
if sys.stdout.encoding != "utf-8":
|
|
17
|
+
sys.stdout.reconfigure(encoding="utf-8")
|
|
18
|
+
if sys.stderr.encoding != "utf-8":
|
|
19
|
+
sys.stderr.reconfigure(encoding="utf-8")
|
|
20
|
+
|
|
21
|
+
ULTRA_MEMORY_HOME = Path(os.environ.get("ULTRA_MEMORY_HOME", Path.home() / ".ultra-memory"))
|
|
22
|
+
|
|
23
|
+
# 同义词/别名映射表:中文描述 ↔ 英文函数名/技术词
|
|
24
|
+
# 检索时会将查询词扩展为同义词集合,提升跨语言检索精度
|
|
25
|
+
SYNONYM_MAP = {
|
|
26
|
+
# 数据处理
|
|
27
|
+
"数据清洗": ["clean", "clean_df", "preprocess", "cleaner", "清洗", "data_clean"],
|
|
28
|
+
"clean_df": ["数据清洗", "清洗", "preprocess", "数据处理", "clean"],
|
|
29
|
+
"preprocess": ["预处理", "数据清洗", "clean_df", "数据处理"],
|
|
30
|
+
"数据处理": ["clean_df", "preprocess", "transform", "处理数据"],
|
|
31
|
+
# 测试
|
|
32
|
+
"测试": ["test", "unittest", "pytest", "spec", "assert"],
|
|
33
|
+
"test": ["测试", "单元测试", "pytest", "unittest"],
|
|
34
|
+
"单元测试": ["test", "unittest", "pytest"],
|
|
35
|
+
# 安装/依赖
|
|
36
|
+
"安装": ["install", "pip install", "npm install", "setup", "依赖"],
|
|
37
|
+
"install": ["安装", "依赖", "setup"],
|
|
38
|
+
"依赖": ["install", "dependency", "requirements", "安装"],
|
|
39
|
+
# 部署
|
|
40
|
+
"部署": ["deploy", "docker", "release", "发布"],
|
|
41
|
+
"deploy": ["部署", "发布", "release"],
|
|
42
|
+
# 错误
|
|
43
|
+
"报错": ["error", "exception", "traceback", "failed", "错误"],
|
|
44
|
+
"error": ["报错", "错误", "exception", "traceback"],
|
|
45
|
+
"错误": ["error", "exception", "报错", "traceback"],
|
|
46
|
+
# 配置
|
|
47
|
+
"配置": ["config", "settings", "setup", ".env"],
|
|
48
|
+
"config": ["配置", "设置", "settings"],
|
|
49
|
+
# 接口
|
|
50
|
+
"接口": ["api", "endpoint", "route", "url"],
|
|
51
|
+
"api": ["接口", "endpoint", "请求", "route"],
|
|
52
|
+
# 函数/方法
|
|
53
|
+
"函数": ["def", "function", "method", "func"],
|
|
54
|
+
"function": ["函数", "方法", "def"],
|
|
55
|
+
# 完成
|
|
56
|
+
"完成": ["done", "finished", "milestone", "✅"],
|
|
57
|
+
"done": ["完成", "finished", "milestone"],
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# 时间衰减半衰期(秒):越新的操作权重越高
|
|
61
|
+
TIME_HALF_LIFE_SECONDS = 3600 * 24 # 24小时为半衰期
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def expand_query(query: str) -> set[str]:
|
|
65
|
+
"""将查询词扩展为同义词集合"""
|
|
66
|
+
tokens = tokenize(query)
|
|
67
|
+
expanded = set(tokens)
|
|
68
|
+
for token in list(tokens):
|
|
69
|
+
for key, synonyms in SYNONYM_MAP.items():
|
|
70
|
+
if token == key.lower() or token in [s.lower() for s in synonyms]:
|
|
71
|
+
expanded.add(key.lower())
|
|
72
|
+
expanded.update(s.lower() for s in synonyms)
|
|
73
|
+
return expanded
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def tokenize(text: str) -> set[str]:
|
|
77
|
+
"""简单中英文分词(无需外部依赖)"""
|
|
78
|
+
# 英文:按空格和标点切分
|
|
79
|
+
words = re.findall(r'[a-zA-Z0-9_\-\.]+', text.lower())
|
|
80
|
+
# 中文:unigram + bigram(bigram 提升短语匹配)
|
|
81
|
+
chinese = re.findall(r'[\u4e00-\u9fff]', text)
|
|
82
|
+
bigrams = [chinese[i] + chinese[i+1] for i in range(len(chinese)-1)]
|
|
83
|
+
return set(words + bigrams + chinese)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def time_weight(ts_str: str) -> float:
|
|
87
|
+
"""
|
|
88
|
+
计算时间衰减权重(指数衰减)。
|
|
89
|
+
越新的操作权重越接近 1.0,24小时前的操作权重约 0.5。
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
ts = datetime.fromisoformat(ts_str.rstrip("Z")).replace(tzinfo=timezone.utc)
|
|
93
|
+
now = datetime.now(timezone.utc)
|
|
94
|
+
age_seconds = (now - ts).total_seconds()
|
|
95
|
+
# 指数衰减:weight = 0.5^(age / half_life)
|
|
96
|
+
import math
|
|
97
|
+
weight = math.pow(0.5, age_seconds / TIME_HALF_LIFE_SECONDS)
|
|
98
|
+
# 最低保底权重 0.1,避免旧记忆完全消失
|
|
99
|
+
return max(0.1, weight)
|
|
100
|
+
except Exception:
|
|
101
|
+
return 0.5
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def score_relevance(query_tokens: set, text: str, ts_str: str = "") -> float:
|
|
105
|
+
"""
|
|
106
|
+
关键词重叠相关性评分 × 时间权重。
|
|
107
|
+
加入同义词扩展后的 token 参与匹配。
|
|
108
|
+
"""
|
|
109
|
+
text_tokens = tokenize(text)
|
|
110
|
+
if not query_tokens or not text_tokens:
|
|
111
|
+
return 0.0
|
|
112
|
+
overlap = len(query_tokens & text_tokens)
|
|
113
|
+
base_score = overlap / max(len(query_tokens), 1)
|
|
114
|
+
tw = time_weight(ts_str) if ts_str else 1.0
|
|
115
|
+
return base_score * (0.7 + 0.3 * tw) # 时间权重占 30%
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def load_all_ops(session_dir: Path) -> list[dict]:
|
|
119
|
+
"""加载全部操作(含已压缩,用于提取上下文窗口)"""
|
|
120
|
+
ops_file = session_dir / "ops.jsonl"
|
|
121
|
+
if not ops_file.exists():
|
|
122
|
+
return []
|
|
123
|
+
ops = []
|
|
124
|
+
with open(ops_file, encoding="utf-8") as f:
|
|
125
|
+
for line in f:
|
|
126
|
+
line = line.strip()
|
|
127
|
+
if not line:
|
|
128
|
+
continue
|
|
129
|
+
try:
|
|
130
|
+
ops.append(json.loads(line))
|
|
131
|
+
except json.JSONDecodeError:
|
|
132
|
+
continue
|
|
133
|
+
return ops
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_context_window(all_ops: list[dict], target_seq: int, window: int = 1) -> dict:
|
|
137
|
+
"""
|
|
138
|
+
返回目标 seq 前后各 window 条操作作为上下文。
|
|
139
|
+
"""
|
|
140
|
+
seq_map = {op["seq"]: op for op in all_ops}
|
|
141
|
+
before = []
|
|
142
|
+
after = []
|
|
143
|
+
for i in range(1, window + 1):
|
|
144
|
+
if (target_seq - i) in seq_map:
|
|
145
|
+
before.insert(0, seq_map[target_seq - i])
|
|
146
|
+
if (target_seq + i) in seq_map:
|
|
147
|
+
after.append(seq_map[target_seq + i])
|
|
148
|
+
return {"before": before, "after": after}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def search_ops(session_dir: Path, query_tokens: set, top_k: int) -> list[dict]:
|
|
152
|
+
"""在操作日志中搜索,附带时间权重和上下文窗口"""
|
|
153
|
+
all_ops = load_all_ops(session_dir)
|
|
154
|
+
if not all_ops:
|
|
155
|
+
return []
|
|
156
|
+
|
|
157
|
+
results = []
|
|
158
|
+
for op in all_ops:
|
|
159
|
+
text = op.get("summary", "") + " " + json.dumps(op.get("detail", {}), ensure_ascii=False)
|
|
160
|
+
score = score_relevance(query_tokens, text, op.get("ts", ""))
|
|
161
|
+
if score > 0:
|
|
162
|
+
ctx = get_context_window(all_ops, op["seq"], window=1)
|
|
163
|
+
results.append({
|
|
164
|
+
"score": score,
|
|
165
|
+
"source": "ops",
|
|
166
|
+
"data": op,
|
|
167
|
+
"context": ctx,
|
|
168
|
+
})
|
|
169
|
+
|
|
170
|
+
results.sort(key=lambda x: (-x["score"], -x["data"]["seq"]))
|
|
171
|
+
return results[:top_k]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def search_summary(session_dir: Path, query_tokens: set) -> list[dict]:
|
|
175
|
+
"""在摘要文件中搜索"""
|
|
176
|
+
summary_file = session_dir / "summary.md"
|
|
177
|
+
if not summary_file.exists():
|
|
178
|
+
return []
|
|
179
|
+
with open(summary_file, encoding="utf-8") as f:
|
|
180
|
+
content = f.read()
|
|
181
|
+
paragraphs = [p.strip() for p in content.split("\n") if p.strip() and not p.startswith("#")]
|
|
182
|
+
results = []
|
|
183
|
+
for para in paragraphs:
|
|
184
|
+
score = score_relevance(query_tokens, para)
|
|
185
|
+
if score > 0.1:
|
|
186
|
+
results.append({"score": score, "source": "summary", "text": para})
|
|
187
|
+
results.sort(key=lambda x: -x["score"])
|
|
188
|
+
return results[:3]
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def search_entities(query_tokens: set, top_k: int) -> list[dict]:
|
|
192
|
+
"""
|
|
193
|
+
第4层:实体索引搜索(结构化精确检索)。
|
|
194
|
+
适合回答:
|
|
195
|
+
- "我们用过哪些函数?" → entity_type=function
|
|
196
|
+
- "动过哪些文件?" → entity_type=file
|
|
197
|
+
- "装了哪些依赖?" → entity_type=dependency
|
|
198
|
+
- "做了哪些决策?" → entity_type=decision
|
|
199
|
+
相比 bigram 关键词,对结构化查询的精度提升显著。
|
|
200
|
+
"""
|
|
201
|
+
entities_file = ULTRA_MEMORY_HOME / "semantic" / "entities.jsonl"
|
|
202
|
+
if not entities_file.exists():
|
|
203
|
+
return []
|
|
204
|
+
|
|
205
|
+
# 实体类型别名:查询词到实体类型的映射
|
|
206
|
+
TYPE_ALIASES = {
|
|
207
|
+
"函数": "function", "function": "function", "方法": "function", "func": "function",
|
|
208
|
+
"文件": "file", "file": "file", "路径": "file",
|
|
209
|
+
"依赖": "dependency", "dependency": "dependency", "包": "dependency",
|
|
210
|
+
"决策": "decision", "decision": "decision", "选择": "decision",
|
|
211
|
+
"错误": "error", "error": "error", "报错": "error", "异常": "error",
|
|
212
|
+
"类": "class", "class": "class",
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
# 检测查询是否包含实体类型词(精确类型过滤)
|
|
216
|
+
target_type = None
|
|
217
|
+
for token in query_tokens:
|
|
218
|
+
if token in TYPE_ALIASES:
|
|
219
|
+
target_type = TYPE_ALIASES[token]
|
|
220
|
+
break
|
|
221
|
+
|
|
222
|
+
results = []
|
|
223
|
+
seen_names: set[str] = set() # 去重:同名实体只保留最新一条
|
|
224
|
+
|
|
225
|
+
all_entities = []
|
|
226
|
+
with open(entities_file, encoding="utf-8") as f:
|
|
227
|
+
for line in f:
|
|
228
|
+
line = line.strip()
|
|
229
|
+
if not line:
|
|
230
|
+
continue
|
|
231
|
+
try:
|
|
232
|
+
all_entities.append(json.loads(line))
|
|
233
|
+
except json.JSONDecodeError:
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
# 按 ts 倒序(最新优先)
|
|
237
|
+
all_entities.sort(key=lambda e: e.get("ts", ""), reverse=True)
|
|
238
|
+
|
|
239
|
+
for ent in all_entities:
|
|
240
|
+
# 类型过滤
|
|
241
|
+
if target_type and ent.get("entity_type") != target_type:
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
name = ent.get("name", "")
|
|
245
|
+
context = ent.get("context", "")
|
|
246
|
+
ent_text = name + " " + context
|
|
247
|
+
|
|
248
|
+
score = score_relevance(query_tokens, ent_text, ent.get("ts", ""))
|
|
249
|
+
|
|
250
|
+
# 实体名精确匹配给予额外加分
|
|
251
|
+
name_tokens = tokenize(name)
|
|
252
|
+
exact_match = bool(query_tokens & name_tokens)
|
|
253
|
+
if exact_match:
|
|
254
|
+
score = max(score, 0.5) # 保底 0.5 分
|
|
255
|
+
|
|
256
|
+
# 如果是类型查询("所有函数" "所有文件"),返回全部该类型实体
|
|
257
|
+
if target_type and not seen_names:
|
|
258
|
+
score = max(score, 0.3)
|
|
259
|
+
|
|
260
|
+
if score > 0.1:
|
|
261
|
+
dedup_key = f"{ent.get('entity_type')}:{name}"
|
|
262
|
+
if dedup_key not in seen_names:
|
|
263
|
+
seen_names.add(dedup_key)
|
|
264
|
+
results.append({
|
|
265
|
+
"score": score,
|
|
266
|
+
"source": "entity",
|
|
267
|
+
"data": ent,
|
|
268
|
+
})
|
|
269
|
+
|
|
270
|
+
results.sort(key=lambda x: -x["score"])
|
|
271
|
+
return results[:top_k]
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def search_semantic(query_tokens: set, top_k: int) -> list[dict]:
|
|
275
|
+
"""在 Layer 3 语义层搜索(轻量模式:关键词匹配 + 同义词扩展)"""
|
|
276
|
+
semantic_dir = ULTRA_MEMORY_HOME / "semantic"
|
|
277
|
+
kb_file = semantic_dir / "knowledge_base.jsonl"
|
|
278
|
+
index_file = semantic_dir / "session_index.json"
|
|
279
|
+
|
|
280
|
+
results = []
|
|
281
|
+
|
|
282
|
+
if kb_file.exists():
|
|
283
|
+
with open(kb_file, encoding="utf-8") as f:
|
|
284
|
+
for line in f:
|
|
285
|
+
line = line.strip()
|
|
286
|
+
if not line:
|
|
287
|
+
continue
|
|
288
|
+
try:
|
|
289
|
+
entry = json.loads(line)
|
|
290
|
+
except json.JSONDecodeError:
|
|
291
|
+
continue
|
|
292
|
+
text = entry.get("content", "") + " " + entry.get("title", "")
|
|
293
|
+
ts = entry.get("ts", "")
|
|
294
|
+
score = score_relevance(query_tokens, text, ts)
|
|
295
|
+
if score > 0.1:
|
|
296
|
+
results.append({"score": score, "source": "knowledge_base", "data": entry})
|
|
297
|
+
|
|
298
|
+
if index_file.exists():
|
|
299
|
+
with open(index_file, encoding="utf-8") as f:
|
|
300
|
+
index = json.load(f)
|
|
301
|
+
for s in index.get("sessions", []):
|
|
302
|
+
text = s.get("project", "") + " " + (s.get("last_milestone") or "")
|
|
303
|
+
ts = s.get("started_at", "")
|
|
304
|
+
score = score_relevance(query_tokens, text, ts)
|
|
305
|
+
if score > 0.1:
|
|
306
|
+
results.append({"score": score, "source": "history", "data": s})
|
|
307
|
+
|
|
308
|
+
results.sort(key=lambda x: -x["score"])
|
|
309
|
+
return results[:top_k]
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# ── TF-IDF 向量语义搜索层(第四层召回的增强)───────────────────────────
|
|
313
|
+
|
|
314
|
+
def is_sklearn_available() -> bool:
|
|
315
|
+
try:
|
|
316
|
+
import sklearn; return True
|
|
317
|
+
except ImportError:
|
|
318
|
+
return False
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def is_sentencetransformers_available() -> bool:
|
|
322
|
+
try:
|
|
323
|
+
from sentence_transformers import SentenceTransformer; return True
|
|
324
|
+
except ImportError:
|
|
325
|
+
return False
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
_TFidfCache: dict[str, dict] = {} # session_id → {vocab, idfs, doc_vectors, doc_texts}
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _get_tfidf_cache_path(session_dir: Path) -> Path:
|
|
332
|
+
return session_dir / "tfidf_cache.json"
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _text_from_op(op: dict) -> str:
|
|
336
|
+
"""提取 op 中可索引的文本"""
|
|
337
|
+
parts = [
|
|
338
|
+
op.get("summary", ""),
|
|
339
|
+
op.get("type", ""),
|
|
340
|
+
" ".join(op.get("tags", [])),
|
|
341
|
+
]
|
|
342
|
+
detail = op.get("detail", {})
|
|
343
|
+
if isinstance(detail, dict):
|
|
344
|
+
for v in detail.values():
|
|
345
|
+
if isinstance(v, str):
|
|
346
|
+
parts.append(v)
|
|
347
|
+
return " ".join(parts)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _build_tfidf_index(ops: list[dict]) -> dict:
|
|
351
|
+
"""
|
|
352
|
+
用 sklearn TfidfVectorizer 构建内存索引。
|
|
353
|
+
返回 {vocab: [...], idfs: [...], doc_vectors: [[...], ...], doc_texts: [...]}
|
|
354
|
+
完全零外部 API 依赖。
|
|
355
|
+
"""
|
|
356
|
+
import math
|
|
357
|
+
from collections import Counter
|
|
358
|
+
|
|
359
|
+
texts = [_text_from_op(op) for op in ops]
|
|
360
|
+
# 简单 tokenize:英文保留 word,中文逐字
|
|
361
|
+
def tokens(text: str) -> list[str]:
|
|
362
|
+
import re
|
|
363
|
+
en = re.findall(r'[a-zA-Z0-9_]+', text.lower())
|
|
364
|
+
zh = list(text)
|
|
365
|
+
return en + zh
|
|
366
|
+
|
|
367
|
+
tokenized = [tokens(t) for t in texts]
|
|
368
|
+
# 构建词表
|
|
369
|
+
vocab_set: set[str] = set()
|
|
370
|
+
for tk in tokenized:
|
|
371
|
+
vocab_set.update(tk)
|
|
372
|
+
vocab = sorted(vocab_set)
|
|
373
|
+
word2idx = {w: i for i, w in enumerate(vocab)}
|
|
374
|
+
n = len(vocab)
|
|
375
|
+
|
|
376
|
+
# TF: 词频
|
|
377
|
+
N = len(texts)
|
|
378
|
+
df = Counter()
|
|
379
|
+
for tk in tokenized:
|
|
380
|
+
df.update(set(tk))
|
|
381
|
+
|
|
382
|
+
idfs = []
|
|
383
|
+
for w in vocab:
|
|
384
|
+
df_w = df[w]
|
|
385
|
+
idf = math.log((N + 1) / (df_w + 1)) + 1 # 平滑
|
|
386
|
+
idfs.append(idf)
|
|
387
|
+
|
|
388
|
+
# 文档向量 = TF × IDF
|
|
389
|
+
doc_vectors = []
|
|
390
|
+
for tk in tokenized:
|
|
391
|
+
tf = Counter(tk)
|
|
392
|
+
vec = [0.0] * n
|
|
393
|
+
for w, f in tf.items():
|
|
394
|
+
if w in word2idx:
|
|
395
|
+
idx = word2idx[w]
|
|
396
|
+
vec[idx] = f * idfs[idx]
|
|
397
|
+
# L2 归一化
|
|
398
|
+
norm = math.sqrt(sum(v ** 2 for v in vec))
|
|
399
|
+
if norm > 0:
|
|
400
|
+
vec = [v / norm for v in vec]
|
|
401
|
+
doc_vectors.append(vec)
|
|
402
|
+
|
|
403
|
+
return {
|
|
404
|
+
"vocab": vocab,
|
|
405
|
+
"idfs": idfs,
|
|
406
|
+
"doc_vectors": doc_vectors,
|
|
407
|
+
"doc_texts": texts,
|
|
408
|
+
"n_docs": N,
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
413
|
+
import math
|
|
414
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
415
|
+
na = math.sqrt(sum(x * x for x in a))
|
|
416
|
+
nb = math.sqrt(sum(x * x for x in b))
|
|
417
|
+
if na == 0 or nb == 0:
|
|
418
|
+
return 0.0
|
|
419
|
+
return dot / (na * nb)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _search_tfidf(session_dir: Path, all_ops: list[dict],
|
|
423
|
+
query: str, top_k: int) -> list[dict]:
|
|
424
|
+
"""纯 sklearn TF-IDF 语义搜索(零依赖,fallback 方案)"""
|
|
425
|
+
import re
|
|
426
|
+
|
|
427
|
+
cache_path = _get_tfidf_cache_path(session_dir)
|
|
428
|
+
|
|
429
|
+
# 加载或构建缓存
|
|
430
|
+
if cache_path.exists():
|
|
431
|
+
try:
|
|
432
|
+
import json as _json
|
|
433
|
+
with open(cache_path, encoding="utf-8") as f:
|
|
434
|
+
cache = _json.load(f)
|
|
435
|
+
doc_vectors = cache["doc_vectors"]
|
|
436
|
+
doc_texts = cache["doc_texts"]
|
|
437
|
+
vocab = cache["vocab"]
|
|
438
|
+
idfs = cache["idfs"]
|
|
439
|
+
cached_seq = cache.get("last_seq", -1)
|
|
440
|
+
except Exception:
|
|
441
|
+
cache = None
|
|
442
|
+
doc_vectors = None
|
|
443
|
+
else:
|
|
444
|
+
cache = None
|
|
445
|
+
|
|
446
|
+
# 如果缓存过期(seq 变了)或不存在,重新构建
|
|
447
|
+
current_seq = max((op.get("seq", 0) for op in all_ops), default=0)
|
|
448
|
+
if doc_vectors is None or cache is None or cache.get("last_seq", -1) != current_seq:
|
|
449
|
+
cache = _build_tfidf_index(all_ops)
|
|
450
|
+
doc_vectors = cache["doc_vectors"]
|
|
451
|
+
doc_texts = cache["doc_texts"]
|
|
452
|
+
vocab = cache["vocab"]
|
|
453
|
+
idfs = cache["idfs"]
|
|
454
|
+
cache["last_seq"] = current_seq
|
|
455
|
+
try:
|
|
456
|
+
with open(cache_path, "w", encoding="utf-8") as f:
|
|
457
|
+
json.dump(cache, f)
|
|
458
|
+
except Exception:
|
|
459
|
+
pass # 写入失败不影响搜索
|
|
460
|
+
|
|
461
|
+
# 把 query 也转成 TF-IDF 向量
|
|
462
|
+
def tokens(text: str) -> list[str]:
|
|
463
|
+
en = re.findall(r'[a-zA-Z0-9_]+', text.lower())
|
|
464
|
+
zh = list(text)
|
|
465
|
+
return en + zh
|
|
466
|
+
|
|
467
|
+
q_tokens = tokens(query)
|
|
468
|
+
tf_q = Counter(q_tokens)
|
|
469
|
+
word2idx = {w: i for i, w in enumerate(vocab)}
|
|
470
|
+
vec_q = [0.0] * len(vocab)
|
|
471
|
+
for w, f in tf_q.items():
|
|
472
|
+
if w in word2idx:
|
|
473
|
+
idx = word2idx[w]
|
|
474
|
+
vec_q[idx] = f * idfs[idx]
|
|
475
|
+
|
|
476
|
+
# L2 归一化
|
|
477
|
+
import math
|
|
478
|
+
norm = math.sqrt(sum(v * v for v in vec_q))
|
|
479
|
+
if norm > 0:
|
|
480
|
+
vec_q = [v / norm for v in vec_q]
|
|
481
|
+
|
|
482
|
+
# 余弦相似度
|
|
483
|
+
scored = []
|
|
484
|
+
for i, dv in enumerate(doc_vectors):
|
|
485
|
+
score = _cosine_similarity(vec_q, dv)
|
|
486
|
+
if score > 0.05: # 阈值过滤噪音
|
|
487
|
+
scored.append((score, i))
|
|
488
|
+
scored.sort(key=lambda x: -x[0])
|
|
489
|
+
|
|
490
|
+
results = []
|
|
491
|
+
# all_ops 和 doc_vectors/doc_texts 按同一顺序排列,直接用索引对应
|
|
492
|
+
for score, i in scored[:top_k]:
|
|
493
|
+
results.append({"score": score, "source": "tfidf", "data": all_ops[i]})
|
|
494
|
+
|
|
495
|
+
return results
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def _search_sentencetransformers(
|
|
499
|
+
session_dir: Path, all_ops: list[dict],
|
|
500
|
+
query: str, top_k: int
|
|
501
|
+
) -> list[dict]:
|
|
502
|
+
"""
|
|
503
|
+
sentence-transformers 向量语义搜索(更高质量,需 pip install sentence-transformers)。
|
|
504
|
+
使用 all-MiniLM-L6-v2(22MB,本地运行,无需 API)。
|
|
505
|
+
"""
|
|
506
|
+
import json as _json
|
|
507
|
+
|
|
508
|
+
try:
|
|
509
|
+
from sentence_transformers import SentenceTransformer
|
|
510
|
+
except ImportError:
|
|
511
|
+
return []
|
|
512
|
+
|
|
513
|
+
cache_path = session_dir / "embed_cache.json"
|
|
514
|
+
|
|
515
|
+
# 加载或构建 embedding 缓存
|
|
516
|
+
if cache_path.exists():
|
|
517
|
+
try:
|
|
518
|
+
with open(cache_path, encoding="utf-8") as f:
|
|
519
|
+
cache = _json.load(f)
|
|
520
|
+
cached_seq = cache.get("last_seq", -1)
|
|
521
|
+
current_seq = max((op.get("seq", 0) for op in all_ops), default=0)
|
|
522
|
+
if cached_seq != current_seq:
|
|
523
|
+
cache = None
|
|
524
|
+
except Exception:
|
|
525
|
+
cache = None
|
|
526
|
+
else:
|
|
527
|
+
cache = None
|
|
528
|
+
|
|
529
|
+
texts = [_text_from_op(op) for op in all_ops]
|
|
530
|
+
|
|
531
|
+
if cache is None:
|
|
532
|
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
533
|
+
embeddings = model.encode(texts, show_progress_bar=False).tolist()
|
|
534
|
+
current_seq = max((op.get("seq", 0) for op in all_ops), default=0)
|
|
535
|
+
cache = {"embeddings": embeddings, "last_seq": current_seq}
|
|
536
|
+
try:
|
|
537
|
+
with open(cache_path, "w", encoding="utf-8") as f:
|
|
538
|
+
_json.dump(cache, f)
|
|
539
|
+
except Exception:
|
|
540
|
+
pass
|
|
541
|
+
|
|
542
|
+
# 将查询向量化
|
|
543
|
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
544
|
+
query_emb = model.encode([query], show_progress_bar=False)[0].tolist()
|
|
545
|
+
|
|
546
|
+
embeddings = cache["embeddings"]
|
|
547
|
+
import math
|
|
548
|
+
scored = []
|
|
549
|
+
for i, emb in enumerate(embeddings):
|
|
550
|
+
dot = sum(a * b for a, b in zip(query_emb, emb))
|
|
551
|
+
na = math.sqrt(sum(a * a for a in query_emb))
|
|
552
|
+
nb = math.sqrt(sum(a * a for a in emb))
|
|
553
|
+
score = dot / (na * nb) if na > 0 and nb > 0 else 0
|
|
554
|
+
if score > 0.3:
|
|
555
|
+
scored.append((score, i))
|
|
556
|
+
scored.sort(key=lambda x: -x[0])
|
|
557
|
+
|
|
558
|
+
results = []
|
|
559
|
+
for score, i in scored[:top_k]:
|
|
560
|
+
results.append({"score": score, "source": "embedding", "data": all_ops[i]})
|
|
561
|
+
return results
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def search_tfidf(session_dir: Path, all_ops: list[dict],
|
|
565
|
+
query: str, top_k: int) -> list[dict]:
|
|
566
|
+
"""
|
|
567
|
+
语义搜索入口:优先 sentence-transformers,退回 sklearn TF-IDF。
|
|
568
|
+
如果都不可用,返回空列表(不阻塞主流程)。
|
|
569
|
+
"""
|
|
570
|
+
if is_sentencetransformers_available():
|
|
571
|
+
return _search_sentencetransformers(session_dir, all_ops, query, top_k)
|
|
572
|
+
elif is_sklearn_available():
|
|
573
|
+
return _search_tfidf(session_dir, all_ops, query, top_k)
|
|
574
|
+
return []
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
# ── 结果格式化 ──────────────────────────────────────────────────────────
|
|
578
|
+
|
|
579
|
+
def format_result(result: dict, show_context: bool = True) -> str:
|
|
580
|
+
source = result["source"]
|
|
581
|
+
lines = []
|
|
582
|
+
|
|
583
|
+
if source == "ops":
|
|
584
|
+
op = result["data"]
|
|
585
|
+
ts = op["ts"][:16].replace("T", " ")
|
|
586
|
+
lines.append(f"[ops #{op['seq']} · {ts}] {op['summary']}")
|
|
587
|
+
# 显示上下文窗口
|
|
588
|
+
if show_context and result.get("context"):
|
|
589
|
+
ctx = result["context"]
|
|
590
|
+
for before_op in ctx.get("before", []):
|
|
591
|
+
lines.append(f" ↑ [#{before_op['seq']}] {before_op['summary'][:60]}")
|
|
592
|
+
for after_op in ctx.get("after", []):
|
|
593
|
+
lines.append(f" ↓ [#{after_op['seq']}] {after_op['summary'][:60]}")
|
|
594
|
+
elif source == "summary":
|
|
595
|
+
lines.append(f"[摘要] {result['text']}")
|
|
596
|
+
elif source == "knowledge_base":
|
|
597
|
+
d = result["data"]
|
|
598
|
+
lines.append(f"[知识库 · {d.get('title', '?')}] {d.get('content', '')[:100]}")
|
|
599
|
+
elif source == "history":
|
|
600
|
+
d = result["data"]
|
|
601
|
+
ts = d.get("started_at", "")[:10]
|
|
602
|
+
lines.append(f"[历史会话 · {ts} · {d.get('project', '')}] {d.get('last_milestone', '无里程碑记录')}")
|
|
603
|
+
elif source == "entity":
|
|
604
|
+
d = result["data"]
|
|
605
|
+
et = d.get("entity_type", "?")
|
|
606
|
+
name = d.get("name", "?")
|
|
607
|
+
ctx = d.get("context", "")
|
|
608
|
+
ts = d.get("ts", "")[:16].replace("T", " ")
|
|
609
|
+
extra = ""
|
|
610
|
+
if et == "dependency":
|
|
611
|
+
extra = f" [via {d.get('manager', '?')}]"
|
|
612
|
+
elif et == "decision":
|
|
613
|
+
rationale = d.get("rationale", "")
|
|
614
|
+
extra = f" 依据: {rationale}" if rationale else ""
|
|
615
|
+
elif et == "error":
|
|
616
|
+
extra = f" ← {d.get('message', '')}"
|
|
617
|
+
lines.append(f"[实体/{et} · {ts}] {name}{extra}")
|
|
618
|
+
if ctx:
|
|
619
|
+
lines.append(f" 来源: {ctx}")
|
|
620
|
+
|
|
621
|
+
elif source in ("tfidf", "embedding"):
|
|
622
|
+
d = result["data"]
|
|
623
|
+
ts = d.get("ts", "")[:16].replace("T", " ")
|
|
624
|
+
label = "TF-IDF" if source == "tfidf" else "向量"
|
|
625
|
+
lines.append(f"[语义/{label} #{d.get('seq', '?')} · {ts}] {d.get('summary', '?')[:80]}")
|
|
626
|
+
detail = d.get("detail", {})
|
|
627
|
+
if isinstance(detail, dict):
|
|
628
|
+
for k, v in list(detail.items())[:2]:
|
|
629
|
+
lines.append(f" [{k}] {str(v)[:60]}")
|
|
630
|
+
|
|
631
|
+
return "\n".join(lines) if lines else str(result)
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def recall(session_id: str, query: str, top_k: int = 5):
|
|
635
|
+
# 扩展查询词(加入同义词)
|
|
636
|
+
query_tokens = expand_query(query)
|
|
637
|
+
|
|
638
|
+
session_dir = ULTRA_MEMORY_HOME / "sessions" / session_id
|
|
639
|
+
found = []
|
|
640
|
+
|
|
641
|
+
# Layer 1: 操作日志(含时间权重 + 上下文窗口)
|
|
642
|
+
ops_results = search_ops(session_dir, query_tokens, top_k)
|
|
643
|
+
found.extend(ops_results)
|
|
644
|
+
|
|
645
|
+
# Layer 2: 摘要
|
|
646
|
+
summary_results = search_summary(session_dir, query_tokens)
|
|
647
|
+
found.extend(summary_results)
|
|
648
|
+
|
|
649
|
+
# Layer 3: 语义层(跨会话)
|
|
650
|
+
semantic_results = search_semantic(query_tokens, top_k)
|
|
651
|
+
found.extend(semantic_results)
|
|
652
|
+
|
|
653
|
+
# Layer 4: 实体索引(结构化精确检索)
|
|
654
|
+
entity_results = search_entities(query_tokens, top_k)
|
|
655
|
+
found.extend(entity_results)
|
|
656
|
+
|
|
657
|
+
# Layer 5: 向量语义搜索(TF-IDF 或 sentence-transformers)
|
|
658
|
+
ops_for_tfidf = load_all_ops(session_dir)
|
|
659
|
+
if ops_for_tfidf:
|
|
660
|
+
tfidf_results = search_tfidf(session_dir, ops_for_tfidf, query, top_k)
|
|
661
|
+
found.extend(tfidf_results)
|
|
662
|
+
|
|
663
|
+
# 去重 + 排序
|
|
664
|
+
found.sort(key=lambda x: -x["score"])
|
|
665
|
+
found = found[:top_k]
|
|
666
|
+
|
|
667
|
+
if not found:
|
|
668
|
+
print(f"[RECALL] 未找到与「{query}」相关的记忆")
|
|
669
|
+
return
|
|
670
|
+
|
|
671
|
+
print(f"\n[RECALL] 找到 {len(found)} 条相关记录(查询: {query}):\n")
|
|
672
|
+
for i, r in enumerate(found, 1):
|
|
673
|
+
print(f"{i}. {format_result(r, show_context=True)}")
|
|
674
|
+
print()
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
if __name__ == "__main__":
|
|
678
|
+
parser = argparse.ArgumentParser(description="检索记忆")
|
|
679
|
+
parser.add_argument("--session", required=True, help="会话 ID")
|
|
680
|
+
parser.add_argument("--query", required=True, help="检索关键词")
|
|
681
|
+
parser.add_argument("--top-k", type=int, default=5)
|
|
682
|
+
args = parser.parse_args()
|
|
683
|
+
recall(args.session, args.query, args.top_k)
|