ultra-memory 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,683 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ultra-memory: 记忆检索脚本
4
+ 支持从三层记忆中检索相关内容
5
+ 优化:同义词/别名映射 + 时间衰减权重 + 上下文窗口(前后各1条)
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import json
11
+ import argparse
12
+ import re
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+
16
+ if sys.stdout.encoding != "utf-8":
17
+ sys.stdout.reconfigure(encoding="utf-8")
18
+ if sys.stderr.encoding != "utf-8":
19
+ sys.stderr.reconfigure(encoding="utf-8")
20
+
21
+ ULTRA_MEMORY_HOME = Path(os.environ.get("ULTRA_MEMORY_HOME", Path.home() / ".ultra-memory"))
22
+
23
+ # 同义词/别名映射表:中文描述 ↔ 英文函数名/技术词
24
+ # 检索时会将查询词扩展为同义词集合,提升跨语言检索精度
25
+ SYNONYM_MAP = {
26
+ # 数据处理
27
+ "数据清洗": ["clean", "clean_df", "preprocess", "cleaner", "清洗", "data_clean"],
28
+ "clean_df": ["数据清洗", "清洗", "preprocess", "数据处理", "clean"],
29
+ "preprocess": ["预处理", "数据清洗", "clean_df", "数据处理"],
30
+ "数据处理": ["clean_df", "preprocess", "transform", "处理数据"],
31
+ # 测试
32
+ "测试": ["test", "unittest", "pytest", "spec", "assert"],
33
+ "test": ["测试", "单元测试", "pytest", "unittest"],
34
+ "单元测试": ["test", "unittest", "pytest"],
35
+ # 安装/依赖
36
+ "安装": ["install", "pip install", "npm install", "setup", "依赖"],
37
+ "install": ["安装", "依赖", "setup"],
38
+ "依赖": ["install", "dependency", "requirements", "安装"],
39
+ # 部署
40
+ "部署": ["deploy", "docker", "release", "发布"],
41
+ "deploy": ["部署", "发布", "release"],
42
+ # 错误
43
+ "报错": ["error", "exception", "traceback", "failed", "错误"],
44
+ "error": ["报错", "错误", "exception", "traceback"],
45
+ "错误": ["error", "exception", "报错", "traceback"],
46
+ # 配置
47
+ "配置": ["config", "settings", "setup", ".env"],
48
+ "config": ["配置", "设置", "settings"],
49
+ # 接口
50
+ "接口": ["api", "endpoint", "route", "url"],
51
+ "api": ["接口", "endpoint", "请求", "route"],
52
+ # 函数/方法
53
+ "函数": ["def", "function", "method", "func"],
54
+ "function": ["函数", "方法", "def"],
55
+ # 完成
56
+ "完成": ["done", "finished", "milestone", "✅"],
57
+ "done": ["完成", "finished", "milestone"],
58
+ }
59
+
60
+ # 时间衰减半衰期(秒):越新的操作权重越高
61
+ TIME_HALF_LIFE_SECONDS = 3600 * 24 # 24小时为半衰期
62
+
63
+
64
+ def expand_query(query: str) -> set[str]:
65
+ """将查询词扩展为同义词集合"""
66
+ tokens = tokenize(query)
67
+ expanded = set(tokens)
68
+ for token in list(tokens):
69
+ for key, synonyms in SYNONYM_MAP.items():
70
+ if token == key.lower() or token in [s.lower() for s in synonyms]:
71
+ expanded.add(key.lower())
72
+ expanded.update(s.lower() for s in synonyms)
73
+ return expanded
74
+
75
+
76
+ def tokenize(text: str) -> set[str]:
77
+ """简单中英文分词(无需外部依赖)"""
78
+ # 英文:按空格和标点切分
79
+ words = re.findall(r'[a-zA-Z0-9_\-\.]+', text.lower())
80
+ # 中文:unigram + bigram(bigram 提升短语匹配)
81
+ chinese = re.findall(r'[\u4e00-\u9fff]', text)
82
+ bigrams = [chinese[i] + chinese[i+1] for i in range(len(chinese)-1)]
83
+ return set(words + bigrams + chinese)
84
+
85
+
86
+ def time_weight(ts_str: str) -> float:
87
+ """
88
+ 计算时间衰减权重(指数衰减)。
89
+ 越新的操作权重越接近 1.0,24小时前的操作权重约 0.5。
90
+ """
91
+ try:
92
+ ts = datetime.fromisoformat(ts_str.rstrip("Z")).replace(tzinfo=timezone.utc)
93
+ now = datetime.now(timezone.utc)
94
+ age_seconds = (now - ts).total_seconds()
95
+ # 指数衰减:weight = 0.5^(age / half_life)
96
+ import math
97
+ weight = math.pow(0.5, age_seconds / TIME_HALF_LIFE_SECONDS)
98
+ # 最低保底权重 0.1,避免旧记忆完全消失
99
+ return max(0.1, weight)
100
+ except Exception:
101
+ return 0.5
102
+
103
+
104
+ def score_relevance(query_tokens: set, text: str, ts_str: str = "") -> float:
105
+ """
106
+ 关键词重叠相关性评分 × 时间权重。
107
+ 加入同义词扩展后的 token 参与匹配。
108
+ """
109
+ text_tokens = tokenize(text)
110
+ if not query_tokens or not text_tokens:
111
+ return 0.0
112
+ overlap = len(query_tokens & text_tokens)
113
+ base_score = overlap / max(len(query_tokens), 1)
114
+ tw = time_weight(ts_str) if ts_str else 1.0
115
+ return base_score * (0.7 + 0.3 * tw) # 时间权重占 30%
116
+
117
+
118
+ def load_all_ops(session_dir: Path) -> list[dict]:
119
+ """加载全部操作(含已压缩,用于提取上下文窗口)"""
120
+ ops_file = session_dir / "ops.jsonl"
121
+ if not ops_file.exists():
122
+ return []
123
+ ops = []
124
+ with open(ops_file, encoding="utf-8") as f:
125
+ for line in f:
126
+ line = line.strip()
127
+ if not line:
128
+ continue
129
+ try:
130
+ ops.append(json.loads(line))
131
+ except json.JSONDecodeError:
132
+ continue
133
+ return ops
134
+
135
+
136
+ def get_context_window(all_ops: list[dict], target_seq: int, window: int = 1) -> dict:
137
+ """
138
+ 返回目标 seq 前后各 window 条操作作为上下文。
139
+ """
140
+ seq_map = {op["seq"]: op for op in all_ops}
141
+ before = []
142
+ after = []
143
+ for i in range(1, window + 1):
144
+ if (target_seq - i) in seq_map:
145
+ before.insert(0, seq_map[target_seq - i])
146
+ if (target_seq + i) in seq_map:
147
+ after.append(seq_map[target_seq + i])
148
+ return {"before": before, "after": after}
149
+
150
+
151
+ def search_ops(session_dir: Path, query_tokens: set, top_k: int) -> list[dict]:
152
+ """在操作日志中搜索,附带时间权重和上下文窗口"""
153
+ all_ops = load_all_ops(session_dir)
154
+ if not all_ops:
155
+ return []
156
+
157
+ results = []
158
+ for op in all_ops:
159
+ text = op.get("summary", "") + " " + json.dumps(op.get("detail", {}), ensure_ascii=False)
160
+ score = score_relevance(query_tokens, text, op.get("ts", ""))
161
+ if score > 0:
162
+ ctx = get_context_window(all_ops, op["seq"], window=1)
163
+ results.append({
164
+ "score": score,
165
+ "source": "ops",
166
+ "data": op,
167
+ "context": ctx,
168
+ })
169
+
170
+ results.sort(key=lambda x: (-x["score"], -x["data"]["seq"]))
171
+ return results[:top_k]
172
+
173
+
174
+ def search_summary(session_dir: Path, query_tokens: set) -> list[dict]:
175
+ """在摘要文件中搜索"""
176
+ summary_file = session_dir / "summary.md"
177
+ if not summary_file.exists():
178
+ return []
179
+ with open(summary_file, encoding="utf-8") as f:
180
+ content = f.read()
181
+ paragraphs = [p.strip() for p in content.split("\n") if p.strip() and not p.startswith("#")]
182
+ results = []
183
+ for para in paragraphs:
184
+ score = score_relevance(query_tokens, para)
185
+ if score > 0.1:
186
+ results.append({"score": score, "source": "summary", "text": para})
187
+ results.sort(key=lambda x: -x["score"])
188
+ return results[:3]
189
+
190
+
191
+ def search_entities(query_tokens: set, top_k: int) -> list[dict]:
192
+ """
193
+ 第4层:实体索引搜索(结构化精确检索)。
194
+ 适合回答:
195
+ - "我们用过哪些函数?" → entity_type=function
196
+ - "动过哪些文件?" → entity_type=file
197
+ - "装了哪些依赖?" → entity_type=dependency
198
+ - "做了哪些决策?" → entity_type=decision
199
+ 相比 bigram 关键词,对结构化查询的精度提升显著。
200
+ """
201
+ entities_file = ULTRA_MEMORY_HOME / "semantic" / "entities.jsonl"
202
+ if not entities_file.exists():
203
+ return []
204
+
205
+ # 实体类型别名:查询词到实体类型的映射
206
+ TYPE_ALIASES = {
207
+ "函数": "function", "function": "function", "方法": "function", "func": "function",
208
+ "文件": "file", "file": "file", "路径": "file",
209
+ "依赖": "dependency", "dependency": "dependency", "包": "dependency",
210
+ "决策": "decision", "decision": "decision", "选择": "decision",
211
+ "错误": "error", "error": "error", "报错": "error", "异常": "error",
212
+ "类": "class", "class": "class",
213
+ }
214
+
215
+ # 检测查询是否包含实体类型词(精确类型过滤)
216
+ target_type = None
217
+ for token in query_tokens:
218
+ if token in TYPE_ALIASES:
219
+ target_type = TYPE_ALIASES[token]
220
+ break
221
+
222
+ results = []
223
+ seen_names: set[str] = set() # 去重:同名实体只保留最新一条
224
+
225
+ all_entities = []
226
+ with open(entities_file, encoding="utf-8") as f:
227
+ for line in f:
228
+ line = line.strip()
229
+ if not line:
230
+ continue
231
+ try:
232
+ all_entities.append(json.loads(line))
233
+ except json.JSONDecodeError:
234
+ continue
235
+
236
+ # 按 ts 倒序(最新优先)
237
+ all_entities.sort(key=lambda e: e.get("ts", ""), reverse=True)
238
+
239
+ for ent in all_entities:
240
+ # 类型过滤
241
+ if target_type and ent.get("entity_type") != target_type:
242
+ continue
243
+
244
+ name = ent.get("name", "")
245
+ context = ent.get("context", "")
246
+ ent_text = name + " " + context
247
+
248
+ score = score_relevance(query_tokens, ent_text, ent.get("ts", ""))
249
+
250
+ # 实体名精确匹配给予额外加分
251
+ name_tokens = tokenize(name)
252
+ exact_match = bool(query_tokens & name_tokens)
253
+ if exact_match:
254
+ score = max(score, 0.5) # 保底 0.5 分
255
+
256
+ # 如果是类型查询("所有函数" "所有文件"),返回全部该类型实体
257
+ if target_type and not seen_names:
258
+ score = max(score, 0.3)
259
+
260
+ if score > 0.1:
261
+ dedup_key = f"{ent.get('entity_type')}:{name}"
262
+ if dedup_key not in seen_names:
263
+ seen_names.add(dedup_key)
264
+ results.append({
265
+ "score": score,
266
+ "source": "entity",
267
+ "data": ent,
268
+ })
269
+
270
+ results.sort(key=lambda x: -x["score"])
271
+ return results[:top_k]
272
+
273
+
274
+ def search_semantic(query_tokens: set, top_k: int) -> list[dict]:
275
+ """在 Layer 3 语义层搜索(轻量模式:关键词匹配 + 同义词扩展)"""
276
+ semantic_dir = ULTRA_MEMORY_HOME / "semantic"
277
+ kb_file = semantic_dir / "knowledge_base.jsonl"
278
+ index_file = semantic_dir / "session_index.json"
279
+
280
+ results = []
281
+
282
+ if kb_file.exists():
283
+ with open(kb_file, encoding="utf-8") as f:
284
+ for line in f:
285
+ line = line.strip()
286
+ if not line:
287
+ continue
288
+ try:
289
+ entry = json.loads(line)
290
+ except json.JSONDecodeError:
291
+ continue
292
+ text = entry.get("content", "") + " " + entry.get("title", "")
293
+ ts = entry.get("ts", "")
294
+ score = score_relevance(query_tokens, text, ts)
295
+ if score > 0.1:
296
+ results.append({"score": score, "source": "knowledge_base", "data": entry})
297
+
298
+ if index_file.exists():
299
+ with open(index_file, encoding="utf-8") as f:
300
+ index = json.load(f)
301
+ for s in index.get("sessions", []):
302
+ text = s.get("project", "") + " " + (s.get("last_milestone") or "")
303
+ ts = s.get("started_at", "")
304
+ score = score_relevance(query_tokens, text, ts)
305
+ if score > 0.1:
306
+ results.append({"score": score, "source": "history", "data": s})
307
+
308
+ results.sort(key=lambda x: -x["score"])
309
+ return results[:top_k]
310
+
311
+
312
+ # ── TF-IDF 向量语义搜索层(第四层召回的增强)───────────────────────────
313
+
314
+ def is_sklearn_available() -> bool:
315
+ try:
316
+ import sklearn; return True
317
+ except ImportError:
318
+ return False
319
+
320
+
321
+ def is_sentencetransformers_available() -> bool:
322
+ try:
323
+ from sentence_transformers import SentenceTransformer; return True
324
+ except ImportError:
325
+ return False
326
+
327
+
328
+ _TFidfCache: dict[str, dict] = {} # session_id → {vocab, idfs, doc_vectors, doc_texts}
329
+
330
+
331
+ def _get_tfidf_cache_path(session_dir: Path) -> Path:
332
+ return session_dir / "tfidf_cache.json"
333
+
334
+
335
+ def _text_from_op(op: dict) -> str:
336
+ """提取 op 中可索引的文本"""
337
+ parts = [
338
+ op.get("summary", ""),
339
+ op.get("type", ""),
340
+ " ".join(op.get("tags", [])),
341
+ ]
342
+ detail = op.get("detail", {})
343
+ if isinstance(detail, dict):
344
+ for v in detail.values():
345
+ if isinstance(v, str):
346
+ parts.append(v)
347
+ return " ".join(parts)
348
+
349
+
350
+ def _build_tfidf_index(ops: list[dict]) -> dict:
351
+ """
352
+ 用 sklearn TfidfVectorizer 构建内存索引。
353
+ 返回 {vocab: [...], idfs: [...], doc_vectors: [[...], ...], doc_texts: [...]}
354
+ 完全零外部 API 依赖。
355
+ """
356
+ import math
357
+ from collections import Counter
358
+
359
+ texts = [_text_from_op(op) for op in ops]
360
+ # 简单 tokenize:英文保留 word,中文逐字
361
+ def tokens(text: str) -> list[str]:
362
+ import re
363
+ en = re.findall(r'[a-zA-Z0-9_]+', text.lower())
364
+ zh = list(text)
365
+ return en + zh
366
+
367
+ tokenized = [tokens(t) for t in texts]
368
+ # 构建词表
369
+ vocab_set: set[str] = set()
370
+ for tk in tokenized:
371
+ vocab_set.update(tk)
372
+ vocab = sorted(vocab_set)
373
+ word2idx = {w: i for i, w in enumerate(vocab)}
374
+ n = len(vocab)
375
+
376
+ # TF: 词频
377
+ N = len(texts)
378
+ df = Counter()
379
+ for tk in tokenized:
380
+ df.update(set(tk))
381
+
382
+ idfs = []
383
+ for w in vocab:
384
+ df_w = df[w]
385
+ idf = math.log((N + 1) / (df_w + 1)) + 1 # 平滑
386
+ idfs.append(idf)
387
+
388
+ # 文档向量 = TF × IDF
389
+ doc_vectors = []
390
+ for tk in tokenized:
391
+ tf = Counter(tk)
392
+ vec = [0.0] * n
393
+ for w, f in tf.items():
394
+ if w in word2idx:
395
+ idx = word2idx[w]
396
+ vec[idx] = f * idfs[idx]
397
+ # L2 归一化
398
+ norm = math.sqrt(sum(v ** 2 for v in vec))
399
+ if norm > 0:
400
+ vec = [v / norm for v in vec]
401
+ doc_vectors.append(vec)
402
+
403
+ return {
404
+ "vocab": vocab,
405
+ "idfs": idfs,
406
+ "doc_vectors": doc_vectors,
407
+ "doc_texts": texts,
408
+ "n_docs": N,
409
+ }
410
+
411
+
412
+ def _cosine_similarity(a: list[float], b: list[float]) -> float:
413
+ import math
414
+ dot = sum(x * y for x, y in zip(a, b))
415
+ na = math.sqrt(sum(x * x for x in a))
416
+ nb = math.sqrt(sum(x * x for x in b))
417
+ if na == 0 or nb == 0:
418
+ return 0.0
419
+ return dot / (na * nb)
420
+
421
+
422
+ def _search_tfidf(session_dir: Path, all_ops: list[dict],
423
+ query: str, top_k: int) -> list[dict]:
424
+ """纯 sklearn TF-IDF 语义搜索(零依赖,fallback 方案)"""
425
+ import re
426
+
427
+ cache_path = _get_tfidf_cache_path(session_dir)
428
+
429
+ # 加载或构建缓存
430
+ if cache_path.exists():
431
+ try:
432
+ import json as _json
433
+ with open(cache_path, encoding="utf-8") as f:
434
+ cache = _json.load(f)
435
+ doc_vectors = cache["doc_vectors"]
436
+ doc_texts = cache["doc_texts"]
437
+ vocab = cache["vocab"]
438
+ idfs = cache["idfs"]
439
+ cached_seq = cache.get("last_seq", -1)
440
+ except Exception:
441
+ cache = None
442
+ doc_vectors = None
443
+ else:
444
+ cache = None
445
+
446
+ # 如果缓存过期(seq 变了)或不存在,重新构建
447
+ current_seq = max((op.get("seq", 0) for op in all_ops), default=0)
448
+ if doc_vectors is None or cache is None or cache.get("last_seq", -1) != current_seq:
449
+ cache = _build_tfidf_index(all_ops)
450
+ doc_vectors = cache["doc_vectors"]
451
+ doc_texts = cache["doc_texts"]
452
+ vocab = cache["vocab"]
453
+ idfs = cache["idfs"]
454
+ cache["last_seq"] = current_seq
455
+ try:
456
+ with open(cache_path, "w", encoding="utf-8") as f:
457
+ json.dump(cache, f)
458
+ except Exception:
459
+ pass # 写入失败不影响搜索
460
+
461
+ # 把 query 也转成 TF-IDF 向量
462
+ def tokens(text: str) -> list[str]:
463
+ en = re.findall(r'[a-zA-Z0-9_]+', text.lower())
464
+ zh = list(text)
465
+ return en + zh
466
+
467
+ q_tokens = tokens(query)
468
+ tf_q = Counter(q_tokens)
469
+ word2idx = {w: i for i, w in enumerate(vocab)}
470
+ vec_q = [0.0] * len(vocab)
471
+ for w, f in tf_q.items():
472
+ if w in word2idx:
473
+ idx = word2idx[w]
474
+ vec_q[idx] = f * idfs[idx]
475
+
476
+ # L2 归一化
477
+ import math
478
+ norm = math.sqrt(sum(v * v for v in vec_q))
479
+ if norm > 0:
480
+ vec_q = [v / norm for v in vec_q]
481
+
482
+ # 余弦相似度
483
+ scored = []
484
+ for i, dv in enumerate(doc_vectors):
485
+ score = _cosine_similarity(vec_q, dv)
486
+ if score > 0.05: # 阈值过滤噪音
487
+ scored.append((score, i))
488
+ scored.sort(key=lambda x: -x[0])
489
+
490
+ results = []
491
+ # all_ops 和 doc_vectors/doc_texts 按同一顺序排列,直接用索引对应
492
+ for score, i in scored[:top_k]:
493
+ results.append({"score": score, "source": "tfidf", "data": all_ops[i]})
494
+
495
+ return results
496
+
497
+
498
+ def _search_sentencetransformers(
499
+ session_dir: Path, all_ops: list[dict],
500
+ query: str, top_k: int
501
+ ) -> list[dict]:
502
+ """
503
+ sentence-transformers 向量语义搜索(更高质量,需 pip install sentence-transformers)。
504
+ 使用 all-MiniLM-L6-v2(22MB,本地运行,无需 API)。
505
+ """
506
+ import json as _json
507
+
508
+ try:
509
+ from sentence_transformers import SentenceTransformer
510
+ except ImportError:
511
+ return []
512
+
513
+ cache_path = session_dir / "embed_cache.json"
514
+
515
+ # 加载或构建 embedding 缓存
516
+ if cache_path.exists():
517
+ try:
518
+ with open(cache_path, encoding="utf-8") as f:
519
+ cache = _json.load(f)
520
+ cached_seq = cache.get("last_seq", -1)
521
+ current_seq = max((op.get("seq", 0) for op in all_ops), default=0)
522
+ if cached_seq != current_seq:
523
+ cache = None
524
+ except Exception:
525
+ cache = None
526
+ else:
527
+ cache = None
528
+
529
+ texts = [_text_from_op(op) for op in all_ops]
530
+
531
+ if cache is None:
532
+ model = SentenceTransformer("all-MiniLM-L6-v2")
533
+ embeddings = model.encode(texts, show_progress_bar=False).tolist()
534
+ current_seq = max((op.get("seq", 0) for op in all_ops), default=0)
535
+ cache = {"embeddings": embeddings, "last_seq": current_seq}
536
+ try:
537
+ with open(cache_path, "w", encoding="utf-8") as f:
538
+ _json.dump(cache, f)
539
+ except Exception:
540
+ pass
541
+
542
+ # 将查询向量化
543
+ model = SentenceTransformer("all-MiniLM-L6-v2")
544
+ query_emb = model.encode([query], show_progress_bar=False)[0].tolist()
545
+
546
+ embeddings = cache["embeddings"]
547
+ import math
548
+ scored = []
549
+ for i, emb in enumerate(embeddings):
550
+ dot = sum(a * b for a, b in zip(query_emb, emb))
551
+ na = math.sqrt(sum(a * a for a in query_emb))
552
+ nb = math.sqrt(sum(a * a for a in emb))
553
+ score = dot / (na * nb) if na > 0 and nb > 0 else 0
554
+ if score > 0.3:
555
+ scored.append((score, i))
556
+ scored.sort(key=lambda x: -x[0])
557
+
558
+ results = []
559
+ for score, i in scored[:top_k]:
560
+ results.append({"score": score, "source": "embedding", "data": all_ops[i]})
561
+ return results
562
+
563
+
564
+ def search_tfidf(session_dir: Path, all_ops: list[dict],
565
+ query: str, top_k: int) -> list[dict]:
566
+ """
567
+ 语义搜索入口:优先 sentence-transformers,退回 sklearn TF-IDF。
568
+ 如果都不可用,返回空列表(不阻塞主流程)。
569
+ """
570
+ if is_sentencetransformers_available():
571
+ return _search_sentencetransformers(session_dir, all_ops, query, top_k)
572
+ elif is_sklearn_available():
573
+ return _search_tfidf(session_dir, all_ops, query, top_k)
574
+ return []
575
+
576
+
577
+ # ── 结果格式化 ──────────────────────────────────────────────────────────
578
+
579
+ def format_result(result: dict, show_context: bool = True) -> str:
580
+ source = result["source"]
581
+ lines = []
582
+
583
+ if source == "ops":
584
+ op = result["data"]
585
+ ts = op["ts"][:16].replace("T", " ")
586
+ lines.append(f"[ops #{op['seq']} · {ts}] {op['summary']}")
587
+ # 显示上下文窗口
588
+ if show_context and result.get("context"):
589
+ ctx = result["context"]
590
+ for before_op in ctx.get("before", []):
591
+ lines.append(f" ↑ [#{before_op['seq']}] {before_op['summary'][:60]}")
592
+ for after_op in ctx.get("after", []):
593
+ lines.append(f" ↓ [#{after_op['seq']}] {after_op['summary'][:60]}")
594
+ elif source == "summary":
595
+ lines.append(f"[摘要] {result['text']}")
596
+ elif source == "knowledge_base":
597
+ d = result["data"]
598
+ lines.append(f"[知识库 · {d.get('title', '?')}] {d.get('content', '')[:100]}")
599
+ elif source == "history":
600
+ d = result["data"]
601
+ ts = d.get("started_at", "")[:10]
602
+ lines.append(f"[历史会话 · {ts} · {d.get('project', '')}] {d.get('last_milestone', '无里程碑记录')}")
603
+ elif source == "entity":
604
+ d = result["data"]
605
+ et = d.get("entity_type", "?")
606
+ name = d.get("name", "?")
607
+ ctx = d.get("context", "")
608
+ ts = d.get("ts", "")[:16].replace("T", " ")
609
+ extra = ""
610
+ if et == "dependency":
611
+ extra = f" [via {d.get('manager', '?')}]"
612
+ elif et == "decision":
613
+ rationale = d.get("rationale", "")
614
+ extra = f" 依据: {rationale}" if rationale else ""
615
+ elif et == "error":
616
+ extra = f" ← {d.get('message', '')}"
617
+ lines.append(f"[实体/{et} · {ts}] {name}{extra}")
618
+ if ctx:
619
+ lines.append(f" 来源: {ctx}")
620
+
621
+ elif source in ("tfidf", "embedding"):
622
+ d = result["data"]
623
+ ts = d.get("ts", "")[:16].replace("T", " ")
624
+ label = "TF-IDF" if source == "tfidf" else "向量"
625
+ lines.append(f"[语义/{label} #{d.get('seq', '?')} · {ts}] {d.get('summary', '?')[:80]}")
626
+ detail = d.get("detail", {})
627
+ if isinstance(detail, dict):
628
+ for k, v in list(detail.items())[:2]:
629
+ lines.append(f" [{k}] {str(v)[:60]}")
630
+
631
+ return "\n".join(lines) if lines else str(result)
632
+
633
+
634
+ def recall(session_id: str, query: str, top_k: int = 5):
635
+ # 扩展查询词(加入同义词)
636
+ query_tokens = expand_query(query)
637
+
638
+ session_dir = ULTRA_MEMORY_HOME / "sessions" / session_id
639
+ found = []
640
+
641
+ # Layer 1: 操作日志(含时间权重 + 上下文窗口)
642
+ ops_results = search_ops(session_dir, query_tokens, top_k)
643
+ found.extend(ops_results)
644
+
645
+ # Layer 2: 摘要
646
+ summary_results = search_summary(session_dir, query_tokens)
647
+ found.extend(summary_results)
648
+
649
+ # Layer 3: 语义层(跨会话)
650
+ semantic_results = search_semantic(query_tokens, top_k)
651
+ found.extend(semantic_results)
652
+
653
+ # Layer 4: 实体索引(结构化精确检索)
654
+ entity_results = search_entities(query_tokens, top_k)
655
+ found.extend(entity_results)
656
+
657
+ # Layer 5: 向量语义搜索(TF-IDF 或 sentence-transformers)
658
+ ops_for_tfidf = load_all_ops(session_dir)
659
+ if ops_for_tfidf:
660
+ tfidf_results = search_tfidf(session_dir, ops_for_tfidf, query, top_k)
661
+ found.extend(tfidf_results)
662
+
663
+ # 去重 + 排序
664
+ found.sort(key=lambda x: -x["score"])
665
+ found = found[:top_k]
666
+
667
+ if not found:
668
+ print(f"[RECALL] 未找到与「{query}」相关的记忆")
669
+ return
670
+
671
+ print(f"\n[RECALL] 找到 {len(found)} 条相关记录(查询: {query}):\n")
672
+ for i, r in enumerate(found, 1):
673
+ print(f"{i}. {format_result(r, show_context=True)}")
674
+ print()
675
+
676
+
677
+ if __name__ == "__main__":
678
+ parser = argparse.ArgumentParser(description="检索记忆")
679
+ parser.add_argument("--session", required=True, help="会话 ID")
680
+ parser.add_argument("--query", required=True, help="检索关键词")
681
+ parser.add_argument("--top-k", type=int, default=5)
682
+ args = parser.parse_args()
683
+ recall(args.session, args.query, args.top_k)