ultra-memory 3.2.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
ultra-memory: 矛盾检测核心模块
|
|
4
|
+
检测 user_profile.json 和 knowledge_base.jsonl 中的时序矛盾,
|
|
5
|
+
将旧记录标记为 superseded: true,召回时跳过失效记录。
|
|
6
|
+
|
|
7
|
+
只使用 Python 标准库,无外部依赖。
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import json
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
# ── 停用词表 ────────────────────────────────────────────────────────────────
|
|
17
|
+
|
|
18
|
+
STOPWORDS = {
|
|
19
|
+
"的", "了", "是", "在", "和", "与", "或", "以及",
|
|
20
|
+
"a", "an", "the", "is", "was", "are", "were",
|
|
21
|
+
"to", "of", "for", "with", "by", "from",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# ── 否定词表 ────────────────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
NEGATION_WORDS = {
|
|
27
|
+
"不", "没有", "无法", "不能", "不是", "别", "莫", "永不", "绝不",
|
|
28
|
+
"not", "no", "never", "none", "cannot", "don't", "doesn't",
|
|
29
|
+
"isn't", "aren't", "wasn't", "weren't", "won't", "wouldn't",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# ── 对立词对 ───────────────────────────────────────────────────────────────
|
|
33
|
+
|
|
34
|
+
CONTRADICTORY_PAIRS = [
|
|
35
|
+
frozenset(["成功", "失败"]),
|
|
36
|
+
frozenset(["可以", "不能"]),
|
|
37
|
+
frozenset(["推荐", "不推荐"]),
|
|
38
|
+
frozenset(["启用", "禁用"]),
|
|
39
|
+
frozenset(["enable", "disable"]),
|
|
40
|
+
frozenset(["success", "failure"]),
|
|
41
|
+
frozenset(["yes", "no"]),
|
|
42
|
+
frozenset(["true", "false"]),
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ── 工具函数 ───────────────────────────────────────────────────────────────
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _now_iso() -> str:
|
|
50
|
+
return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _stopword_filter(words: list[str]) -> list[str]:
|
|
54
|
+
return [w for w in words if w.lower() not in STOPWORDS and len(w) > 1]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ── 画像冲突检测 ───────────────────────────────────────────────────────────
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def detect_profile_conflict(new_data: dict, home: Path) -> list[dict]:
|
|
61
|
+
"""
|
|
62
|
+
检测 user_profile.json 中的字段冲突。
|
|
63
|
+
返回冲突列表,每条格式:
|
|
64
|
+
{"field": "...", "old_value": "...", "new_value": "...", "superseded_at": "..."}
|
|
65
|
+
"""
|
|
66
|
+
profile_file = home / "semantic" / "user_profile.json"
|
|
67
|
+
if not profile_file.exists():
|
|
68
|
+
return []
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
with open(profile_file, encoding="utf-8") as f:
|
|
72
|
+
profile = json.load(f)
|
|
73
|
+
except (json.JSONDecodeError, IOError):
|
|
74
|
+
return []
|
|
75
|
+
|
|
76
|
+
conflicts = []
|
|
77
|
+
|
|
78
|
+
for field, new_value in new_data.items():
|
|
79
|
+
# observed_patterns 只追加,不检测冲突
|
|
80
|
+
if field == "observed_patterns":
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
# 跳过 superseded 标记的字段
|
|
84
|
+
if field.endswith("_superseded"):
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
if field not in profile:
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
old_value = profile[field]
|
|
91
|
+
|
|
92
|
+
# 字符串类型:旧值与新值不同 → 冲突
|
|
93
|
+
if isinstance(old_value, str) and isinstance(new_value, str):
|
|
94
|
+
if old_value != new_value:
|
|
95
|
+
conflicts.append({
|
|
96
|
+
"field": field,
|
|
97
|
+
"old_value": old_value,
|
|
98
|
+
"new_value": new_value,
|
|
99
|
+
"superseded_at": _now_iso(),
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
# 数组类型:完全替换(完全不同)→ 冲突;只新增元素 → 不冲突
|
|
103
|
+
elif isinstance(old_value, list) and isinstance(new_value, list):
|
|
104
|
+
# 新旧完全不同才算冲突;新值包含旧值的所有元素(新增元素)不算冲突
|
|
105
|
+
old_set = set(str(v) for v in old_value)
|
|
106
|
+
new_set = set(str(v) for v in new_value)
|
|
107
|
+
if old_set and new_set and old_set != new_set:
|
|
108
|
+
# 只有当旧值中的元素不在新值中,才算冲突
|
|
109
|
+
removed = old_set - new_set
|
|
110
|
+
if removed: # 有元素被移除才算冲突
|
|
111
|
+
conflicts.append({
|
|
112
|
+
"field": field,
|
|
113
|
+
"old_value": old_value,
|
|
114
|
+
"new_value": new_value,
|
|
115
|
+
"superseded_at": _now_iso(),
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
# dict 类型:递归检测每个 key
|
|
119
|
+
elif isinstance(old_value, dict) and isinstance(new_value, dict):
|
|
120
|
+
for sub_key, sub_old in old_value.items():
|
|
121
|
+
if sub_key not in new_value:
|
|
122
|
+
continue
|
|
123
|
+
sub_new = new_value[sub_key]
|
|
124
|
+
if isinstance(sub_old, str) and isinstance(sub_new, str):
|
|
125
|
+
if sub_old != sub_new:
|
|
126
|
+
full_key = f"{field}.{sub_key}"
|
|
127
|
+
conflicts.append({
|
|
128
|
+
"field": full_key,
|
|
129
|
+
"old_value": sub_old,
|
|
130
|
+
"new_value": "(已更新)" if sub_new else sub_new,
|
|
131
|
+
"superseded_at": _now_iso(),
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
return conflicts
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def mark_profile_superseded(home: Path, conflicts: list[dict]):
|
|
138
|
+
"""
|
|
139
|
+
将 user_profile.json 中冲突的旧字段标记为 superseded。
|
|
140
|
+
在旧字段旁追加 "<field>_superseded" 记录旧值。
|
|
141
|
+
"""
|
|
142
|
+
profile_file = home / "semantic" / "user_profile.json"
|
|
143
|
+
if not profile_file.exists() or not conflicts:
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
with open(profile_file, encoding="utf-8") as f:
|
|
148
|
+
profile = json.load(f)
|
|
149
|
+
except (json.JSONDecodeError, IOError):
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
for conflict in conflicts:
|
|
153
|
+
field = conflict["field"]
|
|
154
|
+
# 记录旧值
|
|
155
|
+
superseded_key = f"{field}_superseded"
|
|
156
|
+
profile[superseded_key] = {
|
|
157
|
+
"old_value": conflict["old_value"],
|
|
158
|
+
"superseded_at": conflict["superseded_at"],
|
|
159
|
+
}
|
|
160
|
+
# 将新值写入主字段(已经在 profile_update 时更新了,这里只处理遗留情况)
|
|
161
|
+
if "." in field:
|
|
162
|
+
# 处理嵌套 dict key,如 work_style.confirm_before_implement
|
|
163
|
+
parts = field.split(".")
|
|
164
|
+
d = profile
|
|
165
|
+
for p in parts[:-1]:
|
|
166
|
+
d = d.setdefault(p, {})
|
|
167
|
+
d[parts[-1]] = conflict["new_value"]
|
|
168
|
+
else:
|
|
169
|
+
profile[field] = conflict["new_value"]
|
|
170
|
+
|
|
171
|
+
# 原子写入
|
|
172
|
+
tmp_file = profile_file.with_suffix(".tmp")
|
|
173
|
+
with open(tmp_file, "w", encoding="utf-8") as f:
|
|
174
|
+
json.dump(profile, f, ensure_ascii=False, indent=2)
|
|
175
|
+
tmp_file.replace(profile_file)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ── 知识库冲突检测 ────────────────────────────────────────────────────────
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _extract_keywords(text: str) -> set[str]:
|
|
182
|
+
"""提取关键词(去除停用词)"""
|
|
183
|
+
words = re.split(r'[\s,;.()\[\]{}]+', text.lower())
|
|
184
|
+
return set(_stopword_filter(words))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _title_similarity(title_a: str, title_b: str) -> float:
|
|
188
|
+
"""计算标题相似度:关键词重叠数 / max(len_a, len_b)"""
|
|
189
|
+
keywords_a = _extract_keywords(title_a)
|
|
190
|
+
keywords_b = _extract_keywords(title_b)
|
|
191
|
+
if not keywords_a or not keywords_b:
|
|
192
|
+
return 0.0
|
|
193
|
+
overlap = len(keywords_a & keywords_b)
|
|
194
|
+
max_len = max(len(keywords_a), len(keywords_b))
|
|
195
|
+
return overlap / max_len
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _has_negation(text: str) -> bool:
|
|
199
|
+
"""检测文本中是否含否定词"""
|
|
200
|
+
text_lower = text.lower()
|
|
201
|
+
return any(nw in text_lower for nw in NEGATION_WORDS)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _has_number_change(text_old: str, text_new: str) -> bool:
|
|
205
|
+
"""检测数值变化:两边都有数字但数字不同"""
|
|
206
|
+
nums_old = set(re.findall(r'\d+\.?\d*', text_old))
|
|
207
|
+
nums_new = set(re.findall(r'\d+\.?\d*', text_new))
|
|
208
|
+
if nums_old and nums_new and nums_old != nums_new:
|
|
209
|
+
return True
|
|
210
|
+
return False
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _has_contradictory_pair(text_old: str, text_new: str) -> bool:
|
|
214
|
+
"""检测对立词对:一边含 A 另一边含 B"""
|
|
215
|
+
text_lower_old = text_old.lower()
|
|
216
|
+
text_lower_new = text_new.lower()
|
|
217
|
+
for pair in CONTRADICTORY_PAIRS:
|
|
218
|
+
words_old = set(pair & set(text_lower_old.split()))
|
|
219
|
+
words_new = set(pair & set(text_lower_new.split()))
|
|
220
|
+
if words_old and words_new and words_old != words_new:
|
|
221
|
+
return True
|
|
222
|
+
return False
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def detect_knowledge_conflict(new_entry: dict, home: Path) -> list[dict]:
|
|
226
|
+
"""
|
|
227
|
+
检测 knowledge_base.jsonl 中的内容矛盾。
|
|
228
|
+
new_entry 格式:{"title": "...", "content": "...", ...}
|
|
229
|
+
返回冲突列表: [{"seq": 行号, "title": "...", "reason": "规则1/2/3"}]
|
|
230
|
+
"""
|
|
231
|
+
kb_file = home / "semantic" / "knowledge_base.jsonl"
|
|
232
|
+
if not kb_file.exists():
|
|
233
|
+
return []
|
|
234
|
+
|
|
235
|
+
new_title = new_entry.get("title", "")
|
|
236
|
+
new_content = new_entry.get("content", "")
|
|
237
|
+
|
|
238
|
+
conflicts = []
|
|
239
|
+
lines = []
|
|
240
|
+
with open(kb_file, encoding="utf-8") as f:
|
|
241
|
+
for i, line in enumerate(f):
|
|
242
|
+
if not line.strip():
|
|
243
|
+
continue
|
|
244
|
+
try:
|
|
245
|
+
entry = json.loads(line)
|
|
246
|
+
except json.JSONDecodeError:
|
|
247
|
+
continue
|
|
248
|
+
# 跳过已失效条目
|
|
249
|
+
if entry.get("superseded"):
|
|
250
|
+
continue
|
|
251
|
+
lines.append((i + 1, entry)) # 行号从1开始
|
|
252
|
+
|
|
253
|
+
for seq, old_entry in lines:
|
|
254
|
+
old_title = old_entry.get("title", "")
|
|
255
|
+
old_content = old_entry.get("content", "")
|
|
256
|
+
|
|
257
|
+
# 标题相似度 >= 0.5 才进入内容矛盾检测
|
|
258
|
+
sim = _title_similarity(new_title, old_title)
|
|
259
|
+
if sim < 0.5:
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
# 矛盾规则检测
|
|
263
|
+
reason = None
|
|
264
|
+
|
|
265
|
+
# 规则1 — 否定词检测
|
|
266
|
+
has_neg_old = _has_negation(old_content)
|
|
267
|
+
has_neg_new = _has_negation(new_content)
|
|
268
|
+
if has_neg_old != has_neg_new:
|
|
269
|
+
reason = "规则1"
|
|
270
|
+
|
|
271
|
+
# 规则2 — 数值变化检测
|
|
272
|
+
elif _has_number_change(old_content, new_content):
|
|
273
|
+
reason = "规则2"
|
|
274
|
+
|
|
275
|
+
# 规则3 — 对立词检测
|
|
276
|
+
elif _has_contradictory_pair(old_content, new_content):
|
|
277
|
+
reason = "规则3"
|
|
278
|
+
|
|
279
|
+
if reason:
|
|
280
|
+
conflicts.append({
|
|
281
|
+
"seq": seq,
|
|
282
|
+
"title": old_title,
|
|
283
|
+
"reason": reason,
|
|
284
|
+
})
|
|
285
|
+
|
|
286
|
+
return conflicts
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def mark_superseded(home: Path, jsonl_path: Path, seq_list: list[int]):
|
|
290
|
+
"""
|
|
291
|
+
将 knowledge_base.jsonl 中指定行号的条目标记为 superseded。
|
|
292
|
+
原子写入:先写 tmp 文件再 rename。
|
|
293
|
+
"""
|
|
294
|
+
if not seq_list:
|
|
295
|
+
return
|
|
296
|
+
|
|
297
|
+
seq_set = set(seq_list)
|
|
298
|
+
|
|
299
|
+
tmp_file = jsonl_path.with_suffix(".tmp")
|
|
300
|
+
written_seq = set()
|
|
301
|
+
|
|
302
|
+
with open(jsonl_path, encoding="utf-8") as f:
|
|
303
|
+
for i, line in enumerate(f):
|
|
304
|
+
if not line.strip():
|
|
305
|
+
continue
|
|
306
|
+
try:
|
|
307
|
+
entry = json.loads(line)
|
|
308
|
+
except json.JSONDecodeError:
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
line_num = i + 1 # 行号从1开始
|
|
312
|
+
if line_num in seq_set and line_num not in written_seq:
|
|
313
|
+
entry["superseded"] = True
|
|
314
|
+
entry["superseded_at"] = _now_iso()
|
|
315
|
+
written_seq.add(line_num)
|
|
316
|
+
|
|
317
|
+
tmp_file.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
|
318
|
+
|
|
319
|
+
tmp_file.replace(jsonl_path)
|
package/scripts/log_op.py
CHANGED
|
@@ -279,6 +279,34 @@ def log_op(
|
|
|
279
279
|
"compressed": False,
|
|
280
280
|
}
|
|
281
281
|
|
|
282
|
+
# ── 矛盾检测(写入 ops.jsonl 之前)──────────────────────────────────────
|
|
283
|
+
|
|
284
|
+
# 2A:画像冲突检测(user_instruction/decision + profile_update)
|
|
285
|
+
if op_type in ("user_instruction", "decision") and detail.get("profile_update"):
|
|
286
|
+
try:
|
|
287
|
+
from conflict_detector import detect_profile_conflict, mark_profile_superseded
|
|
288
|
+
conflicts = detect_profile_conflict(detail["profile_update"], ULTRA_MEMORY_HOME)
|
|
289
|
+
if conflicts:
|
|
290
|
+
entry["detail"]["profile_conflicts"] = conflicts
|
|
291
|
+
mark_profile_superseded(ULTRA_MEMORY_HOME, conflicts)
|
|
292
|
+
print(f"[ultra-memory] ⚡ 检测到 {len(conflicts)} 处画像矛盾,旧记录已标记失效")
|
|
293
|
+
except Exception:
|
|
294
|
+
pass
|
|
295
|
+
|
|
296
|
+
# 2B:知识库冲突检测(milestone/decision + knowledge_entry)
|
|
297
|
+
if op_type in ("milestone", "decision") and detail.get("knowledge_entry"):
|
|
298
|
+
try:
|
|
299
|
+
from conflict_detector import detect_knowledge_conflict, mark_superseded
|
|
300
|
+
conflicts = detect_knowledge_conflict(detail["knowledge_entry"], ULTRA_MEMORY_HOME)
|
|
301
|
+
if conflicts:
|
|
302
|
+
kb_path = ULTRA_MEMORY_HOME / "semantic" / "knowledge_base.jsonl"
|
|
303
|
+
seq_list = [c["seq"] for c in conflicts]
|
|
304
|
+
mark_superseded(ULTRA_MEMORY_HOME, kb_path, seq_list)
|
|
305
|
+
entry["detail"]["knowledge_conflicts"] = conflicts
|
|
306
|
+
print(f"[ultra-memory] ⚡ 检测到 {len(conflicts)} 条知识库矛盾,旧记录已标记失效")
|
|
307
|
+
except Exception:
|
|
308
|
+
pass
|
|
309
|
+
|
|
282
310
|
# 追加写入(append-only,永不覆盖)
|
|
283
311
|
with open(ops_file, "a", encoding="utf-8") as f:
|
|
284
312
|
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
package/scripts/recall.py
CHANGED
|
@@ -289,6 +289,9 @@ def search_semantic(query_tokens: set, top_k: int) -> list[dict]:
|
|
|
289
289
|
entry = json.loads(line)
|
|
290
290
|
except json.JSONDecodeError:
|
|
291
291
|
continue
|
|
292
|
+
# 过滤已失效条目
|
|
293
|
+
if entry.get("superseded"):
|
|
294
|
+
continue
|
|
292
295
|
text = entry.get("content", "") + " " + entry.get("title", "")
|
|
293
296
|
ts = entry.get("ts", "")
|
|
294
297
|
score = score_relevance(query_tokens, text, ts)
|
|
@@ -309,6 +312,36 @@ def search_semantic(query_tokens: set, top_k: int) -> list[dict]:
|
|
|
309
312
|
return results[:top_k]
|
|
310
313
|
|
|
311
314
|
|
|
315
|
+
def search_profile(query_tokens: set, home: Path) -> list[dict]:
|
|
316
|
+
"""从 user_profile.json 检索相关字段,跳过 superseded 字段"""
|
|
317
|
+
profile_file = home / "semantic" / "user_profile.json"
|
|
318
|
+
if not profile_file.exists():
|
|
319
|
+
return []
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
with open(profile_file, encoding="utf-8") as f:
|
|
323
|
+
profile = json.load(f)
|
|
324
|
+
except (json.JSONDecodeError, IOError):
|
|
325
|
+
return []
|
|
326
|
+
|
|
327
|
+
results = []
|
|
328
|
+
for key, value in profile.items():
|
|
329
|
+
# 跳过 superseded 标记的字段
|
|
330
|
+
if key.endswith("_superseded"):
|
|
331
|
+
continue
|
|
332
|
+
text = f"{key} {value}"
|
|
333
|
+
score = score_relevance(query_tokens, str(text))
|
|
334
|
+
if score > 0.1:
|
|
335
|
+
results.append({
|
|
336
|
+
"score": score,
|
|
337
|
+
"source": "profile",
|
|
338
|
+
"data": {"field": key, "value": value},
|
|
339
|
+
})
|
|
340
|
+
|
|
341
|
+
results.sort(key=lambda x: -x["score"])
|
|
342
|
+
return results[:3]
|
|
343
|
+
|
|
344
|
+
|
|
312
345
|
# ── TF-IDF 向量语义搜索层(第四层召回的增强)───────────────────────────
|
|
313
346
|
|
|
314
347
|
def is_sklearn_available() -> bool:
|
|
@@ -628,6 +661,10 @@ def format_result(result: dict, show_context: bool = True) -> str:
|
|
|
628
661
|
for k, v in list(detail.items())[:2]:
|
|
629
662
|
lines.append(f" [{k}] {str(v)[:60]}")
|
|
630
663
|
|
|
664
|
+
elif source == "profile":
|
|
665
|
+
d = result["data"]
|
|
666
|
+
lines.append(f"[用户画像] {d['field']}: {d['value']}")
|
|
667
|
+
|
|
631
668
|
return "\n".join(lines) if lines else str(result)
|
|
632
669
|
|
|
633
670
|
|
|
@@ -650,6 +687,10 @@ def recall(session_id: str, query: str, top_k: int = 5):
|
|
|
650
687
|
semantic_results = search_semantic(query_tokens, top_k)
|
|
651
688
|
found.extend(semantic_results)
|
|
652
689
|
|
|
690
|
+
# 画像检索(从 user_profile.json 搜索相关字段)
|
|
691
|
+
profile_results = search_profile(query_tokens, ULTRA_MEMORY_HOME)
|
|
692
|
+
found.extend(profile_results)
|
|
693
|
+
|
|
653
694
|
# Layer 4: 实体索引(结构化精确检索)
|
|
654
695
|
entity_results = search_entities(query_tokens, top_k)
|
|
655
696
|
found.extend(entity_results)
|