ultra-memory 3.2.0 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,320 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ultra-memory: 矛盾检测核心模块
4
+ 检测 user_profile.json 和 knowledge_base.jsonl 中的时序矛盾,
5
+ 将旧记录标记为 superseded: true,召回时跳过失效记录。
6
+
7
+ 只使用 Python 标准库,无外部依赖。
8
+ """
9
+
10
+ import os
11
+ import re
12
+ import json
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+
16
+ # ── 停用词表 ────────────────────────────────────────────────────────────────
17
+
18
+ STOPWORDS = {
19
+ "的", "了", "是", "在", "和", "与", "或", "以及",
20
+ "a", "an", "the", "is", "was", "are", "were",
21
+ "to", "of", "for", "with", "by", "from",
22
+ }
23
+
24
+ # ── 否定词表 ────────────────────────────────────────────────────────────────
25
+
26
+ NEGATION_WORDS = {
27
+ "不", "没有", "无法", "不能", "不是", "别", "莫", "永不", "绝不",
28
+ "not", "no", "never", "none", "cannot", "don't", "doesn't",
29
+ "isn't", "aren't", "wasn't", "weren't", "won't", "wouldn't",
30
+ }
31
+
32
+ # ── 对立词对 ───────────────────────────────────────────────────────────────
33
+
34
+ CONTRADICTORY_PAIRS = [
35
+ frozenset(["成功", "失败"]),
36
+ frozenset(["可以", "不能"]),
37
+ frozenset(["推荐", "不推荐"]),
38
+ frozenset(["启用", "禁用"]),
39
+ frozenset(["enable", "disable"]),
40
+ frozenset(["success", "failure"]),
41
+ frozenset(["yes", "no"]),
42
+ frozenset(["true", "false"]),
43
+ ]
44
+
45
+
46
+ # ── 工具函数 ───────────────────────────────────────────────────────────────
47
+
48
+
49
+ def _now_iso() -> str:
50
+ return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
51
+
52
+
53
+ def _stopword_filter(words: list[str]) -> list[str]:
54
+ return [w for w in words if w.lower() not in STOPWORDS and len(w) > 1]
55
+
56
+
57
+ # ── 画像冲突检测 ───────────────────────────────────────────────────────────
58
+
59
+
60
+ def detect_profile_conflict(new_data: dict, home: Path) -> list[dict]:
61
+ """
62
+ 检测 user_profile.json 中的字段冲突。
63
+ 返回冲突列表,每条格式:
64
+ {"field": "...", "old_value": "...", "new_value": "...", "superseded_at": "..."}
65
+ """
66
+ profile_file = home / "semantic" / "user_profile.json"
67
+ if not profile_file.exists():
68
+ return []
69
+
70
+ try:
71
+ with open(profile_file, encoding="utf-8") as f:
72
+ profile = json.load(f)
73
+ except (json.JSONDecodeError, IOError):
74
+ return []
75
+
76
+ conflicts = []
77
+
78
+ for field, new_value in new_data.items():
79
+ # observed_patterns 只追加,不检测冲突
80
+ if field == "observed_patterns":
81
+ continue
82
+
83
+ # 跳过 superseded 标记的字段
84
+ if field.endswith("_superseded"):
85
+ continue
86
+
87
+ if field not in profile:
88
+ continue
89
+
90
+ old_value = profile[field]
91
+
92
+ # 字符串类型:旧值与新值不同 → 冲突
93
+ if isinstance(old_value, str) and isinstance(new_value, str):
94
+ if old_value != new_value:
95
+ conflicts.append({
96
+ "field": field,
97
+ "old_value": old_value,
98
+ "new_value": new_value,
99
+ "superseded_at": _now_iso(),
100
+ })
101
+
102
+ # 数组类型:完全替换(完全不同)→ 冲突;只新增元素 → 不冲突
103
+ elif isinstance(old_value, list) and isinstance(new_value, list):
104
+ # 新旧完全不同才算冲突;新值包含旧值的所有元素(新增元素)不算冲突
105
+ old_set = set(str(v) for v in old_value)
106
+ new_set = set(str(v) for v in new_value)
107
+ if old_set and new_set and old_set != new_set:
108
+ # 只有当旧值中的元素不在新值中,才算冲突
109
+ removed = old_set - new_set
110
+ if removed: # 有元素被移除才算冲突
111
+ conflicts.append({
112
+ "field": field,
113
+ "old_value": old_value,
114
+ "new_value": new_value,
115
+ "superseded_at": _now_iso(),
116
+ })
117
+
118
+ # dict 类型:递归检测每个 key
119
+ elif isinstance(old_value, dict) and isinstance(new_value, dict):
120
+ for sub_key, sub_old in old_value.items():
121
+ if sub_key not in new_value:
122
+ continue
123
+ sub_new = new_value[sub_key]
124
+ if isinstance(sub_old, str) and isinstance(sub_new, str):
125
+ if sub_old != sub_new:
126
+ full_key = f"{field}.{sub_key}"
127
+ conflicts.append({
128
+ "field": full_key,
129
+ "old_value": sub_old,
130
+ "new_value": "(已更新)" if sub_new else sub_new,
131
+ "superseded_at": _now_iso(),
132
+ })
133
+
134
+ return conflicts
135
+
136
+
137
+ def mark_profile_superseded(home: Path, conflicts: list[dict]):
138
+ """
139
+ 将 user_profile.json 中冲突的旧字段标记为 superseded。
140
+ 在旧字段旁追加 "<field>_superseded" 记录旧值。
141
+ """
142
+ profile_file = home / "semantic" / "user_profile.json"
143
+ if not profile_file.exists() or not conflicts:
144
+ return
145
+
146
+ try:
147
+ with open(profile_file, encoding="utf-8") as f:
148
+ profile = json.load(f)
149
+ except (json.JSONDecodeError, IOError):
150
+ return
151
+
152
+ for conflict in conflicts:
153
+ field = conflict["field"]
154
+ # 记录旧值
155
+ superseded_key = f"{field}_superseded"
156
+ profile[superseded_key] = {
157
+ "old_value": conflict["old_value"],
158
+ "superseded_at": conflict["superseded_at"],
159
+ }
160
+ # 将新值写入主字段(已经在 profile_update 时更新了,这里只处理遗留情况)
161
+ if "." in field:
162
+ # 处理嵌套 dict key,如 work_style.confirm_before_implement
163
+ parts = field.split(".")
164
+ d = profile
165
+ for p in parts[:-1]:
166
+ d = d.setdefault(p, {})
167
+ d[parts[-1]] = conflict["new_value"]
168
+ else:
169
+ profile[field] = conflict["new_value"]
170
+
171
+ # 原子写入
172
+ tmp_file = profile_file.with_suffix(".tmp")
173
+ with open(tmp_file, "w", encoding="utf-8") as f:
174
+ json.dump(profile, f, ensure_ascii=False, indent=2)
175
+ tmp_file.replace(profile_file)
176
+
177
+
178
+ # ── 知识库冲突检测 ────────────────────────────────────────────────────────
179
+
180
+
181
+ def _extract_keywords(text: str) -> set[str]:
182
+ """提取关键词(去除停用词)"""
183
+ words = re.split(r'[\s,;.()\[\]{}]+', text.lower())
184
+ return set(_stopword_filter(words))
185
+
186
+
187
+ def _title_similarity(title_a: str, title_b: str) -> float:
188
+ """计算标题相似度:关键词重叠数 / max(len_a, len_b)"""
189
+ keywords_a = _extract_keywords(title_a)
190
+ keywords_b = _extract_keywords(title_b)
191
+ if not keywords_a or not keywords_b:
192
+ return 0.0
193
+ overlap = len(keywords_a & keywords_b)
194
+ max_len = max(len(keywords_a), len(keywords_b))
195
+ return overlap / max_len
196
+
197
+
198
+ def _has_negation(text: str) -> bool:
199
+ """检测文本中是否含否定词"""
200
+ text_lower = text.lower()
201
+ return any(nw in text_lower for nw in NEGATION_WORDS)
202
+
203
+
204
+ def _has_number_change(text_old: str, text_new: str) -> bool:
205
+ """检测数值变化:两边都有数字但数字不同"""
206
+ nums_old = set(re.findall(r'\d+\.?\d*', text_old))
207
+ nums_new = set(re.findall(r'\d+\.?\d*', text_new))
208
+ if nums_old and nums_new and nums_old != nums_new:
209
+ return True
210
+ return False
211
+
212
+
213
+ def _has_contradictory_pair(text_old: str, text_new: str) -> bool:
214
+ """检测对立词对:一边含 A 另一边含 B"""
215
+ text_lower_old = text_old.lower()
216
+ text_lower_new = text_new.lower()
217
+ for pair in CONTRADICTORY_PAIRS:
218
+ words_old = set(pair & set(text_lower_old.split()))
219
+ words_new = set(pair & set(text_lower_new.split()))
220
+ if words_old and words_new and words_old != words_new:
221
+ return True
222
+ return False
223
+
224
+
225
+ def detect_knowledge_conflict(new_entry: dict, home: Path) -> list[dict]:
226
+ """
227
+ 检测 knowledge_base.jsonl 中的内容矛盾。
228
+ new_entry 格式:{"title": "...", "content": "...", ...}
229
+ 返回冲突列表: [{"seq": 行号, "title": "...", "reason": "规则1/2/3"}]
230
+ """
231
+ kb_file = home / "semantic" / "knowledge_base.jsonl"
232
+ if not kb_file.exists():
233
+ return []
234
+
235
+ new_title = new_entry.get("title", "")
236
+ new_content = new_entry.get("content", "")
237
+
238
+ conflicts = []
239
+ lines = []
240
+ with open(kb_file, encoding="utf-8") as f:
241
+ for i, line in enumerate(f):
242
+ if not line.strip():
243
+ continue
244
+ try:
245
+ entry = json.loads(line)
246
+ except json.JSONDecodeError:
247
+ continue
248
+ # 跳过已失效条目
249
+ if entry.get("superseded"):
250
+ continue
251
+ lines.append((i + 1, entry)) # 行号从1开始
252
+
253
+ for seq, old_entry in lines:
254
+ old_title = old_entry.get("title", "")
255
+ old_content = old_entry.get("content", "")
256
+
257
+ # 标题相似度 >= 0.5 才进入内容矛盾检测
258
+ sim = _title_similarity(new_title, old_title)
259
+ if sim < 0.5:
260
+ continue
261
+
262
+ # 矛盾规则检测
263
+ reason = None
264
+
265
+ # 规则1 — 否定词检测
266
+ has_neg_old = _has_negation(old_content)
267
+ has_neg_new = _has_negation(new_content)
268
+ if has_neg_old != has_neg_new:
269
+ reason = "规则1"
270
+
271
+ # 规则2 — 数值变化检测
272
+ elif _has_number_change(old_content, new_content):
273
+ reason = "规则2"
274
+
275
+ # 规则3 — 对立词检测
276
+ elif _has_contradictory_pair(old_content, new_content):
277
+ reason = "规则3"
278
+
279
+ if reason:
280
+ conflicts.append({
281
+ "seq": seq,
282
+ "title": old_title,
283
+ "reason": reason,
284
+ })
285
+
286
+ return conflicts
287
+
288
+
289
+ def mark_superseded(home: Path, jsonl_path: Path, seq_list: list[int]):
290
+ """
291
+ 将 knowledge_base.jsonl 中指定行号的条目标记为 superseded。
292
+ 原子写入:先写 tmp 文件再 rename。
293
+ """
294
+ if not seq_list:
295
+ return
296
+
297
+ seq_set = set(seq_list)
298
+
299
+ tmp_file = jsonl_path.with_suffix(".tmp")
300
+ written_seq = set()
301
+
302
+ with open(jsonl_path, encoding="utf-8") as f_in, \
303
+ open(tmp_file, "w", encoding="utf-8") as f_out:
304
+ for i, line in enumerate(f_in):
305
+ if not line.strip():
306
+ continue
307
+ try:
308
+ entry = json.loads(line)
309
+ except json.JSONDecodeError:
310
+ continue
311
+
312
+ line_num = i + 1 # 行号从1开始
313
+ if line_num in seq_set and line_num not in written_seq:
314
+ entry["superseded"] = True
315
+ entry["superseded_at"] = _now_iso()
316
+ written_seq.add(line_num)
317
+
318
+ f_out.write(json.dumps(entry, ensure_ascii=False) + "\n")
319
+
320
+ tmp_file.replace(jsonl_path)
package/scripts/log_op.py CHANGED
@@ -8,10 +8,45 @@ import os
8
8
  import sys
9
9
  import json
10
10
  import re
11
+ import time
12
+ import logging
11
13
  import argparse
14
+ import contextlib
12
15
  from datetime import datetime, timezone
13
16
  from pathlib import Path
14
17
 
18
+ logging.basicConfig(
19
+ level=logging.WARNING,
20
+ format="[ultra-memory] %(levelname)s %(message)s",
21
+ )
22
+ _log = logging.getLogger("ultra-memory.log_op")
23
+
24
+
25
+ @contextlib.contextmanager
26
+ def _advisory_lock(lock_path: Path, timeout: float = 5.0):
27
+ """跨平台建议性文件锁(.lock 哨兵文件)"""
28
+ deadline = time.monotonic() + timeout
29
+ acquired = False
30
+ while True:
31
+ try:
32
+ fd = os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
33
+ os.close(fd)
34
+ acquired = True
35
+ break
36
+ except FileExistsError:
37
+ if time.monotonic() >= deadline:
38
+ _log.warning("文件锁等待超时 %s,直接继续写入", lock_path)
39
+ break
40
+ time.sleep(0.05)
41
+ try:
42
+ yield
43
+ finally:
44
+ if acquired:
45
+ try:
46
+ lock_path.unlink(missing_ok=True)
47
+ except Exception:
48
+ pass
49
+
15
50
  if sys.stdout.encoding != "utf-8":
16
51
  sys.stdout.reconfigure(encoding="utf-8")
17
52
  if sys.stderr.encoding != "utf-8":
@@ -19,6 +54,20 @@ if sys.stderr.encoding != "utf-8":
19
54
 
20
55
  ULTRA_MEMORY_HOME = Path(os.environ.get("ULTRA_MEMORY_HOME", Path.home() / ".ultra-memory"))
21
56
 
57
+ # 记忆注入标记(防止反馈环:AI 把自己的记忆输出又记录进去导致自引用积累)
58
+ MEMORY_INJECTION_PATTERNS = [
59
+ r'\[ultra-memory\][^\n]*', # 脚本自身的打印输出
60
+ r'MEMORY_READY[^\n]*', # 初始化信号
61
+ r'COMPRESS_SUGGESTED[^\n]*', # 压缩建议信号
62
+ r'SESSION_ID=sess_[A-Za-z0-9_]+', # 会话 ID 注入
63
+ r'session_id:\s*sess_[A-Za-z0-9_]+',
64
+ r'\[RECALL\][^\n]*', # recall.py 的输出头
65
+ r'\[ops #\d+[^\]]*\][^\n]*', # format_result 的 ops 格式
66
+ r'\[知识库[^\]]*\][^\n]*', # format_result 的知识库格式
67
+ r'\[实体/[^\]]*\][^\n]*', # format_result 的实体格式
68
+ r'\[摘要\][^\n]*', # format_result 的摘要格式
69
+ ]
70
+
22
71
  # 敏感词正则(防止记录密码/密钥)
23
72
  SENSITIVE_PATTERNS = [
24
73
  r'(?i)(password|passwd|pwd)\s*[=:]\s*\S+',
@@ -192,10 +241,20 @@ FILE_EXT_TAG_MAP = {
192
241
  }
193
242
 
194
243
 
244
+ def filter_memory_markers(text: str) -> str:
245
+ """过滤记忆注入标记,防止反馈环(AI 将自身记忆输出再次记录导致自引用噪音积累)"""
246
+ if not text:
247
+ return text
248
+ for pattern in MEMORY_INJECTION_PATTERNS:
249
+ text = re.sub(pattern, "", text, flags=re.IGNORECASE)
250
+ return text.strip()
251
+
252
+
195
253
  def sanitize(text: str) -> str:
196
- """过滤敏感信息"""
254
+ """过滤敏感信息 + 反馈环标记"""
197
255
  if not text:
198
256
  return text
257
+ text = filter_memory_markers(text)
199
258
  for pattern in SENSITIVE_PATTERNS:
200
259
  text = re.sub(pattern, "[REDACTED]", text)
201
260
  return text
@@ -279,17 +338,57 @@ def log_op(
279
338
  "compressed": False,
280
339
  }
281
340
 
282
- # 追加写入(append-only,永不覆盖)
283
- with open(ops_file, "a", encoding="utf-8") as f:
284
- f.write(json.dumps(entry, ensure_ascii=False) + "\n")
285
-
286
- # 更新 meta
287
- meta["op_count"] = seq
288
- meta["last_op_at"] = entry["ts"]
289
- if op_type == "milestone":
290
- meta["last_milestone"] = summary
291
- with open(meta_file, "w", encoding="utf-8") as f:
292
- json.dump(meta, f, ensure_ascii=False, indent=2)
341
+ # ── 矛盾检测(写入 ops.jsonl 之前)──────────────────────────────────────
342
+
343
+ # 2A:画像冲突检测(user_instruction/decision + profile_update)
344
+ if op_type in ("user_instruction", "decision") and detail.get("profile_update"):
345
+ try:
346
+ import sys as _sys
347
+ _scripts_dir_cd = Path(__file__).parent
348
+ if str(_scripts_dir_cd) not in _sys.path:
349
+ _sys.path.insert(0, str(_scripts_dir_cd))
350
+ from conflict_detector import detect_profile_conflict, mark_profile_superseded
351
+ conflicts = detect_profile_conflict(detail["profile_update"], ULTRA_MEMORY_HOME)
352
+ if conflicts:
353
+ entry["detail"]["profile_conflicts"] = conflicts
354
+ mark_profile_superseded(ULTRA_MEMORY_HOME, conflicts)
355
+ print(f"[ultra-memory] ⚡ 检测到 {len(conflicts)} 处画像矛盾,旧记录已标记失效")
356
+ except Exception as _e:
357
+ _log.debug("画像冲突检测失败(不影响主流程): %s", _e)
358
+
359
+ # 2B:知识库冲突检测(milestone/decision + knowledge_entry)
360
+ if op_type in ("milestone", "decision") and detail.get("knowledge_entry"):
361
+ try:
362
+ import sys as _sys
363
+ _scripts_dir_cd = Path(__file__).parent
364
+ if str(_scripts_dir_cd) not in _sys.path:
365
+ _sys.path.insert(0, str(_scripts_dir_cd))
366
+ from conflict_detector import detect_knowledge_conflict, mark_superseded
367
+ conflicts = detect_knowledge_conflict(detail["knowledge_entry"], ULTRA_MEMORY_HOME)
368
+ if conflicts:
369
+ kb_path = ULTRA_MEMORY_HOME / "semantic" / "knowledge_base.jsonl"
370
+ seq_list = [c["seq"] for c in conflicts]
371
+ mark_superseded(ULTRA_MEMORY_HOME, kb_path, seq_list)
372
+ entry["detail"]["knowledge_conflicts"] = conflicts
373
+ print(f"[ultra-memory] ⚡ 检测到 {len(conflicts)} 条知识库矛盾,旧记录已标记失效")
374
+ except Exception as _e:
375
+ _log.debug("知识库冲突检测失败(不影响主流程): %s", _e)
376
+
377
+ # 追加写入(append-only,永不覆盖);文件锁保护并发写入
378
+ _lock_file = ops_file.with_suffix(".lock")
379
+ with _advisory_lock(_lock_file):
380
+ with open(ops_file, "a", encoding="utf-8") as f:
381
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
382
+
383
+ # 更新 meta(在锁内,保证 op_count 单调递增)
384
+ meta["op_count"] = seq
385
+ meta["last_op_at"] = entry["ts"]
386
+ if op_type == "milestone":
387
+ meta["last_milestone"] = summary
388
+ _tmp_meta = meta_file.with_suffix(".tmp")
389
+ with open(_tmp_meta, "w", encoding="utf-8") as f:
390
+ json.dump(meta, f, ensure_ascii=False, indent=2)
391
+ _tmp_meta.replace(meta_file)
293
392
 
294
393
  # 自动提取结构化实体(写入 semantic/entities.jsonl)
295
394
  try:
@@ -299,8 +398,8 @@ def log_op(
299
398
  _sys.path.insert(0, str(_scripts_dir))
300
399
  from extract_entities import extract_and_store
301
400
  extract_and_store(session_id, dict(entry))
302
- except Exception:
303
- pass # 实体提取失败不影响主流程
401
+ except Exception as _e:
402
+ _log.debug("实体提取失败(不影响主流程): %s", _e)
304
403
 
305
404
  # 自动提取结构化事实(写入 evolution/facts.jsonl,异步不阻塞)
306
405
  try:
@@ -322,8 +421,9 @@ def log_op(
322
421
 
323
422
  # 多模态处理:检测媒体文件并后台提取
324
423
  try:
325
- _media_exts = {".pdf": "extract_from_pdf.py", ".png": "extract_from_image.py",
326
- ".jpg": "extract_from_image.py", ".jpeg": "extract_from_image.py",
424
+ _media_exts = {".pdf": "extract_from_pdf.py", ".docx": "extract_from_docx.py",
425
+ ".png": "extract_from_image.py", ".jpg": "extract_from_image.py",
426
+ ".jpeg": "extract_from_image.py",
327
427
  ".mp4": "transcribe_video.py", ".avi": "transcribe_video.py",
328
428
  ".mov": "transcribe_video.py"}
329
429
  _file_path = detail.get("path", "")