ultra-memory 3.1.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,471 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ultra-memory: 结构化事实提取引擎 (Evolution Engine Phase 1)
4
+ 从操作日志中提取 (subject, predicate, object) 三元组结构化事实,
5
+ 写入 evolution/facts.jsonl。
6
+
7
+ 与 extract_entities.py 的关系:
8
+ 实体层(entity)回答"什么名字/哪个文件/哪个函数"
9
+ 事实层(fact)回答"它做什么/它依赖什么/它的行为是什么"
10
+
11
+ 事实提取不依赖 LLM API,使用正则谓词模式 + 统计共现计算置信度。
12
+
13
+ 被 log_op.py 在每次写入后异步调用(subprocess.Popen,背景执行,不阻塞主流程)。
14
+ """
15
+
16
+ import os
17
+ import sys
18
+ import re
19
+ import json
20
+ import argparse
21
+ import hashlib
22
+ import subprocess
23
+ from datetime import datetime, timezone
24
+ from pathlib import Path
25
+ from collections import Counter
26
+
27
+ if sys.stdout.encoding != "utf-8":
28
+ sys.stdout.reconfigure(encoding="utf-8")
29
+ if sys.stderr.encoding != "utf-8":
30
+ sys.stderr.reconfigure(encoding="utf-8")
31
+
32
+ ULTRA_MEMORY_HOME = Path(os.environ.get("ULTRA_MEMORY_HOME", Path.home() / ".ultra-memory"))
33
+
34
+ # ── 谓词模式目录 ────────────────────────────────────────────────────────────
35
+ # 每个条目:(compiled_regex, predicate_label, source_type_filter_or_None)
36
+ # regex 必须包含 named group: (?P<subj>...) 和 (?P<obj>...)
37
+ # source_type_filter: None 表示所有类型都匹配
38
+
39
+ PREDICATE_PATTERNS = [
40
+ # 行为/方法类谓词
41
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:fill|fills|filling)\s+(?:null|nuls?)\s+with\s+(?P<obj>.+)', re.IGNORECASE),
42
+ "fills_nulls_with", None),
43
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:return|returns|returning)\s+(?P<obj>.+)', re.IGNORECASE),
44
+ "returns", None),
45
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:use|uses|using)\s+(?P<obj>.+)', re.IGNORECASE),
46
+ "uses", None),
47
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:accept|accepts|accepting)\s+(?P<obj>.+)', re.IGNORECASE),
48
+ "accepts", None),
49
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:raise|raises|raising)\s+(?P<obj>.+)', re.IGNORECASE),
50
+ "raises", None),
51
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:skip|skips|skipping)\s+(?P<obj>.+)', re.IGNORECASE),
52
+ "skips", None),
53
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:block|blocks|blocking)\s+(?P<obj>.+)', re.IGNORECASE),
54
+ "blocks", None),
55
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:fail|fails|failing)\s+(?:when|if|on)?\s*(?P<obj>.+)', re.IGNORECASE),
56
+ "fails_on", None),
57
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:parse|parses|parsing)\s+(?P<obj>.+)', re.IGNORECASE),
58
+ "parses", None),
59
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:export|exports|exporting)\s+(?P<obj>.+)', re.IGNORECASE),
60
+ "exports", None),
61
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:validate|validates|validating)\s+(?P<obj>.+)', re.IGNORECASE),
62
+ "validates", None),
63
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:encode|encodes|encoding)\s+(?P<obj>.+)', re.IGNORECASE),
64
+ "encodes", None),
65
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:decode|decodes|decoding)\s+(?P<obj>.+)', re.IGNORECASE),
66
+ "decodes", None),
67
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:read|reads|reading)\s+(?P<obj>.+)', re.IGNORECASE),
68
+ "reads", None),
69
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:write|writes|writing)\s+(?P<obj>.+)', re.IGNORECASE),
70
+ "writes", None),
71
+
72
+ # 依赖类谓词
73
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:depend|depends|depending)\s+on\s+(?P<obj>.+)', re.IGNORECASE),
74
+ "depends_on", None),
75
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:require|requires|requiring)\s+(?P<obj>.+)', re.IGNORECASE),
76
+ "requires", None),
77
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:install|installs|installing)\s+(?P<obj>.+)', re.IGNORECASE),
78
+ "installed_as", "bash_exec"),
79
+
80
+ # 配置/设置类谓词
81
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:set|sets|setting)\s+(?P<obj>.+)', re.IGNORECASE),
82
+ "sets", None),
83
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:default|defaults?)\s+(?:is|are)?\s*(?P<obj>.+)', re.IGNORECASE),
84
+ "defaults_to", None),
85
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:enabled|disabled)\s+by\s+(?P<obj>.+)', re.IGNORECASE),
86
+ "enabled_by", None),
87
+ (re.compile(r'(?P<subj>\b[\w\.]+)\s+(?:config|configure|configures)\s+(?P<obj>.+)', re.IGNORECASE),
88
+ "configured_as", None),
89
+
90
+ # 用户偏好/行为类谓词(从 summary 推断)
91
+ (re.compile(r'用户[以]?\s*(?:prefer|prefers|倾向|喜欢)\s+(?P<obj>\w.+)', re.IGNORECASE),
92
+ "user_prefers", None),
93
+ (re.compile(r'用户[以]?\s*(?:not|不想|不愿意|不要)\s+(?P<obj>\w.+)', re.IGNORECASE),
94
+ "user_avoids", None),
95
+ (re.compile(r'采用[了]?\s+(?P<obj>\w.+)', re.IGNORECASE),
96
+ "adopted", None),
97
+ (re.compile(r'选择[了]?\s+(?P<subj>\w.+)', re.IGNORECASE),
98
+ "chose", None),
99
+ ]
100
+
101
+ # 数值占位符和路径占位符(用于 object 归一化)
102
+ NUMERIC_PLACEHOLDER = re.compile(r'\b\d+\.?\d*\b')
103
+ PATH_PLACEHOLDER = re.compile(r'[/\\]?[a-zA-Z0-9_\-\.]+\.(py|js|ts|jsx|tsx|vue|json|yaml|yml|md|sql|sh|go|rs|java|rb|toml)')
104
+
105
+ # ── 工具函数 ────────────────────────────────────────────────────────────────
106
+
107
+
108
+ def normalize_object(obj: str) -> str:
109
+ """归一化 object 字符串:替换数字/路径为占位符,小写化"""
110
+ obj = obj.lower().strip()
111
+ obj = NUMERIC_PLACEHOLDER.sub('<NUM>', obj)
112
+ obj = PATH_PLACEHOLDER.sub('<PATH>', obj)
113
+ obj = re.sub(r'\s+', ' ', obj)
114
+ return obj
115
+
116
+
117
+ def compute_fact_id(subject: str, predicate: str, obj: str) -> str:
118
+ """确定性 fact_id:SHA1(subject + predicate + object) 的前12位十六进制"""
119
+ raw = f"{subject}\x00{predicate}\x00{obj}"
120
+ return hashlib.sha1(raw.encode()).hexdigest()[:12]
121
+
122
+
123
+ def _now_iso() -> str:
124
+ return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
125
+
126
+
127
+ # ── 主体提取 ────────────────────────────────────────────────────────────────
128
+
129
+
130
+ _FUNC_DEF_PATTERN = re.compile(r'\bdef\s+([a-zA-Z_][a-zA-Z0-9_]+)', re.MULTILINE)
131
+ _FILE_EXT_PATTERN = re.compile(
132
+ r'\b([a-zA-Z0-9_\-\.]+/)*[a-zA-Z0-9_\-\.]+\.(py|js|ts|jsx|tsx|vue|html|css|json|yaml|yml|toml|md|sql|sh|go|rs|java|rb|php|cpp|c|h)\b',
133
+ re.IGNORECASE
134
+ )
135
+
136
+
137
+ def extract_subject_from_op(op: dict) -> str | None:
138
+ """
139
+ 从 op 中提取最适格的主体(subject):
140
+ 优先顺序:function def > file path > detail.path > bash command verb
141
+ """
142
+ summary = op.get("summary", "")
143
+ detail = op.get("detail", {})
144
+ combined = summary + " " + json.dumps(detail, ensure_ascii=False)
145
+
146
+ # 1. def function_name → 取 function name
147
+ def_match = _FUNC_DEF_PATTERN.search(combined)
148
+ if def_match:
149
+ return def_match.group(1)
150
+
151
+ # 2. 文件路径(detail.path 最准确)
152
+ file_path = detail.get("path", "")
153
+ if file_path:
154
+ name = Path(file_path).name
155
+ if name:
156
+ return name
157
+
158
+ # 3. 文本中的文件路径
159
+ file_match = _FILE_EXT_PATTERN.search(combined)
160
+ if file_match:
161
+ name = Path(file_match.group(0)).name
162
+ if name:
163
+ return name
164
+
165
+ # 4. bash 命令第一个实词
166
+ if op.get("type") == "bash_exec":
167
+ cmd = detail.get("cmd", summary)
168
+ words = cmd.strip().split()
169
+ if len(words) >= 2:
170
+ # e.g. "python clean_df.py" → "clean_df"
171
+ second = words[1]
172
+ if not second.startswith("-"):
173
+ return Path(second).stem or second
174
+
175
+ return None
176
+
177
+
178
+ # ── 核心提取逻辑 ───────────────────────────────────────────────────────────
179
+
180
+
181
+ def extract_facts_from_op(op: dict) -> list[dict]:
182
+ """
183
+ 从单条操作记录中提取结构化事实三元组。
184
+
185
+ 返回格式:
186
+ {
187
+ "fact_id": "fct_<sha12hex>",
188
+ "ts": "...",
189
+ "session_id": "sess_xxx",
190
+ "op_seq": 42,
191
+ "subject": "clean_df",
192
+ "predicate": "fills_nulls_with",
193
+ "object": "empty string for text, 0 for numeric",
194
+ "confidence": 0.7,
195
+ "source_summary": "...",
196
+ "source_type": "file_write|...",
197
+ "tags": [...],
198
+ "contradiction_count": 0,
199
+ "last_accessed": "...",
200
+ "access_count": 1,
201
+ "status": "active",
202
+ "expires_at": None
203
+ }
204
+ """
205
+ facts = []
206
+ summary = op.get("summary", "")
207
+ detail = op.get("detail", {})
208
+ op_type = op.get("type", "")
209
+ ts = op.get("ts", "")
210
+ seq = op.get("seq", 0)
211
+ tags = op.get("tags", [])
212
+ session = op.get("_session_id", "")
213
+
214
+ combined_text = summary + " " + json.dumps(detail, ensure_ascii=False)
215
+
216
+ # 从 op 中提取主体
217
+ subject = extract_subject_from_op(op)
218
+ if not subject:
219
+ return []
220
+
221
+ # 遍历谓词模式
222
+ for pattern, predicate, type_filter in PREDICATE_PATTERNS:
223
+ if type_filter and op_type != type_filter:
224
+ continue
225
+
226
+ for match in pattern.finditer(combined_text):
227
+ obj = match.group("obj").strip()
228
+ if not obj or len(obj) < 2:
229
+ continue
230
+
231
+ fact_id = compute_fact_id(subject, predicate, obj)
232
+
233
+ # 基础置信度(模式匹配 = 0.7)
234
+ base_confidence = 0.7
235
+
236
+ # 如果 op_type 是 decision/user_instruction,置信度略高
237
+ if op_type in ("decision", "user_instruction"):
238
+ base_confidence = 0.8
239
+ elif op_type == "milestone":
240
+ base_confidence = 0.85
241
+
242
+ fact = {
243
+ "fact_id": f"fct_{fact_id}",
244
+ "ts": ts,
245
+ "session_id": session,
246
+ "op_seq": seq,
247
+ "subject": subject,
248
+ "predicate": predicate,
249
+ "object": obj,
250
+ "confidence": base_confidence,
251
+ "source_summary": summary[:100],
252
+ "source_type": op_type,
253
+ "tags": tags[:3] if tags else [],
254
+ "contradiction_count": 0,
255
+ "last_accessed": ts,
256
+ "access_count": 1,
257
+ "status": "active",
258
+ "expires_at": None,
259
+ }
260
+ facts.append(fact)
261
+
262
+ return facts
263
+
264
+
265
+ def _load_existing_facts() -> list[dict]:
266
+ """加载所有已存在的 facts"""
267
+ facts_file = ULTRA_MEMORY_HOME / "evolution" / "facts.jsonl"
268
+ if not facts_file.exists():
269
+ return []
270
+ facts = []
271
+ with open(facts_file, encoding="utf-8") as f:
272
+ for line in f:
273
+ line = line.strip()
274
+ if not line:
275
+ continue
276
+ try:
277
+ facts.append(json.loads(line))
278
+ except json.JSONDecodeError:
279
+ continue
280
+ return facts
281
+
282
+
283
+ def _cooccurrence_confidence(subject: str, predicate: str, existing_facts: list[dict]) -> float:
284
+ """
285
+ 统计共现置信度:
286
+ 同一 (subject, predicate) 出现多次,object 越一致 → 置信度越高。
287
+ """
288
+ candidates = [
289
+ f for f in existing_facts
290
+ if f.get("subject") == subject
291
+ and f.get("predicate") == predicate
292
+ ]
293
+ if len(candidates) < 2:
294
+ return 0.7 # 初始值
295
+
296
+ object_normalized = [normalize_object(f.get("object", "")) for f in candidates]
297
+ counter = Counter(object_normalized)
298
+ dominant_count = counter.most_common(1)[0][1]
299
+ total = len(object_normalized)
300
+
301
+ # 1 - (冲突数 / 总数)
302
+ conflict_ratio = 1 - (dominant_count / total)
303
+ return max(0.5, 1.0 - conflict_ratio)
304
+
305
+
306
+ # ── 存储 ────────────────────────────────────────────────────────────────────
307
+
308
+
309
+ def append_facts(facts: list[dict], session_id: str):
310
+ """将提取的事实追加写入 evolution/facts.jsonl"""
311
+ if not facts:
312
+ return
313
+
314
+ evolution_dir = ULTRA_MEMORY_HOME / "evolution"
315
+ evolution_dir.mkdir(parents=True, exist_ok=True)
316
+ facts_file = evolution_dir / "facts.jsonl"
317
+
318
+ for fact in facts:
319
+ if not fact.get("session_id"):
320
+ fact["session_id"] = session_id
321
+
322
+ with open(facts_file, "a", encoding="utf-8") as f:
323
+ for fact in facts:
324
+ f.write(json.dumps(fact, ensure_ascii=False) + "\n")
325
+
326
+
327
+ # ── 矛盾检测触发 ───────────────────────────────────────────────────────────
328
+
329
+
330
+ def trigger_contradiction_detection(session_id: str, fact_ids: list[str]):
331
+ """
332
+ 以背景进程触发矛盾检测。
333
+ 检测完成后自动更新 fact_metadata.json。
334
+ """
335
+ try:
336
+ scripts_dir = Path(__file__).parent
337
+ python = sys.executable
338
+ startupinfo = subprocess.STARTUPINFO()
339
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
340
+ subprocess.Popen(
341
+ [python, str(scripts_dir / "detect_contradictions.py"),
342
+ "--session", session_id,
343
+ "--new-fact-ids"] + fact_ids,
344
+ stdout=subprocess.DEVNULL,
345
+ stderr=subprocess.DEVNULL,
346
+ start_new_session=True,
347
+ startupinfo=startupinfo,
348
+ )
349
+ except Exception:
350
+ pass # 矛盾检测失败静默跳过
351
+
352
+
353
+ # ── 主入口 ─────────────────────────────────────────────────────────────────
354
+
355
+
356
+ def extract_and_store(session_id: str, op: dict) -> list[dict]:
357
+ """
358
+ 对单条操作提取事实并写入 facts.jsonl。
359
+ 供 log_op.py 调用(同步模式,用于测试)。
360
+ """
361
+ op["_session_id"] = session_id
362
+ facts = extract_facts_from_op(op)
363
+ if facts:
364
+ append_facts(facts, session_id)
365
+ # 触发矛盾检测(背景)
366
+ fact_ids = [f["fact_id"] for f in facts]
367
+ trigger_contradiction_detection(session_id, fact_ids)
368
+ return facts
369
+
370
+
371
+ def extract_batch(session_id: str, op_seq: int | None = None):
372
+ """
373
+ 批量提取:读取 ops.jsonl 中指定 op_seq 或全部未处理记录。
374
+ --batch 模式使用。
375
+ """
376
+ session_dir = ULTRA_MEMORY_HOME / "sessions" / session_id
377
+ ops_file = session_dir / "ops.jsonl"
378
+ if not ops_file.exists():
379
+ print(f"[ultra-memory] ⚠️ ops.jsonl 不存在: {session_id}")
380
+ return
381
+
382
+ existing_facts = _load_existing_facts()
383
+
384
+ # 找出已提取的 op_seq 集合
385
+ existing_seqs = {
386
+ f.get("op_seq") for f in existing_facts
387
+ if f.get("session_id") == session_id
388
+ }
389
+
390
+ all_ops = []
391
+ with open(ops_file, encoding="utf-8") as f:
392
+ for line in f:
393
+ line = line.strip()
394
+ if not line:
395
+ continue
396
+ try:
397
+ op = json.loads(line)
398
+ all_ops.append(op)
399
+ except json.JSONDecodeError:
400
+ continue
401
+
402
+ # 过滤未处理的 ops
403
+ if op_seq is not None:
404
+ to_process = [op for op in all_ops if op.get("seq") == op_seq]
405
+ else:
406
+ to_process = [op for op in all_ops if op.get("seq") not in existing_seqs]
407
+
408
+ if not to_process:
409
+ print(f"[ultra-memory] ✅ 无新事实可提取 (session: {session_id})")
410
+ return
411
+
412
+ all_new_facts = []
413
+ for op in to_process:
414
+ op["_session_id"] = session_id
415
+ facts = extract_facts_from_op(op)
416
+ for fact in facts:
417
+ # 使用共现置信度更新
418
+ cooc_conf = _cooccurrence_confidence(
419
+ fact["subject"], fact["predicate"], existing_facts + all_new_facts
420
+ )
421
+ fact["confidence"] = round(
422
+ (fact["confidence"] * 0.5 + cooc_conf * 0.5), 2
423
+ )
424
+ all_new_facts.append(fact)
425
+
426
+ if all_new_facts:
427
+ append_facts(all_new_facts, session_id)
428
+ fact_ids = [f["fact_id"] for f in all_new_facts]
429
+ trigger_contradiction_detection(session_id, fact_ids)
430
+ print(f"[ultra-memory] ✅ 事实提取完成 (session: {session_id})")
431
+ print(f" 新增 {len(all_new_facts)} 条事实")
432
+ else:
433
+ print(f"[ultra-memory] ✅ 无新事实可提取 (session: {session_id})")
434
+
435
+
436
+ if __name__ == "__main__":
437
+ parser = argparse.ArgumentParser(
438
+ description="从操作日志提取结构化事实三元组 (subject, predicate, object)"
439
+ )
440
+ parser.add_argument("--session", required=True, help="会话 ID")
441
+ parser.add_argument(
442
+ "--op-seq", type=int, default=None,
443
+ help="只提取指定 seq 的操作(省略则批量提取未处理的)"
444
+ )
445
+ parser.add_argument(
446
+ "--batch", action="store_true",
447
+ help="批量提取:扫描整个 ops.jsonl"
448
+ )
449
+ args = parser.parse_args()
450
+
451
+ extract_batch(args.session, args.op_seq if not args.batch else None)
452
+
453
+ if args.batch:
454
+ # 全量扫描后触发矛盾检测
455
+ evolution_dir = ULTRA_MEMORY_HOME / "evolution"
456
+ evolution_dir.mkdir(parents=True, exist_ok=True)
457
+ detect_script = Path(__file__).parent / "detect_contradictions.py"
458
+ if detect_script.exists():
459
+ try:
460
+ startupinfo = subprocess.STARTUPINFO()
461
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
462
+ subprocess.Popen(
463
+ [sys.executable, str(detect_script),
464
+ "--session", args.session, "--batch"],
465
+ stdout=subprocess.DEVNULL,
466
+ stderr=subprocess.DEVNULL,
467
+ start_new_session=True,
468
+ startupinfo=startupinfo,
469
+ )
470
+ except Exception:
471
+ pass
package/scripts/log_op.py CHANGED
@@ -279,6 +279,34 @@ def log_op(
279
279
  "compressed": False,
280
280
  }
281
281
 
282
+ # ── 矛盾检测(写入 ops.jsonl 之前)──────────────────────────────────────
283
+
284
+ # 2A:画像冲突检测(user_instruction/decision + profile_update)
285
+ if op_type in ("user_instruction", "decision") and detail.get("profile_update"):
286
+ try:
287
+ from conflict_detector import detect_profile_conflict, mark_profile_superseded
288
+ conflicts = detect_profile_conflict(detail["profile_update"], ULTRA_MEMORY_HOME)
289
+ if conflicts:
290
+ entry["detail"]["profile_conflicts"] = conflicts
291
+ mark_profile_superseded(ULTRA_MEMORY_HOME, conflicts)
292
+ print(f"[ultra-memory] ⚡ 检测到 {len(conflicts)} 处画像矛盾,旧记录已标记失效")
293
+ except Exception:
294
+ pass
295
+
296
+ # 2B:知识库冲突检测(milestone/decision + knowledge_entry)
297
+ if op_type in ("milestone", "decision") and detail.get("knowledge_entry"):
298
+ try:
299
+ from conflict_detector import detect_knowledge_conflict, mark_superseded
300
+ conflicts = detect_knowledge_conflict(detail["knowledge_entry"], ULTRA_MEMORY_HOME)
301
+ if conflicts:
302
+ kb_path = ULTRA_MEMORY_HOME / "semantic" / "knowledge_base.jsonl"
303
+ seq_list = [c["seq"] for c in conflicts]
304
+ mark_superseded(ULTRA_MEMORY_HOME, kb_path, seq_list)
305
+ entry["detail"]["knowledge_conflicts"] = conflicts
306
+ print(f"[ultra-memory] ⚡ 检测到 {len(conflicts)} 条知识库矛盾,旧记录已标记失效")
307
+ except Exception:
308
+ pass
309
+
282
310
  # 追加写入(append-only,永不覆盖)
283
311
  with open(ops_file, "a", encoding="utf-8") as f:
284
312
  f.write(json.dumps(entry, ensure_ascii=False) + "\n")
@@ -302,6 +330,48 @@ def log_op(
302
330
  except Exception:
303
331
  pass # 实体提取失败不影响主流程
304
332
 
333
+ # 自动提取结构化事实(写入 evolution/facts.jsonl,异步不阻塞)
334
+ try:
335
+ import subprocess as _subprocess
336
+ _scripts_dir = Path(__file__).parent
337
+ _python = sys.executable
338
+ _startupinfo = _subprocess.STARTUPINFO()
339
+ _startupinfo.dwFlags |= _subprocess.STARTF_USESHOWWINDOW
340
+ _subprocess.Popen(
341
+ [_python, str(_scripts_dir / "extract_facts.py"),
342
+ "--session", session_id, "--op-seq", str(seq)],
343
+ stdout=_subprocess.DEVNULL,
344
+ stderr=_subprocess.DEVNULL,
345
+ start_new_session=True,
346
+ startupinfo=_startupinfo,
347
+ )
348
+ except Exception:
349
+ pass # 事实提取失败静默跳过
350
+
351
+ # 多模态处理:检测媒体文件并后台提取
352
+ try:
353
+ _media_exts = {".pdf": "extract_from_pdf.py", ".png": "extract_from_image.py",
354
+ ".jpg": "extract_from_image.py", ".jpeg": "extract_from_image.py",
355
+ ".mp4": "transcribe_video.py", ".avi": "transcribe_video.py",
356
+ ".mov": "transcribe_video.py"}
357
+ _file_path = detail.get("path", "")
358
+ if _file_path and op_type in ("file_read", "file_write"):
359
+ _ext = Path(_file_path).suffix.lower()
360
+ if _ext in _media_exts:
361
+ _script = _media_exts[_ext]
362
+ _multimodal_dir = Path(__file__).parent / "multimodal"
363
+ if (_multimodal_dir / _script).exists():
364
+ _subprocess.Popen(
365
+ [_python, str(_multimodal_dir / _script),
366
+ "--session", session_id, "--path", _file_path],
367
+ stdout=_subprocess.DEVNULL,
368
+ stderr=_subprocess.DEVNULL,
369
+ start_new_session=True,
370
+ startupinfo=_startupinfo,
371
+ )
372
+ except Exception:
373
+ pass # 多模态提取失败静默跳过
374
+
305
375
  # 检查是否需要触发压缩
306
376
  should_compress = False
307
377
  if seq > 0 and seq % 50 == 0:
@@ -0,0 +1,2 @@
1
+ # ultra-memory multimodal processing package
2
+ # Requires: pdfminer.six, pytesseract, whisper
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ultra-memory: 图片 OCR 提取 (Multimodal Phase 5)
4
+ 从图片文件中提取文字内容,写入 session 的 multimodal/ 目录,
5
+ 并触发事实提取。
6
+
7
+ 依赖: pytesseract + Tesseract OCR 引擎
8
+ 安装:
9
+ pip install pytesseract
10
+ Windows: 下载 https://github.com/UB-Mannheim/tesseract/wiki
11
+ macOS: brew install tesseract
12
+ Linux: sudo apt install tesseract-ocr
13
+ """
14
+
15
+ import os
16
+ import sys
17
+ import json
18
+ import argparse
19
+ import hashlib
20
+ import subprocess
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+
24
+ if sys.stdout.encoding != "utf-8":
25
+ sys.stdout.reconfigure(encoding="utf-8")
26
+ if sys.stderr.encoding != "utf-8":
27
+ sys.stderr.reconfigure(encoding="utf-8")
28
+
29
+ ULTRA_MEMORY_HOME = Path(os.environ.get("ULTRA_MEMORY_HOME", Path.home() / ".ultra-memory"))
30
+
31
+
32
+ def _now_iso() -> str:
33
+ return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
34
+
35
+
36
+ def extract_text_from_image(image_path: str) -> str:
37
+ """
38
+ 使用 pytesseract OCR 提取图片文字。
39
+ 同时支持中英文(lang='eng+chi')。
40
+ """
41
+ try:
42
+ import pytesseract
43
+ from PIL import Image
44
+
45
+ # 尝试中文+英文,如果失败回退到英文
46
+ try:
47
+ text = pytesseract.image_to_string(image_path, lang="eng+chi")
48
+ except Exception:
49
+ text = pytesseract.image_to_string(image_path, lang="eng")
50
+
51
+ return text.strip()
52
+ except ImportError:
53
+ print("[ultra-memory] ⚠️ pytesseract 未安装: pip install pytesseract")
54
+ print("[ultra-memory] ⚠️ 同时需要安装 Tesseract OCR 引擎")
55
+ return ""
56
+ except Exception as e:
57
+ print(f"[ultra-memory] ⚠️ OCR 提取失败: {e}")
58
+ return ""
59
+
60
+
61
+ def save_extracted_text(
62
+ session_id: str,
63
+ media_path: str,
64
+ text: str,
65
+ media_id: str,
66
+ ) -> Path:
67
+ """保存提取的文本到 multimodal 目录"""
68
+ session_dir = ULTRA_MEMORY_HOME / "sessions" / session_id
69
+ multimodal_dir = session_dir / "multimodal"
70
+ multimodal_dir.mkdir(parents=True, exist_ok=True)
71
+
72
+ file_name = Path(media_path).name
73
+ output_file = multimodal_dir / f"{file_name}.txt"
74
+
75
+ with open(output_file, "w", encoding="utf-8") as f:
76
+ f.write(f"# Extracted from: {media_path}\n")
77
+ f.write(f"# Media ID: {media_id}\n")
78
+ f.write(f"# Type: image (OCR)\n")
79
+ f.write(f"# Extracted at: {_now_iso()}\n")
80
+ f.write(f"# Chars: {len(text)}\n")
81
+ f.write("---\n")
82
+ f.write(text)
83
+
84
+ return output_file
85
+
86
+
87
+ def process_image(session_id: str, image_path: str) -> dict:
88
+ """
89
+ 处理单个图片文件。
90
+ 返回处理结果摘要。
91
+ """
92
+ if not Path(image_path).exists():
93
+ print(f"[ultra-memory] ⚠️ 图片文件不存在: {image_path}")
94
+ return {"success": False, "error": "file not found"}
95
+
96
+ # 提取文本
97
+ text = extract_text_from_image(image_path)
98
+ if not text.strip():
99
+ return {"success": False, "error": "no text extracted"}
100
+
101
+ # 生成 media_id
102
+ media_id = f"media_{hashlib.sha1(image_path.encode()).hexdigest()[:12]}"
103
+
104
+ # 保存文本
105
+ output_file = save_extracted_text(session_id, image_path, text, media_id)
106
+
107
+ char_count = len(text)
108
+
109
+ print(f"[ultra-memory] ✅ 图片 OCR 完成: {image_path}")
110
+ print(f" 文件: {output_file.name}")
111
+ print(f" 字符数: {char_count}")
112
+
113
+ return {
114
+ "success": True,
115
+ "media_id": media_id,
116
+ "session_id": session_id,
117
+ "source_path": image_path,
118
+ "output_file": str(output_file),
119
+ "char_count": char_count,
120
+ "processed_at": _now_iso(),
121
+ }
122
+
123
+
124
+ # ── CLI ─────────────────────────────────────────────────────────────────────
125
+
126
+
127
+ if __name__ == "__main__":
128
+ parser = argparse.ArgumentParser(description="从图片提取文字 (OCR)")
129
+ parser.add_argument("--path", required=True, help="图片文件路径")
130
+ parser.add_argument("--session", required=True, help="会话 ID")
131
+ args = parser.parse_args()
132
+
133
+ result = process_image(args.session, args.path)
134
+ if result["success"]:
135
+ sys.exit(0)
136
+ else:
137
+ print(f"[ultra-memory] ❌ 图片处理失败: {result.get('error')}")
138
+ sys.exit(1)