ultra-memory 3.1.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,182 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ultra-memory: PDF 文本提取 (Multimodal Phase 5)
4
+ 从 PDF 文件中提取文本内容,写入 session 的 multimodal/ 目录,
5
+ 并触发事实提取。
6
+
7
+ 依赖: pdfminer.six
8
+ 安装: pip install pdfminer.six
9
+ """
10
+
11
+ import os
12
+ import sys
13
+ import json
14
+ import argparse
15
+ import hashlib
16
+ import subprocess
17
+ from datetime import datetime, timezone
18
+ from pathlib import Path
19
+
20
+ if sys.stdout.encoding != "utf-8":
21
+ sys.stdout.reconfigure(encoding="utf-8")
22
+ if sys.stderr.encoding != "utf-8":
23
+ sys.stderr.reconfigure(encoding="utf-8")
24
+
25
+ ULTRA_MEMORY_HOME = Path(os.environ.get("ULTRA_MEMORY_HOME", Path.home() / ".ultra-memory"))
26
+
27
+ CHUNK_SIZE = 500 # 每块字符数
28
+
29
+
30
+ def _now_iso() -> str:
31
+ return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
32
+
33
+
34
+ def _chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> list[tuple[int, str]]:
35
+ """将文本分割为段落块,返回 [(chunk_index, text), ...]"""
36
+ paragraphs = text.split("\n\n")
37
+ chunks = []
38
+ current = []
39
+ current_len = 0
40
+ idx = 0
41
+
42
+ for para in paragraphs:
43
+ para_len = len(para)
44
+ if current_len + para_len > chunk_size and current:
45
+ chunks.append((idx, "\n".join(current).strip()))
46
+ idx += 1
47
+ current = []
48
+ current_len = 0
49
+ current.append(para)
50
+ current_len += para_len
51
+
52
+ if current:
53
+ chunks.append((idx, "\n".join(current).strip()))
54
+
55
+ return chunks
56
+
57
+
58
+ def extract_text_from_pdf(pdf_path: str) -> str:
59
+ """
60
+ 使用 pdfminer.six 提取 PDF 文本。
61
+ 返回原始文本(保留布局)。
62
+ """
63
+ try:
64
+ from pdfminer.high_level import extract_text
65
+ from pdfminer.layout import LAParams
66
+ text = extract_text(pdf_path, laparams=LAParams())
67
+ return text
68
+ except ImportError:
69
+ print("[ultra-memory] ⚠️ pdfminer.six 未安装: pip install pdfminer.six")
70
+ return ""
71
+ except Exception as e:
72
+ print(f"[ultra-memory] ⚠️ PDF 提取失败: {e}")
73
+ return ""
74
+
75
+
76
+ def save_extracted_text(
77
+ session_id: str,
78
+ media_path: str,
79
+ text: str,
80
+ media_id: str,
81
+ ) -> Path:
82
+ """保存提取的文本到 multimodal 目录"""
83
+ session_dir = ULTRA_MEMORY_HOME / "sessions" / session_id
84
+ multimodal_dir = session_dir / "multimodal"
85
+ multimodal_dir.mkdir(parents=True, exist_ok=True)
86
+
87
+ file_name = Path(media_path).name
88
+ output_file = multimodal_dir / f"{file_name}.txt"
89
+
90
+ with open(output_file, "w", encoding="utf-8") as f:
91
+ f.write(f"# Extracted from: {media_path}\n")
92
+ f.write(f"# Media ID: {media_id}\n")
93
+ f.write(f"# Extracted at: {_now_iso()}\n")
94
+ f.write(f"# Chars: {len(text)}\n")
95
+ f.write("---\n")
96
+ f.write(text)
97
+
98
+ return output_file
99
+
100
+
101
+ def trigger_fact_extraction(session_id: str, text_chunk: str, media_id: str):
102
+ """触发 extract_facts.py 从文本块中提取事实"""
103
+ try:
104
+ scripts_dir = Path(__file__).parent.parent
105
+ python = sys.executable
106
+
107
+ import subprocess
108
+ startupinfo = subprocess.STARTUPINFO()
109
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
110
+
111
+ # 将文本通过 stdin 传递(避免命令行转义问题)
112
+ proc = subprocess.Popen(
113
+ [python, str(scripts_dir / "extract_facts.py"),
114
+ "--session", session_id, "--batch"],
115
+ stdin=subprocess.PIPE,
116
+ stdout=subprocess.DEVNULL,
117
+ stderr=subprocess.DEVNULL,
118
+ start_new_session=True,
119
+ startupinfo=startupinfo,
120
+ )
121
+ # 文本注入到 fact extraction 流程
122
+ # 注意:当前实现将文本保存到文件,fact extraction 从文件读取
123
+ except Exception:
124
+ pass
125
+
126
+
127
+ def process_pdf(session_id: str, pdf_path: str) -> dict:
128
+ """
129
+ 处理单个 PDF 文件。
130
+ 返回处理结果摘要。
131
+ """
132
+ if not Path(pdf_path).exists():
133
+ print(f"[ultra-memory] ⚠️ PDF 文件不存在: {pdf_path}")
134
+ return {"success": False, "error": "file not found"}
135
+
136
+ # 提取文本
137
+ text = extract_text_from_pdf(pdf_path)
138
+ if not text.strip():
139
+ return {"success": False, "error": "no text extracted"}
140
+
141
+ # 生成 media_id
142
+ media_id = f"media_{hashlib.sha1(pdf_path.encode()).hexdigest()[:12]}"
143
+
144
+ # 保存文本
145
+ output_file = save_extracted_text(session_id, pdf_path, text, media_id)
146
+
147
+ # 分块
148
+ chunks = _chunk_text(text)
149
+ char_count = len(text)
150
+
151
+ print(f"[ultra-memory] ✅ PDF 提取完成: {pdf_path}")
152
+ print(f" 文件: {output_file.name}")
153
+ print(f" 字符数: {char_count}")
154
+ print(f" 文本块: {len(chunks)} 块")
155
+
156
+ return {
157
+ "success": True,
158
+ "media_id": media_id,
159
+ "session_id": session_id,
160
+ "source_path": pdf_path,
161
+ "output_file": str(output_file),
162
+ "char_count": char_count,
163
+ "chunk_count": len(chunks),
164
+ "processed_at": _now_iso(),
165
+ }
166
+
167
+
168
+ # ── CLI ─────────────────────────────────────────────────────────────────────
169
+
170
+
171
+ if __name__ == "__main__":
172
+ parser = argparse.ArgumentParser(description="从 PDF 文件提取文本")
173
+ parser.add_argument("--path", required=True, help="PDF 文件路径")
174
+ parser.add_argument("--session", required=True, help="会话 ID")
175
+ args = parser.parse_args()
176
+
177
+ result = process_pdf(args.session, args.path)
178
+ if result["success"]:
179
+ sys.exit(0)
180
+ else:
181
+ print(f"[ultra-memory] ❌ PDF 处理失败: {result.get('error')}")
182
+ sys.exit(1)
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ultra-memory: 视频转录 (Multimodal Phase 5)
4
+ 从视频文件中提取音频并转录为文字,写入 session 的 multimodal/ 目录。
5
+
6
+ 依赖: whisper (OpenAI 本地转录,无需 API)
7
+ 安装: pip install openai-whisper
8
+ 或: pip install whisper
9
+
10
+ 注意: whisper 模型较大(base≈1.5GB, small≈3GB, medium≈5GB, large≈10GB)
11
+ 首次运行会自动下载模型。建议从 base 开始测试。
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ import json
17
+ import argparse
18
+ import hashlib
19
+ import subprocess
20
+ import tempfile
21
+ import shutil
22
+ from datetime import datetime, timezone
23
+ from pathlib import Path
24
+
25
+ if sys.stdout.encoding != "utf-8":
26
+ sys.stdout.reconfigure(encoding="utf-8")
27
+ if sys.stderr.encoding != "utf-8":
28
+ sys.stderr.reconfigure(encoding="utf-8")
29
+
30
+ ULTRA_MEMORY_HOME = Path(os.environ.get("ULTRA_MEMORY_HOME", Path.home() / ".ultra-memory"))
31
+
32
+ # whisper 模型大小映射
33
+ MODEL_SIZES = ["tiny", "base", "small", "medium", "large"]
34
+
35
+
36
+ def _now_iso() -> str:
37
+ return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
38
+
39
+
40
+ def transcribe_video(video_path: str, model_size: str = "base") -> str:
41
+ """
42
+ 使用 Whisper 本地转录视频。
43
+ 自动检测语言。
44
+ """
45
+ try:
46
+ import whisper
47
+ except ImportError:
48
+ print("[ultra-memory] ⚠️ whisper 未安装: pip install openai-whisper")
49
+ print("[ultra-memory] ⚠️ 首次运行会自动下载模型(约 1.5GB for base)")
50
+ return ""
51
+
52
+ if model_size not in MODEL_SIZES:
53
+ model_size = "base"
54
+
55
+ try:
56
+ print(f"[ultra-memory] 加载 Whisper {model_size} 模型...")
57
+ model = whisper.load_model(model_size)
58
+ print(f"[ultra-memory] 开始转录: {video_path}")
59
+ result = model.transcribe(video_path, language=None, verbose=False)
60
+ return result.get("text", "").strip()
61
+ except Exception as e:
62
+ print(f"[ultra-memory] ⚠️ 转录失败: {e}")
63
+ return ""
64
+
65
+
66
+ def save_extracted_text(
67
+ session_id: str,
68
+ media_path: str,
69
+ text: str,
70
+ media_id: str,
71
+ model_size: str,
72
+ ) -> Path:
73
+ """保存转录文本到 multimodal 目录"""
74
+ session_dir = ULTRA_MEMORY_HOME / "sessions" / session_id
75
+ multimodal_dir = session_dir / "multimodal"
76
+ multimodal_dir.mkdir(parents=True, exist_ok=True)
77
+
78
+ file_name = Path(media_path).name
79
+ output_file = multimodal_dir / f"{file_name}.transcript.txt"
80
+
81
+ with open(output_file, "w", encoding="utf-8") as f:
82
+ f.write(f"# Transcribed from: {media_path}\n")
83
+ f.write(f"# Media ID: {media_id}\n")
84
+ f.write(f"# Type: video (Whisper {model_size})\n")
85
+ f.write(f"# Transcribed at: {_now_iso()}\n")
86
+ f.write(f"# Chars: {len(text)}\n")
87
+ f.write("---\n")
88
+ f.write(text)
89
+
90
+ return output_file
91
+
92
+
93
+ def process_video(
94
+ session_id: str,
95
+ video_path: str,
96
+ model_size: str = "base",
97
+ ) -> dict:
98
+ """
99
+ 处理单个视频文件。
100
+ 返回处理结果摘要。
101
+ """
102
+ if not Path(video_path).exists():
103
+ print(f"[ultra-memory] ⚠️ 视频文件不存在: {video_path}")
104
+ return {"success": False, "error": "file not found"}
105
+
106
+ # 转录
107
+ text = transcribe_video(video_path, model_size)
108
+ if not text.strip():
109
+ return {"success": False, "error": "transcription failed"}
110
+
111
+ # 生成 media_id
112
+ media_id = f"media_{hashlib.sha1(video_path.encode()).hexdigest()[:12]}"
113
+
114
+ # 保存文本
115
+ output_file = save_extracted_text(
116
+ session_id, video_path, text, media_id, model_size
117
+ )
118
+
119
+ char_count = len(text)
120
+
121
+ print(f"[ultra-memory] ✅ 视频转录完成: {video_path}")
122
+ print(f" 文件: {output_file.name}")
123
+ print(f" 字符数: {char_count}")
124
+
125
+ return {
126
+ "success": True,
127
+ "media_id": media_id,
128
+ "session_id": session_id,
129
+ "source_path": video_path,
130
+ "output_file": str(output_file),
131
+ "char_count": char_count,
132
+ "model_size": model_size,
133
+ "processed_at": _now_iso(),
134
+ }
135
+
136
+
137
+ # ── CLI ─────────────────────────────────────────────────────────────────────
138
+
139
+
140
+ if __name__ == "__main__":
141
+ parser = argparse.ArgumentParser(description="从视频提取文字转录")
142
+ parser.add_argument("--path", required=True, help="视频文件路径")
143
+ parser.add_argument("--session", required=True, help="会话 ID")
144
+ parser.add_argument(
145
+ "--model",
146
+ default="base",
147
+ choices=MODEL_SIZES,
148
+ help="Whisper 模型大小 (默认: base)",
149
+ )
150
+ args = parser.parse_args()
151
+
152
+ result = process_video(args.session, args.path, args.model)
153
+ if result["success"]:
154
+ sys.exit(0)
155
+ else:
156
+ print(f"[ultra-memory] ❌ 视频处理失败: {result.get('error')}")
157
+ sys.exit(1)
package/scripts/recall.py CHANGED
@@ -289,6 +289,9 @@ def search_semantic(query_tokens: set, top_k: int) -> list[dict]:
289
289
  entry = json.loads(line)
290
290
  except json.JSONDecodeError:
291
291
  continue
292
+ # 过滤已失效条目
293
+ if entry.get("superseded"):
294
+ continue
292
295
  text = entry.get("content", "") + " " + entry.get("title", "")
293
296
  ts = entry.get("ts", "")
294
297
  score = score_relevance(query_tokens, text, ts)
@@ -309,6 +312,36 @@ def search_semantic(query_tokens: set, top_k: int) -> list[dict]:
309
312
  return results[:top_k]
310
313
 
311
314
 
315
+ def search_profile(query_tokens: set, home: Path) -> list[dict]:
316
+ """从 user_profile.json 检索相关字段,跳过 superseded 字段"""
317
+ profile_file = home / "semantic" / "user_profile.json"
318
+ if not profile_file.exists():
319
+ return []
320
+
321
+ try:
322
+ with open(profile_file, encoding="utf-8") as f:
323
+ profile = json.load(f)
324
+ except (json.JSONDecodeError, IOError):
325
+ return []
326
+
327
+ results = []
328
+ for key, value in profile.items():
329
+ # 跳过 superseded 标记的字段
330
+ if key.endswith("_superseded"):
331
+ continue
332
+ text = f"{key} {value}"
333
+ score = score_relevance(query_tokens, str(text))
334
+ if score > 0.1:
335
+ results.append({
336
+ "score": score,
337
+ "source": "profile",
338
+ "data": {"field": key, "value": value},
339
+ })
340
+
341
+ results.sort(key=lambda x: -x["score"])
342
+ return results[:3]
343
+
344
+
312
345
  # ── TF-IDF 向量语义搜索层(第四层召回的增强)───────────────────────────
313
346
 
314
347
  def is_sklearn_available() -> bool:
@@ -628,6 +661,10 @@ def format_result(result: dict, show_context: bool = True) -> str:
628
661
  for k, v in list(detail.items())[:2]:
629
662
  lines.append(f" [{k}] {str(v)[:60]}")
630
663
 
664
+ elif source == "profile":
665
+ d = result["data"]
666
+ lines.append(f"[用户画像] {d['field']}: {d['value']}")
667
+
631
668
  return "\n".join(lines) if lines else str(result)
632
669
 
633
670
 
@@ -650,6 +687,10 @@ def recall(session_id: str, query: str, top_k: int = 5):
650
687
  semantic_results = search_semantic(query_tokens, top_k)
651
688
  found.extend(semantic_results)
652
689
 
690
+ # 画像检索(从 user_profile.json 搜索相关字段)
691
+ profile_results = search_profile(query_tokens, ULTRA_MEMORY_HOME)
692
+ found.extend(profile_results)
693
+
653
694
  # Layer 4: 实体索引(结构化精确检索)
654
695
  entity_results = search_entities(query_tokens, top_k)
655
696
  found.extend(entity_results)