ai-browser-profile 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,929 @@
1
+ """MemoryDB — schema, upsert, search, mark_accessed, stats, profile, text_search, semantic_search."""
2
+
3
+ import logging
4
+ import sqlite3
5
+ from datetime import datetime, timezone
6
+ from typing import Optional
7
+
8
+ from ai_browser_profile.embeddings import (
9
+ embed_text, embed_batch, setup_embeddings_table, store_embedding, cosine_search,
10
+ is_available as embeddings_available,
11
+ )
12
+
13
+ log = logging.getLogger(__name__)
14
+
15
+ SCHEMA = """
16
+ CREATE TABLE IF NOT EXISTS memories (
17
+ id INTEGER PRIMARY KEY,
18
+ key TEXT NOT NULL,
19
+ value TEXT NOT NULL,
20
+ confidence REAL DEFAULT 1.0,
21
+ source TEXT,
22
+ appeared_count INTEGER DEFAULT 0,
23
+ accessed_count INTEGER DEFAULT 0,
24
+ created_at TEXT,
25
+ last_appeared_at TEXT,
26
+ last_accessed_at TEXT,
27
+ superseded_by INTEGER REFERENCES memories(id),
28
+ superseded_at TEXT,
29
+ search_text TEXT,
30
+ UNIQUE(key, value)
31
+ );
32
+
33
+ CREATE TABLE IF NOT EXISTS memory_tags (
34
+ memory_id INTEGER REFERENCES memories(id) ON DELETE CASCADE,
35
+ tag TEXT NOT NULL,
36
+ PRIMARY KEY (memory_id, tag)
37
+ );
38
+ CREATE INDEX IF NOT EXISTS idx_tags ON memory_tags(tag);
39
+
40
+ CREATE TABLE IF NOT EXISTS memory_links (
41
+ source_id INTEGER REFERENCES memories(id) ON DELETE CASCADE,
42
+ target_id INTEGER REFERENCES memories(id) ON DELETE CASCADE,
43
+ relation TEXT NOT NULL,
44
+ created_at TEXT,
45
+ PRIMARY KEY (source_id, target_id, relation)
46
+ );
47
+ CREATE INDEX IF NOT EXISTS idx_links_source ON memory_links(source_id);
48
+ CREATE INDEX IF NOT EXISTS idx_links_target ON memory_links(target_id);
49
+
50
+ CREATE INDEX IF NOT EXISTS idx_search_text ON memories(search_text);
51
+
52
+ CREATE TABLE IF NOT EXISTS metadata (
53
+ key TEXT PRIMARY KEY,
54
+ value TEXT
55
+ );
56
+ """
57
+
58
+ # ── Key Schema ────────────────────────────────────────────────────────
59
+
60
+ KEY_SCHEMA = {
61
+ # Identity (single-value: new value supersedes old)
62
+ "first_name": "single", "last_name": "single", "full_name": "single",
63
+ "date_of_birth": "single", "gender": "single", "job_title": "single",
64
+ "card_holder_name": "single",
65
+ # Multi-value (one per suffix, e.g., account:github.com)
66
+ "email": "multi", "phone": "multi", "username": "multi", "language": "multi",
67
+ "street_address": "multi", "address_line_2": "multi",
68
+ "city": "multi", "state": "multi",
69
+ "zip": "multi", "country": "multi", "company": "multi",
70
+ "account": "multi", "tool": "multi", "contact": "multi", "linkedin": "multi", "bookmark": "multi",
71
+ "product": "multi", "project": "multi", "interest": "multi",
72
+ "skill": "multi", "location": "multi", "relationship": "multi",
73
+ "work": "multi", "business": "multi", "activity": "multi",
74
+ }
75
+
76
+ CANONICAL_TAGS = {
77
+ "identity", "contact_info", "address", "payment",
78
+ "account", "tool", "contact", "work",
79
+ "knowledge", "communication", "social", "finance",
80
+ }
81
+
82
+ TAG_MIGRATION = {
83
+ "email": "contact_info", "phone": "contact_info",
84
+ "credential": "account", "dev": "tool", "ai": "tool",
85
+ "location": "address", "company": "work",
86
+ "business": "knowledge", "interest": "knowledge",
87
+ "lifestyle": "knowledge", "product": "knowledge",
88
+ "project": "knowledge", "skill": "knowledge",
89
+ "activity": "knowledge", "language": "identity",
90
+ "relationship": "contact", "real_estate": "knowledge",
91
+ "spiritual": "knowledge", "autofill": "identity",
92
+ }
93
+
94
+ # Profile section mapping based on KEY_SCHEMA
95
+ PROFILE_SECTIONS = {
96
+ "identity": ["first_name", "last_name", "full_name", "email", "phone",
97
+ "date_of_birth", "gender", "job_title", "language"],
98
+ "address": ["street_address", "address_line_2", "city", "state", "zip", "country"],
99
+ "payment": ["card_holder_name"],
100
+ "work": ["company"],
101
+ }
102
+
103
+
104
+ class MemoryDB:
105
+ def __init__(self, path: str = "memories.db", defer_embeddings: bool = False):
106
+ self.path = path
107
+ self.conn = sqlite3.connect(path)
108
+ self.conn.execute("PRAGMA journal_mode=WAL")
109
+ self.conn.execute("PRAGMA foreign_keys=ON")
110
+ self.conn.executescript(SCHEMA)
111
+ self._migrate()
112
+ self._defer_embeddings = defer_embeddings
113
+ self._vec_ready = setup_embeddings_table(self.conn) if not defer_embeddings else False
114
+
115
+ # ── Migration ──────────────────────────────────────────────────
116
+
117
+ def _migrate(self):
118
+ """Add new columns/tables to existing DBs and migrate to v2."""
119
+ cols = {r[1] for r in self.conn.execute("PRAGMA table_info(memories)").fetchall()}
120
+ if "superseded_by" not in cols:
121
+ self.conn.execute("ALTER TABLE memories ADD COLUMN superseded_by INTEGER REFERENCES memories(id)")
122
+ if "superseded_at" not in cols:
123
+ self.conn.execute("ALTER TABLE memories ADD COLUMN superseded_at TEXT")
124
+ if "search_text" not in cols:
125
+ self.conn.execute("ALTER TABLE memories ADD COLUMN search_text TEXT")
126
+ self.conn.execute("UPDATE memories SET search_text = key || ': ' || value WHERE search_text IS NULL")
127
+ self.conn.commit()
128
+ if "reviewed_at" not in cols:
129
+ self.conn.execute("ALTER TABLE memories ADD COLUMN reviewed_at TEXT")
130
+
131
+ # v2 migration: normalize confidence, migrate tags
132
+ version = self.get_meta("schema_version") or "1"
133
+ if version == "1":
134
+ self._migrate_v2()
135
+
136
+ def _migrate_v2(self):
137
+ """V2: set all confidence to 1.0, migrate tags to canonical set."""
138
+ log.info("Migrating to schema v2: normalizing confidence, migrating tags")
139
+
140
+ # Set all confidence to 1.0
141
+ self.conn.execute("UPDATE memories SET confidence = 1.0")
142
+
143
+ # Migrate tags
144
+ for old_tag, new_tag in TAG_MIGRATION.items():
145
+ # Update existing tags, ignore if the (memory_id, new_tag) combo already exists
146
+ self.conn.execute("""
147
+ UPDATE OR IGNORE memory_tags SET tag = ? WHERE tag = ?
148
+ """, (new_tag, old_tag))
149
+ # Delete any remaining old tags (dupes that couldn't be updated)
150
+ self.conn.execute("DELETE FROM memory_tags WHERE tag = ?", (old_tag,))
151
+
152
+ self.conn.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES ('schema_version', '2')")
153
+ self.conn.commit()
154
+ log.info("Schema v2 migration complete")
155
+
156
+ # ── Tag Normalization ─────────────────────────────────────────
157
+
158
+ def _normalize_tags(self, tags: list[str]) -> list[str]:
159
+ """Normalize tags via TAG_MIGRATION, deduplicate."""
160
+ normalized = set()
161
+ for tag in tags:
162
+ normalized.add(TAG_MIGRATION.get(tag, tag))
163
+ return list(normalized)
164
+
165
+ def _key_prefix(self, key: str) -> str:
166
+ """Extract key prefix before ':' delimiter."""
167
+ return key.split(":")[0] if ":" in key else key
168
+
169
+ # ── Upsert ─────────────────────────────────────────────────────
170
+
171
+ def upsert(self, key: str, value: str, tags: list[str],
172
+ confidence: float = 1.0, source: str = ""):
173
+ """Insert or update a memory with semantic dedup.
174
+
175
+ Decision framework:
176
+ 1. Exact (key, value) match → bump appeared_count, merge source
177
+ 2. Semantic match (cosine >= 0.92, same key prefix) → supersede old
178
+ 3. Same exact key, different value, single-cardinality → supersede old
179
+ 4. Brand new → INSERT
180
+ """
181
+ if not value or not value.strip():
182
+ return
183
+ value = value.strip()
184
+ now = datetime.now(timezone.utc).isoformat()
185
+ search_text = f"{key}: {value}"
186
+ tags = self._normalize_tags(tags)
187
+
188
+ # Warn on unknown key prefix (soft — doesn't block)
189
+ prefix = self._key_prefix(key)
190
+ if prefix not in KEY_SCHEMA and not key.startswith("autofill:") and not key.startswith("address_type_"):
191
+ log.debug(f"Unknown key prefix: {prefix} (key={key})")
192
+
193
+ # 1. Exact (key, value) match
194
+ existing = self.conn.execute(
195
+ "SELECT id, source, appeared_count FROM memories WHERE key=? AND value=?",
196
+ (key, value),
197
+ ).fetchone()
198
+
199
+ if existing:
200
+ mem_id, old_source, appeared = existing
201
+ new_source = old_source or ""
202
+ if source and source not in (new_source or ""):
203
+ new_source = f"{new_source}, {source}" if new_source else source
204
+ self.conn.execute(
205
+ "UPDATE memories SET source=?, appeared_count=?, last_appeared_at=?, search_text=?, confidence=1.0 WHERE id=?",
206
+ (new_source, (appeared or 0) + 1, now, search_text, mem_id),
207
+ )
208
+ self._ensure_tags(mem_id, tags)
209
+ self.conn.commit()
210
+ return mem_id
211
+
212
+ # 2. Semantic dedup — check for near-duplicate with same key prefix
213
+ mem_id = self._try_semantic_supersede(key, value, search_text, tags, source, now)
214
+ if mem_id:
215
+ return mem_id
216
+
217
+ # 3. Single-cardinality key supersession
218
+ cardinality = KEY_SCHEMA.get(prefix, "multi")
219
+ if cardinality == "single":
220
+ old_row = self.conn.execute(
221
+ "SELECT id FROM memories WHERE key=? AND superseded_by IS NULL",
222
+ (key,),
223
+ ).fetchone()
224
+ if old_row:
225
+ mem_id = self._insert_and_supersede(key, value, search_text, tags, source, now, old_row[0])
226
+ return mem_id
227
+
228
+ # 4. Brand new
229
+ mem_id = self._insert_new(key, value, search_text, tags, source, now)
230
+ return mem_id
231
+
232
+ def _try_semantic_supersede(self, key: str, value: str, search_text: str,
233
+ tags: list[str], source: str, now: str) -> Optional[int]:
234
+ """Check for semantic near-duplicate. Returns new mem_id if superseded, else None."""
235
+ if not self._vec_ready:
236
+ return None
237
+
238
+ vec = embed_text(search_text)
239
+ if vec is None:
240
+ return None
241
+
242
+ prefix = self._key_prefix(key)
243
+ matches = cosine_search(self.conn, vec, limit=5, threshold=0.92)
244
+
245
+ for old_id, similarity in matches:
246
+ # Check same key prefix and not already superseded
247
+ old_row = self.conn.execute(
248
+ "SELECT key, value, superseded_by FROM memories WHERE id=?", (old_id,)
249
+ ).fetchone()
250
+ if not old_row or old_row[2] is not None:
251
+ continue
252
+ old_prefix = self._key_prefix(old_row[0])
253
+ if old_prefix != prefix:
254
+ continue
255
+ # Same key prefix, high similarity — supersede
256
+ log.debug(f"Semantic dedup: '{old_row[0]}: {old_row[1][:50]}' → '{key}: {value[:50]}' (sim={similarity:.3f})")
257
+ return self._insert_and_supersede(key, value, search_text, tags, source, now, old_id)
258
+
259
+ return None
260
+
261
+ def _insert_new(self, key: str, value: str, search_text: str,
262
+ tags: list[str], source: str, now: str) -> int:
263
+ """Insert a brand new memory."""
264
+ cursor = self.conn.execute(
265
+ "INSERT INTO memories (key, value, confidence, source, created_at, search_text, appeared_count, last_appeared_at) "
266
+ "VALUES (?, ?, 1.0, ?, ?, ?, 1, ?)",
267
+ (key, value, source, now, search_text, now),
268
+ )
269
+ mem_id = cursor.lastrowid
270
+ self._ensure_tags(mem_id, tags)
271
+ self._auto_link(mem_id, key, value)
272
+ self._store_embedding(mem_id, search_text)
273
+ self.conn.commit()
274
+ return mem_id
275
+
276
+ def _insert_and_supersede(self, key: str, value: str, search_text: str,
277
+ tags: list[str], source: str, now: str,
278
+ old_id: int) -> int:
279
+ """Insert new memory and supersede old one."""
280
+ cursor = self.conn.execute(
281
+ "INSERT INTO memories (key, value, confidence, source, created_at, search_text, appeared_count, last_appeared_at) "
282
+ "VALUES (?, ?, 1.0, ?, ?, ?, 1, ?)",
283
+ (key, value, source, now, search_text, now),
284
+ )
285
+ mem_id = cursor.lastrowid
286
+ self.conn.execute(
287
+ "UPDATE memories SET superseded_by=?, superseded_at=? WHERE id=?",
288
+ (mem_id, now, old_id),
289
+ )
290
+ self._ensure_tags(mem_id, tags)
291
+ self._auto_link(mem_id, key, value)
292
+ self._store_embedding(mem_id, search_text)
293
+ self.conn.commit()
294
+ return mem_id
295
+
296
+ def _ensure_tags(self, mem_id: int, tags: list[str]):
297
+ """Ensure all tags exist for a memory."""
298
+ for tag in tags:
299
+ self.conn.execute(
300
+ "INSERT OR IGNORE INTO memory_tags (memory_id, tag) VALUES (?, ?)",
301
+ (mem_id, tag),
302
+ )
303
+
304
+ def _store_embedding(self, mem_id: int, search_text: str):
305
+ """Compute and store embedding for a memory."""
306
+ if not self._vec_ready:
307
+ return
308
+ vec = embed_text(search_text)
309
+ if vec:
310
+ store_embedding(self.conn, mem_id, vec)
311
+
312
+ # ── Search ─────────────────────────────────────────────────────
313
+
314
+ def search(self, tags: list[str], limit: int = 20,
315
+ include_superseded: bool = False) -> list[dict]:
316
+ """Search memories by tags, ranked by hit_rate then appeared/accessed counts."""
317
+ placeholders = ",".join("?" for _ in tags)
318
+ superseded_filter = "" if include_superseded else "AND m.superseded_by IS NULL"
319
+ rows = self.conn.execute(f"""
320
+ SELECT DISTINCT m.id, m.key, m.value, m.source,
321
+ m.appeared_count, m.accessed_count,
322
+ m.last_appeared_at, m.last_accessed_at, m.created_at,
323
+ CASE WHEN m.appeared_count = 0 THEN 0.0
324
+ ELSE CAST(m.accessed_count AS REAL) / m.appeared_count
325
+ END AS hit_rate
326
+ FROM memories m
327
+ JOIN memory_tags t ON m.id = t.memory_id
328
+ WHERE t.tag IN ({placeholders}) {superseded_filter}
329
+ ORDER BY hit_rate DESC, m.accessed_count DESC, m.appeared_count DESC
330
+ LIMIT ?
331
+ """, (*tags, limit)).fetchall()
332
+
333
+ now = datetime.now(timezone.utc).isoformat()
334
+
335
+ results = []
336
+ for r in rows:
337
+ results.append({
338
+ "id": r[0], "key": r[1], "value": r[2],
339
+ "source": r[3], "appeared_count": r[4] + 1,
340
+ "accessed_count": r[5], "hit_rate": r[9],
341
+ })
342
+
343
+ ids = [r["id"] for r in results]
344
+ if ids:
345
+ id_placeholders = ",".join("?" for _ in ids)
346
+ self.conn.execute(
347
+ f"UPDATE memories SET appeared_count = appeared_count + 1, "
348
+ f"accessed_count = accessed_count + 1, "
349
+ f"last_appeared_at = ?, last_accessed_at = ? "
350
+ f"WHERE id IN ({id_placeholders})",
351
+ (now, now, *ids),
352
+ )
353
+ self.conn.commit()
354
+
355
+ return results
356
+
357
+ # ── Semantic Search ────────────────────────────────────────────
358
+
359
+ def semantic_search(self, query: str, limit: int = 20,
360
+ threshold: float = 0.3) -> list[dict]:
361
+ """Search memories by semantic similarity. Falls back to text_search if unavailable."""
362
+ if not self._vec_ready:
363
+ return self.text_search(query, limit)
364
+
365
+ vec = embed_text(query, prefix="search_query")
366
+ if vec is None:
367
+ return self.text_search(query, limit)
368
+
369
+ matches = cosine_search(self.conn, vec, limit=limit, threshold=threshold)
370
+ if not matches:
371
+ return self.text_search(query, limit)
372
+
373
+ now = datetime.now(timezone.utc).isoformat()
374
+ results = []
375
+ for mem_id, similarity in matches:
376
+ row = self.conn.execute(
377
+ "SELECT id, key, value, source, appeared_count, accessed_count, superseded_by "
378
+ "FROM memories WHERE id=?",
379
+ (mem_id,),
380
+ ).fetchone()
381
+ if not row or row[6] is not None: # skip superseded
382
+ continue
383
+ results.append({
384
+ "id": row[0], "key": row[1], "value": row[2],
385
+ "source": row[3], "appeared_count": row[4] + 1,
386
+ "accessed_count": row[5] + 1, "similarity": similarity,
387
+ })
388
+
389
+ # Auto-bump appeared + accessed for all returned results
390
+ ids = [r["id"] for r in results]
391
+ if ids:
392
+ id_placeholders = ",".join("?" for _ in ids)
393
+ self.conn.execute(
394
+ f"UPDATE memories SET appeared_count = appeared_count + 1, "
395
+ f"accessed_count = accessed_count + 1, "
396
+ f"last_appeared_at = ?, last_accessed_at = ? "
397
+ f"WHERE id IN ({id_placeholders})",
398
+ (now, now, *ids),
399
+ )
400
+ self.conn.commit()
401
+
402
+ return results
403
+
404
+ # ── Text Search ────────────────────────────────────────────────
405
+
406
+ def text_search(self, query: str, limit: int = 20) -> list[dict]:
407
+ """Full-text-ish search across memories using LIKE matching."""
408
+ words = query.lower().split()
409
+ if not words:
410
+ return []
411
+ conditions = " AND ".join("LOWER(m.search_text) LIKE ?" for _ in words)
412
+ params = [f"%{w}%" for w in words]
413
+ rows = self.conn.execute(f"""
414
+ SELECT m.id, m.key, m.value, m.source,
415
+ m.appeared_count, m.accessed_count,
416
+ CASE WHEN m.appeared_count = 0 THEN 0.0
417
+ ELSE CAST(m.accessed_count AS REAL) / m.appeared_count
418
+ END AS hit_rate
419
+ FROM memories m
420
+ WHERE {conditions} AND m.superseded_by IS NULL
421
+ ORDER BY hit_rate DESC, m.accessed_count DESC
422
+ LIMIT ?
423
+ """, (*params, limit)).fetchall()
424
+
425
+ now = datetime.now(timezone.utc).isoformat()
426
+ results = []
427
+ for r in rows:
428
+ st = f"{r[1]}: {r[2]}".lower()
429
+ matched = sum(1 for w in words if w in st)
430
+ results.append({
431
+ "id": r[0], "key": r[1], "value": r[2],
432
+ "source": r[3], "appeared_count": r[4] + 1, "accessed_count": r[5] + 1,
433
+ "hit_rate": r[6], "score": matched,
434
+ })
435
+ results.sort(key=lambda x: (x["score"], x["hit_rate"]), reverse=True)
436
+
437
+ # Auto-bump appeared + accessed for all returned results
438
+ ids = [r["id"] for r in results]
439
+ if ids:
440
+ id_placeholders = ",".join("?" for _ in ids)
441
+ self.conn.execute(
442
+ f"UPDATE memories SET appeared_count = appeared_count + 1, "
443
+ f"accessed_count = accessed_count + 1, "
444
+ f"last_appeared_at = ?, last_accessed_at = ? "
445
+ f"WHERE id IN ({id_placeholders})",
446
+ (now, now, *ids),
447
+ )
448
+ self.conn.commit()
449
+
450
+ return results
451
+
452
+ # ── Backfill Embeddings ────────────────────────────────────────
453
+
454
+ def backfill_embeddings(self) -> int:
455
+ """Compute embeddings for all existing memories. Returns count embedded."""
456
+ if not self._vec_ready:
457
+ log.warning("sqlite-vec not available, cannot backfill embeddings")
458
+ return 0
459
+
460
+ rows = self.conn.execute(
461
+ "SELECT id, key, value FROM memories WHERE superseded_by IS NULL"
462
+ ).fetchall()
463
+
464
+ if not rows:
465
+ return 0
466
+
467
+ # Check which already have embeddings
468
+ existing_ids = set()
469
+ try:
470
+ for (mid,) in self.conn.execute("SELECT memory_id FROM memory_embeddings"):
471
+ existing_ids.add(mid)
472
+ except Exception:
473
+ pass
474
+
475
+ to_embed = [(r[0], f"{r[1]}: {r[2]}") for r in rows if r[0] not in existing_ids]
476
+ if not to_embed:
477
+ log.info("All memories already have embeddings")
478
+ return 0
479
+
480
+ log.info(f"Backfilling embeddings for {len(to_embed)} memories...")
481
+ texts = [t[1] for t in to_embed]
482
+ vectors = embed_batch(texts)
483
+
484
+ count = 0
485
+ for (mem_id, _), vec in zip(to_embed, vectors):
486
+ if vec is not None:
487
+ store_embedding(self.conn, mem_id, vec)
488
+ count += 1
489
+
490
+ self.conn.commit()
491
+ log.info(f"Embedded {count} memories")
492
+ return count
493
+
494
+ def regenerate_embeddings(self) -> int:
495
+ """Wipe all embeddings and recompute with current model. Use after model change."""
496
+ if not self._vec_ready:
497
+ log.warning("Embeddings table not available, cannot regenerate")
498
+ return 0
499
+
500
+ self.conn.execute("DELETE FROM memory_embeddings")
501
+ self.conn.commit()
502
+ log.info("Cleared all existing embeddings")
503
+ return self.backfill_embeddings()
504
+
505
+ # ── Contradiction / History ────────────────────────────────────
506
+
507
+ def history(self, key: str) -> list[dict]:
508
+ """Return all values for a key ordered by created_at, showing supersession chain."""
509
+ rows = self.conn.execute("""
510
+ SELECT id, key, value, confidence, source, created_at,
511
+ superseded_by, superseded_at
512
+ FROM memories WHERE key=? ORDER BY created_at
513
+ """, (key,)).fetchall()
514
+ return [
515
+ {
516
+ "id": r[0], "key": r[1], "value": r[2], "confidence": r[3],
517
+ "source": r[4], "created_at": r[5],
518
+ "superseded_by": r[6], "superseded_at": r[7],
519
+ }
520
+ for r in rows
521
+ ]
522
+
523
+ # ── Entity Linking ─────────────────────────────────────────────
524
+
525
+ def link(self, source_id: int, target_id: int, relation: str):
526
+ """Create a link between two memories."""
527
+ now = datetime.now(timezone.utc).isoformat()
528
+ self.conn.execute(
529
+ "INSERT OR IGNORE INTO memory_links (source_id, target_id, relation, created_at) VALUES (?, ?, ?, ?)",
530
+ (source_id, target_id, relation, now),
531
+ )
532
+
533
+ def related(self, memory_id: int, relation: Optional[str] = None) -> list[dict]:
534
+ """Return memories linked to this one."""
535
+ if relation:
536
+ rows = self.conn.execute("""
537
+ SELECT m.id, m.key, m.value, m.confidence, ml.relation
538
+ FROM memory_links ml
539
+ JOIN memories m ON m.id = CASE WHEN ml.source_id = ? THEN ml.target_id ELSE ml.source_id END
540
+ WHERE (ml.source_id = ? OR ml.target_id = ?) AND ml.relation = ?
541
+ """, (memory_id, memory_id, memory_id, relation)).fetchall()
542
+ else:
543
+ rows = self.conn.execute("""
544
+ SELECT m.id, m.key, m.value, m.confidence, ml.relation
545
+ FROM memory_links ml
546
+ JOIN memories m ON m.id = CASE WHEN ml.source_id = ? THEN ml.target_id ELSE ml.source_id END
547
+ WHERE ml.source_id = ? OR ml.target_id = ?
548
+ """, (memory_id, memory_id, memory_id)).fetchall()
549
+ return [
550
+ {"id": r[0], "key": r[1], "value": r[2], "confidence": r[3], "relation": r[4]}
551
+ for r in rows
552
+ ]
553
+
554
+ def _auto_link(self, mem_id: int, key: str, value: str):
555
+ """Deterministic auto-linking on upsert."""
556
+ if key == "email":
557
+ accounts = self.conn.execute(
558
+ "SELECT id FROM memories WHERE key LIKE 'account:%' AND value=? AND id!=?",
559
+ (value, mem_id),
560
+ ).fetchall()
561
+ for (aid,) in accounts:
562
+ self.link(mem_id, aid, "belongs_to")
563
+
564
+ if key.startswith("account:"):
565
+ same_user = self.conn.execute(
566
+ "SELECT id FROM memories WHERE key LIKE 'account:%' AND value=? AND id!=?",
567
+ (value, mem_id),
568
+ ).fetchall()
569
+ for (sid,) in same_user:
570
+ self.link(mem_id, sid, "same_identity")
571
+
572
+ # ── Mark Accessed ──────────────────────────────────────────────
573
+
574
+ def mark_accessed(self, memory_id: int):
575
+ """Manually bump accessed_count. Kept for backward compat — search methods now auto-increment."""
576
+ now = datetime.now(timezone.utc).isoformat()
577
+ self.conn.execute(
578
+ "UPDATE memories SET accessed_count = accessed_count + 1, last_accessed_at = ? WHERE id = ?",
579
+ (now, memory_id),
580
+ )
581
+ self.conn.commit()
582
+
583
+ # ── Stats ──────────────────────────────────────────────────────
584
+
585
+ def stats(self) -> dict:
586
+ """Return summary stats about the memory database."""
587
+ total = self.conn.execute("SELECT COUNT(*) FROM memories WHERE superseded_by IS NULL").fetchone()[0]
588
+ superseded = self.conn.execute("SELECT COUNT(*) FROM memories WHERE superseded_by IS NOT NULL").fetchone()[0]
589
+ by_tag = self.conn.execute(
590
+ "SELECT tag, COUNT(*) FROM memory_tags GROUP BY tag ORDER BY COUNT(*) DESC"
591
+ ).fetchall()
592
+ top_accessed = self.conn.execute(
593
+ "SELECT key, value, accessed_count FROM memories WHERE accessed_count > 0 ORDER BY accessed_count DESC LIMIT 10"
594
+ ).fetchall()
595
+ links = self.conn.execute("SELECT COUNT(*) FROM memory_links").fetchone()[0]
596
+
597
+ # Count embeddings
598
+ embedded = 0
599
+ try:
600
+ embedded = self.conn.execute("SELECT COUNT(*) FROM memory_embeddings").fetchone()[0]
601
+ except Exception:
602
+ pass
603
+
604
+ return {
605
+ "total_memories": total,
606
+ "superseded": superseded,
607
+ "links": links,
608
+ "embedded": embedded,
609
+ "by_tag": {r[0]: r[1] for r in by_tag},
610
+ "top_accessed": [{"key": r[0], "value": r[1], "accessed": r[2]} for r in top_accessed],
611
+ }
612
+
613
+ # ── Profile ────────────────────────────────────────────────────
614
+
615
+ def profile(self) -> dict:
616
+ """Generate structured user profile from non-superseded memories."""
617
+ rows = self.conn.execute("""
618
+ SELECT m.key, m.value, m.appeared_count
619
+ FROM memories m
620
+ WHERE m.superseded_by IS NULL
621
+ ORDER BY m.appeared_count DESC, m.accessed_count DESC
622
+ """).fetchall()
623
+
624
+ by_key: dict[str, list[tuple]] = {}
625
+ for key, value, appeared in rows:
626
+ by_key.setdefault(key, []).append((value, appeared))
627
+
628
+ def pick(k, n=1, min_appeared=1):
629
+ """Pick top n values for a key, filtered by min appeared_count."""
630
+ vals = [(v, a) for v, a in by_key.get(k, []) if a >= min_appeared]
631
+ if n == 1:
632
+ return vals[0][0] if vals else None
633
+ return [v for v, _ in vals[:n]]
634
+
635
+ def pick_prefixed(prefix, n=20, min_appeared=1):
636
+ """Pick top n entries matching a key prefix."""
637
+ items = []
638
+ for k, vals in by_key.items():
639
+ if k.startswith(prefix):
640
+ suffix = k[len(prefix):]
641
+ top_val, top_appeared = vals[0]
642
+ if top_appeared >= min_appeared:
643
+ items.append((suffix, top_val, top_appeared))
644
+ items.sort(key=lambda x: x[2], reverse=True)
645
+ return [(name, val) for name, val, _ in items[:n]]
646
+
647
+ # Identity
648
+ name_parts = [pick("first_name"), pick("last_name")]
649
+ full_name = pick("full_name") or " ".join(n for n in name_parts if n)
650
+ emails = pick("email", n=10, min_appeared=5)
651
+ phones = pick("phone", n=5, min_appeared=2)
652
+ # Usernames: exclude values that look like emails
653
+ raw_usernames = pick("username", n=20, min_appeared=2) or []
654
+ usernames = [u for u in raw_usernames if "@" not in u and "." not in u]
655
+
656
+ # Addresses — deduplicate similar streets, group into full addresses
657
+ addresses = []
658
+ raw_streets = pick("street_address", n=10, min_appeared=2) or []
659
+ # Deduplicate: normalize by lowercasing and stripping punctuation
660
+ seen_streets = set()
661
+ unique_streets = []
662
+ for s in raw_streets:
663
+ # Clean trailing commas/spaces
664
+ s = s.rstrip(", ")
665
+ normalized = s.lower().replace(" blvd", " boulevard").replace(" st.", " street").replace(" st", " street")
666
+ if normalized not in seen_streets:
667
+ seen_streets.add(normalized)
668
+ unique_streets.append(s)
669
+ if unique_streets:
670
+ primary = {
671
+ "street": unique_streets[0],
672
+ "city": pick("city"), "state": pick("state"),
673
+ "zip": pick("zip"), "country": pick("country"),
674
+ }
675
+ addresses.append(primary)
676
+ for s in unique_streets[1:]:
677
+ addresses.append({"street": s})
678
+
679
+ # Payment — count active (non-expired) cards
680
+ card_holder = pick("card_holder_name")
681
+ expiries = pick("card_expiry", n=20)
682
+ from datetime import datetime
683
+ now_ym = datetime.now().strftime("%Y%m")
684
+ active_cards = []
685
+ for e in (expiries or []):
686
+ if isinstance(e, str) and "/" in e:
687
+ try:
688
+ mm, yyyy = e.split("/")
689
+ if f"{yyyy}{mm.zfill(2)}" >= now_ym:
690
+ active_cards.append(e)
691
+ except ValueError:
692
+ pass
693
+
694
+ # Work — deduplicate case-insensitive, strip suffixes
695
+ raw_companies = pick("company", n=10, min_appeared=2) or []
696
+ seen_companies = set()
697
+ companies = []
698
+ for c in (raw_companies if isinstance(raw_companies, list) else [raw_companies]):
699
+ # Normalize: lowercase, strip Inc/LLC/etc
700
+ norm = c.lower().rstrip(".,").replace(", inc", "").replace(" inc", "")
701
+ if norm not in seen_companies:
702
+ seen_companies.add(norm)
703
+ companies.append(c)
704
+
705
+ # Tools — sorted by appeared_count
706
+ tool_items = pick_prefixed("tool:", n=20)
707
+
708
+ # Accounts — group by username/email, deduplicate domains
709
+ acct_items = pick_prefixed("account:", n=100)
710
+ accounts_by_user: dict[str, list[str]] = {}
711
+ for domain, user in acct_items:
712
+ accounts_by_user.setdefault(user, []).append(domain)
713
+
714
+ # Contacts
715
+ contact_items = pick_prefixed("contact:", n=10000)
716
+ total_contacts = len(contact_items)
717
+
718
+ # Projects (Notion)
719
+ project_items = pick_prefixed("project:", n=20)
720
+
721
+ return {
722
+ "name": full_name or None,
723
+ "emails": emails or [],
724
+ "phones": phones or [],
725
+ "usernames": usernames or [],
726
+ "gender": pick("gender"),
727
+ "date_of_birth": pick("date_of_birth"),
728
+ "addresses": addresses,
729
+ "card_holder": card_holder,
730
+ "active_cards": len(active_cards),
731
+ "companies": companies or [],
732
+ "tools": [name for name, _ in tool_items],
733
+ "accounts": accounts_by_user,
734
+ "total_contacts": total_contacts,
735
+ "projects": [name for name, _ in project_items],
736
+ }
737
+
738
+ def profile_text(self) -> str:
739
+ """Format profile as markdown text for LLM context injection."""
740
+ p = self.profile()
741
+ lines = ["## User Profile"]
742
+
743
+ if p["name"]:
744
+ lines.append(f"**Name:** {p['name']}")
745
+ if p.get("gender"):
746
+ lines[-1] += f" ({p['gender']})"
747
+
748
+ if p["emails"]:
749
+ lines.append(f"**Emails:** {', '.join(p['emails'])}")
750
+
751
+ if p["phones"]:
752
+ lines.append(f"**Phones:** {', '.join(p['phones'])}")
753
+
754
+ if p["usernames"]:
755
+ lines.append(f"**Handles:** {', '.join(p['usernames'])}")
756
+
757
+ # Addresses
758
+ for i, addr in enumerate(p.get("addresses", [])):
759
+ parts = [addr.get("street", "")]
760
+ city_state = ", ".join(filter(None, [addr.get("city"), addr.get("state")]))
761
+ if city_state:
762
+ parts.append(city_state)
763
+ if addr.get("zip"):
764
+ parts[-1] = parts[-1] + " " + addr["zip"] if parts else addr["zip"]
765
+ if addr.get("country"):
766
+ parts.append(addr["country"])
767
+ addr_str = ", ".join(filter(None, parts))
768
+ if addr_str:
769
+ label = "**Address:**" if i == 0 else "**Address " + str(i + 1) + ":**"
770
+ lines.append(f"{label} {addr_str}")
771
+
772
+ # Payment
773
+ if p.get("card_holder") or p.get("active_cards"):
774
+ card_parts = []
775
+ if p["card_holder"]:
776
+ card_parts.append(p["card_holder"])
777
+ if p["active_cards"]:
778
+ card_parts.append(f"{p['active_cards']} cards on file")
779
+ lines.append(f"**Payment:** {', '.join(card_parts)}")
780
+
781
+ # Companies
782
+ if p.get("companies"):
783
+ if isinstance(p["companies"], list):
784
+ lines.append(f"**Companies:** {', '.join(p['companies'])}")
785
+ else:
786
+ lines.append(f"**Company:** {p['companies']}")
787
+
788
+ # Tools
789
+ if p.get("tools"):
790
+ lines.append(f"**Top Tools:** {', '.join(p['tools'][:15])}")
791
+
792
+ # Accounts grouped by identity
793
+ if p.get("accounts"):
794
+ lines.append("**Accounts:**")
795
+ for user, domains in sorted(p["accounts"].items(),
796
+ key=lambda x: len(x[1]), reverse=True)[:8]:
797
+ # Clean domain names: extract meaningful service name
798
+ seen = set()
799
+ short_domains = []
800
+ for d in domains:
801
+ # Use second-level domain as service name
802
+ parts = d.replace("www.", "").split(".")
803
+ if len(parts) >= 2:
804
+ short = parts[-2] # e.g. "mercury" from "app.mercury.com"
805
+ else:
806
+ short = parts[0]
807
+ # Skip generic TLDs, gov subdomains, localhost
808
+ if short in ("com", "co", "io", "ai", "org", "net", "ru",
809
+ "gov", "ca", "us", "localhost", "localhost:3000"):
810
+ # For .gov domains (dmv.ca.gov), use the subdomain
811
+ short = parts[0] if len(parts) > 2 else d
812
+ if "localhost" in short:
813
+ continue
814
+ if short not in seen:
815
+ seen.add(short)
816
+ short_domains.append(short)
817
+ display = short_domains[:6]
818
+ extra = f" +{len(short_domains) - 6}" if len(short_domains) > 6 else ""
819
+ lines.append(f" {user}: {', '.join(display)}{extra}")
820
+
821
+ # Projects
822
+ if p.get("projects"):
823
+ # Clean up Notion page titles, deduplicate by core name
824
+ seen_proj = set()
825
+ clean = []
826
+ for name in p["projects"]:
827
+ c = name.rstrip(" ‣").strip()
828
+ # Extract last meaningful name for dedup (e.g. "Eugene O'Donald" from "TBD: Eugene O'Donald")
829
+ core = c.split(" - ")[-1].split(": ")[-1].lower().strip()
830
+ if core not in seen_proj and c:
831
+ seen_proj.add(core)
832
+ clean.append(c)
833
+ if clean:
834
+ lines.append(f"**Projects:** {', '.join(clean[:10])}")
835
+
836
+ # Contacts
837
+ if p.get("total_contacts"):
838
+ lines.append(f"**Contacts:** {p['total_contacts']} total")
839
+
840
+ return "\n".join(lines)
841
+
842
+ # ── Review Operations ─────────────────────────────────────────
843
+
844
+ def delete(self, memory_id: int):
845
+ """Delete a memory and its tags/links."""
846
+ self.conn.execute("DELETE FROM memory_tags WHERE memory_id=?", (memory_id,))
847
+ self.conn.execute("DELETE FROM memory_links WHERE source_id=? OR target_id=?", (memory_id, memory_id))
848
+ self.conn.execute("UPDATE memories SET superseded_by=NULL WHERE superseded_by=?", (memory_id,))
849
+ self.conn.execute("DELETE FROM memories WHERE id=?", (memory_id,))
850
+ self.conn.commit()
851
+
852
+ def update_memory(self, memory_id: int, key: str = None, value: str = None,
853
+ confidence: float = None, tags: list[str] = None):
854
+ """Update fields on a memory. Regenerates search_text if key/value changed."""
855
+ updates, params = [], []
856
+ if key is not None:
857
+ updates.append("key=?")
858
+ params.append(key)
859
+ if value is not None:
860
+ updates.append("value=?")
861
+ params.append(value)
862
+ if confidence is not None:
863
+ updates.append("confidence=?")
864
+ params.append(confidence)
865
+ if key is not None or value is not None:
866
+ row = self.conn.execute("SELECT key, value FROM memories WHERE id=?", (memory_id,)).fetchone()
867
+ if row:
868
+ new_key = key if key is not None else row[0]
869
+ new_val = value if value is not None else row[1]
870
+ updates.append("search_text=?")
871
+ params.append(f"{new_key}: {new_val}")
872
+ if updates:
873
+ params.append(memory_id)
874
+ self.conn.execute(f"UPDATE memories SET {', '.join(updates)} WHERE id=?", params)
875
+ if tags is not None:
876
+ self.conn.execute("DELETE FROM memory_tags WHERE memory_id=?", (memory_id,))
877
+ for tag in tags:
878
+ self.conn.execute("INSERT OR IGNORE INTO memory_tags (memory_id, tag) VALUES (?, ?)", (memory_id, tag))
879
+ self.conn.commit()
880
+
881
+ def get_unreviewed(self, limit: int = 100) -> list[dict]:
882
+ """Get memories where reviewed_at IS NULL, with their tags."""
883
+ rows = self.conn.execute("""
884
+ SELECT m.id, m.key, m.value, m.confidence, m.source, m.created_at,
885
+ m.superseded_by
886
+ FROM memories m
887
+ WHERE m.reviewed_at IS NULL
888
+ ORDER BY m.id
889
+ LIMIT ?
890
+ """, (limit,)).fetchall()
891
+ results = []
892
+ for r in rows:
893
+ tags = [t[0] for t in self.conn.execute(
894
+ "SELECT tag FROM memory_tags WHERE memory_id=?", (r[0],)
895
+ ).fetchall()]
896
+ results.append({
897
+ "id": r[0], "key": r[1], "value": r[2], "confidence": r[3],
898
+ "source": r[4], "created_at": r[5], "superseded_by": r[6],
899
+ "tags": tags,
900
+ })
901
+ return results
902
+
903
+ def mark_reviewed(self, memory_ids: list[int]):
904
+ """Set reviewed_at = now for given IDs."""
905
+ if not memory_ids:
906
+ return
907
+ now = datetime.now(timezone.utc).isoformat()
908
+ placeholders = ",".join("?" for _ in memory_ids)
909
+ self.conn.execute(
910
+ f"UPDATE memories SET reviewed_at=? WHERE id IN ({placeholders})",
911
+ (now, *memory_ids),
912
+ )
913
+ self.conn.commit()
914
+
915
+ def get_meta(self, key: str) -> Optional[str]:
916
+ """Get metadata value."""
917
+ row = self.conn.execute("SELECT value FROM metadata WHERE key=?", (key,)).fetchone()
918
+ return row[0] if row else None
919
+
920
+ def set_meta(self, key: str, value: str):
921
+ """Set metadata value (INSERT OR REPLACE)."""
922
+ self.conn.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)", (key, value))
923
+ self.conn.commit()
924
+
925
+ # ── Close ──────────────────────────────────────────────────────
926
+
927
+ def close(self):
928
+ self.conn.commit()
929
+ self.conn.close()