deja-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deja/core/store.py ADDED
@@ -0,0 +1,1413 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import struct
5
+ import sys
6
+ from datetime import datetime, timezone, timedelta
7
+ from pathlib import Path
8
+ from typing import Any, Optional
9
+
10
+ import aiosqlite
11
+ from ulid import ULID
12
+
13
+ from deja.config import Config
14
+
15
+
16
+ def _now_iso() -> str:
17
+ return datetime.now(timezone.utc).isoformat()
18
+
19
+
20
+ def _token_overlap(a: str, b: str) -> float:
21
+ """Simple token overlap ratio between two strings."""
22
+ tokens_a = set(a.lower().split())
23
+ tokens_b = set(b.lower().split())
24
+ if not tokens_a or not tokens_b:
25
+ return 0.0
26
+ intersection = tokens_a & tokens_b
27
+ union = tokens_a | tokens_b
28
+ return len(intersection) / len(union)
29
+
30
+
31
+ # ── embedding helpers ──────────────────────────────────────────────────────────
32
+
33
+ def _emb_to_bytes(embedding: list[float]) -> bytes:
34
+ return struct.pack(f"{len(embedding)}f", *embedding)
35
+
36
+
37
+ def _bytes_to_emb(data: bytes) -> list[float]:
38
+ n = len(data) // 4
39
+ return list(struct.unpack(f"{n}f", data))
40
+
41
+
42
+ def _cosine_similarity(a: list[float], b: list[float]) -> float:
43
+ dot = sum(x * y for x, y in zip(a, b))
44
+ mag_a = math.sqrt(sum(x * x for x in a))
45
+ mag_b = math.sqrt(sum(x * x for x in b))
46
+ if mag_a == 0.0 or mag_b == 0.0:
47
+ return 0.0
48
+ return dot / (mag_a * mag_b)
49
+
50
+
51
+ # ── activation ranking helpers ─────────────────────────────────────────────────
52
+
53
+ def _activation_score(
54
+ mem: dict,
55
+ task_match: float,
56
+ project: Optional[str],
57
+ now_dt: datetime,
58
+ ) -> float:
59
+ """Score a memory for activation ranking.
60
+
61
+ score = task_match * 2 + confidence + recency * 0.5 + reuse_norm * 0.5 + scope_fit
62
+
63
+ All components are in [0, 1] except task_match which is weighted 2×
64
+ because retrieval relevance is the primary signal.
65
+ """
66
+ try:
67
+ created_str = mem.get("created_at", "")
68
+ created = datetime.fromisoformat(created_str)
69
+ if created.tzinfo is None:
70
+ created = created.replace(tzinfo=timezone.utc)
71
+ days_old = (now_dt - created).days
72
+ except Exception:
73
+ days_old = 0
74
+ recency = max(0.0, 1.0 - days_old / 365.0)
75
+ reuse_norm = min(mem.get("reuse_count", 0), 10) / 10.0
76
+ scope_fit = 0.1 if (project and mem.get("project") == project) else 0.0
77
+ confidence = mem.get("confidence", 1.0)
78
+ return task_match * 2.0 + confidence + recency * 0.5 + reuse_norm * 0.5 + scope_fit
79
+
80
+
81
+ def _apply_confusability_penalty(
82
+ scored: list[tuple[dict, Optional[bytes], float]],
83
+ ) -> list[tuple[dict, Optional[bytes], float]]:
84
+ """Down-rank procedures highly similar to a higher-ranked procedure.
85
+
86
+ Processes in score-descending order. For each procedure, if its embedding is
87
+ >0.85 cosine-similar to any higher-ranked procedure, multiply its score by 0.6.
88
+ Only applies when the memory has a stored embedding.
89
+ """
90
+ scored_sorted = sorted(scored, key=lambda x: x[2], reverse=True)
91
+ claimed_embeddings: list[list[float]] = []
92
+ result: list[tuple[dict, Optional[bytes], float]] = []
93
+
94
+ for mem, emb_bytes, score in scored_sorted:
95
+ if mem.get("type") == "procedure" and emb_bytes:
96
+ mem_emb = _bytes_to_emb(emb_bytes)
97
+ if any(
98
+ _cosine_similarity(mem_emb, higher) > 0.85
99
+ for higher in claimed_embeddings
100
+ ):
101
+ score *= 0.6
102
+ else:
103
+ claimed_embeddings.append(mem_emb)
104
+ result.append((mem, emb_bytes, score))
105
+
106
+ return result
107
+
108
+
109
+ def _strip_embedding(mem: dict) -> dict:
110
+ """Remove the binary embedding field before returning to callers."""
111
+ m = dict(mem)
112
+ m.pop("embedding", None)
113
+ return m
114
+
115
+
116
+ # ── load budgeting ─────────────────────────────────────────────────────────────
117
+
118
+ DEFAULT_LOAD_SLOTS: dict[str, int] = {
119
+ "preference": 5,
120
+ "gotcha": 5,
121
+ "decision": 5,
122
+ "pattern": 5,
123
+ "procedure": 3,
124
+ "progress": 3, # only if updated within the last 7 days
125
+ }
126
+
127
+
128
+ def _parse_dt(dt_str: str) -> datetime:
129
+ """Parse an ISO datetime string, defaulting to UTC epoch on failure."""
130
+ try:
131
+ dt = datetime.fromisoformat(dt_str)
132
+ if dt.tzinfo is None:
133
+ dt = dt.replace(tzinfo=timezone.utc)
134
+ return dt
135
+ except Exception:
136
+ return datetime(1970, 1, 1, tzinfo=timezone.utc)
137
+
138
+
139
+ # ── store ──────────────────────────────────────────────────────────────────────
140
+
141
+ _GLOBAL_PROJECT_KEY = "__global__"
142
+
143
+
144
+ def _project_meta_key(project: Optional[str]) -> str:
145
+ return project if project is not None else _GLOBAL_PROJECT_KEY
146
+
147
+
148
+ class MemoryStore:
149
+ """SQLite-backed memory store with FTS5 full-text search and confidence scoring.
150
+
151
+ Confidence lifecycle
152
+ --------------------
153
+ Each memory has a ``confidence`` float in [0.0, 1.0] that tracks reliability:
154
+
155
+ - **Initial value** — set by the caller on ``save()``. Manual ``deja save`` calls
156
+ default to 1.0. LLM-extracted memories use the model's self-assessed confidence
157
+ (often 0.7–0.95 for inferred facts).
158
+
159
+ - **Deduplication reinforcement** — when ``save()`` finds an existing memory with
160
+ >80% token overlap and the same type+scope, it increments ``confidence`` by 0.05
161
+ (capped at 1.0) instead of inserting a duplicate. Repeated discoveries strengthen
162
+ rather than clutter the vault.
163
+
164
+ - **Load ordering** — ``load()`` and ``search()`` both order results by
165
+ ``confidence DESC``, so the most reliable memories appear first in context.
166
+
167
+ - **Decay** (Phase 2, scheduler) — memories not referenced for 2+ weeks have
168
+ confidence reduced by ~0.05/week. Keeps stale knowledge from dominating context.
169
+
170
+ - **Archival threshold** — memories whose confidence falls below 0.3 are archived
171
+ (``archived_at`` stamped). Archived memories are excluded from ``load``/``search``
172
+ but not deleted, preserving history.
173
+
174
+ Search (Phase 3)
175
+ ----------------
176
+ ``search()`` runs a hybrid pipeline:
177
+ 1. BM25 via FTS5 — fast keyword matching, ordered by BM25 rank.
178
+ 2. Embedding search — always runs when an adapter is configured (not a fallback).
179
+ Results merged with FTS results, deduped by ID.
180
+ 3. Activation ranking — all candidates re-scored by:
181
+ ``task_match * 2 + confidence + recency * 0.5 + reuse_count_norm * 0.5 + scope_fit``
182
+ ``task_match`` uses cosine similarity for any result with a stored embedding;
183
+ falls back to normalized FTS rank (0.1–1.0) otherwise.
184
+ 4. Confusability penalty — procedures with cosine >0.85 to a higher-ranked result
185
+ are down-ranked by 0.4×.
186
+ 5. ``reuse_count`` is incremented for each returned memory when ``track_usage=True``
187
+ (the default). Pass ``track_usage=False`` from benchmarks or batch jobs to prevent
188
+ cross-query accumulation artifacts.
189
+ """
190
+
191
+ def __init__(self, config: Config) -> None:
192
+ self._db_path = config.store.db_path
193
+ self._db: Optional[aiosqlite.Connection] = None
194
+
195
+ async def _get_db(self) -> aiosqlite.Connection:
196
+ if self._db is None:
197
+ self._db_path.parent.mkdir(parents=True, exist_ok=True)
198
+ self._db = await aiosqlite.connect(self._db_path)
199
+ self._db.row_factory = aiosqlite.Row
200
+ await self._db.execute("PRAGMA journal_mode=WAL")
201
+ await self._db.execute("PRAGMA foreign_keys=ON")
202
+ return self._db
203
+
204
+ async def init_db(self) -> None:
205
+ db = await self._get_db()
206
+ await db.executescript("""
207
+ CREATE TABLE IF NOT EXISTS memories (
208
+ id TEXT PRIMARY KEY,
209
+ type TEXT NOT NULL,
210
+ category TEXT NOT NULL DEFAULT 'agent',
211
+ content TEXT NOT NULL,
212
+ scope TEXT NOT NULL,
213
+ project TEXT,
214
+ source TEXT,
215
+ confidence REAL NOT NULL DEFAULT 1.0,
216
+ reuse_count INTEGER NOT NULL DEFAULT 0,
217
+ domain TEXT,
218
+ entity_graph TEXT,
219
+ trigger TEXT,
220
+ embedding BLOB,
221
+ created_at TEXT NOT NULL,
222
+ updated_at TEXT NOT NULL,
223
+ last_confirmed TEXT,
224
+ archived_at TEXT,
225
+ invalidated_at TEXT
226
+ );
227
+
228
+ CREATE VIRTUAL TABLE IF NOT EXISTS memories_fts
229
+ USING fts5(content, type, scope, content=memories, content_rowid=rowid);
230
+
231
+ CREATE TRIGGER IF NOT EXISTS memories_ai AFTER INSERT ON memories BEGIN
232
+ INSERT INTO memories_fts(rowid, content, type, scope)
233
+ VALUES (new.rowid, new.content, new.type, new.scope);
234
+ END;
235
+
236
+ CREATE TRIGGER IF NOT EXISTS memories_ad AFTER DELETE ON memories BEGIN
237
+ INSERT INTO memories_fts(memories_fts, rowid, content, type, scope)
238
+ VALUES ('delete', old.rowid, old.content, old.type, old.scope);
239
+ END;
240
+
241
+ CREATE TRIGGER IF NOT EXISTS memories_au AFTER UPDATE ON memories BEGIN
242
+ INSERT INTO memories_fts(memories_fts, rowid, content, type, scope)
243
+ VALUES ('delete', old.rowid, old.content, old.type, old.scope);
244
+ INSERT INTO memories_fts(rowid, content, type, scope)
245
+ VALUES (new.rowid, new.content, new.type, new.scope);
246
+ END;
247
+
248
+ CREATE TABLE IF NOT EXISTS entity_nodes (
249
+ id TEXT PRIMARY KEY,
250
+ project TEXT,
251
+ entity TEXT NOT NULL,
252
+ created_at TEXT NOT NULL
253
+ );
254
+
255
+ CREATE TABLE IF NOT EXISTS entity_edges (
256
+ id TEXT PRIMARY KEY,
257
+ project TEXT,
258
+ subject_entity TEXT NOT NULL,
259
+ predicate TEXT NOT NULL,
260
+ object_entity TEXT NOT NULL,
261
+ confidence REAL NOT NULL DEFAULT 0.5,
262
+ confirmations INTEGER NOT NULL DEFAULT 1,
263
+ is_negation INTEGER NOT NULL DEFAULT 0,
264
+ first_seen_session TEXT,
265
+ valid_from TEXT NOT NULL,
266
+ invalidated_at TEXT
267
+ );
268
+
269
+ CREATE TABLE IF NOT EXISTS observations (
270
+ id TEXT PRIMARY KEY,
271
+ project TEXT,
272
+ content TEXT NOT NULL,
273
+ token_estimate INTEGER,
274
+ created_at TEXT NOT NULL,
275
+ reflector_pass INTEGER NOT NULL DEFAULT 0
276
+ );
277
+
278
+ CREATE TABLE IF NOT EXISTS reflection_meta (
279
+ project TEXT PRIMARY KEY,
280
+ last_observer_at TEXT,
281
+ last_reflector_at TEXT,
282
+ last_decay_at TEXT,
283
+ last_promote_at TEXT,
284
+ last_archive_at TEXT
285
+ );
286
+ """)
287
+ await db.commit()
288
+
289
+ # Migrations: add columns that didn't exist in earlier schema versions.
290
+ # ALTER TABLE ADD COLUMN is a no-op on error — we catch and ignore
291
+ # "duplicate column" errors so init_db() is safe to call on existing DBs.
292
+ migrations = [
293
+ "ALTER TABLE memories ADD COLUMN reuse_count INTEGER NOT NULL DEFAULT 0",
294
+ "ALTER TABLE memories ADD COLUMN domain TEXT",
295
+ "ALTER TABLE memories ADD COLUMN embedding BLOB",
296
+ "ALTER TABLE memories ADD COLUMN trigger TEXT",
297
+ ]
298
+ for sql in migrations:
299
+ try:
300
+ await db.execute(sql)
301
+ except Exception:
302
+ pass # column already exists
303
+ await db.commit()
304
+
305
+ async def save(self, memory: dict, embedding: Optional[bytes] = None) -> str:
306
+ """Save a memory, deduplicating if an existing memory is >80% similar.
307
+
308
+ embedding: pre-computed embedding bytes (from EmbeddingAdapter.to_bytes()).
309
+ Pass None if no embedding provider is configured.
310
+ """
311
+ db = await self._get_db()
312
+ content = memory["content"]
313
+ mem_type = memory.get("type", "pattern")
314
+ scope = memory.get("scope", "global")
315
+ project = memory.get("project")
316
+
317
+ # Dedup check uses raw FTS5 search (no side effects on reuse_count)
318
+ candidates = await self._search_fts(content, project, limit=5)
319
+
320
+ for candidate in candidates:
321
+ if candidate["type"] == mem_type and candidate["scope"] == scope:
322
+ overlap = _token_overlap(content, candidate["content"])
323
+ if overlap > 0.8:
324
+ # Dedup hit: agent rediscovered the same knowledge — strong confirmation.
325
+ # Bump confidence and increment reuse_count.
326
+ # Merge trigger phrases: union of existing + incoming, deduped.
327
+ now = _now_iso()
328
+ new_confidence = min(1.0, candidate["confidence"] + 0.05)
329
+ new_reuse = candidate.get("reuse_count", 0) + 1
330
+
331
+ existing_trigger = candidate.get("trigger") or ""
332
+ incoming_trigger = memory.get("trigger") or ""
333
+ merged_trigger: Optional[str] = None
334
+ if existing_trigger or incoming_trigger:
335
+ existing_phrases = {p.strip() for p in existing_trigger.split(",") if p.strip()}
336
+ incoming_phrases = {p.strip() for p in incoming_trigger.split(",") if p.strip()}
337
+ merged_trigger = ", ".join(sorted(existing_phrases | incoming_phrases))
338
+
339
+ await db.execute(
340
+ """
341
+ UPDATE memories
342
+ SET confidence = ?, reuse_count = ?, last_confirmed = ?, updated_at = ?,
343
+ trigger = ?
344
+ WHERE id = ?
345
+ """,
346
+ (new_confidence, new_reuse, now, now, merged_trigger, candidate["id"]),
347
+ )
348
+ await db.commit()
349
+ return candidate["id"]
350
+
351
+ # Insert new memory
352
+ now = _now_iso()
353
+ mem_id = str(ULID())
354
+ await db.execute(
355
+ """
356
+ INSERT INTO memories (
357
+ id, type, category, content, scope, project, source,
358
+ confidence, reuse_count, domain, entity_graph, trigger, embedding,
359
+ created_at, updated_at, last_confirmed
360
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
361
+ """,
362
+ (
363
+ mem_id,
364
+ mem_type,
365
+ memory.get("category", "agent"),
366
+ content,
367
+ scope,
368
+ project,
369
+ memory.get("source"),
370
+ memory.get("confidence", 1.0),
371
+ memory.get("reuse_count", 0),
372
+ memory.get("domain"),
373
+ memory.get("entity_graph"),
374
+ memory.get("trigger"),
375
+ embedding,
376
+ now,
377
+ now,
378
+ now,
379
+ ),
380
+ )
381
+ await db.commit()
382
+ return mem_id
383
+
384
+ async def save_embedding(self, memory_id: str, embedding: bytes) -> None:
385
+ """Store an embedding for an existing memory (used for backfill)."""
386
+ db = await self._get_db()
387
+ await db.execute(
388
+ "UPDATE memories SET embedding = ?, updated_at = ? WHERE id = ?",
389
+ (embedding, _now_iso(), memory_id),
390
+ )
391
+ await db.commit()
392
+
393
+ async def get_memories_without_embeddings(
394
+ self, project: Optional[str] = None
395
+ ) -> list[dict]:
396
+ """Return active memories that have no embedding yet (for backfill)."""
397
+ db = await self._get_db()
398
+ conditions = ["archived_at IS NULL", "invalidated_at IS NULL", "embedding IS NULL"]
399
+ params: list[Any] = []
400
+ if project:
401
+ conditions.append("(scope = 'global' OR (scope = ? AND project = ?))")
402
+ params.extend([f"project:{project}", project])
403
+ where = "WHERE " + " AND ".join(conditions)
404
+ async with db.execute(
405
+ f"SELECT id, content FROM memories {where}", params
406
+ ) as cursor:
407
+ return [dict(r) for r in await cursor.fetchall()]
408
+
409
+ async def load(self, project: Optional[str] = None) -> list[dict]:
410
+ """Load active memories for a project + global scope, ordered by confidence DESC."""
411
+ db = await self._get_db()
412
+ params: list[Any] = []
413
+
414
+ if project:
415
+ scope_filter = "(scope = 'global' OR (scope = ? AND project = ?))"
416
+ params = [f"project:{project}", project]
417
+ else:
418
+ scope_filter = "scope = 'global'"
419
+
420
+ query = f"""
421
+ SELECT * FROM memories
422
+ WHERE {scope_filter}
423
+ AND archived_at IS NULL
424
+ AND invalidated_at IS NULL
425
+ ORDER BY confidence DESC
426
+ """
427
+ async with db.execute(query, params) as cursor:
428
+ rows = await cursor.fetchall()
429
+ return [_strip_embedding(dict(row)) for row in rows]
430
+
431
+ async def load_budgeted(
432
+ self,
433
+ project: Optional[str] = None,
434
+ slots: Optional[dict[str, int]] = None,
435
+ context: Optional[str] = None,
436
+ embedding_adapter: Any = None,
437
+ ) -> dict:
438
+ """Load memories with type-slot allocation to bound output size.
439
+
440
+ Selects the top-N memories per type using type-appropriate ranking:
441
+ - gotcha/decision/pattern/preference: top-N by confidence DESC
442
+ - procedure: top-N by reuse_count DESC (most-activated first)
443
+ - progress: top-N by updated_at DESC, only if updated within 7 days
444
+
445
+ When ``context`` is provided, re-ranks within each type bucket using
446
+ activation scoring (task_match × 2 + confidence + recency + reuse_norm
447
+ + scope_fit) instead of raw confidence order. task_match is computed
448
+ via cosine similarity for memories with stored embeddings, falling back
449
+ to normalised FTS rank. Memories with no relevance signal get
450
+ task_match=0.0 and sort behind any context-matched result.
451
+
452
+ Returns a dict with:
453
+ memories: selected subset (list of dicts)
454
+ total: total active memory count
455
+ overflow: count of memories cut by slot limits
456
+ overflow_hints: list of {"type": str, "overflow": int} for each cut type
457
+ project: project name or "global"
458
+ """
459
+ if slots is None:
460
+ slots = dict(DEFAULT_LOAD_SLOTS)
461
+
462
+ all_memories = await self.load(project)
463
+ now = datetime.now(timezone.utc)
464
+ cutoff_7d = now - timedelta(days=7)
465
+
466
+ # --- context-aware relevance scoring -----------------------------------
467
+ task_match_by_id: dict[str, float] = {}
468
+ if context:
469
+ # FTS pass: get rank positions for keyword matches
470
+ fts_results = await self._search_fts(context, project, limit=len(all_memories) or 200)
471
+ fts_count = max(len(fts_results), 1)
472
+ for rank, mem in enumerate(fts_results):
473
+ task_match_by_id[mem["id"]] = max(0.1, 1.0 - rank / fts_count)
474
+
475
+ # Embedding pass: cosine similarity overrides FTS rank when available
476
+ if embedding_adapter is not None:
477
+ try:
478
+ query_emb = await embedding_adapter.embed(context)
479
+ for mem in all_memories:
480
+ emb_bytes = mem.get("embedding")
481
+ if isinstance(emb_bytes, bytes):
482
+ sim = _cosine_similarity(query_emb, _bytes_to_emb(emb_bytes))
483
+ task_match_by_id[mem["id"]] = sim
484
+ except Exception as e:
485
+ print(f"[deja] load --context embedding error: {e}", file=sys.stderr)
486
+ # -----------------------------------------------------------------------
487
+
488
+ by_type: dict[str, list[dict]] = {}
489
+ for mem in all_memories:
490
+ t = mem.get("type", "pattern")
491
+ by_type.setdefault(t, []).append(mem)
492
+
493
+ selected: list[dict] = []
494
+ overflow_hints: list[dict] = []
495
+
496
+ for mem_type, limit in slots.items():
497
+ mems = list(by_type.get(mem_type, []))
498
+ if context:
499
+ # Re-rank by activation score using context relevance
500
+ mems.sort(
501
+ key=lambda m: _activation_score(
502
+ m, task_match_by_id.get(m["id"], 0.0), project, now
503
+ ),
504
+ reverse=True,
505
+ )
506
+ if mem_type == "progress":
507
+ mems = [m for m in mems if _parse_dt(m.get("updated_at", "")) >= cutoff_7d]
508
+ elif mem_type == "progress":
509
+ mems = [m for m in mems if _parse_dt(m.get("updated_at", "")) >= cutoff_7d]
510
+ mems.sort(key=lambda m: m.get("updated_at", ""), reverse=True)
511
+ elif mem_type == "procedure":
512
+ mems.sort(
513
+ key=lambda m: (m.get("reuse_count", 0), m.get("confidence", 0.0)),
514
+ reverse=True,
515
+ )
516
+ else:
517
+ mems.sort(key=lambda m: m.get("confidence", 0.0), reverse=True)
518
+
519
+ chosen = mems[:limit]
520
+ leftover = len(mems) - len(chosen)
521
+ selected.extend(chosen)
522
+ if leftover > 0:
523
+ overflow_hints.append({"type": mem_type, "overflow": leftover})
524
+
525
+ # Any memory types not covered by slots
526
+ known = set(slots.keys())
527
+ for mem_type, mems in by_type.items():
528
+ if mem_type not in known and mems:
529
+ overflow_hints.append({"type": mem_type, "overflow": len(mems)})
530
+
531
+ return {
532
+ "memories": selected,
533
+ "total": len(all_memories),
534
+ "overflow": len(all_memories) - len(selected),
535
+ "overflow_hints": overflow_hints,
536
+ "project": project or "global",
537
+ }
538
+
539
+ async def list_all(self) -> list[dict]:
540
+ """List all active memories across every scope, ordered by scope then confidence."""
541
+ db = await self._get_db()
542
+ async with db.execute(
543
+ """
544
+ SELECT * FROM memories
545
+ WHERE archived_at IS NULL AND invalidated_at IS NULL
546
+ ORDER BY scope ASC, confidence DESC
547
+ """
548
+ ) as cursor:
549
+ rows = await cursor.fetchall()
550
+ return [_strip_embedding(dict(row)) for row in rows]
551
+
552
+ async def list_filtered(
553
+ self,
554
+ project: Optional[str] = None,
555
+ mem_type: Optional[str] = None,
556
+ status: str = "active",
557
+ limit: int = 200,
558
+ offset: int = 0,
559
+ ) -> tuple[list[dict], int]:
560
+ """List memories with optional filtering for the web viewer.
561
+
562
+ project: None=all, '__global__'=global scope only, else filter by project name.
563
+ status: 'active' | 'archived' | 'invalidated' | 'all'
564
+ Returns (memories, total_matching_count).
565
+ """
566
+ db = await self._get_db()
567
+ conditions: list[str] = []
568
+ params: list[Any] = []
569
+
570
+ if project == "__global__":
571
+ conditions.append("scope = 'global'")
572
+ elif project is not None:
573
+ conditions.append("project = ?")
574
+ params.append(project)
575
+
576
+ if mem_type:
577
+ conditions.append("type = ?")
578
+ params.append(mem_type)
579
+
580
+ if status == "active":
581
+ conditions.append("archived_at IS NULL AND invalidated_at IS NULL")
582
+ elif status == "archived":
583
+ conditions.append("archived_at IS NOT NULL")
584
+ elif status == "invalidated":
585
+ conditions.append("invalidated_at IS NOT NULL")
586
+ # "all" — no status filter
587
+
588
+ where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
589
+
590
+ async with db.execute(f"SELECT COUNT(*) as n FROM memories {where}", params) as cur:
591
+ row = await cur.fetchone()
592
+ total = row["n"] if row else 0
593
+
594
+ async with db.execute(
595
+ f"SELECT * FROM memories {where} ORDER BY confidence DESC, updated_at DESC LIMIT ? OFFSET ?",
596
+ params + [limit, offset],
597
+ ) as cur:
598
+ rows = await cur.fetchall()
599
+
600
+ return [_strip_embedding(dict(r)) for r in rows], total
601
+
602
+ async def list_projects(self) -> list[dict]:
603
+ """Return distinct project names with active memory count, ordered by count desc."""
604
+ db = await self._get_db()
605
+ async with db.execute(
606
+ """
607
+ SELECT project, COUNT(*) as count FROM memories
608
+ WHERE project IS NOT NULL
609
+ AND archived_at IS NULL AND invalidated_at IS NULL
610
+ GROUP BY project ORDER BY count DESC
611
+ """
612
+ ) as cur:
613
+ rows = await cur.fetchall()
614
+ return [{"name": r["project"], "count": r["count"]} for r in rows]
615
+
616
+ async def _search_by_trigger(
617
+ self,
618
+ query: str,
619
+ project: Optional[str] = None,
620
+ mem_type: Optional[str] = None,
621
+ ) -> list[dict]:
622
+ """Trigger phrase search. Returns memories where any trigger phrase is a
623
+ substring of the query (case-insensitive).
624
+
625
+ Direction: query contains trigger phrase — not the other way around.
626
+ Example: query "kubectl apply -f k8s/" matches trigger "kubectl apply" ✓
627
+ query "kubectl apply" does NOT match trigger "terraform apply" ✓
628
+
629
+ This is Pass 0 in search() — fires before BM25/embedding and surfaces memories
630
+ that were explicitly tagged for this command. High precision by design.
631
+ """
632
+ db = await self._get_db()
633
+ query_lower = query.lower()
634
+
635
+ scope_clause = ""
636
+ scope_params: list[Any] = []
637
+ if project:
638
+ scope_clause = "AND (m.scope = 'global' OR (m.scope = ? AND m.project = ?))"
639
+ scope_params = [f"project:{project}", project]
640
+
641
+ type_clause = ""
642
+ type_params: list[Any] = []
643
+ if mem_type:
644
+ type_clause = "AND m.type = ?"
645
+ type_params = [mem_type]
646
+
647
+ sql = f"""
648
+ SELECT m.* FROM memories m
649
+ WHERE m.trigger IS NOT NULL
650
+ {scope_clause}
651
+ {type_clause}
652
+ AND m.archived_at IS NULL
653
+ AND m.invalidated_at IS NULL
654
+ ORDER BY m.reuse_count DESC, m.confidence DESC
655
+ """
656
+ try:
657
+ async with db.execute(sql, scope_params + type_params) as cursor:
658
+ rows = await cursor.fetchall()
659
+ except Exception as e:
660
+ print(f"[deja] Trigger search error: {e}", file=sys.stderr)
661
+ return []
662
+
663
+ # Filter in Python: does the query contain any of this memory's trigger phrases?
664
+ # Split trigger on commas; check each phrase as a substring of the query.
665
+ matches = []
666
+ for row in rows:
667
+ mem = dict(row)
668
+ phrases = [p.strip().lower() for p in mem["trigger"].split(",") if p.strip()]
669
+ if any(phrase in query_lower for phrase in phrases):
670
+ matches.append(mem)
671
+
672
+ return matches[:5]
673
+
674
+ async def _search_fts(
675
+ self,
676
+ query: str,
677
+ project: Optional[str] = None,
678
+ mem_type: Optional[str] = None,
679
+ limit: int = 20,
680
+ ) -> list[dict]:
681
+ """Raw FTS5 keyword search. Returns full rows including embedding bytes.
682
+
683
+ Used internally by search() and save() (dedup check). No side effects.
684
+ """
685
+ db = await self._get_db()
686
+ params: list[Any] = []
687
+ escaped_query = query.replace('"', '""')
688
+ params.append(f'"{escaped_query}"')
689
+
690
+ extra_conditions = []
691
+ if project:
692
+ extra_conditions.append(
693
+ "(m.scope = 'global' OR (m.scope = ? AND m.project = ?))"
694
+ )
695
+ params.extend([f"project:{project}", project])
696
+
697
+ if mem_type:
698
+ extra_conditions.append("m.type = ?")
699
+ params.append(mem_type)
700
+
701
+ extra_clause = ("AND " + " AND ".join(extra_conditions)) if extra_conditions else ""
702
+ params.append(limit)
703
+
704
+ query_sql = f"""
705
+ SELECT m.* FROM memories m
706
+ JOIN memories_fts ON m.rowid = memories_fts.rowid
707
+ WHERE memories_fts MATCH ?
708
+ {extra_clause}
709
+ AND m.archived_at IS NULL
710
+ AND m.invalidated_at IS NULL
711
+ ORDER BY memories_fts.rank
712
+ LIMIT ?
713
+ """
714
+ try:
715
+ async with db.execute(query_sql, params) as cursor:
716
+ rows = await cursor.fetchall()
717
+ return [dict(row) for row in rows]
718
+ except Exception as e:
719
+ print(f"[deja] FTS search error: {e}", file=sys.stderr)
720
+ return []
721
+
722
+ async def _search_embedding(
723
+ self,
724
+ query_embedding: list[float],
725
+ project: Optional[str] = None,
726
+ mem_type: Optional[str] = None,
727
+ limit: int = 20,
728
+ ) -> list[dict]:
729
+ """Search by cosine similarity against stored embeddings.
730
+
731
+ Only returns memories that have a stored embedding. Returns full rows
732
+ including embedding bytes (needed for confusability penalty in search()).
733
+ """
734
+ db = await self._get_db()
735
+ conditions = [
736
+ "archived_at IS NULL",
737
+ "invalidated_at IS NULL",
738
+ "embedding IS NOT NULL",
739
+ ]
740
+ params: list[Any] = []
741
+
742
+ if project:
743
+ conditions.append("(scope = 'global' OR (scope = ? AND project = ?))")
744
+ params.extend([f"project:{project}", project])
745
+
746
+ if mem_type:
747
+ conditions.append("type = ?")
748
+ params.append(mem_type)
749
+
750
+ where = "WHERE " + " AND ".join(conditions)
751
+ async with db.execute(f"SELECT * FROM memories {where}", params) as cursor:
752
+ rows = [dict(r) for r in await cursor.fetchall()]
753
+
754
+ scored: list[tuple[float, dict]] = []
755
+ for row in rows:
756
+ try:
757
+ mem_emb = _bytes_to_emb(row["embedding"])
758
+ sim = _cosine_similarity(query_embedding, mem_emb)
759
+ scored.append((sim, row))
760
+ except Exception:
761
+ continue
762
+
763
+ scored.sort(key=lambda x: x[0], reverse=True)
764
+ return [row for _, row in scored[:limit]]
765
+
766
+ async def _increment_reuse_for_ids(self, memory_ids: list[str]) -> None:
767
+ """Increment reuse_count by 1 for specific memories (search activation signal)."""
768
+ if not memory_ids:
769
+ return
770
+ db = await self._get_db()
771
+ placeholders = ",".join("?" for _ in memory_ids)
772
+ await db.execute(
773
+ f"UPDATE memories SET reuse_count = reuse_count + 1, updated_at = ? "
774
+ f"WHERE id IN ({placeholders})",
775
+ [_now_iso()] + memory_ids,
776
+ )
777
+ await db.commit()
778
+
779
+ async def search(
780
+ self,
781
+ query: str,
782
+ project: Optional[str] = None,
783
+ mem_type: Optional[str] = None,
784
+ limit: int = 20,
785
+ embedding_adapter: Any = None,
786
+ track_usage: bool = True,
787
+ ) -> list[dict]:
788
+ """Hybrid search: BM25 (FTS5) keywords + embedding similarity, always both.
789
+
790
+ Pipeline:
791
+ 1. FTS5 BM25 keyword search — always runs.
792
+ 2. Embedding similarity search — always runs when embedding_adapter is provided.
793
+ Results merged with FTS results, deduped by ID (FTS first).
794
+ 3. Activation ranking — all candidates re-scored by:
795
+ task_match * 2 + confidence + recency * 0.5 + reuse_norm * 0.5 + scope_fit
796
+ task_match uses cosine for results with stored embeddings; normalized FTS rank
797
+ (0.1–1.0) otherwise — puts both sources on a comparable scale.
798
+ 4. Confusability penalty — procedures with cosine >0.85 to a higher-ranked
799
+ result are down-ranked by 0.4×.
800
+ 5. reuse_count incremented for returned memories when track_usage=True.
801
+
802
+ Without embedding_adapter: FTS5 only with activation ranking (no reuse increment).
803
+ track_usage=False: skip reuse_count increment (use in benchmarks/batch jobs).
804
+ """
805
+ # Step 0: Trigger exact match — high-precision pass for hook-style recall.
806
+ # Finds memories whose trigger field contains any comma-separated phrase from
807
+ # the query. Fast substring match; no ranking needed — these are already precise.
808
+ trigger_results = await self._search_by_trigger(query, project, mem_type)
809
+
810
+ # Step 1: FTS5 keyword search
811
+ fts_results = await self._search_fts(query, project, mem_type, limit)
812
+
813
+ # Step 2: Embedding search — runs whenever adapter is configured.
814
+ # Previously this was a fallback (only when BM25 returned <3 results), but
815
+ # that caused the hybrid to perform worse than either component alone: BM25
816
+ # would return 3+ wrong results on natural language queries, blocking embedding
817
+ # from running at all. Now both always run and activation ranking picks the winner.
818
+ embedding_results: list[dict] = []
819
+ query_embedding: Optional[list[float]] = None
820
+
821
+ if embedding_adapter is not None:
822
+ try:
823
+ query_embedding = await embedding_adapter.embed(query)
824
+ embedding_results = await self._search_embedding(
825
+ query_embedding, project, mem_type, limit
826
+ )
827
+ except Exception as e:
828
+ print(f"[deja] Embedding search error: {e}", file=sys.stderr)
829
+
830
+ # Step 3: Merge results, dedup by ID (trigger first, then FTS5, then embedding)
831
+ seen_ids: set[str] = set()
832
+ merged: list[tuple[dict, str, int]] = [] # (mem, source, rank)
833
+ for rank, mem in enumerate(trigger_results):
834
+ if mem["id"] not in seen_ids:
835
+ seen_ids.add(mem["id"])
836
+ merged.append((mem, "trigger", rank))
837
+ for rank, mem in enumerate(fts_results):
838
+ if mem["id"] not in seen_ids:
839
+ seen_ids.add(mem["id"])
840
+ merged.append((mem, "fts", rank))
841
+ for rank, mem in enumerate(embedding_results):
842
+ if mem["id"] not in seen_ids:
843
+ seen_ids.add(mem["id"])
844
+ merged.append((mem, "emb", rank))
845
+
846
+ if not merged:
847
+ return []
848
+
849
+ # Step 4: Activation ranking
850
+ now_dt = datetime.now(timezone.utc)
851
+ scored: list[tuple[dict, Optional[bytes], float]] = []
852
+ for mem, source, rank in merged:
853
+ emb_bytes = mem.get("embedding")
854
+ if not isinstance(emb_bytes, bytes):
855
+ emb_bytes = None
856
+
857
+ if source == "trigger":
858
+ # Trigger match is definitionally correct — pin above all scored results.
859
+ # Use task_match=2.0 (above max cosine of 1.0) so activation score
860
+ # always beats any FTS/embedding result regardless of reuse_count/recency.
861
+ task_match = 2.0
862
+ elif query_embedding is not None and emb_bytes:
863
+ # Use cosine similarity for any result that has a stored embedding —
864
+ # puts FTS and embedding results on the same scale. Without this,
865
+ # FTS rank 0 always gets task_match=1.0 and beats embedding results
866
+ # with actual cosine similarity, even when the FTS result is wrong.
867
+ task_match = _cosine_similarity(query_embedding, _bytes_to_emb(emb_bytes))
868
+ else:
869
+ # No embedding available: fall back to BM25 rank position (0.1–1.0).
870
+ task_match = max(0.1, 1.0 - rank / max(len(fts_results), 1))
871
+
872
+ score = _activation_score(mem, task_match, project, now_dt)
873
+ scored.append((mem, emb_bytes, score))
874
+
875
+ # Step 5: Confusability penalty for procedures
876
+ scored = _apply_confusability_penalty(scored)
877
+
878
+ # Step 6: Sort by activation score, take top limit
879
+ scored.sort(key=lambda x: x[2], reverse=True)
880
+ final = [_strip_embedding(mem) for mem, _, _ in scored[:limit]]
881
+
882
+ # Step 7: Increment reuse_count for returned memories (passive activation signal).
883
+ # Only when embedding_adapter is provided AND track_usage is True. Pass
884
+ # track_usage=False from benchmarks or batch jobs to avoid accumulation artifacts.
885
+ if embedding_adapter is not None and track_usage and final:
886
+ await self._increment_reuse_for_ids([m["id"] for m in final])
887
+
888
+ return final
889
+
890
+ async def archive(self, memory_id: str) -> None:
891
+ db = await self._get_db()
892
+ now = _now_iso()
893
+ await db.execute(
894
+ "UPDATE memories SET archived_at = ?, updated_at = ? WHERE id = ?",
895
+ (now, now, memory_id),
896
+ )
897
+ await db.commit()
898
+
899
+ async def update_memory(self, memory_id: str, fields: dict) -> bool:
900
+ """Update allowed metadata fields on an existing memory.
901
+
902
+ Only ``trigger`` and ``type`` can be updated this way — content changes
903
+ go through save() (which deduplicates). Returns True if a row was updated.
904
+
905
+ Trigger merge: if the memory already has a trigger, the new phrases are
906
+ unioned with the existing ones (same merge logic as dedup in save()).
907
+ """
908
+ allowed = {"trigger", "type"}
909
+ updates = {k: v for k, v in fields.items() if k in allowed and v is not None}
910
+ if not updates:
911
+ return False
912
+
913
+ db = await self._get_db()
914
+ now = _now_iso()
915
+
916
+ # Merge trigger phrases rather than overwrite
917
+ if "trigger" in updates:
918
+ async with db.execute(
919
+ "SELECT trigger FROM memories WHERE id = ?", (memory_id,)
920
+ ) as cur:
921
+ row = await cur.fetchone()
922
+ if row is None:
923
+ return False
924
+ existing_trigger = (row["trigger"] or "") if row else ""
925
+ incoming_trigger = updates["trigger"] or ""
926
+ existing_phrases = {p.strip() for p in existing_trigger.split(",") if p.strip()}
927
+ incoming_phrases = {p.strip() for p in incoming_trigger.split(",") if p.strip()}
928
+ updates["trigger"] = ", ".join(sorted(existing_phrases | incoming_phrases))
929
+
930
+ set_clauses = ", ".join(f"{k} = ?" for k in updates)
931
+ values = list(updates.values()) + [now, memory_id]
932
+ cursor = await db.execute(
933
+ f"UPDATE memories SET {set_clauses}, updated_at = ? WHERE id = ? "
934
+ f"AND archived_at IS NULL AND invalidated_at IS NULL",
935
+ values,
936
+ )
937
+ await db.commit()
938
+ return cursor.rowcount > 0
939
+
940
+ async def list_for_export(
941
+ self,
942
+ project: Optional[str] = None,
943
+ types: Optional[list[str]] = None,
944
+ include_archived: bool = False,
945
+ ) -> list[dict]:
946
+ """List memories for export with filtering. Embedding bytes are excluded."""
947
+ db = await self._get_db()
948
+ conditions = []
949
+ params: list[Any] = []
950
+
951
+ if project:
952
+ # Export only this project's memories (no global)
953
+ conditions.append("scope = ?")
954
+ params.append(f"project:{project}")
955
+
956
+ if types:
957
+ placeholders = ",".join("?" for _ in types)
958
+ conditions.append(f"type IN ({placeholders})")
959
+ params.extend(types)
960
+
961
+ if not include_archived:
962
+ conditions.append("archived_at IS NULL")
963
+ conditions.append("invalidated_at IS NULL")
964
+
965
+ where_clause = ""
966
+ if conditions:
967
+ where_clause = "WHERE " + " AND ".join(conditions)
968
+
969
+ query = f"SELECT * FROM memories {where_clause} ORDER BY created_at ASC"
970
+ async with db.execute(query, params) as cursor:
971
+ rows = await cursor.fetchall()
972
+ return [_strip_embedding(dict(row)) for row in rows]
973
+
974
+ async def upsert(
975
+ self,
976
+ memory: dict,
977
+ merge_strategy: str = "skip",
978
+ ) -> str:
979
+ """Insert or update a memory during import based on merge strategy.
980
+
981
+ Strategies:
982
+ - skip: If ID exists, do nothing.
983
+ - overwrite: Replace existing record wholesale.
984
+ - update-confidence: If ID exists and content matches, bump confidence.
985
+ """
986
+ db = await self._get_db()
987
+ mem_id = memory["id"]
988
+
989
+ existing = await self.get(mem_id)
990
+ if not existing:
991
+ # New record, just insert
992
+ fields = list(memory.keys())
993
+ placeholders = ",".join("?" for _ in fields)
994
+ query = f"INSERT INTO memories ({','.join(fields)}) VALUES ({placeholders})"
995
+ await db.execute(query, [memory[f] for f in fields])
996
+ await db.commit()
997
+ return "inserted"
998
+
999
+ if merge_strategy == "skip":
1000
+ return "skipped"
1001
+
1002
+ if merge_strategy == "overwrite":
1003
+ fields = [f for f in memory.keys() if f != "id"]
1004
+ set_clause = ",".join(f"{f} = ?" for f in fields)
1005
+ query = f"UPDATE memories SET {set_clause} WHERE id = ?"
1006
+ params = [memory[f] for f in fields] + [mem_id]
1007
+ await db.execute(query, params)
1008
+ await db.commit()
1009
+ return "overwritten"
1010
+
1011
+ if merge_strategy == "update-confidence":
1012
+ if memory["content"] == existing["content"]:
1013
+ new_confidence = min(1.0, existing["confidence"] + 0.05)
1014
+ now = _now_iso()
1015
+ await db.execute(
1016
+ """
1017
+ UPDATE memories
1018
+ SET confidence = ?, last_confirmed = ?, updated_at = ?
1019
+ WHERE id = ?
1020
+ """,
1021
+ (new_confidence, now, now, mem_id),
1022
+ )
1023
+ await db.commit()
1024
+ return "updated"
1025
+ else:
1026
+ return "skipped"
1027
+
1028
+ return "skipped"
1029
+
1030
+ async def get(self, memory_id: str) -> Optional[dict]:
1031
+ db = await self._get_db()
1032
+ async with db.execute(
1033
+ "SELECT * FROM memories WHERE id = ?", (memory_id,)
1034
+ ) as cursor:
1035
+ row = await cursor.fetchone()
1036
+ return _strip_embedding(dict(row)) if row else None
1037
+
1038
+ async def invalidate(self, memory_id: str) -> None:
1039
+ """Mark a memory as invalidated (superseded by newer information)."""
1040
+ db = await self._get_db()
1041
+ now = _now_iso()
1042
+ await db.execute(
1043
+ "UPDATE memories SET invalidated_at = ?, updated_at = ? WHERE id = ?",
1044
+ (now, now, memory_id),
1045
+ )
1046
+ await db.commit()
1047
+
1048
+ async def save_observation(self, project: Optional[str], content: str) -> str:
1049
+ """Save one observation to the observations table."""
1050
+ db = await self._get_db()
1051
+ obs_id = str(ULID())
1052
+ now = _now_iso()
1053
+ token_estimate = len(content.split()) * 2
1054
+ await db.execute(
1055
+ """
1056
+ INSERT INTO observations (id, project, content, token_estimate, created_at, reflector_pass)
1057
+ VALUES (?, ?, ?, ?, ?, 0)
1058
+ """,
1059
+ (obs_id, project, content, token_estimate, now),
1060
+ )
1061
+ await db.commit()
1062
+ return obs_id
1063
+
1064
+ async def list_observations(self, project: Optional[str] = None) -> list[dict]:
1065
+ """List all observations for a project (or all if project is None)."""
1066
+ db = await self._get_db()
1067
+ if project is not None:
1068
+ async with db.execute(
1069
+ "SELECT * FROM observations WHERE project = ? ORDER BY created_at ASC",
1070
+ (project,),
1071
+ ) as cursor:
1072
+ rows = await cursor.fetchall()
1073
+ else:
1074
+ async with db.execute(
1075
+ "SELECT * FROM observations ORDER BY created_at ASC"
1076
+ ) as cursor:
1077
+ rows = await cursor.fetchall()
1078
+ return [dict(row) for row in rows]
1079
+
1080
+ async def replace_observations(
1081
+ self, project: Optional[str], new_texts: list[str]
1082
+ ) -> None:
1083
+ """Replace the full observation log for a project with condensed versions."""
1084
+ db = await self._get_db()
1085
+ if project is not None:
1086
+ await db.execute("DELETE FROM observations WHERE project = ?", (project,))
1087
+ else:
1088
+ await db.execute("DELETE FROM observations WHERE project IS NULL")
1089
+ now = _now_iso()
1090
+ for text in new_texts:
1091
+ obs_id = str(ULID())
1092
+ token_estimate = len(text.split()) * 2
1093
+ await db.execute(
1094
+ """
1095
+ INSERT INTO observations (id, project, content, token_estimate, created_at, reflector_pass)
1096
+ VALUES (?, ?, ?, ?, ?, 1)
1097
+ """,
1098
+ (obs_id, project, text, token_estimate, now),
1099
+ )
1100
+ await db.commit()
1101
+
1102
+ async def get_reflection_meta(self, project: Optional[str] = None) -> Optional[dict]:
1103
+ """Get reflection metadata for a project."""
1104
+ db = await self._get_db()
1105
+ key = _project_meta_key(project)
1106
+ async with db.execute(
1107
+ "SELECT * FROM reflection_meta WHERE project = ?", (key,)
1108
+ ) as cursor:
1109
+ row = await cursor.fetchone()
1110
+ return dict(row) if row else None
1111
+
1112
+ async def set_reflection_meta(self, project: Optional[str] = None, **fields) -> None:
1113
+ """Insert or update reflection metadata fields for a project."""
1114
+ db = await self._get_db()
1115
+ key = _project_meta_key(project)
1116
+ existing = await self.get_reflection_meta(project)
1117
+ if existing is None:
1118
+ all_fields: dict = {
1119
+ "project": key,
1120
+ "last_observer_at": None,
1121
+ "last_reflector_at": None,
1122
+ "last_decay_at": None,
1123
+ "last_promote_at": None,
1124
+ "last_archive_at": None,
1125
+ }
1126
+ all_fields.update(fields)
1127
+ cols = ",".join(all_fields.keys())
1128
+ placeholders = ",".join("?" for _ in all_fields)
1129
+ await db.execute(
1130
+ f"INSERT INTO reflection_meta ({cols}) VALUES ({placeholders})",
1131
+ list(all_fields.values()),
1132
+ )
1133
+ else:
1134
+ set_clause = ",".join(f"{f} = ?" for f in fields)
1135
+ await db.execute(
1136
+ f"UPDATE reflection_meta SET {set_clause} WHERE project = ?",
1137
+ list(fields.values()) + [key],
1138
+ )
1139
+ await db.commit()
1140
+
1141
+ async def list_for_reflection(
1142
+ self,
1143
+ project: Optional[str] = None,
1144
+ since: Optional[str] = None,
1145
+ ) -> list[dict]:
1146
+ """Get active memories for the Observer to process.
1147
+
1148
+ project=None returns ALL memories (global + all projects).
1149
+ project='X' returns only scope='project:X' memories.
1150
+ since restricts to memories updated after that ISO timestamp.
1151
+ """
1152
+ db = await self._get_db()
1153
+ conditions = ["archived_at IS NULL", "invalidated_at IS NULL"]
1154
+ params: list[Any] = []
1155
+
1156
+ if project is not None:
1157
+ conditions.append("scope = ?")
1158
+ params.append(f"project:{project}")
1159
+
1160
+ if since:
1161
+ conditions.append("updated_at > ?")
1162
+ params.append(since)
1163
+
1164
+ where = "WHERE " + " AND ".join(conditions)
1165
+ query = f"SELECT * FROM memories {where} ORDER BY updated_at ASC"
1166
+ async with db.execute(query, params) as cursor:
1167
+ rows = await cursor.fetchall()
1168
+ return [_strip_embedding(dict(row)) for row in rows]
1169
+
1170
+ async def decay_unconfirmed(
1171
+ self,
1172
+ days_threshold: int,
1173
+ decay_per_week: float,
1174
+ user_decay_per_week: float,
1175
+ ) -> int:
1176
+ """Reduce confidence on memories not confirmed in days_threshold days.
1177
+
1178
+ Two decay rates are applied based on memory category:
1179
+ - category='agent' (gotcha, decision, progress, pattern): uses decay_per_week.
1180
+ Operational knowledge goes stale; higher rate reflects that.
1181
+ - category='user' (preferences, habits): uses user_decay_per_week.
1182
+ Personal style preferences are stable across time; much lower rate.
1183
+
1184
+ Returns number of memories whose confidence was updated.
1185
+ """
1186
+ db = await self._get_db()
1187
+ now = datetime.now(timezone.utc)
1188
+ threshold_iso = (now - timedelta(days=days_threshold)).isoformat()
1189
+
1190
+ async with db.execute(
1191
+ """
1192
+ SELECT id, category, confidence, last_confirmed FROM memories
1193
+ WHERE archived_at IS NULL
1194
+ AND invalidated_at IS NULL
1195
+ AND (last_confirmed IS NULL OR last_confirmed < ?)
1196
+ """,
1197
+ (threshold_iso,),
1198
+ ) as cursor:
1199
+ rows = [dict(r) for r in await cursor.fetchall()]
1200
+
1201
+ count = 0
1202
+ for row in rows:
1203
+ lc = row["last_confirmed"]
1204
+ if lc:
1205
+ weeks_since = (now - datetime.fromisoformat(lc)).days / 7.0
1206
+ else:
1207
+ weeks_since = days_threshold / 7.0
1208
+
1209
+ rate = user_decay_per_week if row["category"] == "user" else decay_per_week
1210
+ new_conf = max(0.0, row["confidence"] - rate * weeks_since)
1211
+ if abs(new_conf - row["confidence"]) > 0.001:
1212
+ await db.execute(
1213
+ "UPDATE memories SET confidence = ?, updated_at = ? WHERE id = ?",
1214
+ (new_conf, _now_iso(), row["id"]),
1215
+ )
1216
+ count += 1
1217
+
1218
+ if count:
1219
+ await db.commit()
1220
+ return count
1221
+
1222
+ async def archive_below_threshold(self, threshold: float) -> int:
1223
+ """Archive memories whose confidence is below threshold.
1224
+ Returns number of memories archived.
1225
+ """
1226
+ db = await self._get_db()
1227
+ now = _now_iso()
1228
+ async with db.execute(
1229
+ """
1230
+ UPDATE memories SET archived_at = ?, updated_at = ?
1231
+ WHERE confidence < ? AND archived_at IS NULL AND invalidated_at IS NULL
1232
+ """,
1233
+ (now, now, threshold),
1234
+ ) as cursor:
1235
+ count = cursor.rowcount
1236
+ await db.commit()
1237
+ return count
1238
+
1239
+ async def increment_reuse_count(self, project: Optional[str] = None) -> int:
1240
+ """Increment reuse_count by 1 for all active memories (used after Reflector pass).
1241
+
1242
+ Surviving compression is a confirmation signal — memories still represented
1243
+ in the observation log have proven worth keeping.
1244
+ Returns number of memories updated.
1245
+ """
1246
+ db = await self._get_db()
1247
+ conditions = ["archived_at IS NULL", "invalidated_at IS NULL"]
1248
+ params: list[Any] = []
1249
+ if project is not None:
1250
+ conditions.append("scope = ?")
1251
+ params.append(f"project:{project}")
1252
+ where = "WHERE " + " AND ".join(conditions)
1253
+ async with db.execute(
1254
+ f"UPDATE memories SET reuse_count = reuse_count + 1, updated_at = ? {where}",
1255
+ [_now_iso()] + params,
1256
+ ) as cursor:
1257
+ count = cursor.rowcount
1258
+ await db.commit()
1259
+ return count
1260
+
1261
+ async def promote_patterns_to_global(self, min_project_count: int) -> int:
1262
+ """Promote pattern and procedure memories appearing in min_project_count+
1263
+ distinct projects to global scope. Returns number promoted.
1264
+ """
1265
+ db = await self._get_db()
1266
+ async with db.execute(
1267
+ """
1268
+ SELECT * FROM memories
1269
+ WHERE type IN ('pattern', 'procedure')
1270
+ AND scope != 'global'
1271
+ AND archived_at IS NULL
1272
+ AND invalidated_at IS NULL
1273
+ ORDER BY created_at ASC
1274
+ """
1275
+ ) as cursor:
1276
+ patterns = [dict(r) for r in await cursor.fetchall()]
1277
+
1278
+ promoted = 0
1279
+ processed_ids: set[str] = set()
1280
+
1281
+ for i, pat in enumerate(patterns):
1282
+ if pat["id"] in processed_ids:
1283
+ continue
1284
+
1285
+ similar = [pat]
1286
+ for other in patterns[i + 1:]:
1287
+ if other["project"] != pat["project"] and other["id"] not in processed_ids:
1288
+ if _token_overlap(pat["content"], other["content"]) > 0.7:
1289
+ similar.append(other)
1290
+
1291
+ distinct_projects = {m["project"] for m in similar if m.get("project")}
1292
+ if len(distinct_projects) >= min_project_count:
1293
+ # Use raw FTS5 search (no side effects) to check for existing global
1294
+ global_candidates = await self._search_fts(
1295
+ pat["content"], None, mem_type=pat["type"], limit=5
1296
+ )
1297
+ has_global = any(m["scope"] == "global" for m in global_candidates)
1298
+
1299
+ if not has_global:
1300
+ best = max(similar, key=lambda m: m["confidence"])
1301
+ now = _now_iso()
1302
+ new_id = str(ULID())
1303
+ await db.execute(
1304
+ """
1305
+ INSERT INTO memories
1306
+ (id, type, category, content, scope, project, source,
1307
+ confidence, reuse_count, domain, entity_graph, trigger,
1308
+ created_at, updated_at, last_confirmed)
1309
+ VALUES (?, ?, ?, ?, 'global', NULL, 'deja_promote',
1310
+ ?, ?, ?, NULL, ?, ?, ?, ?)
1311
+ """,
1312
+ (
1313
+ new_id,
1314
+ best["type"],
1315
+ best.get("category", "agent"),
1316
+ best["content"],
1317
+ best["confidence"],
1318
+ best.get("reuse_count", 0),
1319
+ best.get("domain"),
1320
+ best.get("trigger"),
1321
+ now, now, now,
1322
+ ),
1323
+ )
1324
+ promoted += 1
1325
+
1326
+ for m in similar:
1327
+ processed_ids.add(m["id"])
1328
+
1329
+ if promoted:
1330
+ await db.commit()
1331
+ return promoted
1332
+
1333
+ async def get_stats(self, project: Optional[str] = None) -> dict:
1334
+ """Return memory statistics for a project or all memories."""
1335
+ db = await self._get_db()
1336
+
1337
+ if project:
1338
+ scope_filter = "scope = ?"
1339
+ scope_params: list[Any] = [f"project:{project}"]
1340
+ else:
1341
+ scope_filter = "1=1"
1342
+ scope_params = []
1343
+
1344
+ async with db.execute(
1345
+ f"""
1346
+ SELECT type, COUNT(*) as cnt FROM memories
1347
+ WHERE {scope_filter} AND archived_at IS NULL AND invalidated_at IS NULL
1348
+ GROUP BY type
1349
+ """,
1350
+ scope_params,
1351
+ ) as cursor:
1352
+ by_type = {r["type"]: r["cnt"] for r in await cursor.fetchall()}
1353
+
1354
+ async with db.execute(
1355
+ f"SELECT COUNT(*) as n FROM memories WHERE {scope_filter} AND archived_at IS NOT NULL",
1356
+ scope_params,
1357
+ ) as cursor:
1358
+ archived = (await cursor.fetchone())["n"]
1359
+
1360
+ async with db.execute(
1361
+ f"""SELECT COUNT(*) as n FROM memories
1362
+ WHERE {scope_filter} AND invalidated_at IS NOT NULL AND archived_at IS NULL""",
1363
+ scope_params,
1364
+ ) as cursor:
1365
+ invalidated = (await cursor.fetchone())["n"]
1366
+
1367
+ async with db.execute(
1368
+ f"""SELECT content FROM memories
1369
+ WHERE {scope_filter} AND archived_at IS NULL AND invalidated_at IS NULL""",
1370
+ scope_params,
1371
+ ) as cursor:
1372
+ token_estimate = sum(
1373
+ len(r["content"].split()) * 2 for r in await cursor.fetchall()
1374
+ )
1375
+
1376
+ # Count memories with embeddings
1377
+ async with db.execute(
1378
+ f"""SELECT COUNT(*) as n FROM memories
1379
+ WHERE {scope_filter} AND archived_at IS NULL AND invalidated_at IS NULL
1380
+ AND embedding IS NOT NULL""",
1381
+ scope_params,
1382
+ ) as cursor:
1383
+ with_embeddings = (await cursor.fetchone())["n"]
1384
+
1385
+ if project:
1386
+ obs_clause, obs_params = "WHERE project = ?", [project]
1387
+ else:
1388
+ obs_clause, obs_params = "", []
1389
+
1390
+ async with db.execute(
1391
+ f"SELECT COUNT(*) as n FROM observations {obs_clause}", obs_params
1392
+ ) as cursor:
1393
+ obs_count = (await cursor.fetchone())["n"]
1394
+
1395
+ meta = await self.get_reflection_meta(project)
1396
+ return {
1397
+ "project": project or "global",
1398
+ "active": sum(by_type.values()),
1399
+ "by_type": by_type,
1400
+ "archived": archived,
1401
+ "invalidated": invalidated,
1402
+ "observations": obs_count,
1403
+ "token_estimate": token_estimate,
1404
+ "with_embeddings": with_embeddings,
1405
+ "last_observer_at": meta.get("last_observer_at") if meta else None,
1406
+ "last_reflector_at": meta.get("last_reflector_at") if meta else None,
1407
+ "last_decay_at": meta.get("last_decay_at") if meta else None,
1408
+ }
1409
+
1410
+ async def close(self) -> None:
1411
+ if self._db is not None:
1412
+ await self._db.close()
1413
+ self._db = None