@context-vault/core 2.17.0 → 2.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index/db.js CHANGED
@@ -42,7 +42,69 @@ function runTransaction(db, fn) {
42
42
  }
43
43
  }
44
44
 
45
- export const SCHEMA_DDL = `
45
+ // Local-mode schema: no multi-tenancy or encryption columns.
46
+ // Identity uniqueness is scoped to (kind, identity_key) — no user_id.
47
+ export const LOCAL_SCHEMA_DDL = `
48
+ CREATE TABLE IF NOT EXISTS vault (
49
+ id TEXT PRIMARY KEY,
50
+ kind TEXT NOT NULL,
51
+ category TEXT NOT NULL DEFAULT 'knowledge',
52
+ title TEXT,
53
+ body TEXT NOT NULL,
54
+ meta TEXT,
55
+ tags TEXT,
56
+ source TEXT,
57
+ file_path TEXT UNIQUE,
58
+ identity_key TEXT,
59
+ expires_at TEXT,
60
+ superseded_by TEXT,
61
+ created_at TEXT DEFAULT (datetime('now')),
62
+ updated_at TEXT,
63
+ hit_count INTEGER DEFAULT 0,
64
+ last_accessed_at TEXT,
65
+ source_files TEXT,
66
+ tier TEXT DEFAULT 'working' CHECK(tier IN ('ephemeral', 'working', 'durable')),
67
+ related_to TEXT
68
+ );
69
+
70
+ CREATE INDEX IF NOT EXISTS idx_vault_kind ON vault(kind);
71
+ CREATE INDEX IF NOT EXISTS idx_vault_category ON vault(category);
72
+ CREATE INDEX IF NOT EXISTS idx_vault_category_created ON vault(category, created_at DESC);
73
+ CREATE INDEX IF NOT EXISTS idx_vault_updated ON vault(updated_at DESC);
74
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_vault_identity ON vault(kind, identity_key) WHERE identity_key IS NOT NULL AND category = 'entity';
75
+ CREATE INDEX IF NOT EXISTS idx_vault_superseded ON vault(superseded_by) WHERE superseded_by IS NOT NULL;
76
+ CREATE INDEX IF NOT EXISTS idx_vault_tier ON vault(tier);
77
+
78
+ -- Single FTS5 table
79
+ CREATE VIRTUAL TABLE IF NOT EXISTS vault_fts USING fts5(
80
+ title, body, tags, kind,
81
+ content='vault', content_rowid='rowid'
82
+ );
83
+
84
+ -- FTS sync triggers
85
+ CREATE TRIGGER IF NOT EXISTS vault_ai AFTER INSERT ON vault BEGIN
86
+ INSERT INTO vault_fts(rowid, title, body, tags, kind)
87
+ VALUES (new.rowid, new.title, new.body, new.tags, new.kind);
88
+ END;
89
+ CREATE TRIGGER IF NOT EXISTS vault_ad AFTER DELETE ON vault BEGIN
90
+ INSERT INTO vault_fts(vault_fts, rowid, title, body, tags, kind)
91
+ VALUES ('delete', old.rowid, old.title, old.body, old.tags, old.kind);
92
+ END;
93
+ CREATE TRIGGER IF NOT EXISTS vault_au AFTER UPDATE ON vault BEGIN
94
+ INSERT INTO vault_fts(vault_fts, rowid, title, body, tags, kind)
95
+ VALUES ('delete', old.rowid, old.title, old.body, old.tags, old.kind);
96
+ INSERT INTO vault_fts(rowid, title, body, tags, kind)
97
+ VALUES (new.rowid, new.title, new.body, new.tags, new.kind);
98
+ END;
99
+
100
+ -- Single vec table (384-dim float32 for all-MiniLM-L6-v2)
101
+ CREATE VIRTUAL TABLE IF NOT EXISTS vault_vec USING vec0(embedding float[384]);
102
+ `;
103
+
104
+ // Hosted-mode schema: adds multi-tenancy (user_id, team_id) and at-rest
105
+ // encryption columns (body_encrypted, title_encrypted, meta_encrypted, iv).
106
+ // Identity uniqueness is scoped to (user_id, kind, identity_key).
107
+ export const HOSTED_SCHEMA_DDL = `
46
108
  CREATE TABLE IF NOT EXISTS vault (
47
109
  id TEXT PRIMARY KEY,
48
110
  kind TEXT NOT NULL,
@@ -67,7 +129,8 @@ export const SCHEMA_DDL = `
67
129
  hit_count INTEGER DEFAULT 0,
68
130
  last_accessed_at TEXT,
69
131
  source_files TEXT,
70
- tier TEXT DEFAULT 'working' CHECK(tier IN ('ephemeral', 'working', 'durable'))
132
+ tier TEXT DEFAULT 'working' CHECK(tier IN ('ephemeral', 'working', 'durable')),
133
+ related_to TEXT
71
134
  );
72
135
 
73
136
  CREATE INDEX IF NOT EXISTS idx_vault_kind ON vault(kind);
@@ -106,7 +169,13 @@ export const SCHEMA_DDL = `
106
169
  CREATE VIRTUAL TABLE IF NOT EXISTS vault_vec USING vec0(embedding float[384]);
107
170
  `;
108
171
 
109
- export async function initDatabase(dbPath) {
172
+ // Backward-compatible alias kept for external consumers that reference SCHEMA_DDL.
173
+ export const SCHEMA_DDL = HOSTED_SCHEMA_DDL;
174
+
175
+ // Current target schema version. Bump this on every migration.
176
+ const CURRENT_VERSION = 14;
177
+
178
+ export async function initDatabase(dbPath, { mode = "local" } = {}) {
110
179
  const sqliteVec = await loadSqliteVec();
111
180
 
112
181
  function createDb(path) {
@@ -121,6 +190,8 @@ export async function initDatabase(dbPath) {
121
190
  return db;
122
191
  }
123
192
 
193
+ const schemaDdl = mode === "hosted" ? HOSTED_SCHEMA_DDL : LOCAL_SCHEMA_DDL;
194
+
124
195
  const db = createDb(dbPath);
125
196
  const version = db.prepare("PRAGMA user_version").get().user_version;
126
197
 
@@ -155,14 +226,14 @@ export async function initDatabase(dbPath) {
155
226
  } catch {}
156
227
 
157
228
  const freshDb = createDb(dbPath);
158
- freshDb.exec(SCHEMA_DDL);
159
- freshDb.exec("PRAGMA user_version = 12");
229
+ freshDb.exec(schemaDdl);
230
+ freshDb.exec(`PRAGMA user_version = ${CURRENT_VERSION}`);
160
231
  return freshDb;
161
232
  }
162
233
 
163
234
  if (version < 5) {
164
- db.exec(SCHEMA_DDL);
165
- db.exec("PRAGMA user_version = 12");
235
+ db.exec(schemaDdl);
236
+ db.exec(`PRAGMA user_version = ${CURRENT_VERSION}`);
166
237
  } else if (version === 5) {
167
238
  // v5 -> v6 migration: add multi-tenancy + encryption columns
168
239
  // Wrapped in transaction with duplicate-column guards for idempotent retry
@@ -344,12 +415,108 @@ export async function initDatabase(dbPath) {
344
415
  });
345
416
  }
346
417
 
418
+ if (version >= 5 && version <= 12) {
419
+ // v12 -> v13 migration: add related_to column for graph linking
420
+ runTransaction(db, () => {
421
+ try {
422
+ db.exec(`ALTER TABLE vault ADD COLUMN related_to TEXT`);
423
+ } catch (e) {
424
+ if (!e.message.includes("duplicate column")) throw e;
425
+ }
426
+ db.exec("PRAGMA user_version = 13");
427
+ });
428
+ }
429
+
430
+ if (version >= 5 && version <= 13) {
431
+ // v13 -> v14 migration: separate local and hosted schemas.
432
+ // Local mode: drop the 6 hosted-only columns (user_id, team_id,
433
+ // body_encrypted, title_encrypted, meta_encrypted, iv) and rebuild
434
+ // the identity index without user_id.
435
+ // Hosted mode: no structural change — just bump version.
436
+ runTransaction(db, () => {
437
+ if (mode === "local") {
438
+ // Must drop indexes that reference the columns before dropping columns.
439
+ db.exec(`DROP INDEX IF EXISTS idx_vault_user`);
440
+ db.exec(`DROP INDEX IF EXISTS idx_vault_team`);
441
+ db.exec(`DROP INDEX IF EXISTS idx_vault_identity`);
442
+ const dropColumnSafe = (col) => {
443
+ try {
444
+ db.exec(`ALTER TABLE vault DROP COLUMN ${col}`);
445
+ } catch (e) {
446
+ // Column may not exist on older schemas that never had it added.
447
+ if (!e.message.includes("no such column")) throw e;
448
+ }
449
+ };
450
+ dropColumnSafe("user_id");
451
+ dropColumnSafe("team_id");
452
+ dropColumnSafe("body_encrypted");
453
+ dropColumnSafe("title_encrypted");
454
+ dropColumnSafe("meta_encrypted");
455
+ dropColumnSafe("iv");
456
+ // Recreate identity uniqueness index scoped to (kind, identity_key),
457
+ // restricted to entity-category entries only (knowledge/event entries
458
+ // with identity_key are informational and may duplicate).
459
+ db.exec(
460
+ `CREATE UNIQUE INDEX IF NOT EXISTS idx_vault_identity ON vault(kind, identity_key) WHERE identity_key IS NOT NULL AND category = 'entity'`,
461
+ );
462
+ }
463
+ db.exec(`PRAGMA user_version = ${CURRENT_VERSION}`);
464
+ });
465
+ }
466
+
347
467
  return db;
348
468
  }
349
469
 
350
- export function prepareStatements(db) {
470
+ export function prepareStatements(db, mode = "local") {
351
471
  try {
472
+ if (mode === "local") {
473
+ // Local mode: no user_id, team_id, or encryption columns.
474
+ // insertEntry has 15 params (no user_id).
475
+ // getByIdentityKey and upsertByIdentityKey have no user_id WHERE clause.
476
+ return {
477
+ _mode: "local",
478
+ insertEntry: db.prepare(
479
+ `INSERT INTO vault (id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at, updated_at, source_files, tier) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
480
+ ),
481
+ updateEntry: db.prepare(
482
+ `UPDATE vault SET title = ?, body = ?, meta = ?, tags = ?, source = ?, category = ?, identity_key = ?, expires_at = ?, updated_at = datetime('now') WHERE file_path = ?`,
483
+ ),
484
+ deleteEntry: db.prepare(`DELETE FROM vault WHERE id = ?`),
485
+ getRowid: db.prepare(`SELECT rowid FROM vault WHERE id = ?`),
486
+ getRowidByPath: db.prepare(
487
+ `SELECT rowid FROM vault WHERE file_path = ?`,
488
+ ),
489
+ getEntryById: db.prepare(`SELECT * FROM vault WHERE id = ?`),
490
+ getByIdentityKey: db.prepare(
491
+ `SELECT * FROM vault WHERE kind = ? AND identity_key = ?`,
492
+ ),
493
+ upsertByIdentityKey: db.prepare(
494
+ `UPDATE vault SET title = ?, body = ?, meta = ?, tags = ?, source = ?, category = ?, file_path = ?, expires_at = ?, source_files = ?, updated_at = datetime('now') WHERE kind = ? AND identity_key = ?`,
495
+ ),
496
+ updateSourceFiles: db.prepare(
497
+ `UPDATE vault SET source_files = ? WHERE id = ?`,
498
+ ),
499
+ updateRelatedTo: db.prepare(
500
+ `UPDATE vault SET related_to = ? WHERE id = ?`,
501
+ ),
502
+ insertVecStmt: db.prepare(
503
+ `INSERT INTO vault_vec (rowid, embedding) VALUES (?, ?)`,
504
+ ),
505
+ deleteVecStmt: db.prepare(`DELETE FROM vault_vec WHERE rowid = ?`),
506
+ updateSupersededBy: db.prepare(
507
+ `UPDATE vault SET superseded_by = ? WHERE id = ?`,
508
+ ),
509
+ clearSupersededByRef: db.prepare(
510
+ `UPDATE vault SET superseded_by = NULL WHERE superseded_by = ?`,
511
+ ),
512
+ };
513
+ }
514
+
515
+ // Hosted mode: full schema with user_id scoping and encryption support.
516
+ // insertEntry has 16 params (includes user_id).
517
+ // getByIdentityKey and upsertByIdentityKey scope by user_id IS ?.
352
518
  return {
519
+ _mode: "hosted",
353
520
  insertEntry: db.prepare(
354
521
  `INSERT INTO vault (id, user_id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at, updated_at, source_files, tier) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
355
522
  ),
@@ -372,6 +539,9 @@ export function prepareStatements(db) {
372
539
  updateSourceFiles: db.prepare(
373
540
  `UPDATE vault SET source_files = ? WHERE id = ?`,
374
541
  ),
542
+ updateRelatedTo: db.prepare(
543
+ `UPDATE vault SET related_to = ? WHERE id = ?`,
544
+ ),
375
545
  insertVecStmt: db.prepare(
376
546
  `INSERT INTO vault_vec (rowid, embedding) VALUES (?, ?)`,
377
547
  ),
@@ -66,37 +66,53 @@ export async function indexEntry(
66
66
  const cat = category || categoryFor(kind);
67
67
  const effectiveTier = tier || defaultTierFor(kind);
68
68
  const userIdVal = userId || null;
69
+ const isLocal = ctx.stmts._mode === "local";
69
70
 
70
71
  let wasUpdate = false;
71
72
 
72
- // Entity upsert: check by (kind, identity_key, user_id) first
73
+ // Entity upsert: check by (kind, identity_key[, user_id]) first.
74
+ // Local mode omits user_id — all entries are user-agnostic.
73
75
  if (cat === "entity" && identity_key) {
74
- const existing = ctx.stmts.getByIdentityKey.get(
75
- kind,
76
- identity_key,
77
- userIdVal,
78
- );
76
+ const existing = isLocal
77
+ ? ctx.stmts.getByIdentityKey.get(kind, identity_key)
78
+ : ctx.stmts.getByIdentityKey.get(kind, identity_key, userIdVal);
79
79
  if (existing) {
80
- ctx.stmts.upsertByIdentityKey.run(
81
- title || null,
82
- body,
83
- metaJson,
84
- tagsJson,
85
- source || "claude-code",
86
- cat,
87
- filePath,
88
- expires_at || null,
89
- sourceFilesJson,
90
- kind,
91
- identity_key,
92
- userIdVal,
93
- );
80
+ if (isLocal) {
81
+ ctx.stmts.upsertByIdentityKey.run(
82
+ title || null,
83
+ body,
84
+ metaJson,
85
+ tagsJson,
86
+ source || "claude-code",
87
+ cat,
88
+ filePath,
89
+ expires_at || null,
90
+ sourceFilesJson,
91
+ kind,
92
+ identity_key,
93
+ );
94
+ } else {
95
+ ctx.stmts.upsertByIdentityKey.run(
96
+ title || null,
97
+ body,
98
+ metaJson,
99
+ tagsJson,
100
+ source || "claude-code",
101
+ cat,
102
+ filePath,
103
+ expires_at || null,
104
+ sourceFilesJson,
105
+ kind,
106
+ identity_key,
107
+ userIdVal,
108
+ );
109
+ }
94
110
  wasUpdate = true;
95
111
  }
96
112
  }
97
113
 
98
114
  if (!wasUpdate) {
99
- // Prepare encryption if ctx.encrypt is available
115
+ // Prepare encryption if ctx.encrypt is available (hosted mode only)
100
116
  let encrypted = null;
101
117
  if (ctx.encrypt) {
102
118
  encrypted = await ctx.encrypt({ title, body, meta });
@@ -104,7 +120,8 @@ export async function indexEntry(
104
120
 
105
121
  try {
106
122
  if (encrypted) {
107
- // Encrypted insert: store preview in body column for FTS, full content in encrypted columns
123
+ // Hosted-mode encrypted insert: store preview in body for FTS,
124
+ // full content in encrypted columns.
108
125
  const bodyPreview = body.slice(0, 200);
109
126
  ctx.stmts.insertEntryEncrypted.run(
110
127
  id,
@@ -128,7 +145,27 @@ export async function indexEntry(
128
145
  sourceFilesJson,
129
146
  effectiveTier,
130
147
  );
148
+ } else if (isLocal) {
149
+ // Local mode: no user_id column — 15 params.
150
+ ctx.stmts.insertEntry.run(
151
+ id,
152
+ kind,
153
+ cat,
154
+ title || null,
155
+ body,
156
+ metaJson,
157
+ tagsJson,
158
+ source || "claude-code",
159
+ filePath,
160
+ identity_key || null,
161
+ expires_at || null,
162
+ createdAt,
163
+ createdAt,
164
+ sourceFilesJson,
165
+ effectiveTier,
166
+ );
131
167
  } else {
168
+ // Hosted mode without encryption: 16 params (includes user_id).
132
169
  ctx.stmts.insertEntry.run(
133
170
  id,
134
171
  userIdVal,
@@ -262,10 +299,14 @@ export async function reindex(ctx, opts = {}) {
262
299
 
263
300
  if (!existsSync(ctx.config.vaultDir)) return stats;
264
301
 
265
- // Use INSERT OR IGNORE for reindex — handles files with duplicate frontmatter IDs
266
- // user_id is NULL for reindex (always local mode)
302
+ // Use INSERT OR IGNORE for reindex — handles files with duplicate frontmatter IDs.
303
+ // Local mode: no user_id column (15 params).
304
+ // Hosted mode: user_id is NULL for file-sourced entries (14 params, NULL literal).
305
+ const isLocalReindex = ctx.stmts._mode === "local";
267
306
  const upsertEntry = ctx.db.prepare(
268
- `INSERT OR IGNORE INTO vault (id, user_id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at, updated_at) VALUES (?, NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
307
+ isLocalReindex
308
+ ? `INSERT OR IGNORE INTO vault (id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
309
+ : `INSERT OR IGNORE INTO vault (id, user_id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at, updated_at) VALUES (?, NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
269
310
  );
270
311
 
271
312
  // Auto-discover kind directories, supporting both:
@@ -317,7 +358,7 @@ export async function reindex(ctx, opts = {}) {
317
358
  // P3: Fetch all mutable fields for change detection
318
359
  const dbRows = ctx.db
319
360
  .prepare(
320
- "SELECT id, file_path, body, title, tags, meta FROM vault WHERE kind = ?",
361
+ "SELECT id, file_path, body, title, tags, meta, related_to FROM vault WHERE kind = ?",
321
362
  )
322
363
  .all(kind);
323
364
  const dbByPath = new Map(dbRows.map((r) => [r.file_path, r]));
@@ -343,6 +384,12 @@ export async function reindex(ctx, opts = {}) {
343
384
  // Extract identity_key and expires_at from frontmatter
344
385
  const identity_key = fmMeta.identity_key || null;
345
386
  const expires_at = fmMeta.expires_at || null;
387
+ const related_to = Array.isArray(fmMeta.related_to)
388
+ ? fmMeta.related_to
389
+ : null;
390
+ const relatedToJson = related_to?.length
391
+ ? JSON.stringify(related_to)
392
+ : null;
346
393
 
347
394
  // Derive folder from disk location (source of truth)
348
395
  const meta = { ...(parsed.meta || {}) };
@@ -372,6 +419,9 @@ export async function reindex(ctx, opts = {}) {
372
419
  fmMeta.updated || created,
373
420
  );
374
421
  if (result.changes > 0) {
422
+ if (relatedToJson && ctx.stmts.updateRelatedTo) {
423
+ ctx.stmts.updateRelatedTo.run(relatedToJson, id);
424
+ }
375
425
  if (category !== "event") {
376
426
  const rowidResult = ctx.stmts.getRowid.get(id);
377
427
  if (rowidResult?.rowid) {
@@ -396,8 +446,16 @@ export async function reindex(ctx, opts = {}) {
396
446
  const bodyChanged = existing.body !== parsed.body;
397
447
  const tagsChanged = tagsJson !== (existing.tags || null);
398
448
  const metaChanged = metaJson !== (existing.meta || null);
399
-
400
- if (bodyChanged || titleChanged || tagsChanged || metaChanged) {
449
+ const relatedToChanged =
450
+ relatedToJson !== (existing.related_to || null);
451
+
452
+ if (
453
+ bodyChanged ||
454
+ titleChanged ||
455
+ tagsChanged ||
456
+ metaChanged ||
457
+ relatedToChanged
458
+ ) {
401
459
  ctx.stmts.updateEntry.run(
402
460
  parsed.title || null,
403
461
  parsed.body,
@@ -409,6 +467,9 @@ export async function reindex(ctx, opts = {}) {
409
467
  expires_at,
410
468
  filePath,
411
469
  );
470
+ if (relatedToChanged && ctx.stmts.updateRelatedTo) {
471
+ ctx.stmts.updateRelatedTo.run(relatedToJson, existing.id);
472
+ }
412
473
 
413
474
  // Queue re-embed if title or body changed (vector ops deferred to Phase 2)
414
475
  if ((bodyChanged || titleChanged) && category !== "event") {
package/src/index.js CHANGED
@@ -29,6 +29,11 @@ export {
29
29
  parseEntryFromMarkdown,
30
30
  } from "./core/frontmatter.js";
31
31
  export { gatherVaultStatus } from "./core/status.js";
32
+ export {
33
+ PLURAL_TO_SINGULAR,
34
+ planMigration,
35
+ executeMigration,
36
+ } from "./core/migrate-dirs.js";
32
37
 
33
38
  // Capture layer
34
39
  export {
@@ -11,8 +11,6 @@ const NEAR_DUP_THRESHOLD = 0.92;
11
11
 
12
12
  const RRF_K = 60;
13
13
 
14
- const MMR_LAMBDA = 0.7;
15
-
16
14
  /**
17
15
  * Exponential recency decay score based on updated_at timestamp.
18
16
  * Returns e^(-decayRate * ageDays) for valid dates, or 0.5 as a neutral
@@ -132,108 +130,16 @@ export function reciprocalRankFusion(rankedLists, k = RRF_K) {
132
130
  return scores;
133
131
  }
134
132
 
135
- /**
136
- * Jaccard similarity between two strings based on word sets.
137
- * Used as a fallback for MMR when embedding vectors are unavailable.
138
- *
139
- * @param {string} a
140
- * @param {string} b
141
- * @returns {number} Similarity in [0, 1].
142
- */
143
- export function jaccardSimilarity(a, b) {
144
- const wordsA = new Set((a ?? "").toLowerCase().split(/\W+/).filter(Boolean));
145
- const wordsB = new Set((b ?? "").toLowerCase().split(/\W+/).filter(Boolean));
146
- if (wordsA.size === 0 && wordsB.size === 0) return 1;
147
- if (wordsA.size === 0 || wordsB.size === 0) return 0;
148
- let intersection = 0;
149
- for (const w of wordsA) if (wordsB.has(w)) intersection++;
150
- return intersection / (wordsA.size + wordsB.size - intersection);
151
- }
152
-
153
- /**
154
- * Maximal Marginal Relevance reranking.
155
- *
156
- * Selects up to n candidates that balance relevance to the query and
157
- * diversity from already-selected results.
158
- *
159
- * MMR_score = lambda * querySim(doc) - (1 - lambda) * max(sim(doc, selected))
160
- *
161
- * @param {Array<object>} candidates - Entries with at least {id, title, body}.
162
- * @param {Map<string, number>} querySimMap - Map of id -> relevance score.
163
- * @param {Map<string, Float32Array|null>} embeddingMap - Map of id -> embedding (null if unavailable).
164
- * @param {number} n - Number of results to select.
165
- * @param {number} lambda - Trade-off weight (default MMR_LAMBDA = 0.7).
166
- * @returns {Array<object>} Reranked subset of candidates (length <= n).
167
- */
168
- export function maximalMarginalRelevance(
169
- candidates,
170
- querySimMap,
171
- embeddingMap,
172
- n,
173
- lambda = MMR_LAMBDA,
174
- ) {
175
- if (candidates.length === 0) return [];
176
-
177
- const remaining = [...candidates];
178
- const selected = [];
179
- const selectedVecs = [];
180
- const selectedEntries = [];
181
-
182
- while (selected.length < n && remaining.length > 0) {
183
- let bestIdx = -1;
184
- let bestScore = -Infinity;
185
-
186
- for (let i = 0; i < remaining.length; i++) {
187
- const candidate = remaining[i];
188
- const relevance = querySimMap.get(candidate.id) ?? 0;
189
-
190
- let maxRedundancy = 0;
191
- if (selectedVecs.length > 0) {
192
- const vec = embeddingMap.get(candidate.id);
193
- for (let j = 0; j < selectedVecs.length; j++) {
194
- let sim;
195
- if (vec && selectedVecs[j]) {
196
- sim = dotProduct(vec, selectedVecs[j]);
197
- } else {
198
- const selEntry = selectedEntries[j];
199
- sim = jaccardSimilarity(
200
- `${candidate.title} ${candidate.body}`,
201
- `${selEntry.title} ${selEntry.body}`,
202
- );
203
- }
204
- if (sim > maxRedundancy) maxRedundancy = sim;
205
- }
206
- }
207
-
208
- const score = lambda * relevance - (1 - lambda) * maxRedundancy;
209
- if (score > bestScore) {
210
- bestScore = score;
211
- bestIdx = i;
212
- }
213
- }
214
-
215
- if (bestIdx === -1) break;
216
-
217
- const chosen = remaining.splice(bestIdx, 1)[0];
218
- selected.push(chosen);
219
- selectedVecs.push(embeddingMap.get(chosen.id) ?? null);
220
- selectedEntries.push(chosen);
221
- }
222
-
223
- return selected;
224
- }
225
-
226
133
  /**
227
134
  * Hybrid search combining FTS5 text matching and vector similarity,
228
- * with RRF merging and MMR reranking for diversity.
135
+ * with RRF merging, recency decay, and near-duplicate suppression.
229
136
  *
230
137
  * Pipeline:
231
138
  * 1. FTS5 ranked list
232
139
  * 2. Vector (semantic) ranked list
233
140
  * 3. RRF: merge the two ranked lists into a single score
234
- * 4. Apply recency decay to RRF scores
235
- * 5. MMR: rerank top candidates for diversity (uses embeddings or Jaccard fallback)
236
- * 6. Near-duplicate suppression on the final selection
141
+ * 4. Recency decay: penalise old events (knowledge/entity entries unaffected)
142
+ * 5. Near-duplicate suppression (cosine similarity > 0.92 threshold)
237
143
  *
238
144
  * @param {import('../server/types.js').BaseCtx} ctx
239
145
  * @param {string} query
@@ -383,20 +289,6 @@ export async function hybridSearch(
383
289
  rrfScores.set(id, (rrfScores.get(id) ?? 0) * boost);
384
290
  }
385
291
 
386
- // Stage 3b: Frequency signal — log(1 + hit_count) / log(1 + max_hit_count)
387
- const allRows = [...rowMap.values()];
388
- const maxHitCount = Math.max(...allRows.map((e) => e.hit_count || 0), 0);
389
- if (maxHitCount > 0) {
390
- const logMax = Math.log(1 + maxHitCount);
391
- for (const entry of allRows) {
392
- const freqScore = Math.log(1 + (entry.hit_count || 0)) / logMax;
393
- rrfScores.set(
394
- entry.id,
395
- (rrfScores.get(entry.id) ?? 0) + freqScore * 0.13,
396
- );
397
- }
398
- }
399
-
400
292
  // Attach final score to each entry and sort by RRF score descending
401
293
  const candidates = [...rowMap.values()].map((entry) => ({
402
294
  ...entry,
@@ -404,7 +296,7 @@ export async function hybridSearch(
404
296
  }));
405
297
  candidates.sort((a, b) => b.score - a.score);
406
298
 
407
- // Stage 4: Fetch embeddings for all candidates that have a rowid
299
+ // Stage 4: Fetch embeddings for near-duplicate suppression
408
300
  const embeddingMap = new Map();
409
301
  if (queryVec && idToRowid.size > 0) {
410
302
  const rowidToId = new Map();
@@ -429,34 +321,15 @@ export async function hybridSearch(
429
321
  }
430
322
  }
431
323
  } catch (_) {
432
- // Embeddings unavailable — MMR will fall back to Jaccard similarity
324
+ // Embeddings unavailable — near-dup suppression skipped
433
325
  }
434
326
  }
435
327
 
436
- // Use vecSim as the query-relevance signal for MMR; fall back to RRF score
437
- const querySimMap = new Map();
438
- for (const candidate of candidates) {
439
- querySimMap.set(
440
- candidate.id,
441
- vecSimMap.has(candidate.id)
442
- ? vecSimMap.get(candidate.id)
443
- : candidate.score,
444
- );
445
- }
446
-
447
- // Stage 5: MMR — rerank for diversity using embeddings or Jaccard fallback
448
- const mmrSelected = maximalMarginalRelevance(
449
- candidates,
450
- querySimMap,
451
- embeddingMap,
452
- offset + limit,
453
- );
454
-
455
- // Stage 6: Near-duplicate suppression (hard filter, not reorder)
456
- if (queryVec && embeddingMap.size > 0 && mmrSelected.length > limit) {
328
+ // Stage 5: Near-duplicate suppression (cosine similarity > 0.92 threshold)
329
+ if (queryVec && embeddingMap.size > 0) {
457
330
  const selected = [];
458
331
  const selectedVecs = [];
459
- for (const candidate of mmrSelected) {
332
+ for (const candidate of candidates) {
460
333
  if (selected.length >= offset + limit) break;
461
334
  const vec = embeddingMap.get(candidate.id);
462
335
  if (vec && selectedVecs.length > 0) {
@@ -475,7 +348,7 @@ export async function hybridSearch(
475
348
  return dedupedPage;
476
349
  }
477
350
 
478
- const finalPage = mmrSelected.slice(offset, offset + limit);
351
+ const finalPage = candidates.slice(offset, offset + limit);
479
352
  trackAccess(ctx.db, finalPage);
480
353
  return finalPage;
481
354
  }