npm - @context-vault/core - Versions diffs - 2.14.0 → 2.17.0 - Mend

@context-vault/core 2.14.0 → 2.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/package.json +1 -1
package/src/capture/index.js +11 -0
package/src/consolidation/index.js +112 -0
package/src/constants.js +7 -2
package/src/core/categories.js +10 -0
package/src/core/config.js +46 -2
package/src/core/status.js +28 -2
package/src/index/db.js +102 -9
package/src/index/index.js +48 -21
package/src/index.js +4 -0
package/src/retrieve/index.js +268 -64
package/src/server/tools/context-status.js +7 -0
package/src/server/tools/create-snapshot.js +12 -3
package/src/server/tools/get-context.js +311 -11
package/src/server/tools/ingest-project.js +244 -0
package/src/server/tools/list-buckets.js +116 -0
package/src/server/tools/save-context.js +190 -19
package/src/server/tools/session-start.js +285 -0
package/src/server/tools.js +6 -0

package/src/retrieve/index.js CHANGED Viewed

@@ -7,10 +7,27 @@
  * Agent Constraint: Read-only access to DB. Never writes.
  */
-const FTS_WEIGHT = 0.4;
-const VEC_WEIGHT = 0.6;
 const NEAR_DUP_THRESHOLD = 0.92;
+const RRF_K = 60;
+const MMR_LAMBDA = 0.7;
+/**
+ * Exponential recency decay score based on updated_at timestamp.
+ * Returns e^(-decayRate * ageDays) for valid dates, or 0.5 as a neutral
+ * score when updatedAt is null/undefined.
+ *
+ * @param {string|null|undefined} updatedAt - ISO timestamp
+ * @param {number} decayRate - Decay rate per day (default 0.05)
+ * @returns {number} Score in [0, 1]
+ */
+export function recencyDecayScore(updatedAt, decayRate = 0.05) {
+  if (updatedAt == null) return 0.5;
+  const ageDays = (Date.now() - new Date(updatedAt).getTime()) / 86400000;
+  return Math.exp(-decayRate * ageDays);
+}
 /**
  * Dot product of two Float32Array vectors (cosine similarity for unit vectors).
  */
@@ -57,6 +74,7 @@ export function recencyBoost(createdAt, category, decayDays = 30) {
  */
 export function buildFilterClauses({
   categoryFilter,
+  excludeEvents = false,
   since,
   until,
   userIdFilter,
@@ -77,6 +95,9 @@ export function buildFilterClauses({
     clauses.push("e.category = ?");
     params.push(categoryFilter);
   }
+  if (excludeEvents && !categoryFilter) {
+    clauses.push("e.category != 'event'");
+  }
   if (since) {
     clauses.push("e.created_at >= ?");
     params.push(since);
@@ -93,7 +114,126 @@ export function buildFilterClauses({
 }
 /**
- * Hybrid search combining FTS5 text matching and vector similarity.
+ * Reciprocal Rank Fusion: merge multiple ranked lists into a single score.
+ * Each document receives 1/(k + rank) from each list it appears in.
+ *
+ * @param {Array<string[]>} rankedLists - Arrays of document IDs in rank order (best first).
+ * @param {number} k - Smoothing constant (default RRF_K = 60).
+ * @returns {Map<string, number>} Map of id -> RRF score.
+ */
+export function reciprocalRankFusion(rankedLists, k = RRF_K) {
+  const scores = new Map();
+  for (const list of rankedLists) {
+    for (let rank = 0; rank < list.length; rank++) {
+      const id = list[rank];
+      scores.set(id, (scores.get(id) ?? 0) + 1 / (k + rank + 1));
+    }
+  }
+  return scores;
+}
+/**
+ * Jaccard similarity between two strings based on word sets.
+ * Used as a fallback for MMR when embedding vectors are unavailable.
+ *
+ * @param {string} a
+ * @param {string} b
+ * @returns {number} Similarity in [0, 1].
+ */
+export function jaccardSimilarity(a, b) {
+  const wordsA = new Set((a ?? "").toLowerCase().split(/\W+/).filter(Boolean));
+  const wordsB = new Set((b ?? "").toLowerCase().split(/\W+/).filter(Boolean));
+  if (wordsA.size === 0 && wordsB.size === 0) return 1;
+  if (wordsA.size === 0 || wordsB.size === 0) return 0;
+  let intersection = 0;
+  for (const w of wordsA) if (wordsB.has(w)) intersection++;
+  return intersection / (wordsA.size + wordsB.size - intersection);
+}
+/**
+ * Maximal Marginal Relevance reranking.
+ *
+ * Selects up to n candidates that balance relevance to the query and
+ * diversity from already-selected results.
+ *
+ * MMR_score = lambda * querySim(doc) - (1 - lambda) * max(sim(doc, selected))
+ *
+ * @param {Array<object>} candidates - Entries with at least {id, title, body}.
+ * @param {Map<string, number>} querySimMap - Map of id -> relevance score.
+ * @param {Map<string, Float32Array|null>} embeddingMap - Map of id -> embedding (null if unavailable).
+ * @param {number} n - Number of results to select.
+ * @param {number} lambda - Trade-off weight (default MMR_LAMBDA = 0.7).
+ * @returns {Array<object>} Reranked subset of candidates (length <= n).
+ */
+export function maximalMarginalRelevance(
+  candidates,
+  querySimMap,
+  embeddingMap,
+  n,
+  lambda = MMR_LAMBDA,
+) {
+  if (candidates.length === 0) return [];
+  const remaining = [...candidates];
+  const selected = [];
+  const selectedVecs = [];
+  const selectedEntries = [];
+  while (selected.length < n && remaining.length > 0) {
+    let bestIdx = -1;
+    let bestScore = -Infinity;
+    for (let i = 0; i < remaining.length; i++) {
+      const candidate = remaining[i];
+      const relevance = querySimMap.get(candidate.id) ?? 0;
+      let maxRedundancy = 0;
+      if (selectedVecs.length > 0) {
+        const vec = embeddingMap.get(candidate.id);
+        for (let j = 0; j < selectedVecs.length; j++) {
+          let sim;
+          if (vec && selectedVecs[j]) {
+            sim = dotProduct(vec, selectedVecs[j]);
+          } else {
+            const selEntry = selectedEntries[j];
+            sim = jaccardSimilarity(
+              `${candidate.title} ${candidate.body}`,
+              `${selEntry.title} ${selEntry.body}`,
+            );
+          }
+          if (sim > maxRedundancy) maxRedundancy = sim;
+        }
+      }
+      const score = lambda * relevance - (1 - lambda) * maxRedundancy;
+      if (score > bestScore) {
+        bestScore = score;
+        bestIdx = i;
+      }
+    }
+    if (bestIdx === -1) break;
+    const chosen = remaining.splice(bestIdx, 1)[0];
+    selected.push(chosen);
+    selectedVecs.push(embeddingMap.get(chosen.id) ?? null);
+    selectedEntries.push(chosen);
+  }
+  return selected;
+}
+/**
+ * Hybrid search combining FTS5 text matching and vector similarity,
+ * with RRF merging and MMR reranking for diversity.
+ *
+ * Pipeline:
+ *   1. FTS5 ranked list
+ *   2. Vector (semantic) ranked list
+ *   3. RRF: merge the two ranked lists into a single score
+ *   4. Apply recency decay to RRF scores
+ *   5. MMR: rerank top candidates for diversity (uses embeddings or Jaccard fallback)
+ *   6. Near-duplicate suppression on the final selection
  *
  * @param {import('../server/types.js').BaseCtx} ctx
  * @param {string} query
@@ -106,6 +246,7 @@ export async function hybridSearch(
   {
     kindFilter = null,
     categoryFilter = null,
+    excludeEvents = false,
     since = null,
     until = null,
     limit = 20,
@@ -116,11 +257,13 @@ export async function hybridSearch(
     includeSuperseeded = false,
   } = {},
 ) {
-  const results = new Map();
+  const rowMap = new Map();
   const idToRowid = new Map();
   let queryVec = null;
   const extraFilters = buildFilterClauses({
     categoryFilter,
+    excludeEvents,
     since,
     until,
     userIdFilter,
@@ -128,7 +271,9 @@ export async function hybridSearch(
     includeSuperseeded,
   });
-  // FTS5 search
+  const ftsRankedIds = [];
+  // Stage 1a: FTS5 — collect ranked list of IDs
   const ftsQuery = buildFtsQuery(query);
   if (ftsQuery) {
     try {
@@ -145,25 +290,21 @@ export async function hybridSearch(
       const ftsSQL = `SELECT e.*, rank FROM vault_fts f JOIN vault e ON f.rowid = e.rowid WHERE ${whereParts.join(" AND ")} ORDER BY rank LIMIT 15`;
       const rows = ctx.db.prepare(ftsSQL).all(...ftsParams);
-      // Normalize FTS scores to [0, 1]
-      const ftsScores = rows.map((r) => Math.abs(r.rank || 0));
-      const maxFts = Math.max(...ftsScores, 1);
-      for (let i = 0; i < rows.length; i++) {
-        const { rank: _rank, ...row } = rows[i];
-        const normalized = ftsScores[i] / maxFts;
-        results.set(row.id, { ...row, score: normalized * FTS_WEIGHT });
+      for (const { rank: _rank, ...row } of rows) {
+        ftsRankedIds.push(row.id);
+        if (!rowMap.has(row.id)) rowMap.set(row.id, row);
       }
     } catch (err) {
-      if (err.message?.includes("fts5: syntax error")) {
-        // Expected: malformed query, fall through to vector search
-      } else {
+      if (!err.message?.includes("fts5: syntax error")) {
         console.error(`[retrieve] FTS search error: ${err.message}`);
       }
     }
   }
-  // Vector similarity search (skipped if embedding unavailable)
+  const vecRankedIds = [];
+  const vecSimMap = new Map();
+  // Stage 1b: Vector similarity — collect ranked list of IDs and raw similarity scores
   try {
     const vecCount = ctx.db
       .prepare("SELECT COUNT(*) as c FROM vault_vec")
@@ -171,7 +312,6 @@ export async function hybridSearch(
     if (vecCount > 0) {
       queryVec = await ctx.embed(query);
       if (queryVec) {
-        // Increase limits in hosted mode to compensate for post-filtering
         const hasPostFilter = userIdFilter !== undefined || teamIdFilter;
         const vecLimit = hasPostFilter
           ? kindFilter
@@ -187,7 +327,6 @@ export async function hybridSearch(
           .all(queryVec, vecLimit);
         if (vecRows.length) {
-          // Batch hydration: single query instead of N+1
           const rowids = vecRows.map((vr) => vr.rowid);
           const placeholders = rowids.map(() => "?").join(",");
           const hydrated = ctx.db
@@ -207,6 +346,7 @@ export async function hybridSearch(
             if (teamIdFilter && row.team_id !== teamIdFilter) continue;
             if (kindFilter && row.kind !== kindFilter) continue;
             if (categoryFilter && row.category !== categoryFilter) continue;
+            if (excludeEvents && row.category === "event") continue;
             if (since && row.created_at < since) continue;
             if (until && row.created_at > until) continue;
             if (row.expires_at && new Date(row.expires_at) <= new Date())
@@ -214,70 +354,111 @@ export async function hybridSearch(
             const { rowid: _rowid, ...cleanRow } = row;
             idToRowid.set(cleanRow.id, Number(row.rowid));
             // sqlite-vec returns L2 distance [0, 2] for normalized vectors.
-            // Convert to similarity [1, 0] with: 1 - distance/2
-            const vecScore = Math.max(0, 1 - vr.distance / 2) * VEC_WEIGHT;
-            const existing = results.get(cleanRow.id);
-            if (existing) {
-              existing.score += vecScore;
-            } else {
-              results.set(cleanRow.id, { ...cleanRow, score: vecScore });
-            }
+            // Convert to similarity [0, 1]: 1 - distance/2
+            const vecSim = Math.max(0, 1 - vr.distance / 2);
+            vecSimMap.set(cleanRow.id, vecSim);
+            vecRankedIds.push(cleanRow.id);
+            if (!rowMap.has(cleanRow.id)) rowMap.set(cleanRow.id, cleanRow);
           }
         }
       }
     }
   } catch (err) {
-    if (err.message?.includes("no such table")) {
-      // Expected on fresh vaults with no vec table yet
-    } else {
+    if (!err.message?.includes("no such table")) {
       console.error(`[retrieve] Vector search error: ${err.message}`);
     }
   }
-  // Apply category-aware recency boost
-  for (const [, entry] of results) {
-    entry.score *= recencyBoost(entry.created_at, entry.category, decayDays);
+  if (rowMap.size === 0) return [];
+  // Stage 2: RRF — merge FTS and vector ranked lists into a single score
+  const rrfScores = reciprocalRankFusion([ftsRankedIds, vecRankedIds]);
+  // Stage 3: Apply category-aware recency boost to RRF scores
+  for (const [id, entry] of rowMap) {
+    const boost = recencyBoost(entry.created_at, entry.category, decayDays);
+    rrfScores.set(id, (rrfScores.get(id) ?? 0) * boost);
   }
-  const sorted = [...results.values()].sort((a, b) => b.score - a.score);
+  // Stage 3b: Frequency signal — log(1 + hit_count) / log(1 + max_hit_count)
+  const allRows = [...rowMap.values()];
+  const maxHitCount = Math.max(...allRows.map((e) => e.hit_count || 0), 0);
+  if (maxHitCount > 0) {
+    const logMax = Math.log(1 + maxHitCount);
+    for (const entry of allRows) {
+      const freqScore = Math.log(1 + (entry.hit_count || 0)) / logMax;
+      rrfScores.set(
+        entry.id,
+        (rrfScores.get(entry.id) ?? 0) + freqScore * 0.13,
+      );
+    }
+  }
-  // Near-duplicate suppression: when embeddings are available and we have more
-  // candidates than needed, skip results that are too similar to already-selected ones.
-  if (queryVec && idToRowid.size > 0 && sorted.length > limit) {
-    const rowidsToFetch = sorted
-      .filter((c) => idToRowid.has(c.id))
-      .map((c) => idToRowid.get(c.id));
+  // Attach final score to each entry and sort by RRF score descending
+  const candidates = [...rowMap.values()].map((entry) => ({
+    ...entry,
+    score: rrfScores.get(entry.id) ?? 0,
+  }));
+  candidates.sort((a, b) => b.score - a.score);
-    const embeddingMap = new Map();
-    if (rowidsToFetch.length > 0) {
-      try {
-        const placeholders = rowidsToFetch.map(() => "?").join(",");
-        const vecData = ctx.db
-          .prepare(
-            `SELECT rowid, embedding FROM vault_vec WHERE rowid IN (${placeholders})`,
-          )
-          .all(...rowidsToFetch);
-        for (const row of vecData) {
-          const buf = row.embedding;
-          if (buf) {
-            embeddingMap.set(
-              Number(row.rowid),
-              new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4),
-            );
-          }
+  // Stage 4: Fetch embeddings for all candidates that have a rowid
+  const embeddingMap = new Map();
+  if (queryVec && idToRowid.size > 0) {
+    const rowidToId = new Map();
+    for (const [id, rowid] of idToRowid) rowidToId.set(rowid, id);
+    const rowidsToFetch = [...idToRowid.values()];
+    try {
+      const placeholders = rowidsToFetch.map(() => "?").join(",");
+      const vecData = ctx.db
+        .prepare(
+          `SELECT rowid, embedding FROM vault_vec WHERE rowid IN (${placeholders})`,
+        )
+        .all(...rowidsToFetch);
+      for (const row of vecData) {
+        const id = rowidToId.get(Number(row.rowid));
+        const buf = row.embedding;
+        if (id && buf) {
+          embeddingMap.set(
+            id,
+            new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4),
+          );
         }
-      } catch (_) {
-        return sorted.slice(offset, offset + limit);
       }
+    } catch (_) {
+      // Embeddings unavailable — MMR will fall back to Jaccard similarity
     }
+  }
+  // Use vecSim as the query-relevance signal for MMR; fall back to RRF score
+  const querySimMap = new Map();
+  for (const candidate of candidates) {
+    querySimMap.set(
+      candidate.id,
+      vecSimMap.has(candidate.id)
+        ? vecSimMap.get(candidate.id)
+        : candidate.score,
+    );
+  }
+  // Stage 5: MMR — rerank for diversity using embeddings or Jaccard fallback
+  const mmrSelected = maximalMarginalRelevance(
+    candidates,
+    querySimMap,
+    embeddingMap,
+    offset + limit,
+  );
+  // Stage 6: Near-duplicate suppression (hard filter, not reorder)
+  if (queryVec && embeddingMap.size > 0 && mmrSelected.length > limit) {
     const selected = [];
     const selectedVecs = [];
-    for (const candidate of sorted) {
+    for (const candidate of mmrSelected) {
       if (selected.length >= offset + limit) break;
-      const rowid = idToRowid.get(candidate.id);
-      const vec = rowid !== undefined ? embeddingMap.get(rowid) : null;
+      const vec = embeddingMap.get(candidate.id);
       if (vec && selectedVecs.length > 0) {
         let maxSim = 0;
         for (const sv of selectedVecs) {
@@ -289,8 +470,31 @@ export async function hybridSearch(
       selected.push(candidate);
       if (vec) selectedVecs.push(vec);
     }
-    return selected.slice(offset, offset + limit);
+    const dedupedPage = selected.slice(offset, offset + limit);
+    trackAccess(ctx.db, dedupedPage);
+    return dedupedPage;
   }
-  return sorted.slice(offset, offset + limit);
+  const finalPage = mmrSelected.slice(offset, offset + limit);
+  trackAccess(ctx.db, finalPage);
+  return finalPage;
+}
+/**
+ * Increment hit_count and set last_accessed_at for a batch of retrieved entries.
+ * Single batched UPDATE for efficiency.
+ *
+ * @param {import('node:sqlite').DatabaseSync} db
+ * @param {Array<{id: string}>} entries
+ */
+function trackAccess(db, entries) {
+  if (!entries.length) return;
+  try {
+    const placeholders = entries.map(() => "?").join(",");
+    db.prepare(
+      `UPDATE vault SET hit_count = hit_count + 1, last_accessed_at = datetime('now') WHERE id IN (${placeholders})`,
+    ).run(...entries.map((e) => e.id));
+  } catch (_) {
+    // Non-fatal: frequency tracking is best-effort
+  }
 }

package/src/server/tools/context-status.js CHANGED Viewed

@@ -146,6 +146,13 @@ export function handler(_args, ctx) {
     for (const w of growth.warnings) {
       lines.push(`  ${w.message}`);
     }
+    if (growth.kindBreakdown.length) {
+      lines.push("");
+      lines.push("  Breakdown by kind:");
+      for (const { kind, count, pct } of growth.kindBreakdown) {
+        lines.push(`    ${kind}: ${count.toLocaleString()} (${pct}%)`);
+      }
+    }
     if (growth.actions.length) {
       lines.push("", "Suggested growth actions:");
       for (const a of growth.actions) {

package/src/server/tools/create-snapshot.js CHANGED Viewed

@@ -20,6 +20,12 @@ export const inputSchema = {
     .array(z.string())
     .optional()
     .describe("Optional tag filters — entries must match at least one"),
+  buckets: z
+    .array(z.string())
+    .optional()
+    .describe(
+      "Filter by project-scoped buckets. Each name expands to a 'bucket:<name>' tag. Composes with 'tags' via OR (entries matching any tag or any bucket are included).",
+    ),
   kinds: z
     .array(z.string())
     .optional()
@@ -99,7 +105,7 @@ function slugifyTopic(topic) {
 }
 export async function handler(
-  { topic, tags, kinds, identity_key },
+  { topic, tags, buckets, kinds, identity_key },
   ctx,
   { ensureIndexed },
 ) {
@@ -116,6 +122,9 @@ export async function handler(
   await ensureIndexed();
   const normalizedKinds = kinds?.map(normalizeKind) ?? [];
+  // Expand buckets to bucket: prefixed tags and merge with explicit tags
+  const bucketTags = buckets?.length ? buckets.map((b) => `bucket:${b}`) : [];
+  const effectiveTags = [...(tags ?? []), ...bucketTags];
   let candidates = [];
@@ -143,10 +152,10 @@ export async function handler(
     });
   }
-  if (tags?.length) {
+  if (effectiveTags.length) {
     candidates = candidates.filter((r) => {
       const entryTags = r.tags ? JSON.parse(r.tags) : [];
-      return tags.some((t) => entryTags.includes(t));
+      return effectiveTags.some((t) => entryTags.includes(t));
     });
   }