npm - @loreai/core - Versions diffs - 0.17.0 → 0.18.0 - Mend

@loreai/core 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (235) hide show

package/dist/bun/agents-file.d.ts +4 -0
package/dist/bun/agents-file.d.ts.map +1 -1
package/dist/bun/config.d.ts +2 -0
package/dist/bun/config.d.ts.map +1 -1
package/dist/bun/curator.d.ts +45 -0
package/dist/bun/curator.d.ts.map +1 -1
package/dist/bun/data-dir.d.ts +18 -0
package/dist/bun/data-dir.d.ts.map +1 -0
package/dist/bun/db.d.ts +12 -0
package/dist/bun/db.d.ts.map +1 -1
package/dist/bun/distillation.d.ts.map +1 -1
package/dist/bun/embedding-vendor.d.ts +22 -38
package/dist/bun/embedding-vendor.d.ts.map +1 -1
package/dist/bun/embedding-worker-types.d.ts +17 -12
package/dist/bun/embedding-worker-types.d.ts.map +1 -1
package/dist/bun/embedding-worker.d.ts +9 -2
package/dist/bun/embedding-worker.d.ts.map +1 -1
package/dist/bun/embedding-worker.js +38864 -33
package/dist/bun/embedding-worker.js.map +4 -4
package/dist/bun/embedding.d.ts +30 -22
package/dist/bun/embedding.d.ts.map +1 -1
package/dist/bun/gradient.d.ts +8 -1
package/dist/bun/gradient.d.ts.map +1 -1
package/dist/bun/import/detect.d.ts +14 -0
package/dist/bun/import/detect.d.ts.map +1 -0
package/dist/bun/import/extract.d.ts +43 -0
package/dist/bun/import/extract.d.ts.map +1 -0
package/dist/bun/import/history.d.ts +40 -0
package/dist/bun/import/history.d.ts.map +1 -0
package/dist/bun/import/index.d.ts +17 -0
package/dist/bun/import/index.d.ts.map +1 -0
package/dist/bun/import/providers/aider.d.ts +2 -0
package/dist/bun/import/providers/aider.d.ts.map +1 -0
package/dist/bun/import/providers/claude-code.d.ts +2 -0
package/dist/bun/import/providers/claude-code.d.ts.map +1 -0
package/dist/bun/import/providers/cline.d.ts +2 -0
package/dist/bun/import/providers/cline.d.ts.map +1 -0
package/dist/bun/import/providers/codex.d.ts +2 -0
package/dist/bun/import/providers/codex.d.ts.map +1 -0
package/dist/bun/import/providers/continue.d.ts +2 -0
package/dist/bun/import/providers/continue.d.ts.map +1 -0
package/dist/bun/import/providers/index.d.ts +19 -0
package/dist/bun/import/providers/index.d.ts.map +1 -0
package/dist/bun/import/providers/opencode.d.ts +2 -0
package/dist/bun/import/providers/opencode.d.ts.map +1 -0
package/dist/bun/import/providers/pi.d.ts +2 -0
package/dist/bun/import/providers/pi.d.ts.map +1 -0
package/dist/bun/import/types.d.ts +82 -0
package/dist/bun/import/types.d.ts.map +1 -0
package/dist/bun/index.d.ts +4 -1
package/dist/bun/index.d.ts.map +1 -1
package/dist/bun/index.js +2217 -224
package/dist/bun/index.js.map +4 -4
package/dist/bun/instruction-detect.d.ts +66 -0
package/dist/bun/instruction-detect.d.ts.map +1 -0
package/dist/bun/log.d.ts +9 -0
package/dist/bun/log.d.ts.map +1 -1
package/dist/bun/ltm.d.ts +40 -0
package/dist/bun/ltm.d.ts.map +1 -1
package/dist/bun/pattern-extract.d.ts +7 -0
package/dist/bun/pattern-extract.d.ts.map +1 -1
package/dist/bun/prompt.d.ts +1 -1
package/dist/bun/prompt.d.ts.map +1 -1
package/dist/bun/recall.d.ts.map +1 -1
package/dist/bun/search.d.ts +5 -3
package/dist/bun/search.d.ts.map +1 -1
package/dist/bun/temporal.d.ts.map +1 -1
package/dist/bun/types.d.ts +1 -1
package/dist/node/agents-file.d.ts +4 -0
package/dist/node/agents-file.d.ts.map +1 -1
package/dist/node/config.d.ts +2 -0
package/dist/node/config.d.ts.map +1 -1
package/dist/node/curator.d.ts +45 -0
package/dist/node/curator.d.ts.map +1 -1
package/dist/node/data-dir.d.ts +18 -0
package/dist/node/data-dir.d.ts.map +1 -0
package/dist/node/db.d.ts +12 -0
package/dist/node/db.d.ts.map +1 -1
package/dist/node/distillation.d.ts.map +1 -1
package/dist/node/embedding-vendor.d.ts +22 -38
package/dist/node/embedding-vendor.d.ts.map +1 -1
package/dist/node/embedding-worker-types.d.ts +17 -12
package/dist/node/embedding-worker-types.d.ts.map +1 -1
package/dist/node/embedding-worker.d.ts +9 -2
package/dist/node/embedding-worker.d.ts.map +1 -1
package/dist/node/embedding-worker.js +38864 -33
package/dist/node/embedding-worker.js.map +4 -4
package/dist/node/embedding.d.ts +30 -22
package/dist/node/embedding.d.ts.map +1 -1
package/dist/node/gradient.d.ts +8 -1
package/dist/node/gradient.d.ts.map +1 -1
package/dist/node/import/detect.d.ts +14 -0
package/dist/node/import/detect.d.ts.map +1 -0
package/dist/node/import/extract.d.ts +43 -0
package/dist/node/import/extract.d.ts.map +1 -0
package/dist/node/import/history.d.ts +40 -0
package/dist/node/import/history.d.ts.map +1 -0
package/dist/node/import/index.d.ts +17 -0
package/dist/node/import/index.d.ts.map +1 -0
package/dist/node/import/providers/aider.d.ts +2 -0
package/dist/node/import/providers/aider.d.ts.map +1 -0
package/dist/node/import/providers/claude-code.d.ts +2 -0
package/dist/node/import/providers/claude-code.d.ts.map +1 -0
package/dist/node/import/providers/cline.d.ts +2 -0
package/dist/node/import/providers/cline.d.ts.map +1 -0
package/dist/node/import/providers/codex.d.ts +2 -0
package/dist/node/import/providers/codex.d.ts.map +1 -0
package/dist/node/import/providers/continue.d.ts +2 -0
package/dist/node/import/providers/continue.d.ts.map +1 -0
package/dist/node/import/providers/index.d.ts +19 -0
package/dist/node/import/providers/index.d.ts.map +1 -0
package/dist/node/import/providers/opencode.d.ts +2 -0
package/dist/node/import/providers/opencode.d.ts.map +1 -0
package/dist/node/import/providers/pi.d.ts +2 -0
package/dist/node/import/providers/pi.d.ts.map +1 -0
package/dist/node/import/types.d.ts +82 -0
package/dist/node/import/types.d.ts.map +1 -0
package/dist/node/index.d.ts +4 -1
package/dist/node/index.d.ts.map +1 -1
package/dist/node/index.js +2217 -224
package/dist/node/index.js.map +4 -4
package/dist/node/instruction-detect.d.ts +66 -0
package/dist/node/instruction-detect.d.ts.map +1 -0
package/dist/node/log.d.ts +9 -0
package/dist/node/log.d.ts.map +1 -1
package/dist/node/ltm.d.ts +40 -0
package/dist/node/ltm.d.ts.map +1 -1
package/dist/node/pattern-extract.d.ts +7 -0
package/dist/node/pattern-extract.d.ts.map +1 -1
package/dist/node/prompt.d.ts +1 -1
package/dist/node/prompt.d.ts.map +1 -1
package/dist/node/recall.d.ts.map +1 -1
package/dist/node/search.d.ts +5 -3
package/dist/node/search.d.ts.map +1 -1
package/dist/node/temporal.d.ts.map +1 -1
package/dist/node/types.d.ts +1 -1
package/dist/types/agents-file.d.ts +4 -0
package/dist/types/agents-file.d.ts.map +1 -1
package/dist/types/config.d.ts +2 -0
package/dist/types/config.d.ts.map +1 -1
package/dist/types/curator.d.ts +45 -0
package/dist/types/curator.d.ts.map +1 -1
package/dist/types/data-dir.d.ts +18 -0
package/dist/types/data-dir.d.ts.map +1 -0
package/dist/types/db.d.ts +12 -0
package/dist/types/db.d.ts.map +1 -1
package/dist/types/distillation.d.ts.map +1 -1
package/dist/types/embedding-vendor.d.ts +22 -38
package/dist/types/embedding-vendor.d.ts.map +1 -1
package/dist/types/embedding-worker-types.d.ts +17 -12
package/dist/types/embedding-worker-types.d.ts.map +1 -1
package/dist/types/embedding-worker.d.ts +9 -2
package/dist/types/embedding-worker.d.ts.map +1 -1
package/dist/types/embedding.d.ts +30 -22
package/dist/types/embedding.d.ts.map +1 -1
package/dist/types/gradient.d.ts +8 -1
package/dist/types/gradient.d.ts.map +1 -1
package/dist/types/import/detect.d.ts +14 -0
package/dist/types/import/detect.d.ts.map +1 -0
package/dist/types/import/extract.d.ts +43 -0
package/dist/types/import/extract.d.ts.map +1 -0
package/dist/types/import/history.d.ts +40 -0
package/dist/types/import/history.d.ts.map +1 -0
package/dist/types/import/index.d.ts +17 -0
package/dist/types/import/index.d.ts.map +1 -0
package/dist/types/import/providers/aider.d.ts +2 -0
package/dist/types/import/providers/aider.d.ts.map +1 -0
package/dist/types/import/providers/claude-code.d.ts +2 -0
package/dist/types/import/providers/claude-code.d.ts.map +1 -0
package/dist/types/import/providers/cline.d.ts +2 -0
package/dist/types/import/providers/cline.d.ts.map +1 -0
package/dist/types/import/providers/codex.d.ts +2 -0
package/dist/types/import/providers/codex.d.ts.map +1 -0
package/dist/types/import/providers/continue.d.ts +2 -0
package/dist/types/import/providers/continue.d.ts.map +1 -0
package/dist/types/import/providers/index.d.ts +19 -0
package/dist/types/import/providers/index.d.ts.map +1 -0
package/dist/types/import/providers/opencode.d.ts +2 -0
package/dist/types/import/providers/opencode.d.ts.map +1 -0
package/dist/types/import/providers/pi.d.ts +2 -0
package/dist/types/import/providers/pi.d.ts.map +1 -0
package/dist/types/import/types.d.ts +82 -0
package/dist/types/import/types.d.ts.map +1 -0
package/dist/types/index.d.ts +4 -1
package/dist/types/index.d.ts.map +1 -1
package/dist/types/instruction-detect.d.ts +66 -0
package/dist/types/instruction-detect.d.ts.map +1 -0
package/dist/types/log.d.ts +9 -0
package/dist/types/log.d.ts.map +1 -1
package/dist/types/ltm.d.ts +40 -0
package/dist/types/ltm.d.ts.map +1 -1
package/dist/types/pattern-extract.d.ts +7 -0
package/dist/types/pattern-extract.d.ts.map +1 -1
package/dist/types/prompt.d.ts +1 -1
package/dist/types/prompt.d.ts.map +1 -1
package/dist/types/recall.d.ts.map +1 -1
package/dist/types/search.d.ts +5 -3
package/dist/types/search.d.ts.map +1 -1
package/dist/types/temporal.d.ts.map +1 -1
package/dist/types/types.d.ts +1 -1
package/package.json +2 -4
package/src/agents-file.ts +41 -13
package/src/config.ts +31 -18
package/src/curator.ts +111 -75
package/src/data-dir.ts +76 -0
package/src/db.ts +110 -11
package/src/distillation.ts +10 -2
package/src/embedding-vendor.ts +23 -40
package/src/embedding-worker-types.ts +19 -11
package/src/embedding-worker.ts +111 -47
package/src/embedding.ts +196 -171
package/src/gradient.ts +9 -1
package/src/import/detect.ts +37 -0
package/src/import/extract.ts +137 -0
package/src/import/history.ts +99 -0
package/src/import/index.ts +45 -0
package/src/import/providers/aider.ts +207 -0
package/src/import/providers/claude-code.ts +339 -0
package/src/import/providers/cline.ts +324 -0
package/src/import/providers/codex.ts +369 -0
package/src/import/providers/continue.ts +304 -0
package/src/import/providers/index.ts +32 -0
package/src/import/providers/opencode.ts +272 -0
package/src/import/providers/pi.ts +332 -0
package/src/import/types.ts +91 -0
package/src/index.ts +5 -0
package/src/instruction-detect.ts +275 -0
package/src/log.ts +91 -3
package/src/ltm.ts +316 -3
package/src/pattern-extract.ts +41 -0
package/src/prompt.ts +7 -1
package/src/recall.ts +43 -5
package/src/search.ts +7 -5
package/src/temporal.ts +8 -6
package/src/types.ts +1 -1

package/src/ltm.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { uuidv7 } from "uuidv7";
 import { db, ensureProject } from "./db";
 import { config } from "./config";
-import { ftsQuery, EMPTY_QUERY, extractTopTerms, runRelaxedSearch } from "./search";
+import { ftsQuery, ftsQueryOr, EMPTY_QUERY, extractTopTerms, filterTerms, runRelaxedSearch } from "./search";
 import * as embedding from "./embedding";
 import * as latReader from "./lat-reader";
 import * as log from "./log";
@@ -50,6 +50,10 @@ export function create(input: {
       ? ensureProject(input.projectPath)
       : null;
+  // IF-2: Global entries (pid=null) must be cross-project to avoid a data hole
+  // where forSession() can't find them in either the project or cross-project pool.
+  const crossProject = pid === null ? true : (input.crossProject ?? false);
   // Dedup guard: if an entry with the same project_id + title already exists,
   // update its content instead of inserting a duplicate. This prevents the
   // curator from creating multiple entries for the same concept across sessions.
@@ -90,6 +94,16 @@ export function create(input: {
       update(crossExisting.id, { content: input.content });
       return crossExisting.id;
     }
+    // Fuzzy dedup: check for title-similar entries via FTS5 + word-overlap.
+    // This catches near-duplicates the curator creates with slightly different
+    // titles for the same concept (e.g. "Upgrade lock bug" vs "Upgrade binary
+    // lock re-entry bug"). Placed after exact checks (cheaper checks first).
+    const fuzzyMatch = findFuzzyDuplicate({ title: input.title, projectId: pid });
+    if (fuzzyMatch) {
+      update(fuzzyMatch.id, { content: input.content });
+      return fuzzyMatch.id;
+    }
   }
   const id = input.id ?? uuidv7();
@@ -106,7 +120,7 @@ export function create(input: {
       input.title,
       input.content,
       input.session ?? null,
-      (input.crossProject ?? false) ? 1 : 0,
+      crossProject ? 1 : 0,
       now,
       now,
     );
@@ -130,8 +144,10 @@ export function update(
     params.push(input.content);
   }
   if (input.confidence !== undefined) {
+    // Clamp to [0.0, 1.0] — an LLM-provided value outside this range would
+    // give disproportionate scoring weight (>1) or silently soft-delete (<0.2).
     sets.push("confidence = ?");
-    params.push(input.confidence);
+    params.push(Math.max(0, Math.min(1, input.confidence)));
   }
   sets.push("updated_at = ?");
   params.push(Date.now());
@@ -153,6 +169,100 @@ export function remove(id: string) {
   db().query("DELETE FROM knowledge WHERE id = ?").run(id);
 }
+// ---------------------------------------------------------------------------
+// Fuzzy title dedup — word-overlap similarity
+// ---------------------------------------------------------------------------
+/**
+ * Compute title word-overlap between two titles.
+ * Returns { coefficient, intersectionSize } where:
+ * - coefficient = |A ∩ B| / min(|A|, |B|) (0–1)
+ * - intersectionSize = number of shared meaningful words
+ * Filters stopwords and single-char tokens for meaningful comparison.
+ */
+function titleOverlap(a: string, b: string): { coefficient: number; intersectionSize: number } {
+  const wordsA = new Set(filterTerms(a).map((w) => w.toLowerCase()));
+  const wordsB = new Set(filterTerms(b).map((w) => w.toLowerCase()));
+  if (wordsA.size === 0 || wordsB.size === 0) return { coefficient: 0, intersectionSize: 0 };
+  const intersection = [...wordsA].filter((w) => wordsB.has(w));
+  return {
+    coefficient: intersection.length / Math.min(wordsA.size, wordsB.size),
+    intersectionSize: intersection.length,
+  };
+}
+/** Minimum word-overlap coefficient to consider two titles as duplicates. */
+const FUZZY_DEDUP_THRESHOLD = 0.7;
+/** Minimum number of overlapping meaningful words required for a fuzzy match.
+ *  Prevents false positives on short titles where 2-3 common words produce
+ *  a high overlap coefficient despite being genuinely different entries. */
+const FUZZY_DEDUP_MIN_OVERLAP = 4;
+/** Minimum cosine similarity for embedding-based dedup. Empirically tuned
+ *  against 312 Nomic v1.5 entries:
+ *  - 0.935+: all genuine duplicates (same topic, different wording)
+ *  - 0.92–0.935: contains false positives from same-subsystem entries
+ *    (e.g. "BGE Small unusable" ↔ "Nomic OOM" scored 0.9326 — related
+ *    but distinct bugs). Star clustering amplifies this by bridging.
+ *  - <0.92: mixed or unrelated entries */
+const EMBEDDING_DEDUP_THRESHOLD = 0.935;
+/**
+ * Find an existing knowledge entry whose title is fuzzy-similar to the given title.
+ *
+ * Uses FTS5 to find up to 5 candidates, then applies word-overlap filtering.
+ * This is the same algorithm used by `check()` but returns a single match
+ * for use in the `create()` dedup guard.
+ *
+ * @returns The first matching entry (id + title), or null if no fuzzy match.
+ */
+export function findFuzzyDuplicate(input: {
+  title: string;
+  projectId: string | null;
+  excludeId?: string;
+}): { id: string; title: string } | null {
+  const q = ftsQueryOr(input.title);
+  if (q === EMPTY_QUERY) return null;
+  const { title: tw, content: cw, category: catw } = config().search.ftsWeights;
+  try {
+    // Build query scoped to the same project + cross-project entries
+    const excludeClause = input.excludeId ? "AND k.id != ?" : "";
+    const sql = input.projectId !== null
+      ? `SELECT k.id, k.title FROM knowledge_fts f
+         CROSS JOIN knowledge k ON k.rowid = f.rowid
+         WHERE knowledge_fts MATCH ?
+         AND (k.project_id = ? OR k.cross_project = 1)
+         AND k.confidence > 0.2
+         ${excludeClause}
+         ORDER BY bm25(knowledge_fts, ?, ?, ?) LIMIT 5`
+      : `SELECT k.id, k.title FROM knowledge_fts f
+         CROSS JOIN knowledge k ON k.rowid = f.rowid
+         WHERE knowledge_fts MATCH ?
+         AND (k.project_id IS NULL OR k.cross_project = 1)
+         AND k.confidence > 0.2
+         ${excludeClause}
+         ORDER BY bm25(knowledge_fts, ?, ?, ?) LIMIT 5`;
+    const params: (string | number)[] = input.projectId !== null
+      ? [q, input.projectId, ...(input.excludeId ? [input.excludeId] : []), tw, cw, catw]
+      : [q, ...(input.excludeId ? [input.excludeId] : []), tw, cw, catw];
+    const candidates = db().query(sql).all(...params) as Array<{ id: string; title: string }>;
+    for (const candidate of candidates) {
+      const { coefficient, intersectionSize } = titleOverlap(input.title, candidate.title);
+      if (coefficient >= FUZZY_DEDUP_THRESHOLD && intersectionSize >= FUZZY_DEDUP_MIN_OVERLAP) {
+        return candidate;
+      }
+    }
+  } catch {
+    // FTS5 error — fall through to no match
+  }
+  return null;
+}
 export function forProject(
   projectPath: string,
   includeCross = true,
@@ -418,6 +528,17 @@ export function all(): KnowledgeEntry[] {
     .all() as KnowledgeEntry[];
 }
+/** Return all cross-project and global (user-level) knowledge entries. */
+export function crossProject(): KnowledgeEntry[] {
+  return db()
+    .query(
+      `SELECT ${KNOWLEDGE_COLS} FROM knowledge
+       WHERE (project_id IS NULL OR cross_project = 1) AND confidence > 0.2
+       ORDER BY confidence DESC, updated_at DESC`,
+    )
+    .all() as KnowledgeEntry[];
+}
 // LIKE-based fallback for when FTS5 fails unexpectedly.
 function searchLike(input: {
   query: string;
@@ -832,3 +953,195 @@ export function check(projectPath: string): IntegrityIssue[] {
   return issues;
 }
+// ---------------------------------------------------------------------------
+// Deduplication — embedding-based semantic clustering with word-overlap fallback
+// ---------------------------------------------------------------------------
+export type DedupCluster = {
+  surviving: { id: string; title: string };
+  merged: Array<{ id: string; title: string }>;
+};
+export type DedupResult = {
+  clusters: DedupCluster[];
+  totalRemoved: number;
+};
+/**
+ * Deduplicate knowledge entries for a project.
+ *
+ * Uses two complementary signals with "star" clustering (no transitive
+ * chains) to prevent snowball merging:
+ *
+ * 1. **Title word-overlap** (Jaccard on meaningful words) — catches entries
+ *    with similar titles regardless of content wording.
+ * 2. **Embedding cosine similarity** (when embeddings are available) — catches
+ *    entries with different titles but semantically identical content. Nomic
+ *    v1.5 produces a same-domain spread of 0.46–0.70 for distinct entries,
+ *    making threshold-based dedup viable at 0.935+ (lower thresholds catch
+ *    related-but-distinct entries as false positives, especially via star
+ *    clustering where a hub entry bridges two distinct topics).
+ *
+ * Pairs matching either signal are clustered together. For each cluster,
+ * picks a survivor (highest confidence, then most recently updated, then
+ * shortest title) and removes the rest.
+ *
+ * @param projectPath   Project root path
+ * @param opts.dryRun   If true (default), report clusters without deleting
+ * @returns             Cluster report and count of removed entries
+ */
+/** Core dedup logic — operates on an arbitrary list of entries. */
+function _dedup(entries: KnowledgeEntry[], dryRun: boolean): DedupResult {
+  if (entries.length < 2) return { clusters: [], totalRemoved: 0 };
+  // --- Build neighbor map using title overlap + embedding similarity ---
+  // Two entries are considered neighbors (potential duplicates) if EITHER:
+  //   (a) title word-overlap ≥ 0.7 with ≥ 4 shared words, OR
+  //   (b) embedding cosine similarity ≥ 0.935
+  // Star clustering (no transitivity) prevents snowball merging.
+  // O(n²) pairwise comparison — acceptable for n ≤ 25 (maxEntries cap).
+  // Load embeddings for the given entries (if available).
+  // We query directly rather than using vectorSearch() because we need
+  // pairwise comparison among entries, not a query-vs-all search.
+  const embeddingMap = new Map<string, Float32Array>();
+  {
+    const entryIds = entries.map((e) => e.id);
+    // Build parameterized IN clause for the entry IDs
+    const placeholders = entryIds.map(() => "?").join(",");
+    const rows = db()
+      .query(`SELECT id, embedding FROM knowledge WHERE embedding IS NOT NULL AND id IN (${placeholders})`)
+      .all(...entryIds) as Array<{ id: string; embedding: Buffer }>;
+    for (const row of rows) {
+      try {
+        embeddingMap.set(row.id, embedding.fromBlob(row.embedding));
+      } catch {
+        // Skip corrupted embeddings — entry falls back to title-overlap only.
+        log.info(`skipping corrupted embedding for entry ${row.id}`);
+      }
+    }
+  }
+  // Pre-compute neighbors for all pairs
+  type DedupHit = { id: string; score: number };
+  const neighborMap = new Map<string, DedupHit[]>();
+  for (const entry of entries) {
+    const neighbors: DedupHit[] = [];
+    const entryVec = embeddingMap.get(entry.id);
+    for (const other of entries) {
+      if (other.id === entry.id) continue;
+      // Signal 1: title word-overlap
+      const { coefficient, intersectionSize } = titleOverlap(entry.title, other.title);
+      const titleMatch = coefficient >= FUZZY_DEDUP_THRESHOLD && intersectionSize >= FUZZY_DEDUP_MIN_OVERLAP;
+      // Signal 2: embedding cosine similarity
+      let embeddingMatch = false;
+      let similarity = 0;
+      if (entryVec) {
+        const otherVec = embeddingMap.get(other.id);
+        if (otherVec && entryVec.length === otherVec.length) {
+          similarity = embedding.cosineSimilarity(entryVec, otherVec);
+          embeddingMatch = similarity >= EMBEDDING_DEDUP_THRESHOLD;
+        }
+      }
+      if (titleMatch || embeddingMatch) {
+        // Use the stronger signal as the match score for cluster priority
+        neighbors.push({ id: other.id, score: Math.max(coefficient, similarity) });
+      }
+    }
+    neighbors.sort((a, b) => b.score - a.score);
+    neighborMap.set(entry.id, neighbors);
+  }
+  // Greedy star clustering — process entries with most neighbors first
+  const claimed = new Set<string>();
+  const rawClusters = new Map<string, string[]>();
+  const sortedIds = [...neighborMap.keys()].sort(
+    (a, b) => neighborMap.get(b)!.length - neighborMap.get(a)!.length,
+  );
+  for (const centerId of sortedIds) {
+    if (claimed.has(centerId)) continue;
+    claimed.add(centerId);
+    const members = [centerId];
+    for (const { id: neighborId } of neighborMap.get(centerId)!) {
+      if (claimed.has(neighborId)) continue;
+      claimed.add(neighborId);
+      members.push(neighborId);
+    }
+    if (members.length > 1) {
+      rawClusters.set(centerId, members);
+    }
+  }
+  // Build clusters and pick survivors
+  const entryById = new Map(entries.map((e) => [e.id, e]));
+  const result: DedupCluster[] = [];
+  let totalRemoved = 0;
+  for (const members of rawClusters.values()) {
+    if (members.length < 2) continue;
+    // Pick survivor: highest confidence → most recent → shortest title
+    const sorted = members
+      .map((id) => entryById.get(id)!)
+      .filter(Boolean)
+      .sort((a, b) => {
+        if (b.confidence !== a.confidence) return b.confidence - a.confidence;
+        if (b.updated_at !== a.updated_at) return b.updated_at - a.updated_at;
+        return a.title.length - b.title.length;
+      });
+    const survivor = sorted[0];
+    const merged = sorted.slice(1);
+    result.push({
+      surviving: { id: survivor.id, title: survivor.title },
+      merged: merged.map((e) => ({ id: e.id, title: e.title })),
+    });
+    if (!dryRun) {
+      for (const entry of merged) {
+        remove(entry.id);
+      }
+    }
+    totalRemoved += merged.length;
+  }
+  // Sort clusters by size descending for readability
+  result.sort((a, b) => b.merged.length - a.merged.length);
+  return { clusters: result, totalRemoved };
+}
+export async function deduplicate(
+  projectPath: string,
+  opts?: { dryRun?: boolean },
+): Promise<DedupResult> {
+  const entries = forProject(projectPath, false);
+  return _dedup(entries, opts?.dryRun ?? true);
+}
+/** Deduplicate global (cross-project) entries that have no project_id. */
+export async function deduplicateGlobal(
+  opts?: { dryRun?: boolean },
+): Promise<DedupResult> {
+  const entries = db()
+    .query(
+      `SELECT ${KNOWLEDGE_COLS} FROM knowledge
+       WHERE project_id IS NULL
+       AND confidence > 0.2
+       ORDER BY confidence DESC, updated_at DESC`,
+    )
+    .all() as KnowledgeEntry[];
+  return _dedup(entries, opts?.dryRun ?? true);
+}

package/src/pattern-extract.ts CHANGED Viewed

@@ -12,6 +12,13 @@
  *   - "prefers X for Y"
  *   - "going with X because Y"
  *
+ * Also matches process instruction patterns from distilled observations
+ * where the observer normalizes user assertions:
+ *   - "User stated always X"
+ *   - "User said never Y"
+ *   - "User stated make sure to X"
+ *   - "User stated don't forget to X"
+ *
  * Extracted entries participate in the normal curator cycle — the curator
  * can consolidate or remove them based on actual value. The extraction is
  * a cheap seed, not a permanent fixture.
@@ -76,6 +83,33 @@ const PATTERNS: PatternDef[] = [
     category: "preference",
     titleFn: (m) => `Typically uses ${m[1].trim()}`,
   },
+  // Process instruction patterns — match distilled observations recording
+  // user assertions about workflow/process rules. The distillation observer
+  // normalizes user instructions into "User stated always X" phrasing.
+  // These require "stated/asserted/said" to avoid overlapping with the
+  // existing "typically uses" pattern above (which already handles
+  // "user always use/prefer/go with X").
+  {
+    regex: /(?:user |team |we )(?:stated |asserted |said )(?:to )?always (.+?)(?:\.|,|$)/gi,
+    category: "preference",
+    titleFn: (m) => `Always ${m[1].trim()}`,
+  },
+  {
+    regex: /(?:user |team |we )(?:stated |asserted |said )(?:to )?never (.+?)(?:\.|,|$)/gi,
+    category: "preference",
+    titleFn: (m) => `Never ${m[1].trim()}`,
+  },
+  {
+    regex: /(?:user |team |we )(?:stated |asserted |said )(?:to )?make sure to (.+?)(?:\.|,|$)/gi,
+    category: "preference",
+    titleFn: (m) => `Make sure to ${m[1].trim()}`,
+  },
+  {
+    regex: /(?:user |team |we )(?:stated |asserted |said )(?:to )?(?:don't|do not) forget (?:to )?(.+?)(?:\.|,|$)/gi,
+    category: "preference",
+    titleFn: (m) => `Always ${m[1].trim()}`,
+  },
 ];
 /**
@@ -96,6 +130,13 @@ export function extractPatterns(observations: string): ExtractedPattern[] {
     regex.lastIndex = 0;
     let match: RegExpMatchArray | null;
     while ((match = regex.exec(observations)) !== null) {
+      // Skip false positives: template placeholders (e.g. "X", "Y"),
+      // quoted fragments, or very short captures that are clearly not
+      // real technology/tool names. Plain apostrophes (') are allowed
+      // since they appear in valid names like "Bun's test runner".
+      const captures = match.slice(1);
+      if (captures.some((c) => c && (c.trim().length <= 2 || /["\u201C\u201D`\u2018\u2019]/.test(c)))) continue;
       const title = titleFn(match);
       const key = title.toLowerCase();
       if (seen.has(key)) continue;

package/src/prompt.ts CHANGED Viewed

@@ -222,6 +222,10 @@ Focus ONLY on knowledge that helps a coding agent work effectively on THIS codeb
 - Environment/tooling setup details that affect development
 - Important relationships between components that aren't obvious from reading the code
 - User preferences and working style specific to how they use this project
+- Repeated user instructions — when the user says things like "always", "never",
+  "make sure to", "don't forget to", these are high-value preference candidates.
+  If you see instruction-like language, prioritize extracting it as a "preference" entry.
+  These instructions represent how the user wants to work and should persist across sessions.
 Do NOT extract:
 - Task-specific details (file currently being edited, current bug being fixed)
@@ -316,7 +320,9 @@ IMPORTANT:
 2. When updating, REPLACE the content with a complete rewrite — never append.
 3. If entries cover the same system from different angles, merge them: update one, delete the rest.
 4. Only create a new entry for genuinely distinct knowledge with no existing home.
-5. Keep all entries under 150 words. If an existing entry is too long, use an update op to trim it.`;
+5. Keep all entries under 150 words. If an existing entry is too long, use an update op to trim it.
+6. Pay special attention to user instructions ("always do X", "never do Y", "make sure to X").
+   These are strong signals for "preference" entries with high confidence.`;
 }
 /**

package/src/recall.ts CHANGED Viewed

@@ -475,14 +475,27 @@ export async function searchRecall(
     }
   }
+  // Determine vector boost weight: for queries with enough meaningful terms,
+  // boost vector search lists so semantic similarity outweighs keyword noise.
+  const queryTermCount = filterTerms(query).length;
+  const vectorWeight =
+    queryTermCount >= (searchConfig?.vectorBoostMinTerms ?? 3)
+      ? (searchConfig?.vectorBoostWeight ?? 1.5)
+      : 1;
   // Collect per-query RRF lists. Original query is always first; if expansion
   // produced extras, we still weight the original twice by adding both original
   // and expanded lists (RRF naturally weights items appearing in more lists).
   const allRrfLists: Array<{
     items: TaggedResult[];
     key: (r: TaggedResult) => string;
+    weight?: number;
   }> = [];
+  // Track where primary (first-query) lists end so the MAX_RRF_LISTS cap
+  // trims expanded-query lists first, preserving vector/supplemental lists.
+  let primaryListEnd = 0;
   for (const q of queries) {
     const knowledgeResults: ltm.ScoredKnowledgeEntry[] = [];
     if (knowledgeEnabled && scope !== "session") {
@@ -568,7 +581,15 @@ export async function searchRecall(
         key: (r) => `t:${r.item.id}`,
       });
     }
+    // Mark the end of the first (original) query's lists. Supplemental lists
+    // (vector, lat.md, cross-project, quality, exact-match) are appended after
+    // the loop and should be preserved over expanded-query lists when capping.
+    if (primaryListEnd === 0) {
+      primaryListEnd = allRrfLists.length;
+    }
   }
+  const perQueryListEnd = allRrfLists.length;
   // Vector search on the original query (not expansions — avoid redundant embeds).
   if (embedding.isAvailable() && scope !== "session") {
@@ -593,6 +614,7 @@ export async function searchRecall(
           allRrfLists.push({
             items: vectorTagged,
             key: (r) => `k:${r.item.id}`,
+            weight: vectorWeight,
           });
         }
       }
@@ -618,6 +640,7 @@ export async function searchRecall(
           allRrfLists.push({
             items: distVectorTagged,
             key: (r) => `d:${r.item.id}`,
+            weight: vectorWeight,
           });
         }
       }
@@ -648,6 +671,7 @@ export async function searchRecall(
           allRrfLists.push({
             items: temporalVectorTagged,
             key: (r) => `t:${r.item.id}`,
+            weight: vectorWeight,
           });
         }
       }
@@ -786,6 +810,25 @@ export async function searchRecall(
     }
   }
+  // Cap the number of RRF lists to prevent score inflation from marginal items.
+  // With query expansion (3 queries × 4 sources + supplemental lists), the list
+  // count can exceed 15. Each list gives marginal items enough cumulative RRF
+  // score to clear the relevance floor.
+  //
+  // Priority: primary (original query BM25 + recency) and supplemental
+  // (vector, lat.md, cross-project, quality, exact-match) are high-value.
+  // Expanded-query BM25 lists are lowest priority — trim those first.
+  const MAX_RRF_LISTS = 10;
+  if (allRrfLists.length > MAX_RRF_LISTS) {
+    // Layout: [0..primaryListEnd) = primary, [primaryListEnd..perQueryEnd) = expanded, [perQueryEnd..) = supplemental
+    const primary = allRrfLists.slice(0, primaryListEnd);
+    const expanded = allRrfLists.slice(primaryListEnd, perQueryListEnd);
+    const supplemental = allRrfLists.slice(perQueryListEnd);
+    const budget = Math.max(0, MAX_RRF_LISTS - primary.length - supplemental.length);
+    allRrfLists.length = 0;
+    allRrfLists.push(...primary, ...expanded.slice(0, budget), ...supplemental);
+  }
   const fused = reciprocalRankFusion<TaggedResult>(allRrfLists);
   // Cap output: return at most 3x the per-source limit. With 7+ RRF sources
@@ -885,11 +928,6 @@ export async function runRecall(input: RecallInput): Promise<RecallResult> {
     return recallById(input.id);
   }
-  // Short-circuit vague queries — stopwords-only would match everything.
-  if (ftsQuery(input.query) === EMPTY_QUERY) {
-    return "Query too vague — try using specific keywords, file names, or technical terms.";
-  }
   const fused = await searchRecall(input);
   const recallCfg = input.searchConfig?.recall;
   return formatFusedResults(fused, {

package/src/search.ts CHANGED Viewed

@@ -302,29 +302,31 @@ export function normalizeRank(
 /**
  * Reciprocal Rank Fusion: merge multiple ranked lists into a single ranked list.
  *
- * RRF score = Σ(1 / (k + rank_i)) for each list where the item appears.
+ * RRF score = Σ(weight / (k + rank_i)) for each list where the item appears.
  * k = 60 is standard (from Cormack et al., 2009; also used by QMD).
  *
  * RRF is rank-based, not score-based — raw score magnitude differences across
  * different FTS5 tables don't matter. Only relative ordering within each list.
  *
- * @param lists  Each list provides items (in ranked order) and a key function
- *               for deduplication. Items at the front of the array are rank 0.
+ * @param lists  Each list provides items (in ranked order), a key function
+ *               for deduplication, and an optional weight (default 1).
+ *               Items at the front of the array are rank 0.
  * @param k      Smoothing constant. Default 60.
  * @returns      Fused list sorted by RRF score descending. When items appear
  *               in multiple lists, the first occurrence's item is kept.
  */
 export function reciprocalRankFusion<T>(
-  lists: Array<{ items: T[]; key: (item: T) => string }>,
+  lists: Array<{ items: T[]; key: (item: T) => string; weight?: number }>,
   k = 60,
 ): Array<{ item: T; score: number }> {
   const scores = new Map<string, { item: T; score: number }>();
   for (const list of lists) {
+    const w = list.weight ?? 1;
     for (let rank = 0; rank < list.items.length; rank++) {
       const item = list.items[rank];
       const id = list.key(item);
-      const rrfScore = 1 / (k + rank);
+      const rrfScore = w / (k + rank);
       const existing = scores.get(id);
       if (existing) {
         existing.score += rrfScore;

package/src/temporal.ts CHANGED Viewed

@@ -171,6 +171,8 @@ export function markDistilled(ids: string[]) {
     .run(...ids);
 }
+// Only searches undistilled messages — distilled content is already represented
+// in distillation search results and would duplicate/dilute temporal hits.
 // LIKE-based fallback for when FTS5 fails unexpectedly.
 function searchLike(input: {
   pid: string;
@@ -186,8 +188,8 @@ function searchLike(input: {
   const conditions = terms.map(() => "LOWER(content) LIKE ?").join(" AND ");
   const likeParams = terms.map((t) => `%${t}%`);
   const query = input.sessionID
-    ? `SELECT * FROM temporal_messages WHERE project_id = ? AND session_id = ? AND ${conditions} ORDER BY created_at DESC LIMIT ?`
-    : `SELECT * FROM temporal_messages WHERE project_id = ? AND ${conditions} ORDER BY created_at DESC LIMIT ?`;
+    ? `SELECT * FROM temporal_messages WHERE project_id = ? AND session_id = ? AND distilled = 0 AND ${conditions} ORDER BY created_at DESC LIMIT ?`
+    : `SELECT * FROM temporal_messages WHERE project_id = ? AND distilled = 0 AND ${conditions} ORDER BY created_at DESC LIMIT ?`;
   const params = input.sessionID
     ? [input.pid, input.sessionID, ...likeParams, input.limit]
     : [input.pid, ...likeParams, input.limit];
@@ -208,11 +210,11 @@ export function search(input: {
   const ftsSQL = input.sessionID
     ? `SELECT m.* FROM temporal_fts f
        CROSS JOIN temporal_messages m ON m.rowid = f.rowid
-       WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ?
+       WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ? AND m.distilled = 0
        ORDER BY rank LIMIT ?`
     : `SELECT m.* FROM temporal_fts f
        CROSS JOIN temporal_messages m ON m.rowid = f.rowid
-       WHERE f.content MATCH ? AND m.project_id = ?
+       WHERE f.content MATCH ? AND m.project_id = ? AND m.distilled = 0
        ORDER BY rank LIMIT ?`;
   try {
@@ -251,11 +253,11 @@ export function searchScored(input: {
   const ftsSQL = input.sessionID
     ? `SELECT m.*, rank FROM temporal_fts f
        CROSS JOIN temporal_messages m ON m.rowid = f.rowid
-       WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ?
+       WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ? AND m.distilled = 0
        ORDER BY rank LIMIT ?`
     : `SELECT m.*, rank FROM temporal_fts f
        CROSS JOIN temporal_messages m ON m.rowid = f.rowid
-       WHERE f.content MATCH ? AND m.project_id = ?
+       WHERE f.content MATCH ? AND m.project_id = ? AND m.distilled = 0
        ORDER BY rank LIMIT ?`;
   try {

package/src/types.ts CHANGED Viewed

@@ -183,7 +183,7 @@ export type LoreMessageWithParts = {
  * Host adapters implement this:
  * - OpenCode: wraps `client.session.create()` + `client.session.prompt()`
  * - Pi: wraps `complete()` from `@mariozechner/pi-ai`
- * - Standalone: direct `fetch()` to provider APIs
+ * - Gateway: direct `fetch()` to provider APIs
  */
 export interface LLMClient {
   /**