npm - @betterdb/memory - Versions diffs - 0.2.0 → 0.4.0 - Mend

@betterdb/memory 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +46 -6
package/package.json +3 -1
package/scripts/aging-worker.ts +4 -1
package/scripts/setup-index.ts +10 -3
package/src/client/memory-store.ts +406 -0
package/src/client/model.ts +10 -10
package/src/client/providers/local.ts +58 -0
package/src/client/valkey.ts +9 -0
package/src/config.ts +25 -2
package/src/hooks/pre-tool.ts +10 -10
package/src/hooks/session-end.ts +4 -2
package/src/hooks/session-start.ts +22 -10
package/src/index.ts +318 -21
package/src/mcp/server.ts +62 -42
package/src/memory/aging.ts +78 -196
package/src/memory/recall.ts +169 -0
package/src/memory/retrieval.ts +73 -70

package/README.md CHANGED Viewed

@@ -54,7 +54,7 @@ docker run -d --name betterdb-valkey -p 6379:6379 -v betterdb-valkey-data:/data
 ### MCP Tools
 Claude can use these mid-conversation:
-- `search_context` — Semantic search over past sessions
+- `search_context` — Semantic search over past sessions. Escalates project+branch → project → cross-project, and takes an optional `tags` filter (`decision`, `pattern`, `problem`, `open-thread`)
 - `store_insight` — Save a decision, pattern, or warning
 - `list_open_threads` — Show unresolved items
 - `forget` — Delete a specific memory
@@ -63,9 +63,11 @@ Claude can use these mid-conversation:
 ```bash
 bunx @betterdb/memory install        # Set up hooks + MCP server
-bunx @betterdb/memory status         # Check health
+bunx @betterdb/memory status         # Check health + recall scoring config
 bunx @betterdb/memory uninstall      # Remove everything
 bunx @betterdb/memory maintain       # Run aging/compression manually
+bunx @betterdb/memory forget         # Bulk-delete by scope (dry run; --apply to delete)
+                                     #   --project <name> | --all-projects --branch <b> --tags <a,b>
 bunx @betterdb/memory docker-valkey  # Manage Docker Valkey container
 ```
@@ -80,23 +82,62 @@ Copy `.env.example` to `.env` and fill in your values before running `bunx @bett
 | `BETTERDB_VALKEY_URL` | `redis://localhost:6379` | Valkey connection URL |
 | `BETTERDB_VALKEY_INDEX_NAME` | `betterdb-memory-index` | Valkey search index name |
 | `BETTERDB_EMBED_DIM` | `1024` | Embedding dimensions |
-| `BETTERDB_MAX_CONTEXT_MEMORIES` | `5` | Memories injected per session |
+| `BETTERDB_MAX_CONTEXT_MEMORIES` | `5` | Max memories injected per session (after gating) |
 | `BETTERDB_CONTEXT_FILE` | `.betterdb_context.md` | Context injection file |
 | `BETTERDB_ALLOW_REMOTE_FALLBACK` | `true` | Fall back to remote APIs if local models unavailable |
+#### Recall Gating
+Recall over-fetches a candidate pool, gates it by relevance, and escalates on a
+miss (project+branch → project → cross-project). Memories are stored with their
+git branch as a native thread scope and content-type tags, so recall can narrow
+to the current branch first and filter by type. `search_context` returns nothing
+only when nothing clears the bar — so a miss is honest, not a silent drop.
+The gate is **relative**, not an absolute similarity threshold: embed models
+compress cosine similarity into different, narrow bands (mxbai-embed-large packs
+everything into ~0.7–0.88), so a fixed threshold doesn't transfer across models.
+Instead, `floor` drops genuine noise, and hits within `margin` of the top match
+are kept; confidence comes from the scale-independent top-vs-next gap.
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `BETTERDB_RECALL_FLOOR` | `0.5` | Similarity floor — drops noise and loosens the store's own distance gate |
+| `BETTERDB_RECALL_MARGIN` | `0.05` | Keep hits within this similarity of the top match |
+| `BETTERDB_RECALL_SEPARATION` | `0.04` | Top-vs-next gap above which a match is "high" confidence |
+| `BETTERDB_RECALL_POOL_K` | `10` | Rung-1 over-fetch pool (project) |
+| `BETTERDB_RECALL_POOL_K_WIDE` | `20` | Rung-2/3 over-fetch pool (wider / cross-project) |
+| `BETTERDB_ALLOW_CROSS_PROJECT` | `true` | Allow escalation / `scope="all"` to search across projects |
+Ranking within the gated pool uses a composite score (similarity + recency +
+importance), owned by `@betterdb/agent-memory`. Recency is the system's single
+time-decay — a half-life applied at query time, not a stored per-memory aging
+pass. These knobs tune it; defaults match the store's.
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `BETTERDB_RECALL_HALF_LIFE_DAYS` | `7` | Age at which a memory's recency term halves |
+| `BETTERDB_RECALL_WEIGHT_SIMILARITY` | `0.6` | Weight of semantic similarity in the composite score |
+| `BETTERDB_RECALL_WEIGHT_RECENCY` | `0.25` | Weight of recency |
+| `BETTERDB_RECALL_WEIGHT_IMPORTANCE` | `0.15` | Weight of stored importance |
 #### Model Providers
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `BETTERDB_EMBED_PROVIDER` | auto-detect | Force embed provider: `ollama`, `voyage`, `openai`, `groq`, `together` |
+| `BETTERDB_EMBED_PROVIDER` | auto-detect | Force embed provider: `local`, `ollama`, `voyage`, `openai`, `groq`, `together` |
 | `BETTERDB_SUMMARIZE_PROVIDER` | auto-detect | Force summarize provider: `ollama`, `anthropic`, `openai`, `groq`, `together` |
 | `BETTERDB_EMBED_MODEL` | `mxbai-embed-large` | Ollama embedding model name |
 | `BETTERDB_SUMMARIZE_MODEL` | `mistral:7b` | Ollama summarization model name |
 | `BETTERDB_OLLAMA_URL` | `http://localhost:11434` | Ollama API URL |
+#### Embeddings work with zero config
+If no embedding provider is detected (no Ollama models, no API keys), BetterDB falls back to **on-device embeddings** via `@xenova/transformers` (`all-MiniLM-L6-v2`, 384-dim, Apache-2.0). No API key, no running service — the model weights download once on first use and are cached thereafter. Auto-detected providers (Ollama, then API keys) take priority when available.
 #### API Keys
-At least one embedding provider and one summarization provider must be available. Ollama is free and local; the others require API keys.
+Embeddings always work (on-device fallback above). A summarization provider is still required — Ollama is free and local; the others require API keys.
 | Variable | Provider | Used for |
 |----------|----------|----------|
@@ -110,7 +151,6 @@ At least one embedding provider and one summarization provider must be available
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `BETTERDB_DECAY_RATE` | `0.95` | Memory importance decay per day |
 | `BETTERDB_COMPRESS_THRESHOLD` | `0.3` | Importance threshold for compression |
 | `BETTERDB_DISTILL_MIN_SESSIONS` | `5` | Min sessions before knowledge distillation |
 | `BETTERDB_AGING_INTERVAL_HOURS` | `6` | Hours between automatic aging runs |

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@betterdb/memory",
-  "version": "0.2.0",
+  "version": "0.4.0",
   "description": "BetterDB Memory for Claude Code — Valkey-powered persistent memory across sessions",
   "license": "MIT",
   "author": "BetterDB Inc. <hello@betterdb.com>",
@@ -46,9 +46,11 @@
     "typecheck": "tsc --noEmit"
   },
   "dependencies": {
+    "@betterdb/agent-memory": "^0.2.1",
     "iovalkey": "^0.2.1",
     "ollama": "^0.5.14",
     "@modelcontextprotocol/sdk": "^1.12.1",
+    "@xenova/transformers": "^2.17.2",
     "zod": "^3.24.4",
     "zod-to-json-schema": "^3.24.5",
     "@anthropic-ai/sdk": "latest"

package/scripts/aging-worker.ts CHANGED Viewed

@@ -7,16 +7,19 @@
  *   bun run scripts/aging-worker.ts
  */
 import { getValkeyClient } from "../src/client/valkey.js";
+import { getPluginMemoryStore } from "../src/client/memory-store.js";
 import { createModelClient } from "../src/client/model.js";
 import { AgingPipeline } from "../src/memory/aging.js";
 try {
   const valkeyClient = await getValkeyClient();
   const modelClient = await createModelClient();
+  const store = await getPluginMemoryStore((t) => modelClient.embed(t));
-  const pipeline = new AgingPipeline(valkeyClient, modelClient);
+  const pipeline = new AgingPipeline(valkeyClient, store, modelClient);
   await pipeline.runFullPipeline();
+  await store.close();
   await valkeyClient.quit();
 } catch (err) {
   console.error("[betterdb] Aging worker failed:", err);

package/scripts/setup-index.ts CHANGED Viewed

@@ -1,14 +1,21 @@
 #!/usr/bin/env bun
 import { getValkeyClient } from "../src/client/valkey.js";
+import { getPluginMemoryStore } from "../src/client/memory-store.js";
 import { createModelClient } from "../src/client/model.js";
-import { config } from "../src/config.js";
 const client = await getValkeyClient();
 const modelClient = await createModelClient();
-await client.ensureIndex(modelClient.embedDim, modelClient.preset.embedModel);
-console.log("Index ready:", config.valkey.indexName);
+// Create the episodic vector index that MemoryStore reads/writes
+// (betterdb:mem:idx) — the same one `install` builds. Record the active
+// provider/dimension first so a later provider swap is caught.
+await client.assertEmbedDim(modelClient.embedDim, modelClient.preset.embedModel);
+const store = await getPluginMemoryStore((t) => modelClient.embed(t));
+await store.ensureIndex();
+console.log("Index ready: betterdb:mem:idx");
 console.log("Embedding dimension:", modelClient.embedDim);
 console.log("Preset:", modelClient.preset.embedModel, "/", modelClient.preset.summarizeModel);
+await store.close();
 await client.quit();

package/src/client/memory-store.ts ADDED Viewed

@@ -0,0 +1,406 @@
+import {
+  MemoryStore,
+  similarityFromDistance,
+  type ConsolidateOptions,
+  type ConsolidateResult,
+  type EmbedFn,
+  type MemoryItem,
+  type MemoryScope,
+  type MemoryStats,
+  type MemoryStoreClient,
+} from "@betterdb/agent-memory";
+import {
+  EpisodicMemorySchema,
+  type EpisodicMemory,
+} from "../memory/schema.js";
+import { getValkeyClient } from "./valkey.js";
+import { config } from "../config.js";
+const SECONDS_PER_DAY = 86400;
+// Store name fixes the index (`betterdb:mem:idx`) and key prefix
+// (`betterdb:mem:{id}`) that @betterdb/agent-memory derives internally.
+const STORE_NAME = "betterdb";
+// --- EpisodicMemory <-> MemoryItem mapping ---
+//
+// agent-memory's MemoryItem is flat (content + importance + tags + scope),
+// while the plugin's EpisodicMemory carries a structured `summary` plus
+// `branch` and an original `timestamp`. We embed `summary.oneLineSummary`
+// (so recall quality matches the current implementation, which embeds the
+// same string) and stash everything MemoryItem can't hold natively in the
+// free-form `source` field. The remaining fields map directly:
+//   project        -> namespace
+//   importanceScore -> importance
+//   accessCount     -> accessCount   (tracked natively, bumped on recall)
+//   lastAccessed    -> lastAccessedAt (tracked natively)
+interface SourcePayload {
+  summary: EpisodicMemory["summary"];
+  branch: string;
+  timestamp: string;
+}
+/** A recalled memory carrying its relevance and composite score for gating. */
+export interface ScoredMemory {
+  memory: EpisodicMemory;
+  /** Cosine similarity to the query, 0..1 (higher = more relevant). */
+  relevance: number;
+  /** Composite recall score (similarity + recency + importance). */
+  score: number;
+}
+export function episodicToSource(memory: EpisodicMemory): string {
+  const payload: SourcePayload = {
+    summary: memory.summary,
+    branch: memory.branch,
+    timestamp: memory.timestamp,
+  };
+  return JSON.stringify(payload);
+}
+/**
+ * Content-type tags for a memory, derived from which summary sections it fills.
+ * Stored natively (not in the opaque `source` blob) so recall can filter on
+ * them — e.g. surface only decisions, or only unresolved open threads.
+ */
+export function memoryTags(memory: EpisodicMemory): string[] {
+  const tags: string[] = [];
+  if (memory.summary.decisions.length > 0) tags.push("decision");
+  if (memory.summary.patterns.length > 0) tags.push("pattern");
+  if (memory.summary.problemsSolved.length > 0) tags.push("problem");
+  if (memory.summary.openThreads.length > 0) tags.push("open-thread");
+  return tags;
+}
+/**
+ * The text embedded for a memory. Previously only `oneLineSummary` was
+ * embedded, so recall could never see the structured detail (decisions,
+ * patterns, problems, open threads) — the single biggest recall-quality limit.
+ * We fold those into the vector here. `filesChanged` is deliberately omitted:
+ * bare file paths are generic and dominate the similarity band with noise.
+ */
+export function buildEmbedText(memory: EpisodicMemory): string {
+  const s = memory.summary;
+  const parts: string[] = [s.oneLineSummary];
+  if (s.decisions.length > 0) parts.push(`Decisions: ${s.decisions.join("; ")}`);
+  if (s.patterns.length > 0) parts.push(`Patterns: ${s.patterns.join("; ")}`);
+  if (s.problemsSolved.length > 0) {
+    const solved = s.problemsSolved
+      .map((p) => `${p.problem} → ${p.resolution}`)
+      .join("; ");
+    parts.push(`Problems solved: ${solved}`);
+  }
+  if (s.openThreads.length > 0) {
+    parts.push(`Open threads: ${s.openThreads.join("; ")}`);
+  }
+  return parts.join("\n");
+}
+export function itemToEpisodic(item: MemoryItem): EpisodicMemory | null {
+  let summary: EpisodicMemory["summary"];
+  let branch: string;
+  let timestamp: string;
+  const payload = parseSourcePayload(item.source);
+  if (payload) {
+    summary = payload.summary;
+    branch = payload.branch;
+    timestamp = payload.timestamp;
+  } else {
+    // A flat item with no SourcePayload — e.g. a memory produced by
+    // MemoryStore.consolidate(), whose `source` is its own marker, not our
+    // JSON. Synthesize a minimal episodic memory from the content so merged
+    // summaries stay first-class for recall, listing, and injection.
+    summary = {
+      decisions: [],
+      patterns: [],
+      problemsSolved: [],
+      openThreads: [],
+      filesChanged: [],
+      oneLineSummary: item.content,
+    };
+    branch = "consolidated";
+    timestamp = new Date(item.createdAt).toISOString();
+  }
+  const parsed = EpisodicMemorySchema.safeParse({
+    memoryId: item.id,
+    project: item.namespace ?? "unknown",
+    branch,
+    timestamp,
+    summary,
+    importanceScore: item.importance,
+    accessCount: item.accessCount,
+    lastAccessed: new Date(item.lastAccessedAt).toISOString(),
+  });
+  return parsed.success ? parsed.data : null;
+}
+function parseSourcePayload(source: string | undefined): SourcePayload | null {
+  if (!source) return null;
+  try {
+    const parsed = JSON.parse(source) as Partial<SourcePayload>;
+    if (parsed && typeof parsed === "object" && parsed.summary) {
+      return parsed as SourcePayload;
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
+// --- Adapter ---
+//
+// Drop-in replacement for the episodic-vector subset of ValkeyClient, backed
+// by @betterdb/agent-memory's MemoryStore. Knowledge entries and work queues
+// stay on the existing ValkeyClient — they have no MemoryStore analog.
+export class PluginMemoryStore {
+  private readonly store: MemoryStore;
+  constructor(client: MemoryStoreClient, embed?: EmbedFn) {
+    this.store = new MemoryStore({
+      client,
+      name: STORE_NAME,
+      embedFn: embed,
+      // Composite-score decay/blend from plugin config. This is the single
+      // time-decay in the system (recency, applied at query time) — there is
+      // no separate importance-aging pass. configRefresh:false keeps these
+      // values fixed rather than letting a Valkey config key override them.
+      halfLifeSeconds: config.recall.halfLifeDays * SECONDS_PER_DAY,
+      weights: {
+        similarity: config.recall.weightSimilarity,
+        recency: config.recall.weightRecency,
+        importance: config.recall.weightImportance,
+      },
+      // The plugin owns its own analytics/discovery story; keep the store quiet
+      // and offline so it pulls in no posthog/otel network behavior.
+      discovery: false,
+      configRefresh: false,
+      analytics: { disabled: true },
+    });
+  }
+  /** Create the `betterdb:mem:idx` vector index if absent (idempotent). */
+  ensureIndex(): Promise<void> {
+    return this.store.ensureIndex();
+  }
+  /**
+   * Store an episodic memory and return its generated id. The vector is derived
+   * from {@link buildEmbedText} (summary + structured detail) inside
+   * MemoryStore — callers no longer precompute an embedding. The full episodic
+   * memory is preserved in `source` for reconstruction; the embed text only
+   * shapes the vector.
+   */
+  storeMemory(memory: EpisodicMemory): Promise<string> {
+    return this.store.remember(buildEmbedText(memory), {
+      importance: memory.importanceScore,
+      namespace: memory.project,
+      // Branch as the native thread scope; content-type tags for filtered
+      // recall. Both are queryable, unlike the free-form `source` payload.
+      threadId: memory.branch,
+      tags: memoryTags(memory),
+      source: episodicToSource(memory),
+    });
+  }
+  /**
+   * KNN recall ranked by MemoryStore's composite score. Unlike the raw store,
+   * this returns each memory *with* its relevance so callers can gate on it —
+   * `relevance` is cosine similarity (0..1, higher = closer) derived from the
+   * hit's raw distance; `score` is the composite (similarity + recency +
+   * importance). Omit `project` to search across all namespaces; pass `branch`
+   * to scope to a git branch (native thread) and `tags` to filter by
+   * content type.
+   */
+  async recall(
+    query: string,
+    opts: {
+      project?: string;
+      branch?: string;
+      tags?: string[];
+      k: number;
+      threshold?: number;
+      reinforce?: boolean;
+    },
+  ): Promise<ScoredMemory[]> {
+    const hits = await this.store.recall(query, {
+      ...(opts.project !== undefined ? { namespace: opts.project } : {}),
+      ...(opts.branch !== undefined ? { threadId: opts.branch } : {}),
+      ...(opts.tags !== undefined && opts.tags.length > 0
+        ? { tags: opts.tags }
+        : {}),
+      k: opts.k,
+      ...(opts.threshold !== undefined ? { threshold: opts.threshold } : {}),
+      reinforce: opts.reinforce ?? true,
+    });
+    const out: ScoredMemory[] = [];
+    for (const hit of hits) {
+      const memory = itemToEpisodic(hit.item);
+      if (memory) {
+        out.push({
+          memory,
+          score: hit.score,
+          relevance: similarityFromDistance(hit.similarity),
+        });
+      }
+    }
+    return out;
+  }
+  /** KNN recall from a precomputed embedding (see {@link recall}). */
+  async searchMemories(
+    embedding: number[],
+    project: string,
+    topK: number,
+  ): Promise<EpisodicMemory[]> {
+    const hits = await this.store.recallByVector(embedding, {
+      namespace: project,
+      k: topK,
+    });
+    return hits
+      .map((hit) => itemToEpisodic(hit.item))
+      .filter((m): m is EpisodicMemory => m !== null);
+  }
+  /**
+   * List stored memories, optionally scoped to `project` and filtered by a
+   * minimum importance. Paginates through MemoryStore.list so callers that
+   * scan all memories (open-thread aggregation, distillation) get the full set.
+   * Pass `max` to stop early once that many matches are collected, so callers
+   * that only need a bounded slice don't materialize the whole store.
+   */
+  async listMemories(
+    project?: string,
+    minImportance?: number,
+    max?: number,
+  ): Promise<EpisodicMemory[]> {
+    const out: EpisodicMemory[] = [];
+    const limit = 100;
+    let offset = 0;
+    for (;;) {
+      const { items, total } = await this.store.list({
+        namespace: project,
+        limit,
+        offset,
+      });
+      if (items.length === 0) break;
+      for (const item of items) {
+        const memory = itemToEpisodic(item);
+        if (!memory) continue;
+        if (minImportance !== undefined && memory.importanceScore < minImportance) {
+          continue;
+        }
+        out.push(memory);
+        if (max !== undefined && out.length >= max) return out;
+      }
+      offset += items.length;
+      if (offset >= total) break;
+    }
+    return out;
+  }
+  /**
+   * List memories matching a scope (project namespace, branch thread, and/or
+   * content-type tags) using the SAME native index filter as
+   * {@link forgetByScope} — so a `listByScope` preview is exactly the set a
+   * `forgetByScope` with the same scope would delete. Unlike {@link listMemories}
+   * (which filters summary-derived tags in memory), this queries native tags,
+   * so memories stored before native tagging are matched identically by both.
+   */
+  async listByScope(scope: {
+    project?: string;
+    branch?: string;
+    tags?: string[];
+  }): Promise<EpisodicMemory[]> {
+    const out: EpisodicMemory[] = [];
+    const limit = 100;
+    let offset = 0;
+    for (;;) {
+      const { items, total } = await this.store.list({
+        ...(scope.project !== undefined ? { namespace: scope.project } : {}),
+        ...(scope.branch !== undefined ? { threadId: scope.branch } : {}),
+        ...(scope.tags !== undefined && scope.tags.length > 0
+          ? { tags: scope.tags }
+          : {}),
+        limit,
+        offset,
+      });
+      if (items.length === 0) break;
+      for (const item of items) {
+        const memory = itemToEpisodic(item);
+        if (memory) out.push(memory);
+      }
+      offset += items.length;
+      if (offset >= total) break;
+    }
+    return out;
+  }
+  /**
+   * Merge a selection of memories into one summary memory (and delete the
+   * sources). Selection criteria — scope, age, or max importance — are passed
+   * through to MemoryStore.consolidate.
+   */
+  consolidate(options: ConsolidateOptions): Promise<ConsolidateResult> {
+    return this.store.consolidate(options);
+  }
+  async getMemory(memoryId: string): Promise<EpisodicMemory | null> {
+    const item = await this.store.get(memoryId);
+    return item ? itemToEpisodic(item) : null;
+  }
+  async deleteMemory(memoryId: string): Promise<void> {
+    await this.store.forget(memoryId);
+  }
+  /**
+   * Bulk-delete every memory matching a scope (project namespace, branch
+   * thread, and/or tags). Returns the number deleted. At least one scope field
+   * should be set — an empty scope would match the whole store.
+   */
+  forgetByScope(scope: {
+    project?: string;
+    branch?: string;
+    tags?: string[];
+  }): Promise<number> {
+    const s: MemoryScope & { tags?: string[] } = {};
+    if (scope.project !== undefined) s.namespace = scope.project;
+    if (scope.branch !== undefined) s.threadId = scope.branch;
+    if (scope.tags !== undefined && scope.tags.length > 0) s.tags = scope.tags;
+    return this.store.forgetByScope(s);
+  }
+  /** Live store stats: item count, evictions, and active composite config. */
+  stats(): Promise<MemoryStats> {
+    return this.store.stats();
+  }
+  close(): Promise<void> {
+    return this.store.close();
+  }
+}
+/**
+ * Shared accessor for the episodic-vector store. Reuses the singleton
+ * ValkeyClient's connection (its `.call()` satisfies MemoryStoreClient) so the
+ * whole plugin runs on one iovalkey socket. Pass `embed` when the caller will
+ * remember/recall/ensureIndex; read-only callers (list/get/delete) may omit it.
+ */
+export async function getPluginMemoryStore(
+  embed?: EmbedFn,
+): Promise<PluginMemoryStore> {
+  const valkey = await getValkeyClient();
+  return new PluginMemoryStore(valkey.redis, embed);
+}

package/src/client/model.ts CHANGED Viewed

@@ -74,6 +74,7 @@ export { AnthropicSummarizeClient } from "./providers/anthropic.js";
 export { VoyageEmbedClient } from "./providers/voyage.js";
 export { GroqEmbedClient, GroqSummarizeClient } from "./providers/groq.js";
 export { TogetherEmbedClient, TogetherSummarizeClient } from "./providers/together.js";
+export { LocalEmbedClient } from "./providers/local.js";
 export { buildSummarizePrompt } from "./providers/_prompt.js";
 // --- Provider Detection ---
@@ -146,15 +147,10 @@ async function resolveEmbedProvider(
     return new TogetherEmbedClient(p.togetherKey);
   }
-  throw new Error(
-    `No embedding provider available. Options:\n` +
-      `  1. Install Ollama and run: ollama pull mxbai-embed-large\n` +
-      `  2. Set VOYAGE_API_KEY for Voyage AI (voyage-3, dim=1024)\n` +
-      `  3. Set OPENAI_API_KEY for OpenAI (text-embedding-3-small, dim=1536)\n` +
-      `  4. Set GROQ_API_KEY for Groq (nomic-embed-text-v1_5, dim=768)\n` +
-      `  5. Set TOGETHER_API_KEY for Together AI (m2-bert-80M-8k-retrieval, dim=768)\n\n` +
-      `Note: ANTHROPIC_API_KEY does not provide embeddings — pair it with another embed provider.`,
-  );
+  // On-device fallback: zero-config, no API key, no service. Ensures a fresh
+  // install produces embeddings even with nothing else installed.
+  const { LocalEmbedClient } = await import("./providers/local.js");
+  return new LocalEmbedClient();
 }
 async function resolveSummarizeProvider(
@@ -217,6 +213,10 @@ function createExplicitEmbedProvider(
   p: typeof config.providers,
 ): ModelClient {
   switch (name) {
+    case "local": {
+      const { LocalEmbedClient } = require("./providers/local.js");
+      return new LocalEmbedClient();
+    }
     case "ollama": {
       const { OllamaModelClient } = require("./providers/ollama.js");
       return new OllamaModelClient(PRESET_CLEAN, config.ollama.url);
@@ -242,7 +242,7 @@ function createExplicitEmbedProvider(
       return new TogetherEmbedClient(p.togetherKey);
     }
     default:
-      throw new Error(`Unknown embed provider: ${name}. Valid: ollama, openai, voyage, groq, together`);
+      throw new Error(`Unknown embed provider: ${name}. Valid: local, ollama, openai, voyage, groq, together`);
   }
 }

package/src/client/providers/local.ts ADDED Viewed

@@ -0,0 +1,58 @@
+import type { SessionSummary } from "../../memory/schema.js";
+import type { ModelClient, ModelPreset } from "../model.js";
+// On-device embeddings via @xenova/transformers — no API key, no running
+// service. Weights (all-MiniLM-L6-v2, Apache-2.0, 384-dim) download once on
+// first use and are cached under the transformers cache dir thereafter.
+const MODEL_ID = "Xenova/all-MiniLM-L6-v2";
+const EMBED_DIM = 384;
+type FeatureExtractor = (
+  text: string,
+  options: { pooling: "mean"; normalize: boolean },
+) => Promise<{ data: Float32Array }>;
+interface TransformersModule {
+  pipeline(
+    task: "feature-extraction",
+    model: string,
+  ): Promise<FeatureExtractor>;
+}
+// Lazy singleton: the model loads once and is reused across embed calls, and
+// @xenova/transformers is only imported when local embeddings are actually used.
+let extractorPromise: Promise<FeatureExtractor> | null = null;
+function getExtractor(): Promise<FeatureExtractor> {
+  if (!extractorPromise) {
+    extractorPromise = import("@xenova/transformers").then((mod) =>
+      (mod as unknown as TransformersModule).pipeline(
+        "feature-extraction",
+        MODEL_ID,
+      ),
+    );
+  }
+  return extractorPromise;
+}
+export class LocalEmbedClient implements ModelClient {
+  readonly embedDim = EMBED_DIM;
+  readonly preset: ModelPreset = {
+    embedModel: MODEL_ID,
+    summarizeModel: "n/a",
+    embedDim: EMBED_DIM,
+  };
+  async embed(text: string): Promise<number[]> {
+    const extract = await getExtractor();
+    const output = await extract(text, { pooling: "mean", normalize: true });
+    return Array.from(output.data);
+  }
+  async summarize(_transcript: string): Promise<SessionSummary> {
+    throw new Error(
+      "Local embeddings provider does not summarize — configure a summarize provider (Ollama, Anthropic, OpenAI, Groq, or Together)",
+    );
+  }
+}