npm - audrey - Versions diffs - 0.16.0 → 0.17.0 - Mend

audrey 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/LICENSE +21 -21
package/README.md +310 -643
package/benchmarks/baselines.js +169 -0
package/benchmarks/cases.js +421 -0
package/benchmarks/reference-results.js +70 -0
package/benchmarks/report.js +255 -0
package/benchmarks/run.js +514 -0
package/docs/assets/benchmarks/local-benchmark.svg +45 -0
package/docs/assets/benchmarks/operations-benchmark.svg +45 -0
package/docs/assets/benchmarks/published-memory-standards.svg +50 -0
package/docs/benchmarking.md +151 -0
package/docs/production-readiness.md +96 -0
package/examples/fintech-ops-demo.js +67 -0
package/examples/healthcare-ops-demo.js +67 -0
package/examples/stripe-demo.js +105 -0
package/mcp-server/config.js +81 -24
package/mcp-server/index.js +611 -75
package/mcp-server/serve.js +482 -0
package/package.json +24 -5
package/src/audrey.js +51 -13
package/src/consolidate.js +70 -54
package/src/db.js +22 -1
package/src/embedding.js +16 -12
package/src/encode.js +8 -2
package/src/fts.js +134 -0
package/src/import.js +28 -0
package/src/llm.js +6 -3
package/src/migrate.js +2 -2
package/src/recall.js +253 -32
package/src/utils.js +25 -0
package/types/index.d.ts +434 -0

package/docs/benchmarking.md ADDED Viewed

@@ -0,0 +1,151 @@
+# Benchmarking Audrey
+Audrey now ships with a memory benchmark harness that does three different jobs:
+1. It runs Audrey against a local retrieval suite inspired by LongMemEval, plus privacy and abstention checks that matter in production.
+2. It runs Audrey against an operation-level suite for update, overwrite, delete, merge, and abstain behavior.
+3. It overlays published leaderboard numbers from leading memory systems on LoCoMo so you can place Audrey in the current market and research landscape without pretending the measurements are identical.
+That split is deliberate. A lot of memory tooling mixes internal demos with external benchmark claims. Audrey should not do that.
+## Run It
+```bash
+npm run bench:memory
+```
+The package script is the intended operator entrypoint:
+```bash
+npm run bench:memory
+```
+Artifacts are written to `benchmarks/output/`:
+- `summary.json`
+- `report.html`
+- `local-overall.svg`
+- `retrieval-overall.svg`
+- `operations-overall.svg`
+- `published-locomo.svg`
+For CI, JSON-only output is available:
+```bash
+npm run bench:memory:json
+```
+For regression gating, use:
+```bash
+npm run bench:memory:check
+```
+That command fails if Audrey falls below its minimum local score, pass rate, or required lead over the strongest naive baseline.
+To refresh the committed SVGs used in the README:
+```bash
+npm run bench:memory:readme-assets
+```
+That writes stable chart assets to `docs/assets/benchmarks/` so the GitHub repo surface shows the same benchmark posture as the generated report.
+To run a single local track:
+```bash
+npm run bench:memory:retrieval
+npm run bench:memory:operations
+```
+## What The Local Retrieval Benchmark Measures
+The retrieval suite covers eight memory families:
+- `information_extraction`
+- `knowledge_updates`
+- `multi_session_reasoning`
+- `temporal_reasoning`
+- `abstention`
+- `conflict_resolution`
+- `procedural_learning`
+- `privacy_boundary`
+This is intentionally closer to how operators evaluate memory in production than a single retrieval-accuracy number. Audrey should not only retrieve facts. It should:
+- prefer fresh state over stale state
+- avoid leaking private memory
+- consolidate repeated episodes into reusable procedures
+- handle conflict without amplifying low-reliability noise
+## What The Local Operations Benchmark Measures
+The operations suite covers four lifecycle families:
+- `update_overwrite`
+- `delete_and_abstain`
+- `semantic_merge`
+- `procedural_merge`
+This suite exists because leading memory systems are often compared on offline recall, while real agent memory succeeds or fails on memory operations:
+- can a newer fact overwrite stale state without leaking both
+- can a delete actually prevent future recall
+- can repeated raw events merge into reusable semantic knowledge
+- can repeated events merge into an actionable procedure instead of another inert blob of text
+Those are not implementation details. They are the actual product surface of memory.
+## What The Published Leaderboard Means
+The LoCoMo chart in the generated report is a research context layer, not a claim that Audrey has already reproduced those exact scores.
+Current published anchors included in the report:
+- MIRIX: LoCoMo `85.4` from the MIRIX paper
+- Letta Filesystem: LoCoMo `74.0` from Letta's benchmark write-up
+- Mem0 Graph Memory: LoCoMo `68.5` from the Mem0 paper
+- Mem0: LoCoMo `66.9` from the Mem0 paper
+- OpenAI Memory baseline: LoCoMo `52.9` as reported in the Mem0 paper
+Use this chart to answer: "Where is the frontier today?" not "Has Audrey already matched that exact benchmark protocol?"
+## March 23, 2026 Research Readout
+The most important memory trends right now:
+1. Typed memory systems are replacing flat retrieval.
+   MemOS frames memory as an operating system concern with scheduling and memory-object abstractions, not just vector lookup.
+2. Realistic long-horizon benchmarks are replacing toy recall tests.
+   LongMemEval emphasizes multi-session reasoning, temporal updates, abstraction, and knowledge revision.
+3. Context engineering is now a first-class competitor to retrieval-only memory.
+   Letta's filesystem and memory-block work argues that editable context structure can outperform simpler retrieval-only designs.
+4. Production memory is now judged on latency and token cost too.
+   Mem0 explicitly reports quality alongside lower token and latency overhead.
+5. Temporal and multimodal memory are moving into the frontier.
+   MIRIX pushes beyond text-only episodic recall into typed multimodal memory with compression.
+## What Audrey Should Do Next
+The benchmark highlights the next credible roadmap for Audrey:
+- first-party LoCoMo and LongMemEval adapters so Audrey can publish directly reproducible external benchmark numbers
+- contradiction-state and truth-resolution benchmark cases, not just retrieval outcomes
+- cost, latency, and storage curves against long-context baselines and simpler memory systems
+- a typed memory graph layer for cross-memory state transitions and time-aware reasoning
+## Source Links
+- LongMemEval: [arXiv 2410.10813](https://arxiv.org/abs/2410.10813)
+- Mem0: [arXiv 2504.19413](https://arxiv.org/abs/2504.19413)
+- MIRIX: [arXiv 2507.07957](https://arxiv.org/abs/2507.07957)
+- MemOS: [arXiv 2507.03724](https://arxiv.org/abs/2507.03724)
+- MemGPT: [arXiv 2310.08560](https://arxiv.org/abs/2310.08560)
+- Letta memory blocks: [Letta blog](https://www.letta.com/blog/memory-blocks)
+- Letta benchmarking: [Letta benchmark write-up](https://www.letta.com/blog/benchmarking-ai-agent-memory)
+- LoCoMo benchmark repo: [snap-research/locomo](https://github.com/snap-research/locomo)
+- LongMemEval repo: [xiaowu0162/LongMemEval](https://github.com/xiaowu0162/LongMemEval)

package/docs/production-readiness.md ADDED Viewed

@@ -0,0 +1,96 @@
+# Audrey Production Readiness
+Audrey is ready to be the memory layer inside a production agent system, but it is not a complete regulated-platform package by itself. Treat it as stateful infrastructure: pin providers, isolate tenants, monitor health, and wrap it with the controls your environment requires.
+## Best Vertical Fit
+### 1. Financial Services Operations
+Best fit:
+- Payments operations copilots
+- Fraud and dispute investigation agents
+- KYC/KYB review assistants
+- Internal support agents that need durable incident and policy memory
+Why Audrey fits:
+- Contradiction tracking helps surface conflicting customer, tool, and policy evidence.
+- Confidence scoring and source lineage make escalations more reviewable.
+- Local SQLite storage keeps memory close to the application boundary.
+- Dream-cycle consolidation turns repeated incidents into reusable operational principles.
+Guardrails:
+- Do not store PAN, CVV, raw bank credentials, or secrets in memory.
+- Isolate memory stores by environment, customer, and business unit.
+- Keep export and purge paths in your incident-response runbook.
+- Add encryption at rest and backup retention outside Audrey.
+### 2. Healthcare Operations
+Best fit:
+- Care coordination assistants
+- Prior-authorization workflow agents
+- Intake, referral, and scheduling copilots
+- Internal knowledge assistants for clinical operations teams
+Why Audrey fits:
+- Longitudinal recall preserves operational context across multi-step handoffs.
+- Private memories support role-specific context without making it part of public recall.
+- Contradiction detection helps catch conflicting workflow instructions and stale operating assumptions.
+- Local embeddings allow offline-first or reduced-data-egress deployments.
+Guardrails:
+- Audrey is not a medical device and should not be treated as a clinical decision engine.
+- Use de-identified or minimum-necessary data unless the full deployment boundary is HIPAA-ready.
+- Enforce access controls and audit logging in the host application, not just in Audrey.
+- Separate patient-facing and staff-only memory scopes.
+## Production Checklist
+1. Pin `AUDREY_EMBEDDING_PROVIDER` and `AUDREY_LLM_PROVIDER` explicitly. Do not rely on key-based auto-detection in production.
+2. Set a dedicated `AUDREY_DATA_DIR` per environment and per tenant boundary.
+3. Add a health check that runs `npx audrey status --json --fail-on-unhealthy`.
+4. Alert on `health.healthy=false` or `health.reembed_recommended=true`.
+5. Schedule `npx audrey dream` during low-traffic windows so consolidation and decay stay current.
+6. Backup the SQLite data directory before changing embedding dimensions or providers.
+7. Treat re-embedding as a controlled maintenance action and validate with `npx audrey status`.
+8. Keep API keys, bearer tokens, and raw credentials out of encoded memory content.
+9. Decide whether `private` memories are allowed for your use case and document who can create them.
+10. Add application-level encryption, access control, logging, and retention policies around Audrey.
+11. On graceful shutdown paths, call `await brain.waitForIdle()` before `brain.close()` so tracked background work drains cleanly.
+## Operations Commands
+```bash
+# Human-readable health
+npx audrey status
+# Monitoring-friendly health
+npx audrey status --json
+# Fail the process if the index is unhealthy or unreadable
+npx audrey status --json --fail-on-unhealthy
+# Nightly memory maintenance
+npx audrey dream
+# Repair vector/index drift after provider or dimension changes
+npx audrey reembed
+```
+## Example Deployment Pattern
+Use Audrey as a local sidecar to the agent service:
+- One Audrey data directory per tenant or deployment shard
+- Health checks wired to `status --json`
+- Scheduled dream/reembed jobs
+- Backups handled by the host platform
+- Regulated-data filtering handled before `memory_encode`
+That keeps Audrey focused on memory integrity while the host system owns compliance, tenancy, and transport security.

package/examples/fintech-ops-demo.js ADDED Viewed

@@ -0,0 +1,67 @@
+import { Audrey } from '../src/index.js';
+async function demo() {
+  console.log('=== Audrey Demo: Financial Services Operations ===\n');
+  const brain = new Audrey({
+    dataDir: './fintech-demo-data',
+    agent: 'payments-ops-agent',
+    embedding: { provider: 'mock', dimensions: 64 },
+  });
+  console.log('--- Encoding payment-operations incidents ---');
+  await brain.encode({
+    content: 'Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute for marketplace merchants.',
+    source: 'direct-observation',
+    salience: 0.9,
+    tags: ['payments', 'payouts', 'rate-limit'],
+    context: { domain: 'finserv', workflow: 'payout-incident' },
+  });
+  await brain.encode({
+    content: 'On-call notes show payout incident volume drops after retry batches are capped at 50 merchants per worker.',
+    source: 'tool-result',
+    salience: 0.8,
+    tags: ['payments', 'payouts', 'ops'],
+    context: { domain: 'finserv', workflow: 'payout-incident' },
+  });
+  await brain.encode({
+    content: 'Risk operations requested automatic escalation when payout failures affect more than three merchants in the same hour.',
+    source: 'told-by-user',
+    salience: 0.7,
+    tags: ['payments', 'escalation', 'risk'],
+    context: { domain: 'finserv', workflow: 'payout-incident' },
+  });
+  console.log('\n--- Consolidating incidents into an ops principle ---');
+  await brain.consolidate({
+    minClusterSize: 3,
+    similarityThreshold: -0.3,
+    extractPrinciple: () => ({
+      content: 'When payout retries spike, cap retry batches and escalate once multiple merchants are affected in the same hour.',
+      type: 'procedural',
+      conditions: ['payout failures > 3 merchants per hour', 'processor returns 429 or throttling errors'],
+    }),
+  });
+  console.log('\n--- Recalling during a live payout incident ---');
+  const recalled = await brain.recall('payout retries throttled by processor', {
+    limit: 5,
+    context: { domain: 'finserv', workflow: 'payout-incident' },
+  });
+  for (const memory of recalled) {
+    console.log(`[${memory.type}] ${memory.content}`);
+  }
+  brain.close();
+  const { rmSync } = await import('node:fs');
+  rmSync('./fintech-demo-data', { recursive: true, force: true });
+}
+demo().catch(err => {
+  console.error(err);
+  process.exit(1);
+});

package/examples/healthcare-ops-demo.js ADDED Viewed

@@ -0,0 +1,67 @@
+import { Audrey } from '../src/index.js';
+async function demo() {
+  console.log('=== Audrey Demo: Healthcare Operations ===\n');
+  const brain = new Audrey({
+    dataDir: './healthcare-demo-data',
+    agent: 'care-ops-agent',
+    embedding: { provider: 'mock', dimensions: 64 },
+  });
+  console.log('--- Encoding care-coordination observations ---');
+  await brain.encode({
+    content: 'Referral queue delays drop when missing imaging notes are requested before prior-authorization submission.',
+    source: 'direct-observation',
+    salience: 0.9,
+    tags: ['healthcare-ops', 'prior-auth', 'referrals'],
+    context: { domain: 'healthcare', workflow: 'prior-auth' },
+  });
+  await brain.encode({
+    content: 'Scheduling team reports the highest callback completion rate between 4pm and 6pm for discharge follow-up.',
+    source: 'tool-result',
+    salience: 0.8,
+    tags: ['healthcare-ops', 'follow-up', 'scheduling'],
+    context: { domain: 'healthcare', workflow: 'discharge-followup' },
+  });
+  await brain.encode({
+    content: 'Care coordinators want interpreter requirements captured in every handoff note before outreach starts.',
+    source: 'told-by-user',
+    salience: 0.7,
+    tags: ['healthcare-ops', 'handoff', 'interpreter'],
+    context: { domain: 'healthcare', workflow: 'care-coordination' },
+  });
+  console.log('\n--- Consolidating into a reusable workflow ---');
+  await brain.consolidate({
+    minClusterSize: 3,
+    similarityThreshold: -0.3,
+    extractPrinciple: () => ({
+      content: 'For care-coordination workflows, collect missing documentation and communication preferences before outreach or prior-auth submission.',
+      type: 'procedural',
+      conditions: ['prior-auth missing documentation', 'handoff note lacks outreach constraints'],
+    }),
+  });
+  console.log('\n--- Recalling during a care-coordination handoff ---');
+  const recalled = await brain.recall('care coordination handoff missing documentation', {
+    limit: 5,
+    context: { domain: 'healthcare', workflow: 'care-coordination' },
+  });
+  for (const memory of recalled) {
+    console.log(`[${memory.type}] ${memory.content}`);
+  }
+  brain.close();
+  const { rmSync } = await import('node:fs');
+  rmSync('./healthcare-demo-data', { recursive: true, force: true });
+}
+demo().catch(err => {
+  console.error(err);
+  process.exit(1);
+});

package/examples/stripe-demo.js ADDED Viewed

@@ -0,0 +1,105 @@
+// examples/stripe-demo.js
+// Proof-of-concept demo showing the full Audrey pipeline:
+//   encode episodic memories → consolidate into principles → recall proactively
+//
+// Run: node examples/stripe-demo.js
+// No external dependencies required (uses mock embeddings).
+import { Audrey } from '../src/index.js';
+async function demo() {
+  console.log('=== Audrey Demo: Stripe Rate Limit Learning ===\n');
+  const brain = new Audrey({
+    dataDir: './demo-data',
+    agent: 'stripe-agent',
+    embedding: { provider: 'mock', dimensions: 64 },
+  });
+  brain.on('encode', ({ id, content }) => {
+    console.log(`  [ENCODE] ${id.slice(0, 8)}... "${content.slice(0, 60)}"`);
+  });
+  brain.on('consolidation', ({ principlesExtracted, clustersFound }) => {
+    console.log(`  [CONSOLIDATE] Found ${clustersFound} clusters, extracted ${principlesExtracted} principles`);
+  });
+  brain.on('reinforcement', ({ episodeId, similarity }) => {
+    console.log(`  [REINFORCE] Episode ${episodeId.slice(0, 8)}... reinforced existing knowledge (sim: ${similarity?.toFixed(2) || 'N/A'})`);
+  });
+  // --- Scenario: Agent encounters Stripe rate limits ---
+  console.log('--- Episode 1: First rate limit hit ---');
+  await brain.encode({
+    content: 'Stripe API returned HTTP 429 when batch-processing 150 payments per second',
+    source: 'direct-observation',
+    salience: 0.9,
+    causal: { trigger: 'batch-payment-job', consequence: 'payment-queue-stalled' },
+    tags: ['stripe', 'rate-limit', 'production'],
+  });
+  console.log('\n--- Episode 2: Second hit from different code path ---');
+  await brain.encode({
+    content: 'Stripe webhook verification endpoint returned 429 Too Many Requests during high traffic',
+    source: 'tool-result',
+    salience: 0.7,
+    causal: { trigger: 'webhook-flood', consequence: 'missed-webhook-events' },
+    tags: ['stripe', 'rate-limit', 'webhooks'],
+  });
+  console.log('\n--- Episode 3: Third observation from monitoring ---');
+  await brain.encode({
+    content: 'Stripe API rate limit triggered at approximately 100 requests per second threshold',
+    source: 'direct-observation',
+    salience: 0.8,
+    tags: ['stripe', 'rate-limit', 'monitoring'],
+  });
+  // --- Consolidation ---
+  console.log('\n--- Running consolidation ("sleep" cycle) ---');
+  await brain.consolidate({
+    minClusterSize: 3,
+    // Mock embeddings are hash-based (not semantic), so cosine similarity
+    // between related texts is near-random. In production with real embeddings
+    // (e.g. OpenAI text-embedding-3-small), a threshold of 0.80+ works well.
+    // We drop it here so the demo pipeline runs end-to-end.
+    similarityThreshold: -0.3,
+    extractPrinciple: (episodes) => ({
+      content: `Stripe enforces ~100 req/s rate limit across all endpoints. Exceeding this causes 429 errors that can stall payment queues and cause missed webhooks. Implement request throttling.`,
+      type: 'semantic',
+    }),
+  });
+  // --- Proactive recall ---
+  console.log('\n--- Agent encounters Stripe again, recalls proactively ---');
+  const memories = await brain.recall('stripe api request rate', {
+    minConfidence: 0.3,
+    limit: 5,
+  });
+  console.log(`\nRecalled ${memories.length} memories:`);
+  for (const mem of memories) {
+    console.log(`  [${mem.type.toUpperCase()}] (conf: ${mem.confidence.toFixed(2)}, score: ${mem.score.toFixed(3)}) ${mem.content.slice(0, 80)}${mem.content.length > 80 ? '...' : ''}`);
+  }
+  // --- Introspection ---
+  console.log('\n--- Brain stats ---');
+  const stats = brain.introspect();
+  console.log(`  Episodic memories:     ${stats.episodic}`);
+  console.log(`  Semantic principles:   ${stats.semantic}`);
+  console.log(`  Procedural workflows:  ${stats.procedural}`);
+  console.log(`  Causal links:          ${stats.causalLinks}`);
+  console.log(`  Consolidation runs:    ${stats.totalConsolidationRuns}`);
+  console.log(`  Dormant memories:      ${stats.dormant}`);
+  brain.close();
+  // Cleanup demo data
+  const { rmSync } = await import('node:fs');
+  rmSync('./demo-data', { recursive: true, force: true });
+  console.log('\n=== Demo complete ===');
+}
+demo().catch(console.error);

package/mcp-server/config.js CHANGED Viewed

@@ -1,18 +1,47 @@
 import { homedir } from 'node:os';
 import { join } from 'node:path';
+import { fileURLToPath } from 'node:url';
-export const VERSION = '0.16.0';
+export const VERSION = '0.16.1';
 export const SERVER_NAME = 'audrey-memory';
 export const DEFAULT_DATA_DIR = join(homedir(), '.audrey', 'data');
+export const MCP_ENTRYPOINT = fileURLToPath(new URL('./index.js', import.meta.url));
+const VALID_EMBEDDING_PROVIDERS = new Set(['mock', 'local', 'gemini', 'openai']);
+const VALID_LLM_PROVIDERS = new Set(['mock', 'anthropic', 'openai']);
+function assertValidProvider(provider, validProviders, envVar) {
+  if (!validProviders.has(provider)) {
+    throw new Error(`Unsupported ${envVar} value: ${provider}`);
+  }
+}
+function defaultEmbeddingDimensions(provider) {
+  switch (provider) {
+    case 'mock':
+      return 64;
+    case 'openai':
+      return 1536;
+    case 'gemini':
+      return 3072;
+    case 'local':
+    default:
+      return 384;
+  }
+}
+export function resolveDataDir(env = process.env) {
+  return env.AUDREY_DATA_DIR || DEFAULT_DATA_DIR;
+}
 /**
  * Resolves which embedding provider to use.
  * Priority: explicit config -> gemini (if GOOGLE_API_KEY exists) -> local
  * OpenAI is NEVER auto-selected -- must be set explicitly via AUDREY_EMBEDDING_PROVIDER=openai.
  */
-export function resolveEmbeddingProvider(env, explicit) {
+export function resolveEmbeddingProvider(env, explicit = env.AUDREY_EMBEDDING_PROVIDER) {
   if (explicit && explicit !== 'auto') {
-    const dims = explicit === 'openai' ? 1536 : explicit === 'gemini' ? 3072 : 384;
+    assertValidProvider(explicit, VALID_EMBEDDING_PROVIDERS, 'AUDREY_EMBEDDING_PROVIDER');
+    const dims = defaultEmbeddingDimensions(explicit);
     const apiKey = explicit === 'gemini'
       ? (env.GOOGLE_API_KEY || env.GEMINI_API_KEY)
       : explicit === 'openai'
@@ -28,49 +57,77 @@ export function resolveEmbeddingProvider(env, explicit) {
   return { provider: 'local', dimensions: 384, device: env.AUDREY_DEVICE || 'gpu' };
 }
+export function resolveLLMProvider(env, explicit = env.AUDREY_LLM_PROVIDER) {
+  if (explicit && explicit !== 'auto') {
+    assertValidProvider(explicit, VALID_LLM_PROVIDERS, 'AUDREY_LLM_PROVIDER');
+    if (explicit === 'anthropic') {
+      return { provider: 'anthropic', apiKey: env.ANTHROPIC_API_KEY };
+    }
+    if (explicit === 'openai') {
+      return { provider: 'openai', apiKey: env.OPENAI_API_KEY };
+    }
+    return { provider: 'mock' };
+  }
+  if (env.ANTHROPIC_API_KEY) {
+    return { provider: 'anthropic', apiKey: env.ANTHROPIC_API_KEY };
+  }
+  if (env.OPENAI_API_KEY) {
+    return { provider: 'openai', apiKey: env.OPENAI_API_KEY };
+  }
+  return null;
+}
 export function buildAudreyConfig() {
-  const dataDir = process.env.AUDREY_DATA_DIR || DEFAULT_DATA_DIR;
+  const dataDir = resolveDataDir(process.env);
   const agent = process.env.AUDREY_AGENT || 'claude-code';
   const explicitProvider = process.env.AUDREY_EMBEDDING_PROVIDER;
-  const llmProvider = process.env.AUDREY_LLM_PROVIDER;
   const embedding = resolveEmbeddingProvider(process.env, explicitProvider);
+  const llm = resolveLLMProvider(process.env, process.env.AUDREY_LLM_PROVIDER);
   const config = { dataDir, agent, embedding };
-  if (llmProvider === 'anthropic') {
-    config.llm = { provider: 'anthropic', apiKey: process.env.ANTHROPIC_API_KEY };
-  } else if (llmProvider === 'openai') {
-    config.llm = { provider: 'openai', apiKey: process.env.OPENAI_API_KEY };
-  } else if (llmProvider === 'mock') {
-    config.llm = { provider: 'mock' };
+  if (llm) {
+    config.llm = llm;
   }
   return config;
 }
 export function buildInstallArgs(env = process.env) {
-  const envPairs = [`AUDREY_DATA_DIR=${DEFAULT_DATA_DIR}`];
+  const envPairs = new Map();
+  const addEnv = (key, value) => {
+    if (value === undefined || value === null || value === '') return;
+    envPairs.set(key, `${key}=${value}`);
+  };
-  const embedding = resolveEmbeddingProvider(env);
-  if (embedding.provider === 'gemini') {
-    envPairs.push('AUDREY_EMBEDDING_PROVIDER=gemini');
-    envPairs.push(`GOOGLE_API_KEY=${embedding.apiKey}`);
+  addEnv('AUDREY_DATA_DIR', resolveDataDir(env));
+  const embedding = resolveEmbeddingProvider(env, env.AUDREY_EMBEDDING_PROVIDER);
+  addEnv('AUDREY_EMBEDDING_PROVIDER', embedding.provider);
+  if (embedding.provider === 'local') {
+    addEnv('AUDREY_DEVICE', embedding.device || env.AUDREY_DEVICE || 'gpu');
+  } else if (embedding.provider === 'gemini') {
+    addEnv('GOOGLE_API_KEY', embedding.apiKey);
   } else if (embedding.provider === 'openai') {
-    envPairs.push('AUDREY_EMBEDDING_PROVIDER=openai');
-    envPairs.push(`OPENAI_API_KEY=${env.OPENAI_API_KEY}`);
+    addEnv('OPENAI_API_KEY', embedding.apiKey);
   }
-  if (env.ANTHROPIC_API_KEY) {
-    envPairs.push('AUDREY_LLM_PROVIDER=anthropic');
-    envPairs.push(`ANTHROPIC_API_KEY=${env.ANTHROPIC_API_KEY}`);
+  const llm = resolveLLMProvider(env, env.AUDREY_LLM_PROVIDER);
+  if (llm) {
+    addEnv('AUDREY_LLM_PROVIDER', llm.provider);
+    if (llm.provider === 'anthropic') {
+      addEnv('ANTHROPIC_API_KEY', llm.apiKey);
+    } else if (llm.provider === 'openai') {
+      addEnv('OPENAI_API_KEY', llm.apiKey);
+    }
   }
   const args = ['mcp', 'add', '-s', 'user', SERVER_NAME];
-  for (const pair of envPairs) {
+  for (const pair of envPairs.values()) {
     args.push('-e', pair);
   }
-  args.push('--', 'npx', 'audrey');
+  args.push('--', process.execPath, MCP_ENTRYPOINT);
   return args;
 }