npm - claude-mem-lite - Versions diffs - 3.2.0 → 3.3.0 - Mend

claude-mem-lite 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/.claude-plugin/marketplace.json +1 -1
package/.claude-plugin/plugin.json +1 -1
package/deep-search.mjs +105 -13
package/haiku-client.mjs +105 -1
package/mem-cli.mjs +3 -3
package/package.json +1 -1
package/server.mjs +3 -3

package/.claude-plugin/marketplace.json CHANGED Viewed

@@ -10,7 +10,7 @@
   "plugins": [
     {
       "name": "claude-mem-lite",
-      "version": "3.2.0",
+      "version": "3.3.0",
       "source": "./",
       "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. A lighter, lower-cost alternative to claude-mem (episode batching + a smaller model; cost savings are an internal estimate, not a measured benchmark)."
     }

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-mem-lite",
-  "version": "3.2.0",
+  "version": "3.3.0",
   "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. A lighter, lower-cost alternative to claude-mem (episode batching + a smaller model; cost savings are an internal estimate, not a measured benchmark).",
   "author": {
     "name": "sdsrss"

package/deep-search.mjs CHANGED Viewed

@@ -68,15 +68,24 @@ export function hasEscalatableCorpus(db, project, min = AUTO_DEEP_MIN_CORPUS) {
 /**
  * Is a usable LLM available for AUTO escalation? True when a stub/real llm is
- * injected (tests), or a FAST provider key is set. The claude-CLI fallback is
- * deliberately excluded — spawning a subprocess per search is too slow for the
- * default (automatic) path; explicit deep=true may still use it.
+ * injected (tests), a FAST provider key is set, OR the claude-CLI fallback is
+ * enabled (D#40: default-on for CLI-auth users; kill switch
+ * CLAUDE_MEM_AUTO_DEEP_CLI=0). The CLI path is made safe for the long-lived
+ * server hot path by the async/fail-fast/throttled auto provider (deepSearch
+ * auto), not by being excluded as it was before D#40.
  * @param {object} [env=process.env]
  * @param {Function|undefined} [injectedLlm]
  * @returns {boolean}
  */
 export function autoDeepLlmReady(env = process.env, injectedLlm) {
-  return !!injectedLlm || !!(env.ANTHROPIC_API_KEY || env.OPENROUTER_API_KEY);
+  if (injectedLlm) return true;
+  if (env.ANTHROPIC_API_KEY || env.OPENROUTER_API_KEY) return true;
+  // No provider key → detectMode() would be 'cli'. CLI-auth users get auto
+  // escalation by default; the burst/latency cost is bounded by the auto
+  // provider (fail-fast + throttle) and a failed rewrite degrades to baseline.
+  // Kill switch honors the common disable spellings, not just the exact '0'.
+  const off = String(env.CLAUDE_MEM_AUTO_DEEP_CLI ?? '').trim().toLowerCase();
+  return !(off === '0' || off === 'false' || off === 'no' || off === 'off');
 }
 /**
@@ -189,12 +198,75 @@ export function assembleVariants(query, parsed, { max = MAX_VARIANTS } = {}) {
   return out;
 }
-// Default provider: pulled in lazily so importing deep-search.mjs (e.g. in tests
-// with an injected llm) never loads the LLM client. callModelJSON returns parsed
-// JSON or null, and never throws.
+// ─── Auto-escalation safety machinery (D#40) ─────────────────────────────────
+// The AUTO path can fire on every weak search across the long-lived MCP server,
+// so it must be fail-fast (short timeout, no retry), throttled (bound bursts),
+// and cached (skip repeat rewrites). The EXPLICIT deep=true path stays patient.
+export const AUTO_DEEP_TIMEOUT_MS = 5000;   // fail-fast budget for the auto path; no retry
+export const AUTO_DEEP_THROTTLE_MS = 3000;  // min gap between auto LLM rewrites, per process (bounds spawn rate)
+const REWRITE_CACHE_MAX = 256;              // LRU cap for the query→variants cache
+let _lastAutoLlmAt = 0;
+const _rewriteCache = new Map(); // normalized query → variants (string[]); successes only
+/** Reset auto-path throttle + cache. Test-only; production state is per-process. */
+export function _resetAutoDeepState() { _lastAutoLlmAt = 0; _rewriteCache.clear(); }
+function cacheGet(key) {
+  if (!_rewriteCache.has(key)) return null;
+  const v = _rewriteCache.get(key);
+  _rewriteCache.delete(key); _rewriteCache.set(key, v); // LRU bump
+  return v.slice();
+}
+function cacheSet(key, variants) {
+  if (_rewriteCache.has(key)) _rewriteCache.delete(key);
+  _rewriteCache.set(key, variants.slice());
+  if (_rewriteCache.size > REWRITE_CACHE_MAX) {
+    _rewriteCache.delete(_rewriteCache.keys().next().value); // evict oldest
+  }
+}
+/**
+ * Wrap an llm so it fires at most once per `intervalMs` per process. A throttled
+ * call resolves null → rewriteQuery degrades to baseline (never worse). Exported
+ * for tests. Throttle state is module-global (shared across deepSearch calls).
+ *
+ * The clock advances on every ACTUAL call — success OR failure — deliberately:
+ * the throttle bounds the subprocess SPAWN RATE, and a failed spawn still costs a
+ * subprocess + its timeout, so a broken provider that always fails must be rate-
+ * limited too (gating only on success would let a persistent failure spawn on
+ * every weak search). The interval is kept short so one failure suppresses
+ * escalation only briefly, not for a long window.
+ */
+export function makeThrottled(llm, { intervalMs = AUTO_DEEP_THROTTLE_MS } = {}) {
+  return async (prompt) => {
+    const now = Date.now();
+    if (now - _lastAutoLlmAt < intervalMs) return null;
+    _lastAutoLlmAt = now;
+    return llm(prompt);
+  };
+}
+// Run one rewrite LLM call via the fully-async dispatcher (callModelJSONAsync):
+// every CLI invocation — cli-mode primary AND the post-provider-failure fallback
+// — is non-blocking, so an MCP request handler never blocks the event loop even
+// under a keyed-provider outage (D#40). Lazy import so tests with an injected llm
+// never load the LLM client.
+async function callRewriteLLM(prompt, { timeout }) {
+  const { callModelJSONAsync } = await import('./haiku-client.mjs');
+  return callModelJSONAsync(prompt, 'haiku', { timeout, maxTokens: 400 });
+}
+// Default (explicit deep=true) provider: patient timeout, no throttle/cache.
 async function defaultLLM(prompt) {
-  const { callModelJSON } = await import('./haiku-client.mjs');
-  return callModelJSON(prompt, 'haiku', { timeout: 12000, maxTokens: 400 });
+  return callRewriteLLM(prompt, { timeout: 12000 });
+}
+// Auto-path provider: fail-fast timeout + throttle. Built fresh per deepSearch
+// call; the throttle clock it reads is module-global (per-process).
+function makeAutoLlm() {
+  return makeThrottled((prompt) => callRewriteLLM(prompt, { timeout: AUTO_DEEP_TIMEOUT_MS }));
 }
 /**
@@ -205,11 +277,17 @@ async function defaultLLM(prompt) {
  * @param {object} [opts]
  * @param {(prompt: object) => Promise<object|null>} [opts.llm]
  * @param {number} [opts.retries=1]
+ * @param {boolean} [opts.cache=false]  memoize successful rewrites (auto path)
  * @returns {Promise<string[]>}
  */
-export async function rewriteQuery(query, { llm = defaultLLM, retries = 1 } = {}) {
+export async function rewriteQuery(query, { llm = defaultLLM, retries = 1, cache = false } = {}) {
   const original = String(query ?? '').trim();
   if (!original) return [];
+  const key = original.toLowerCase();
+  if (cache) {
+    const hit = cacheGet(key);
+    if (hit) return hit; // process-lifetime memo of a prior successful rewrite
+  }
   const prompt = buildRewritePrompt(original);
   for (let attempt = 0; attempt <= retries; attempt++) {
     let parsed;
@@ -219,7 +297,10 @@ export async function rewriteQuery(query, { llm = defaultLLM, retries = 1 } = {}
       parsed = null;
     }
     const variants = assembleVariants(original, parsed);
-    if (variants.length > 1) return variants; // got at least one real rewrite
+    if (variants.length > 1) { // got at least one real rewrite
+      if (cache) cacheSet(key, variants); // cache successes only — failures retry next time
+      return variants;
+    }
   }
   return [original]; // robust floor — single-query == baseline
 }
@@ -304,13 +385,24 @@ function defaultSearchFn(db, query, params) {
  * @param {(prompt:object)=>Promise<object|null>} [deps.llm]
  * @param {(db:Database, query:string, params:object)=>Array} [deps.searchFn]
  * @param {number} [deps.rrfK=RRF_K]
+ * @param {boolean} [deps.auto=false]  use the fail-fast/throttled/cached auto provider
  * @returns {Promise<{results: Array, variants: string[]}>}
  */
-export async function deepSearch(db, params, { llm = defaultLLM, searchFn = defaultSearchFn, rrfK = RRF_K } = {}) {
+export async function deepSearch(db, params, { llm, searchFn = defaultSearchFn, rrfK = RRF_K, auto = false } = {}) {
   const query = String(params?.query ?? '').trim();
   if (!query) return { results: [], variants: [] };
-  const variants = await rewriteQuery(query, { llm });
+  // No injected llm: EXPLICIT deep=true uses the patient defaultLLM; the AUTO
+  // path uses a fail-fast + throttled provider with no retry and a process-
+  // lifetime rewrite cache (D#40). An injected llm (tests) is used verbatim.
+  let rewriteLlm = llm;
+  let retries = 1;
+  let cache = false;
+  if (!rewriteLlm) {
+    if (auto) { rewriteLlm = makeAutoLlm(); retries = 0; cache = true; }
+    else rewriteLlm = defaultLLM;
+  }
+  const variants = await rewriteQuery(query, { llm: rewriteLlm, retries, cache });
   const lists = variants.map((v, i) => {
     // variant[0] is the ORIGINAL query: let an engine error propagate exactly as
     // it does on the single-query baseline path, so "never worse than baseline"

package/haiku-client.mjs CHANGED Viewed

@@ -5,7 +5,7 @@
 // Model configurable via CLAUDE_MEM_MODEL (haiku|sonnet); OpenRouter slug
 // overridable via OPENROUTER_MODEL
-import { execFileSync } from 'child_process';
+import { execFileSync, spawn } from 'child_process';
 import { readFileSync } from 'fs';
 import { join } from 'path';
 import { randomUUID } from 'crypto';
@@ -247,6 +247,44 @@ export async function callModelJSON(prompt, model = 'haiku', opts) {
   return parseJsonFromLLM(result.text);
 }
+/**
+ * JSON-returning, FULLY-ASYNC model call for the long-lived server hot path
+ * (deep-search auto-escalation). Like callModelJSON, but every CLI invocation —
+ * cli-mode primary AND the post-provider-failure fallback — uses the
+ * non-blocking callModelCLIAsync, so a keyed-provider outage can never drop onto
+ * the blocking execFileSync path and freeze the MCP event loop (D#40). Never
+ * throws; returns parsed JSON or null.
+ * @param {string|{system?:string,user:string}} prompt
+ * @param {'haiku'|'sonnet'} model
+ * @param {{timeout?:number,maxTokens?:number,temperature?:number}} [opts]
+ * @returns {Promise<object|null>}
+ */
+export async function callModelJSONAsync(prompt, model = 'haiku', { timeout = 15000, maxTokens = 1000, temperature = DEFAULT_LLM_TEMPERATURE } = {}) {
+  if (!prompt) return null;
+  const resolvedModel = MODEL_MAP[model] ? model : 'haiku';
+  const mode = detectMode();
+  if (mode === 'cli') {
+    const res = await callModelCLIAsync(prompt, resolvedModel, { timeout });
+    return res?.text ? parseJsonFromLLM(res.text) : null;
+  }
+  // Keyed provider (api/openrouter): try it, then degrade to the ASYNC CLI on any
+  // failure — NOT the blocking execFileSync callModelCLI that callModelJSON uses.
+  let primary = null;
+  try {
+    primary = mode === 'api'
+      ? await callModelAPI(prompt, resolvedModel, { timeout, maxTokens, temperature })
+      : await callOpenRouterAPI(prompt, resolvedModel, { timeout, maxTokens, temperature });
+  } catch (e) {
+    debugCatch(e, `callModelJSONAsync:${mode}:${resolvedModel}`);
+  }
+  if (primary?.text) return parseJsonFromLLM(primary.text);
+  const res = await callModelCLIAsync(prompt, resolvedModel, { timeout });
+  return res?.text ? parseJsonFromLLM(res.text) : null;
+}
 async function callModelAPI(prompt, model, { timeout, maxTokens, temperature = DEFAULT_LLM_TEMPERATURE }) {
   const apiKey = process.env.ANTHROPIC_API_KEY;
   if (!apiKey) return null;
@@ -319,6 +357,72 @@ function callModelCLI(prompt, model, { timeout }) {
   }
 }
+/**
+ * Async, non-blocking sibling of callModelCLI for the long-lived MCP server hot
+ * path (deep-search auto-escalation, D#40). execFileSync blocks the event loop for
+ * the whole subprocess lifetime — acceptable in short-lived hook processes
+ * (callModelCLI), not inside an MCP request handler. Uses spawn + stdin so the
+ * untrusted query stays out of argv (ps-visible) and the boundary-marker model is
+ * preserved. Never rejects: resolves {text} on non-empty stdout, null on
+ * error/empty. On timeout it SIGKILLs the child with NO retry (fail-fast) and
+ * salvages a complete JSON payload from partial stdout (mirrors callModelCLI's
+ * catch-salvage; tolerant of Haiku's ```json fencing per #8605, which the upstream
+ * parseJsonFromLLM strips).
+ * @param {string|{system?:string,user:string}} prompt
+ * @param {'haiku'|'sonnet'} model
+ * @param {{timeout:number}} opts  SIGKILL after `timeout` ms; no retry.
+ * @returns {Promise<{text:string}|null>}
+ */
+export function callModelCLIAsync(prompt, model, { timeout }) {
+  return new Promise((resolve) => {
+    const modelName = MODEL_MAP[model] ? model : 'haiku';
+    let child;
+    try {
+      child = spawn(getClaudePath(), ['-p', '--model', modelName], {
+        env: { ...process.env, CLAUDE_MEM_HOOK_RUNNING: '1' },
+        cwd: '/tmp',
+        stdio: ['pipe', 'pipe', 'pipe'],
+      });
+    } catch (e) {
+      debugCatch(e, `${model}-cli-async`);
+      resolve(null);
+      return;
+    }
+    let stdout = '';
+    let settled = false;
+    const done = (val) => {
+      if (settled) return;
+      settled = true;
+      clearTimeout(timer);
+      resolve(val);
+    };
+    const timer = setTimeout(() => {
+      try { child.kill('SIGKILL'); } catch { /* already gone */ }
+      const t = stdout.trim();
+      if (t.startsWith('{') && t.endsWith('}')) {
+        try { JSON.parse(t); done({ text: t }); return; } catch { /* not complete JSON */ }
+      }
+      done(null);
+    }, timeout);
+    child.stdout?.setEncoding('utf8'); // decode multi-byte UTF-8 (CJK) across chunk boundaries
+    child.stdout?.on('data', (d) => { stdout += d; });
+    child.stderr?.on('data', () => {}); // drain stderr so a chatty child can't block on a full pipe
+    child.on('error', (e) => { debugCatch(e, `${model}-cli-async`); done(null); });
+    child.on('close', () => {
+      const t = stdout.trim();
+      done(t ? { text: t } : null);
+    });
+    // EPIPE guard: the child may exit before we finish writing stdin.
+    child.stdin?.on('error', () => {});
+    try {
+      child.stdin?.write(flattenForCLI(prompt));
+      child.stdin?.end();
+    } catch (e) {
+      debugCatch(e, `${model}-cli-async:stdin`);
+    }
+  });
+}
 // ─── API Mode ────────────────────────────────────────────────────────────────
 async function callHaikuAPI(prompt, { timeout, maxTokens, temperature = DEFAULT_LLM_TEMPERATURE }) {

package/mem-cli.mjs CHANGED Viewed

@@ -182,7 +182,7 @@ async function cmdSearch(db, args, { llm } = {}) {
       orFallbackFired: false,
     };
-    const runDeep = async () => {
+    const runDeep = async ({ auto = false } = {}) => {
       const ds = await deepSearch(db, {
         query,
         project: project || null,
@@ -194,7 +194,7 @@ async function cmdSearch(db, args, { llm } = {}) {
         epochTo: dateTo,
         limit: perSourceLimit,
         currentProject: project ? null : inferProject(),
-      }, llm ? { llm } : undefined);
+      }, llm ? { llm } : { auto });
       deepVariants = ds.variants;
       if (deepVariants.length > 1) {
         process.stderr.write(`[mem] Deep search: rewrote into ${deepVariants.length} query variants, RRF-fused\n`);
@@ -212,7 +212,7 @@ async function cmdSearch(db, args, { llm } = {}) {
       if (obsCtx.orFallbackFired) orFallbackFired = true;
       if (deepMode === 'auto' && autoDeepLlmReady(process.env, llm) && shouldEscalateToDeep(obsResults, obsCtx) && hasEscalatableCorpus(db, project || null)) {
         process.stderr.write(`[mem] auto-escalated to deep search (weak results: ${obsResults.length} hits)\n`);
-        obsResults = await runDeep();
+        obsResults = await runDeep({ auto: true });
         isDeep = true;
       }
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-mem-lite",
-  "version": "3.2.0",
+  "version": "3.3.0",
   "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. A lighter, lower-cost alternative to claude-mem (episode batching + a smaller model; cost savings are an internal estimate, not a measured benchmark).",
   "type": "module",
   "packageManager": "npm@10.9.2",

package/server.mjs CHANGED Viewed

@@ -370,7 +370,7 @@ async function runSearchPipeline(db, args, { llm } = {}) {
     let escalatedObsCount = 0;
     // Helper: run deepSearch and load results into the shared `results` array.
-    const runDeepInto = async () => {
+    const runDeepInto = async ({ auto = false } = {}) => {
       const { results: deepRows, variants } = await deepSearch(db, {
         query: args.query,
         project: args.project || null,
@@ -381,7 +381,7 @@ async function runSearchPipeline(db, args, { llm } = {}) {
         epochFrom, epochTo,
         limit: perSourceLimit,
         currentProject,
-      }, llm ? { llm } : undefined);
+      }, llm ? { llm } : { auto });
       // Safe to reset: sessions/prompts are pushed AFTER the obs block, so nothing is lost here.
       results.length = 0;
       results.push(...deepRows);
@@ -405,7 +405,7 @@ async function runSearchPipeline(db, args, { llm } = {}) {
         // filter makes the invariant explicit and robust to future reordering.
         const obsCountBeforeEscalation = results.length;
         if (deepMode === 'auto' && autoDeepLlmReady(process.env, llm) && shouldEscalateToDeep(results.filter(r => r.source === 'obs'), ctx) && hasEscalatableCorpus(db, args.project || null)) {
-          await runDeepInto();
+          await runDeepInto({ auto: true });
           isDeep = true;
           escalated = true;
           escalatedObsCount = obsCountBeforeEscalation;