npm - claude-mem-lite - Versions diffs - 3.3.0 → 3.4.0 - Mend

claude-mem-lite 3.3.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/.claude-plugin/marketplace.json +1 -1
package/.claude-plugin/plugin.json +1 -1
package/README.md +35 -0
package/deep-search.mjs +79 -5
package/haiku-client.mjs +86 -11
package/mem-cli.mjs +21 -3
package/package.json +2 -1
package/rerank.mjs +78 -0
package/server.mjs +22 -8
package/source-files.mjs +1 -1
package/tool-schemas.mjs +1 -0

package/.claude-plugin/marketplace.json CHANGED Viewed

@@ -10,7 +10,7 @@
   "plugins": [
     {
       "name": "claude-mem-lite",
-      "version": "3.3.0",
+      "version": "3.4.0",
       "source": "./",
       "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. A lighter, lower-cost alternative to claude-mem (episode batching + a smaller model; cost savings are an internal estimate, not a measured benchmark)."
     }

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-mem-lite",
-  "version": "3.3.0",
+  "version": "3.4.0",
   "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. A lighter, lower-cost alternative to claude-mem (episode batching + a smaller model; cost savings are an internal estimate, not a measured benchmark).",
   "author": {
     "name": "sdsrss"

package/README.md CHANGED Viewed

@@ -644,6 +644,41 @@ Benchmarked on 200 observations across 30 queries (standard + hard-negative cate
 The benchmark suite runs as a CI gate (`npm run benchmark:gate`) to prevent search quality regressions.
+### Recall on LongMemEval (standard benchmark)
+Beyond the in-repo micro-benchmark above, claude-mem-lite is measured on
+[LongMemEval](https://github.com/xiaowu0162/LongMemEval) (Wu et al.) — a
+500-question long-term-memory benchmark — so its recall is comparable to the
+field, not just to itself. Metric is **recall_any@k**: is a gold evidence session
+in the top *k* retrieved? Corpus is user-turns-only (the standard raw-baseline
+rule). Runners: `benchmark/longmemeval.mjs` (lexical) and
+`benchmark/longmemeval-rerank.mjs` (rerank).
+| Retriever (zero embeddings) | @1 | @5 | @10 |
+|---|---|---|---|
+| Lexical hybrid — FTS5 + TF-IDF + RRF | 76.8% | 90.6% | 95.2% |
+| + one top-20 LLM rerank pass | **92.8%** | **96.8%** | **97.4%** |
+*n = 500 questions; 99.8% JSON parse-rate at concurrency 3.* The rerank pass
+hands the top 20 lexical candidates to a single Haiku call (~1.4 s/query) that
+reorders them. It is **never worse than the lexical baseline by construction** —
+any LLM or parse failure falls back to the original candidate order.
+**On embeddings, honestly.** With no LLM in the loop, dense-embedding retrieval
+still wins on raw recall — a dense-embedding baseline reports ~96.6% @5 on this
+split, versus our 90.6%. The rerank row's point is that a *single cheap LLM call
+closes that gap*: a zero-embedding lexical stack reaches 96.8% @5, edging the
+embedding raw number, because the lexical candidate set is already rich enough
+(recall@20 = 97.8%) that ranking — not recall — is the bottleneck. An
+embedding-plus-rerank stack still leads when both sides spend an LLM call; the
+takeaway is that claude-mem-lite needs **no vector model, no Python, and no
+external service** to reach embedding-competitive precision.
+Per-category @5 (lexical → +rerank): knowledge-update 98.7 → 100.0 ·
+single-session-user 91.4 → 98.6 · temporal-reasoning 89.5 → 97.7 · multi-session
+95.5 → 97.7 · single-session-assistant 83.9 → 94.6 · single-session-preference
+63.3 → 80.0. Every category improves; none regress.
 ## Development
 ```bash

package/deep-search.mjs CHANGED Viewed

@@ -34,10 +34,19 @@
 import { searchObservationsHybrid } from './search-engine.mjs';
 import { sanitizeFtsQuery } from './utils.mjs';
 import { RRF_K } from './tfidf.mjs';
+import { llmRerankOrder, defaultRerankLLM } from './rerank.mjs';
 // original + up to 3 rewrites (keyword / concept-expansion / HyDE).
 export const MAX_VARIANTS = 4;
+// How many RRF-fused candidates the opt-in rerank stage hands to the LLM. The
+// LongMemEval rerank benchmark (benchmark/longmemeval-rerank.mjs) measured the
+// lexical candidate set as rich enough at 20 (recall@20 = 97.8%) that reranking
+// the top-20 captures nearly all of that ceiling (96.8%@5); matching it here keeps
+// the shipped behaviour aligned with the measured number. Module-internal — callers
+// override per-call via deps.rerankTopK; export it if a config surface ever needs it.
+const RERANK_TOPK = 20;
 // ─── Auto-escalation (opt-in adaptive deep search) ──────────────────────────
 // Result-count floor below which a normal search is "weak" enough to auto-escalate
 // to deepSearch. Calibrated against the deep-search benchmark fixtures; 3 is the
@@ -371,7 +380,33 @@ function defaultSearchFn(db, query, params) {
 }
 /**
- * Opt-in deep search: rewrite → per-variant hybrid search → RRF fusion.
+ * Build the candidate text the opt-in rerank stage shows the LLM. Prefers each
+ * observation's full `narrative` (the field the LongMemEval rerank benchmark
+ * scored); falls back to title / subtitle / snippet / lesson when narrative is
+ * unavailable or the db can't be read (injected rows / null db in unit tests).
+ * @param {Database|null} db
+ * @param {Array<object>} rows  fused candidate rows (already sliced to top-K)
+ * @returns {Map<any,string>} id → candidate text
+ */
+function defaultRerankText(db, rows) {
+  const fallback = (r) => [r.title, r.subtitle, r.snippet, r.lesson_learned].filter(Boolean).join(' — ');
+  if (!db) return new Map(rows.map((r) => [r.id, fallback(r)]));
+  try {
+    const ids = rows.map((r) => r.id);
+    const ph = ids.map(() => '?').join(',');
+    const found = new Map(
+      db.prepare(`SELECT id, narrative, title, subtitle FROM observations WHERE id IN (${ph})`)
+        .all(...ids)
+        .map((o) => [o.id, o.narrative || [o.title, o.subtitle].filter(Boolean).join(' — ')]),
+    );
+    return new Map(rows.map((r) => [r.id, found.get(r.id) || fallback(r)]));
+  } catch {
+    return new Map(rows.map((r) => [r.id, fallback(r)]));
+  }
+}
+/**
+ * Opt-in deep search: rewrite → per-variant hybrid search → RRF fusion → opt-in rerank.
  * @param {Database} db open better-sqlite3 handle
  * @param {object} params
  * @param {string} params.query  The user query.
@@ -386,11 +421,15 @@ function defaultSearchFn(db, query, params) {
  * @param {(db:Database, query:string, params:object)=>Array} [deps.searchFn]
  * @param {number} [deps.rrfK=RRF_K]
  * @param {boolean} [deps.auto=false]  use the fail-fast/throttled/cached auto provider
- * @returns {Promise<{results: Array, variants: string[]}>}
+ * @param {boolean} [deps.rerank=false]  opt-in: LLM-rerank the fused top-K (never on the auto path)
+ * @param {(prompt:object)=>Promise<any>} [deps.rerankLlm]  rerank provider (default: lazy haiku)
+ * @param {number} [deps.rerankTopK=RERANK_TOPK]  how many fused candidates to rerank
+ * @param {(db:Database, rows:Array)=>Map} [deps.rerankTextFn]  id→text builder for the rerank prompt
+ * @returns {Promise<{results: Array, variants: string[], reranked: boolean}>}
  */
-export async function deepSearch(db, params, { llm, searchFn = defaultSearchFn, rrfK = RRF_K, auto = false } = {}) {
+export async function deepSearch(db, params, { llm, searchFn = defaultSearchFn, rrfK = RRF_K, auto = false, rerank = false, rerankLlm, rerankTopK = RERANK_TOPK, rerankTextFn = defaultRerankText } = {}) {
   const query = String(params?.query ?? '').trim();
-  if (!query) return { results: [], variants: [] };
+  if (!query) return { results: [], variants: [], reranked: false };
   // No injected llm: EXPLICIT deep=true uses the patient defaultLLM; the AUTO
   // path uses a fail-fast + throttled provider with no retry and a process-
@@ -418,5 +457,40 @@ export async function deepSearch(db, params, { llm, searchFn = defaultSearchFn,
   const fused = rrfFuseN(lists, rrfK);
   const limit = params.limit ?? 10;
-  return { results: fused.slice(0, limit), variants };
+  // Opt-in rerank stage (option C): reorder the fused top-K by an LLM relevance
+  // read, using the same core the LongMemEval benchmark measures (rerank.mjs) so
+  // the shipped algorithm == the measured one. Strictly opt-in — the AUTO
+  // escalation path never reranks, so no default search behaviour changes and the
+  // hot path stays a single LLM call. "Never worse than the fused order" by
+  // construction: a failed/unparseable rerank leaves the fused order untouched.
+  // The candidate set fed here is RICHER than the benchmark's single-query top-20
+  // (it is multi-query RRF), so the measured 96.8%@5 is a conservative floor.
+  let ordered = fused;
+  let reranked = false;
+  if (rerank && fused.length > 1) {
+    const k = Math.min(rerankTopK, fused.length);
+    const top = fused.slice(0, k);
+    const text = rerankTextFn(db, top);
+    const cand = top.map((r) => ({ sid: r.id, text: text.get(r.id) || '' }));
+    const { order, parsed } = await llmRerankOrder(query, cand, rerankLlm || defaultRerankLLM);
+    if (parsed) {
+      const byId = new Map(top.map((r) => [r.id, r]));
+      const head = order.map((id) => byId.get(id)).filter(Boolean);
+      // Re-stamp scores so `score` stays monotonic with the rerank order, reusing
+      // the top-K's OWN values ascending (best = most-negative first): the reranked
+      // block keeps the K best scores so it stays ahead of the fused tail, and orders
+      // within itself by rerank rank. This keeps the shared CLI↔MCP `score` ordering
+      // contract (#8217) consistent with the array order, so a consumer that re-sorts
+      // by score reproduces the rerank order instead of restoring the RRF order.
+      // (server.mjs also skips its context re-rank/re-sort when reranked, so the LLM
+      // judgement is the final order — the re-stamp keeps score honest regardless.)
+      const scores = top.map((r) => r.score).sort((a, b) => a - b);
+      head.forEach((r, i) => { r.score = scores[i]; r.rrfScore = -scores[i]; });
+      ordered = [...head, ...fused.slice(k)];
+      reranked = true;
+    }
+  }
+  return { results: ordered.slice(0, limit), variants, reranked };
 }

package/haiku-client.mjs CHANGED Viewed

@@ -6,12 +6,84 @@
 // overridable via OPENROUTER_MODEL
 import { execFileSync, spawn } from 'child_process';
+import http from 'node:http';
+import https from 'node:https';
+import tls from 'node:tls';
 import { readFileSync } from 'fs';
 import { join } from 'path';
 import { randomUUID } from 'crypto';
 import { debugLog, debugCatch, parseJsonFromLLM } from './utils.mjs';
 import { DB_DIR } from './schema.mjs';
+// ─── Proxy support (native fetch ignores HTTP(S)_PROXY) ──────────────────────
+//
+// Node's global fetch (undici) does NOT honour HTTP(S)_PROXY env vars, and
+// undici's ProxyAgent isn't importable without adding a dependency. In an env
+// that requires a local proxy to reach external APIs (e.g.
+// HTTPS_PROXY=http://127.0.0.1:PORT), a direct fetch to openrouter.ai
+// hangs/times out. We tunnel HTTPS through the HTTP CONNECT proxy using built-ins
+// only. No proxy var (or a NO_PROXY host) → null → callers keep native fetch,
+// unchanged (zero behaviour change when no proxy is configured).
+function httpConnectProxyFor(targetUrl) {
+  const proxy = process.env.HTTPS_PROXY || process.env.https_proxy || process.env.HTTP_PROXY || process.env.http_proxy;
+  if (!proxy || !/^https?:\/\//.test(proxy)) return null; // socks5 ALL_PROXY not supported here
+  try {
+    const host = new URL(targetUrl).hostname;
+    const noProxy = (process.env.NO_PROXY || process.env.no_proxy || '').split(',').map((s) => s.trim()).filter(Boolean);
+    if (noProxy.some((n) => n === host || (n.startsWith('.') && host.endsWith(n.slice(1))))) return null;
+    return proxy;
+  } catch {
+    return null;
+  }
+}
+// fetch-compatible (subset) POST over an HTTP CONNECT tunnel: returns
+// { ok, status, json(), text() }. Rejects on connect/timeout/socket error so the
+// caller's try/catch degrades to the CLI exactly as a failed fetch would.
+function postViaConnectProxy(proxy, url, { headers = {}, body = '', timeout = 20000 }) {
+  return new Promise((resolve, reject) => {
+    const p = new URL(proxy);
+    const t = new URL(url);
+    const port = Number(t.port) || 443;
+    let settled = false;
+    const finish = (fn, arg) => { if (!settled) { settled = true; fn(arg); } };
+    const connReq = http.request({
+      host: p.hostname,
+      port: Number(p.port) || 80,
+      method: 'CONNECT',
+      path: `${t.hostname}:${port}`,
+      headers: { Host: `${t.hostname}:${port}` },
+    });
+    connReq.setTimeout(timeout, () => connReq.destroy(new Error('proxy CONNECT timeout')));
+    connReq.on('error', (e) => finish(reject, e));
+    connReq.on('connect', (res, socket) => {
+      if (res.statusCode !== 200) {
+        socket.destroy();
+        return finish(reject, new Error(`proxy CONNECT ${res.statusCode}`));
+      }
+      const req = https.request(
+        url,
+        { method: 'POST', headers, createConnection: () => tls.connect({ socket, servername: t.hostname }) },
+        (resp) => {
+          let data = '';
+          resp.setEncoding('utf8');
+          resp.on('data', (c) => (data += c));
+          resp.on('end', () => finish(resolve, {
+            ok: resp.statusCode >= 200 && resp.statusCode < 300,
+            status: resp.statusCode,
+            json: () => JSON.parse(data),
+            text: () => data,
+          }));
+        }
+      );
+      req.setTimeout(timeout, () => req.destroy(new Error('proxy request timeout')));
+      req.on('error', (e) => finish(reject, e));
+      req.end(body);
+    });
+    connReq.end();
+  });
+}
 // ─── Model Resolution ────────────────────────────────────────────────────────
 // CLI name → API model ID mapping
@@ -493,17 +565,20 @@ async function callOpenRouterAPI(prompt, tier, { timeout, maxTokens, temperature
     if (system) messages.push({ role: 'system', content: system });
     messages.push({ role: 'user', content: user });
-    const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
-      method: 'POST',
-      headers: {
-        'Content-Type': 'application/json',
-        'Authorization': `Bearer ${apiKey}`,
-        // Optional OpenRouter attribution headers (ignored by the API if absent).
-        'X-Title': 'claude-mem-lite',
-      },
-      body: JSON.stringify({ model, max_tokens: maxTokens, temperature, messages }),
-      signal: controller.signal,
-    });
+    const url = 'https://openrouter.ai/api/v1/chat/completions';
+    const reqHeaders = {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${apiKey}`,
+      // Optional OpenRouter attribution headers (ignored by the API if absent).
+      'X-Title': 'claude-mem-lite',
+    };
+    const reqBody = JSON.stringify({ model, max_tokens: maxTokens, temperature, messages });
+    // Native fetch ignores HTTP(S)_PROXY; when a proxy is configured, tunnel the
+    // request through it — a direct fetch to openrouter.ai times out behind one.
+    const proxy = httpConnectProxyFor(url);
+    const res = proxy
+      ? await postViaConnectProxy(proxy, url, { headers: reqHeaders, body: reqBody, timeout })
+      : await fetch(url, { method: 'POST', headers: reqHeaders, body: reqBody, signal: controller.signal });
     if (!res.ok) {
       debugLog('WARN', `${tier}-openrouter`, `HTTP ${res.status}`);

package/mem-cli.mjs CHANGED Viewed

@@ -52,7 +52,7 @@ async function cmdSearch(db, args, { llm } = {}) {
   const { positional, flags } = parseArgs(args);
   const query = positional.join(' ');
   if (!query) {
-    fail('[mem] Usage: claude-mem-lite search <query> [--type TYPE] [--source SOURCE] [--limit N] [--project P] [--from DATE] [--to DATE] [--importance N] [--branch B] [--offset N] [--sort relevance|time|importance] [--include-noise] [--deep] [--no-deep]');
+    fail('[mem] Usage: claude-mem-lite search <query> [--type TYPE] [--source SOURCE] [--limit N] [--project P] [--from DATE] [--to DATE] [--importance N] [--branch B] [--offset N] [--sort relevance|time|importance] [--include-noise] [--deep] [--no-deep] [--rerank]');
     return;
   }
@@ -109,6 +109,15 @@ async function cmdSearch(db, args, { llm } = {}) {
     : ((flags['no-deep'] === true || flags['no-deep'] === 'true') ? false : undefined);
   const deepMode = resolveDeepMode(explicitDeep, { surface: 'cli' });
+  // --rerank: opt-in LLM rerank of the fused top-20 (option C, deep-search.mjs).
+  // One extra Haiku call (~1.4s); only meaningful on the explicit --deep path,
+  // never on auto-escalation. Same rerank core the LongMemEval benchmark measures.
+  const rerankFlag = flags.rerank === true || flags.rerank === 'true';
+  const rerank = rerankFlag && deepMode === 'deep';
+  if (rerankFlag && deepMode !== 'deep') {
+    process.stderr.write('[mem] Note: --rerank requires --deep (it reranks deep-search candidates); ignored\n');
+  }
   if (source && !['observations', 'sessions', 'prompts'].includes(source)) {
     fail(`[mem] Invalid --source "${source}". Use: observations, sessions, prompts`);
     return;
@@ -160,6 +169,7 @@ async function cmdSearch(db, args, { llm } = {}) {
   let orFallbackFired = false;
   let deepVariants = null;
+  let isReranked = false;
   let isDeep = deepMode === 'deep';
   // Search observations — shared engine with server.mjs (#8198/#8212 paired-path fix)
@@ -194,13 +204,19 @@ async function cmdSearch(db, args, { llm } = {}) {
         epochTo: dateTo,
         limit: perSourceLimit,
         currentProject: project ? null : inferProject(),
-      }, llm ? { llm } : { auto });
+      }, llm ? { llm, rerank: rerank && !auto } : { auto, rerank: rerank && !auto });
       deepVariants = ds.variants;
+      isReranked = ds.reranked;
       if (deepVariants.length > 1) {
         process.stderr.write(`[mem] Deep search: rewrote into ${deepVariants.length} query variants, RRF-fused\n`);
       } else {
         process.stderr.write('[mem] Deep search: rewrite returned no usable variants; used original query only\n');
       }
+      if (rerank && !auto) {
+        process.stderr.write(ds.reranked
+          ? '[mem] Deep search: LLM-reranked the fused top-20\n'
+          : '[mem] Deep search: rerank produced no usable order; kept fused order\n');
+      }
       return ds.results;
     };
@@ -270,7 +286,9 @@ async function cmdSearch(db, args, { llm } = {}) {
   if (obsResults.length > 0) {
     // reRankWithContext/markSuperseded expect source='obs' — alias _source for compatibility
     for (const r of obsResults) r.source = 'obs';
-    reRankWithContext(db, obsResults, project || inferProject());
+    // Explicit LLM rerank order is final — skip file-context re-rank when reranked
+    // (paired-path with mem_search; markSuperseded still runs for stale-tagging).
+    if (!isReranked) reRankWithContext(db, obsResults, project || inferProject());
     markSuperseded(obsResults);
     if (isCrossSource) results.sort((a, b) => (a.score ?? 0) - (b.score ?? 0));
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-mem-lite",
-  "version": "3.3.0",
+  "version": "3.4.0",
   "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. A lighter, lower-cost alternative to claude-mem (episode batching + a smaller model; cost savings are an internal estimate, not a measured benchmark).",
   "type": "module",
   "packageManager": "npm@10.9.2",
@@ -31,6 +31,7 @@
     "server-internals.mjs",
     "search-engine.mjs",
     "deep-search.mjs",
+    "rerank.mjs",
     "hook.mjs",
     "hook-shared.mjs",
     "hook-llm.mjs",

package/rerank.mjs ADDED Viewed

@@ -0,0 +1,78 @@
+// Shared LLM-rerank core: reorder a top-K candidate list by an LLM relevance read.
+//
+// Used by BOTH the production deep-search rerank stage (deep-search.mjs) and the
+// LongMemEval rerank benchmark (benchmark/longmemeval-rerank.mjs), so the measured
+// lift number reflects the EXACT algorithm that ships. "Never worse than the input
+// candidate order" by construction: any LLM/parse failure returns the original order.
+//
+// The LLM is dependency-injected by every caller, so this module is unit-tested with
+// deterministic stubs and never statically imports the native-heavy LLM client (the
+// default provider is pulled in lazily on first real call).
+import { parseJsonFromLLM } from './utils.mjs';
+// Module-internal: only buildRerankPrompt (below) consumes these. Kept un-exported
+// so the module's public surface is just the three functions callers actually import.
+const RERANK_SYSTEM =
+  'You rerank search results. Given a QUERY and numbered candidate session snippets, ' +
+  'decide which sessions most likely contain the answer to the query. ' +
+  'Return ONLY JSON {"ranked":[<candidate numbers, most relevant first, each number once>]}. No prose, no markdown.';
+function buildRerankPrompt(query, snippets) {
+  const lines = snippets.map((s, i) => `${i + 1}. ${String(s).replace(/\s+/g, ' ').slice(0, 400)}`);
+  return {
+    system: RERANK_SYSTEM,
+    user: `QUERY: ${query}\n\nCANDIDATES:\n${lines.join('\n')}\n\nReturn {"ranked":[...]} over 1..${snippets.length}, best first.`,
+  };
+}
+// Extract a 1-based ranking array from whatever the LLM returned: a {ranked:[...]}
+// object (stub / clean JSON), a bare array (clean OR prose-wrapped [..]), or a
+// {text} envelope from callLLMWithModel. The bare-array path is what lifts the
+// real parse-rate: claude-haiku often answers "[3,1,5]" instead of {"ranked":..},
+// and parseJsonFromLLM's leading JSON.parse returns that as an array (no .ranked),
+// which the old object-only check silently dropped. null → nothing recoverable.
+export function extractRanked(raw) {
+  if (raw === null || raw === undefined) return null;
+  if (Array.isArray(raw)) return raw;
+  if (typeof raw === 'object' && Array.isArray(raw.ranked)) return raw.ranked;
+  const text = typeof raw === 'string' ? raw : typeof raw.text === 'string' ? raw.text : '';
+  if (!text) return null;
+  const obj = parseJsonFromLLM(text);
+  if (Array.isArray(obj)) return obj; // bare array [3,1,5]
+  if (obj && Array.isArray(obj.ranked)) return obj.ranked; // {"ranked":[...]}
+  const m = text.match(/\[\s*\d+(?:\s*,\s*\d+)*\s*\]/); // prose-wrapped [..]
+  if (m) {
+    try { const a = JSON.parse(m[0]); if (Array.isArray(a)) return a; } catch { /* fall through */ }
+  }
+  return null;
+}
+// Reorder candidate session ids per the LLM's chosen 1-based order; any failure →
+// original order ("never worse than baseline"). { order: sid[], parsed: bool }.
+export async function llmRerankOrder(query, cand /* [{sid,text}] */, llm) {
+  const prompt = buildRerankPrompt(query, cand.map((c) => c.text));
+  let raw;
+  try { raw = await llm(prompt); } catch { raw = null; }
+  const order = extractRanked(raw);
+  if (!order) return { order: cand.map((c) => c.sid), parsed: false };
+  const seen = new Set();
+  const out = [];
+  for (const n of order) {
+    const idx = Number(n) - 1;
+    if (Number.isInteger(idx) && idx >= 0 && idx < cand.length && !seen.has(idx)) {
+      seen.add(idx);
+      out.push(cand[idx].sid);
+    }
+  }
+  cand.forEach((c, i) => { if (!seen.has(i)) out.push(c.sid); }); // append omitted, original order
+  return { order: out, parsed: true };
+}
+// Default provider — lazy import so stub-injected callers never load the client.
+// Uses callLLMWithModel (returns {text}) rather than callModelJSONAsync (which
+// JSON-parses internally and nulls on any non-{...} output) so extractRanked can
+// recover bare-array answers the strict JSON parse drops.
+export async function defaultRerankLLM(prompt) {
+  const { callLLMWithModel } = await import('./haiku-client.mjs');
+  return callLLMWithModel(prompt, 'haiku', { timeout: 20000, maxTokens: 300 });
+}

package/server.mjs CHANGED Viewed

@@ -320,11 +320,11 @@ function formatSearchOutput(paginatedResults, args, ftsQuery, totalCount, orFall
 // NOTE: resolveProject() inside runSearchPipeline closes over the module-level `db`,
 // not the injected one. Tests that pass a project: arg via this seam will trigger
 // resolveProject() against the real (module) DB, not the test DB.
-export async function handleSearchForTest(db, args, { llm } = {}) {
-  return runSearchPipeline(db, args, { llm });
+export async function handleSearchForTest(db, args, { llm, rerankLlm } = {}) {
+  return runSearchPipeline(db, args, { llm, rerankLlm });
 }
-async function runSearchPipeline(db, args, { llm } = {}) {
+async function runSearchPipeline(db, args, { llm, rerankLlm } = {}) {
     if (args.project) args = { ...args, project: resolveProject(args.project) };
     const limit = args.limit ?? 20;
     const offset = args.offset ?? 0;
@@ -349,6 +349,9 @@ async function runSearchPipeline(db, args, { llm } = {}) {
     // Resolve tri-state deep mode. MCP defaults to 'auto' (escalate on weak results)
     // unless explicitly overridden via args.deep or CLAUDE_MEM_AUTO_DEEP env flag.
     const deepMode = resolveDeepMode(args.deep, { surface: 'mcp' });
+    // Opt-in LLM rerank (D#43): explicit-deep only — never on AUTO escalation — so
+    // no default search behaviour changes. Parity with CLI `search --deep --rerank`.
+    const rerank = args.rerank === true && deepMode === 'deep';
     // Early return when query was provided but sanitized to nothing (all FTS5
     // keywords/special chars). Skipped for deep/auto — deep's LLM rewrite may
@@ -365,13 +368,14 @@ async function runSearchPipeline(db, args, { llm } = {}) {
     const ctx = { db, ftsQuery, searchType: effectiveType, args, epochFrom, epochTo, perSourceLimit, perSourceOffset, currentProject, limit };
     const results = [];
     let deepVariants = null;
+    let deepReranked = false;
     let isDeep = deepMode === 'deep';
     let escalated = false;
     let escalatedObsCount = 0;
     // Helper: run deepSearch and load results into the shared `results` array.
     const runDeepInto = async ({ auto = false } = {}) => {
-      const { results: deepRows, variants } = await deepSearch(db, {
+      const { results: deepRows, variants, reranked } = await deepSearch(db, {
         query: args.query,
         project: args.project || null,
         type: args.obs_type || null,
@@ -381,11 +385,12 @@ async function runSearchPipeline(db, args, { llm } = {}) {
         epochFrom, epochTo,
         limit: perSourceLimit,
         currentProject,
-      }, llm ? { llm } : { auto });
+      }, llm ? { llm, rerank: rerank && !auto, rerankLlm } : { auto, rerank: rerank && !auto, rerankLlm });
       // Safe to reset: sessions/prompts are pushed AFTER the obs block, so nothing is lost here.
       results.length = 0;
       results.push(...deepRows);
       deepVariants = variants;
+      deepReranked = reranked;
     };
     if (!effectiveType || effectiveType === 'observations') {
@@ -460,9 +465,13 @@ async function runSearchPipeline(db, args, { llm } = {}) {
     // empty-ftsQuery deep path we tag-but-don't-reorder (keep RRF order).
     if ((ftsQuery || isDeep) && results.some(r => r.source === 'obs')) {
       const obsResults = results.filter(r => r.source === 'obs');
-      if (ftsQuery) reRankWithContext(db, obsResults, currentProject);
+      // When the deep candidates were explicitly LLM-reranked, that order is final:
+      // skip the file-context re-rank + re-sort (they would perturb the rerank order
+      // via score multiplication / score-sort). markSuperseded is pure stale-tagging
+      // and still runs. (D#43 — parity with the CLI deep path, which keeps array order.)
+      if (ftsQuery && !deepReranked) reRankWithContext(db, obsResults, currentProject);
       markSuperseded(obsResults);
-      if (ftsQuery) results.sort((a, b) => (a.score ?? 0) - (b.score ?? 0));
+      if (ftsQuery && !deepReranked) results.sort((a, b) => (a.score ?? 0) - (b.score ?? 0));
     }
     // Tier post-filter: batch-lookup full rows and classify (shared with CLI).
@@ -512,9 +521,14 @@ async function runSearchPipeline(db, args, { llm } = {}) {
         ? `\n\n[deep search: rewrote into ${deepVariants.length} variants — ${deepVariants.slice(1).map(v => JSON.stringify(v)).join(', ')}]`
         : '\n\n[deep search: rewrite produced no usable variants; searched the original query only (== baseline)]';
     }
+    // Discoverability signal for the opt-in rerank (D#43): tell the calling agent the
+    // candidates were LLM-reranked — parity with the CLI stderr note.
+    if (deepReranked && output.content?.[0]?.type === 'text') {
+      output.content[0].text += '\n\n[deep search: LLM-reranked the top candidates by relevance]';
+    }
     // Return an object that exposes structured fields for tests + the MCP content blob.
-    return { ...output, results: paginatedResults, total: totalBeforePagination, escalated, variants: deepVariants };
+    return { ...output, results: paginatedResults, total: totalBeforePagination, escalated, variants: deepVariants, reranked: deepReranked };
 }
 server.registerTool(

package/source-files.mjs CHANGED Viewed

@@ -6,7 +6,7 @@
 export const SOURCE_FILES = [
   // Entry points and top-level modules
-  'cli.mjs', 'cli-path.mjs', 'server.mjs', 'server-internals.mjs', 'search-engine.mjs', 'deep-search.mjs', 'tool-schemas.mjs',
+  'cli.mjs', 'cli-path.mjs', 'server.mjs', 'server-internals.mjs', 'search-engine.mjs', 'deep-search.mjs', 'rerank.mjs', 'tool-schemas.mjs',
   'hook.mjs', 'hook-shared.mjs', 'hook-llm.mjs', 'hook-memory.mjs', 'skip-tools.mjs',
   'hook-semaphore.mjs', 'hook-episode.mjs', 'hook-context.mjs', 'hook-handoff.mjs',
   'hook-update.mjs', 'hook-optimize.mjs', 'hook-precompact.mjs',

package/tool-schemas.mjs CHANGED Viewed

@@ -94,6 +94,7 @@ export const memSearchSchema = {
   include_noise: z.boolean().optional().describe('Include hook-llm fallback titles ("Modified X", "Worked on X", raw error logs) — hidden by default as they have ~3% access rate'),
   or: coerceBool.optional().describe('Force OR semantics between query terms from the start (default: AND with automatic OR-fallback when AND returns 0). Aligns with CLI --or.'),
   deep: coerceBool.optional().describe('Tri-state LLM multi-query/HyDE deep search (observations-only). true=force; false=never; omit=AUTO (default ON for mem_search): a normal search that returns weak/few results auto-escalates with ONE Haiku call (query rewritten to keyword/concept/HyDE variants, RRF-fused). Set CLAUDE_MEM_AUTO_DEEP=0 to disable AUTO. Passive recall stays single-query.'),
+  rerank: coerceBool.optional().describe('Opt-in: LLM-rerank the deep-search candidates for ranking precision (one extra Haiku call, ~1.4s). Requires deep=true (no effect on AUTO/normal). Reserve for hard, ranking-sensitive queries where the right memory is likely retrieved but mis-ranked — skip for routine search. Default off.'),
 };
 export const memRecentSchema = {