claude-mem-lite 3.3.1 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,7 @@
10
10
  "plugins": [
11
11
  {
12
12
  "name": "claude-mem-lite",
13
- "version": "3.3.1",
13
+ "version": "3.4.0",
14
14
  "source": "./",
15
15
  "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. A lighter, lower-cost alternative to claude-mem (episode batching + a smaller model; cost savings are an internal estimate, not a measured benchmark)."
16
16
  }
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-mem-lite",
3
- "version": "3.3.1",
3
+ "version": "3.4.0",
4
4
  "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. A lighter, lower-cost alternative to claude-mem (episode batching + a smaller model; cost savings are an internal estimate, not a measured benchmark).",
5
5
  "author": {
6
6
  "name": "sdsrss"
package/README.md CHANGED
@@ -644,6 +644,41 @@ Benchmarked on 200 observations across 30 queries (standard + hard-negative cate
644
644
 
645
645
  The benchmark suite runs as a CI gate (`npm run benchmark:gate`) to prevent search quality regressions.
646
646
 
647
+ ### Recall on LongMemEval (standard benchmark)
648
+
649
+ Beyond the in-repo micro-benchmark above, claude-mem-lite is measured on
650
+ [LongMemEval](https://github.com/xiaowu0162/LongMemEval) (Wu et al.) — a
651
+ 500-question long-term-memory benchmark — so its recall is comparable to the
652
+ field, not just to itself. Metric is **recall_any@k**: is a gold evidence session
653
+ in the top *k* retrieved? Corpus is user-turns-only (the standard raw-baseline
654
+ rule). Runners: `benchmark/longmemeval.mjs` (lexical) and
655
+ `benchmark/longmemeval-rerank.mjs` (rerank).
656
+
657
+ | Retriever (zero embeddings) | @1 | @5 | @10 |
658
+ |---|---|---|---|
659
+ | Lexical hybrid — FTS5 + TF-IDF + RRF | 76.8% | 90.6% | 95.2% |
660
+ | + one top-20 LLM rerank pass | **92.8%** | **96.8%** | **97.4%** |
661
+
662
+ *n = 500 questions; 99.8% JSON parse-rate at concurrency 3.* The rerank pass
663
+ hands the top 20 lexical candidates to a single Haiku call (~1.4 s/query) that
664
+ reorders them. It is **never worse than the lexical baseline by construction** —
665
+ any LLM or parse failure falls back to the original candidate order.
666
+
667
+ **On embeddings, honestly.** With no LLM in the loop, dense-embedding retrieval
668
+ still wins on raw recall — a dense-embedding baseline reports ~96.6% @5 on this
669
+ split, versus our 90.6%. The rerank row's point is that a *single cheap LLM call
670
+ closes that gap*: a zero-embedding lexical stack reaches 96.8% @5, edging the
671
+ embedding raw number, because the lexical candidate set is already rich enough
672
+ (recall@20 = 97.8%) that ranking — not recall — is the bottleneck. An
673
+ embedding-plus-rerank stack still leads when both sides spend an LLM call; the
674
+ takeaway is that claude-mem-lite needs **no vector model, no Python, and no
675
+ external service** to reach embedding-competitive precision.
676
+
677
+ Per-category @5 (lexical → +rerank): knowledge-update 98.7 → 100.0 ·
678
+ single-session-user 91.4 → 98.6 · temporal-reasoning 89.5 → 97.7 · multi-session
679
+ 95.5 → 97.7 · single-session-assistant 83.9 → 94.6 · single-session-preference
680
+ 63.3 → 80.0. Every category improves; none regress.
681
+
647
682
  ## Development
648
683
 
649
684
  ```bash
package/deep-search.mjs CHANGED
@@ -34,10 +34,19 @@
34
34
  import { searchObservationsHybrid } from './search-engine.mjs';
35
35
  import { sanitizeFtsQuery } from './utils.mjs';
36
36
  import { RRF_K } from './tfidf.mjs';
37
+ import { llmRerankOrder, defaultRerankLLM } from './rerank.mjs';
37
38
 
38
39
  // original + up to 3 rewrites (keyword / concept-expansion / HyDE).
39
40
  export const MAX_VARIANTS = 4;
40
41
 
42
+ // How many RRF-fused candidates the opt-in rerank stage hands to the LLM. The
43
+ // LongMemEval rerank benchmark (benchmark/longmemeval-rerank.mjs) measured the
44
+ // lexical candidate set as rich enough at 20 (recall@20 = 97.8%) that reranking
45
+ // the top-20 captures nearly all of that ceiling (96.8%@5); matching it here keeps
46
+ // the shipped behaviour aligned with the measured number. Module-internal — callers
47
+ // override per-call via deps.rerankTopK; export it if a config surface ever needs it.
48
+ const RERANK_TOPK = 20;
49
+
41
50
  // ─── Auto-escalation (opt-in adaptive deep search) ──────────────────────────
42
51
  // Result-count floor below which a normal search is "weak" enough to auto-escalate
43
52
  // to deepSearch. Calibrated against the deep-search benchmark fixtures; 3 is the
@@ -371,7 +380,33 @@ function defaultSearchFn(db, query, params) {
371
380
  }
372
381
 
373
382
  /**
374
- * Opt-in deep search: rewrite per-variant hybrid search RRF fusion.
383
+ * Build the candidate text the opt-in rerank stage shows the LLM. Prefers each
384
+ * observation's full `narrative` (the field the LongMemEval rerank benchmark
385
+ * scored); falls back to title / subtitle / snippet / lesson when narrative is
386
+ * unavailable or the db can't be read (injected rows / null db in unit tests).
387
+ * @param {Database|null} db
388
+ * @param {Array<object>} rows fused candidate rows (already sliced to top-K)
389
+ * @returns {Map<any,string>} id → candidate text
390
+ */
391
+ function defaultRerankText(db, rows) {
392
+ const fallback = (r) => [r.title, r.subtitle, r.snippet, r.lesson_learned].filter(Boolean).join(' — ');
393
+ if (!db) return new Map(rows.map((r) => [r.id, fallback(r)]));
394
+ try {
395
+ const ids = rows.map((r) => r.id);
396
+ const ph = ids.map(() => '?').join(',');
397
+ const found = new Map(
398
+ db.prepare(`SELECT id, narrative, title, subtitle FROM observations WHERE id IN (${ph})`)
399
+ .all(...ids)
400
+ .map((o) => [o.id, o.narrative || [o.title, o.subtitle].filter(Boolean).join(' — ')]),
401
+ );
402
+ return new Map(rows.map((r) => [r.id, found.get(r.id) || fallback(r)]));
403
+ } catch {
404
+ return new Map(rows.map((r) => [r.id, fallback(r)]));
405
+ }
406
+ }
407
+
408
+ /**
409
+ * Opt-in deep search: rewrite → per-variant hybrid search → RRF fusion → opt-in rerank.
375
410
  * @param {Database} db open better-sqlite3 handle
376
411
  * @param {object} params
377
412
  * @param {string} params.query The user query.
@@ -386,11 +421,15 @@ function defaultSearchFn(db, query, params) {
386
421
  * @param {(db:Database, query:string, params:object)=>Array} [deps.searchFn]
387
422
  * @param {number} [deps.rrfK=RRF_K]
388
423
  * @param {boolean} [deps.auto=false] use the fail-fast/throttled/cached auto provider
389
- * @returns {Promise<{results: Array, variants: string[]}>}
424
+ * @param {boolean} [deps.rerank=false] opt-in: LLM-rerank the fused top-K (never on the auto path)
425
+ * @param {(prompt:object)=>Promise<any>} [deps.rerankLlm] rerank provider (default: lazy haiku)
426
+ * @param {number} [deps.rerankTopK=RERANK_TOPK] how many fused candidates to rerank
427
+ * @param {(db:Database, rows:Array)=>Map} [deps.rerankTextFn] id→text builder for the rerank prompt
428
+ * @returns {Promise<{results: Array, variants: string[], reranked: boolean}>}
390
429
  */
391
- export async function deepSearch(db, params, { llm, searchFn = defaultSearchFn, rrfK = RRF_K, auto = false } = {}) {
430
+ export async function deepSearch(db, params, { llm, searchFn = defaultSearchFn, rrfK = RRF_K, auto = false, rerank = false, rerankLlm, rerankTopK = RERANK_TOPK, rerankTextFn = defaultRerankText } = {}) {
392
431
  const query = String(params?.query ?? '').trim();
393
- if (!query) return { results: [], variants: [] };
432
+ if (!query) return { results: [], variants: [], reranked: false };
394
433
 
395
434
  // No injected llm: EXPLICIT deep=true uses the patient defaultLLM; the AUTO
396
435
  // path uses a fail-fast + throttled provider with no retry and a process-
@@ -418,5 +457,40 @@ export async function deepSearch(db, params, { llm, searchFn = defaultSearchFn,
418
457
 
419
458
  const fused = rrfFuseN(lists, rrfK);
420
459
  const limit = params.limit ?? 10;
421
- return { results: fused.slice(0, limit), variants };
460
+
461
+ // Opt-in rerank stage (option C): reorder the fused top-K by an LLM relevance
462
+ // read, using the same core the LongMemEval benchmark measures (rerank.mjs) so
463
+ // the shipped algorithm == the measured one. Strictly opt-in — the AUTO
464
+ // escalation path never reranks, so no default search behaviour changes and the
465
+ // hot path stays a single LLM call. "Never worse than the fused order" by
466
+ // construction: a failed/unparseable rerank leaves the fused order untouched.
467
+ // The candidate set fed here is RICHER than the benchmark's single-query top-20
468
+ // (it is multi-query RRF), so the measured 96.8%@5 is a conservative floor.
469
+ let ordered = fused;
470
+ let reranked = false;
471
+ if (rerank && fused.length > 1) {
472
+ const k = Math.min(rerankTopK, fused.length);
473
+ const top = fused.slice(0, k);
474
+ const text = rerankTextFn(db, top);
475
+ const cand = top.map((r) => ({ sid: r.id, text: text.get(r.id) || '' }));
476
+ const { order, parsed } = await llmRerankOrder(query, cand, rerankLlm || defaultRerankLLM);
477
+ if (parsed) {
478
+ const byId = new Map(top.map((r) => [r.id, r]));
479
+ const head = order.map((id) => byId.get(id)).filter(Boolean);
480
+ // Re-stamp scores so `score` stays monotonic with the rerank order, reusing
481
+ // the top-K's OWN values ascending (best = most-negative first): the reranked
482
+ // block keeps the K best scores so it stays ahead of the fused tail, and orders
483
+ // within itself by rerank rank. This keeps the shared CLI↔MCP `score` ordering
484
+ // contract (#8217) consistent with the array order, so a consumer that re-sorts
485
+ // by score reproduces the rerank order instead of restoring the RRF order.
486
+ // (server.mjs also skips its context re-rank/re-sort when reranked, so the LLM
487
+ // judgement is the final order — the re-stamp keeps score honest regardless.)
488
+ const scores = top.map((r) => r.score).sort((a, b) => a - b);
489
+ head.forEach((r, i) => { r.score = scores[i]; r.rrfScore = -scores[i]; });
490
+ ordered = [...head, ...fused.slice(k)];
491
+ reranked = true;
492
+ }
493
+ }
494
+
495
+ return { results: ordered.slice(0, limit), variants, reranked };
422
496
  }
package/mem-cli.mjs CHANGED
@@ -52,7 +52,7 @@ async function cmdSearch(db, args, { llm } = {}) {
52
52
  const { positional, flags } = parseArgs(args);
53
53
  const query = positional.join(' ');
54
54
  if (!query) {
55
- fail('[mem] Usage: claude-mem-lite search <query> [--type TYPE] [--source SOURCE] [--limit N] [--project P] [--from DATE] [--to DATE] [--importance N] [--branch B] [--offset N] [--sort relevance|time|importance] [--include-noise] [--deep] [--no-deep]');
55
+ fail('[mem] Usage: claude-mem-lite search <query> [--type TYPE] [--source SOURCE] [--limit N] [--project P] [--from DATE] [--to DATE] [--importance N] [--branch B] [--offset N] [--sort relevance|time|importance] [--include-noise] [--deep] [--no-deep] [--rerank]');
56
56
  return;
57
57
  }
58
58
 
@@ -109,6 +109,15 @@ async function cmdSearch(db, args, { llm } = {}) {
109
109
  : ((flags['no-deep'] === true || flags['no-deep'] === 'true') ? false : undefined);
110
110
  const deepMode = resolveDeepMode(explicitDeep, { surface: 'cli' });
111
111
 
112
+ // --rerank: opt-in LLM rerank of the fused top-20 (option C, deep-search.mjs).
113
+ // One extra Haiku call (~1.4s); only meaningful on the explicit --deep path,
114
+ // never on auto-escalation. Same rerank core the LongMemEval benchmark measures.
115
+ const rerankFlag = flags.rerank === true || flags.rerank === 'true';
116
+ const rerank = rerankFlag && deepMode === 'deep';
117
+ if (rerankFlag && deepMode !== 'deep') {
118
+ process.stderr.write('[mem] Note: --rerank requires --deep (it reranks deep-search candidates); ignored\n');
119
+ }
120
+
112
121
  if (source && !['observations', 'sessions', 'prompts'].includes(source)) {
113
122
  fail(`[mem] Invalid --source "${source}". Use: observations, sessions, prompts`);
114
123
  return;
@@ -160,6 +169,7 @@ async function cmdSearch(db, args, { llm } = {}) {
160
169
  let orFallbackFired = false;
161
170
 
162
171
  let deepVariants = null;
172
+ let isReranked = false;
163
173
  let isDeep = deepMode === 'deep';
164
174
 
165
175
  // Search observations — shared engine with server.mjs (#8198/#8212 paired-path fix)
@@ -194,13 +204,19 @@ async function cmdSearch(db, args, { llm } = {}) {
194
204
  epochTo: dateTo,
195
205
  limit: perSourceLimit,
196
206
  currentProject: project ? null : inferProject(),
197
- }, llm ? { llm } : { auto });
207
+ }, llm ? { llm, rerank: rerank && !auto } : { auto, rerank: rerank && !auto });
198
208
  deepVariants = ds.variants;
209
+ isReranked = ds.reranked;
199
210
  if (deepVariants.length > 1) {
200
211
  process.stderr.write(`[mem] Deep search: rewrote into ${deepVariants.length} query variants, RRF-fused\n`);
201
212
  } else {
202
213
  process.stderr.write('[mem] Deep search: rewrite returned no usable variants; used original query only\n');
203
214
  }
215
+ if (rerank && !auto) {
216
+ process.stderr.write(ds.reranked
217
+ ? '[mem] Deep search: LLM-reranked the fused top-20\n'
218
+ : '[mem] Deep search: rerank produced no usable order; kept fused order\n');
219
+ }
204
220
  return ds.results;
205
221
  };
206
222
 
@@ -270,7 +286,9 @@ async function cmdSearch(db, args, { llm } = {}) {
270
286
  if (obsResults.length > 0) {
271
287
  // reRankWithContext/markSuperseded expect source='obs' — alias _source for compatibility
272
288
  for (const r of obsResults) r.source = 'obs';
273
- reRankWithContext(db, obsResults, project || inferProject());
289
+ // Explicit LLM rerank order is final — skip file-context re-rank when reranked
290
+ // (paired-path with mem_search; markSuperseded still runs for stale-tagging).
291
+ if (!isReranked) reRankWithContext(db, obsResults, project || inferProject());
274
292
  markSuperseded(obsResults);
275
293
  if (isCrossSource) results.sort((a, b) => (a.score ?? 0) - (b.score ?? 0));
276
294
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-mem-lite",
3
- "version": "3.3.1",
3
+ "version": "3.4.0",
4
4
  "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. A lighter, lower-cost alternative to claude-mem (episode batching + a smaller model; cost savings are an internal estimate, not a measured benchmark).",
5
5
  "type": "module",
6
6
  "packageManager": "npm@10.9.2",
@@ -31,6 +31,7 @@
31
31
  "server-internals.mjs",
32
32
  "search-engine.mjs",
33
33
  "deep-search.mjs",
34
+ "rerank.mjs",
34
35
  "hook.mjs",
35
36
  "hook-shared.mjs",
36
37
  "hook-llm.mjs",
package/rerank.mjs ADDED
@@ -0,0 +1,78 @@
1
+ // Shared LLM-rerank core: reorder a top-K candidate list by an LLM relevance read.
2
+ //
3
+ // Used by BOTH the production deep-search rerank stage (deep-search.mjs) and the
4
+ // LongMemEval rerank benchmark (benchmark/longmemeval-rerank.mjs), so the measured
5
+ // lift number reflects the EXACT algorithm that ships. "Never worse than the input
6
+ // candidate order" by construction: any LLM/parse failure returns the original order.
7
+ //
8
+ // The LLM is dependency-injected by every caller, so this module is unit-tested with
9
+ // deterministic stubs and never statically imports the native-heavy LLM client (the
10
+ // default provider is pulled in lazily on first real call).
11
+ import { parseJsonFromLLM } from './utils.mjs';
12
+
13
+ // Module-internal: only buildRerankPrompt (below) consumes these. Kept un-exported
14
+ // so the module's public surface is just the three functions callers actually import.
15
+ const RERANK_SYSTEM =
16
+ 'You rerank search results. Given a QUERY and numbered candidate session snippets, ' +
17
+ 'decide which sessions most likely contain the answer to the query. ' +
18
+ 'Return ONLY JSON {"ranked":[<candidate numbers, most relevant first, each number once>]}. No prose, no markdown.';
19
+
20
+ function buildRerankPrompt(query, snippets) {
21
+ const lines = snippets.map((s, i) => `${i + 1}. ${String(s).replace(/\s+/g, ' ').slice(0, 400)}`);
22
+ return {
23
+ system: RERANK_SYSTEM,
24
+ user: `QUERY: ${query}\n\nCANDIDATES:\n${lines.join('\n')}\n\nReturn {"ranked":[...]} over 1..${snippets.length}, best first.`,
25
+ };
26
+ }
27
+
28
+ // Extract a 1-based ranking array from whatever the LLM returned: a {ranked:[...]}
29
+ // object (stub / clean JSON), a bare array (clean OR prose-wrapped [..]), or a
30
+ // {text} envelope from callLLMWithModel. The bare-array path is what lifts the
31
+ // real parse-rate: claude-haiku often answers "[3,1,5]" instead of {"ranked":..},
32
+ // and parseJsonFromLLM's leading JSON.parse returns that as an array (no .ranked),
33
+ // which the old object-only check silently dropped. null → nothing recoverable.
34
+ export function extractRanked(raw) {
35
+ if (raw === null || raw === undefined) return null;
36
+ if (Array.isArray(raw)) return raw;
37
+ if (typeof raw === 'object' && Array.isArray(raw.ranked)) return raw.ranked;
38
+ const text = typeof raw === 'string' ? raw : typeof raw.text === 'string' ? raw.text : '';
39
+ if (!text) return null;
40
+ const obj = parseJsonFromLLM(text);
41
+ if (Array.isArray(obj)) return obj; // bare array [3,1,5]
42
+ if (obj && Array.isArray(obj.ranked)) return obj.ranked; // {"ranked":[...]}
43
+ const m = text.match(/\[\s*\d+(?:\s*,\s*\d+)*\s*\]/); // prose-wrapped [..]
44
+ if (m) {
45
+ try { const a = JSON.parse(m[0]); if (Array.isArray(a)) return a; } catch { /* fall through */ }
46
+ }
47
+ return null;
48
+ }
49
+
50
+ // Reorder candidate session ids per the LLM's chosen 1-based order; any failure →
51
+ // original order ("never worse than baseline"). { order: sid[], parsed: bool }.
52
+ export async function llmRerankOrder(query, cand /* [{sid,text}] */, llm) {
53
+ const prompt = buildRerankPrompt(query, cand.map((c) => c.text));
54
+ let raw;
55
+ try { raw = await llm(prompt); } catch { raw = null; }
56
+ const order = extractRanked(raw);
57
+ if (!order) return { order: cand.map((c) => c.sid), parsed: false };
58
+ const seen = new Set();
59
+ const out = [];
60
+ for (const n of order) {
61
+ const idx = Number(n) - 1;
62
+ if (Number.isInteger(idx) && idx >= 0 && idx < cand.length && !seen.has(idx)) {
63
+ seen.add(idx);
64
+ out.push(cand[idx].sid);
65
+ }
66
+ }
67
+ cand.forEach((c, i) => { if (!seen.has(i)) out.push(c.sid); }); // append omitted, original order
68
+ return { order: out, parsed: true };
69
+ }
70
+
71
+ // Default provider — lazy import so stub-injected callers never load the client.
72
+ // Uses callLLMWithModel (returns {text}) rather than callModelJSONAsync (which
73
+ // JSON-parses internally and nulls on any non-{...} output) so extractRanked can
74
+ // recover bare-array answers the strict JSON parse drops.
75
+ export async function defaultRerankLLM(prompt) {
76
+ const { callLLMWithModel } = await import('./haiku-client.mjs');
77
+ return callLLMWithModel(prompt, 'haiku', { timeout: 20000, maxTokens: 300 });
78
+ }
package/server.mjs CHANGED
@@ -320,11 +320,11 @@ function formatSearchOutput(paginatedResults, args, ftsQuery, totalCount, orFall
320
320
  // NOTE: resolveProject() inside runSearchPipeline closes over the module-level `db`,
321
321
  // not the injected one. Tests that pass a project: arg via this seam will trigger
322
322
  // resolveProject() against the real (module) DB, not the test DB.
323
- export async function handleSearchForTest(db, args, { llm } = {}) {
324
- return runSearchPipeline(db, args, { llm });
323
+ export async function handleSearchForTest(db, args, { llm, rerankLlm } = {}) {
324
+ return runSearchPipeline(db, args, { llm, rerankLlm });
325
325
  }
326
326
 
327
- async function runSearchPipeline(db, args, { llm } = {}) {
327
+ async function runSearchPipeline(db, args, { llm, rerankLlm } = {}) {
328
328
  if (args.project) args = { ...args, project: resolveProject(args.project) };
329
329
  const limit = args.limit ?? 20;
330
330
  const offset = args.offset ?? 0;
@@ -349,6 +349,9 @@ async function runSearchPipeline(db, args, { llm } = {}) {
349
349
  // Resolve tri-state deep mode. MCP defaults to 'auto' (escalate on weak results)
350
350
  // unless explicitly overridden via args.deep or CLAUDE_MEM_AUTO_DEEP env flag.
351
351
  const deepMode = resolveDeepMode(args.deep, { surface: 'mcp' });
352
+ // Opt-in LLM rerank (D#43): explicit-deep only — never on AUTO escalation — so
353
+ // no default search behaviour changes. Parity with CLI `search --deep --rerank`.
354
+ const rerank = args.rerank === true && deepMode === 'deep';
352
355
 
353
356
  // Early return when query was provided but sanitized to nothing (all FTS5
354
357
  // keywords/special chars). Skipped for deep/auto — deep's LLM rewrite may
@@ -365,13 +368,14 @@ async function runSearchPipeline(db, args, { llm } = {}) {
365
368
  const ctx = { db, ftsQuery, searchType: effectiveType, args, epochFrom, epochTo, perSourceLimit, perSourceOffset, currentProject, limit };
366
369
  const results = [];
367
370
  let deepVariants = null;
371
+ let deepReranked = false;
368
372
  let isDeep = deepMode === 'deep';
369
373
  let escalated = false;
370
374
  let escalatedObsCount = 0;
371
375
 
372
376
  // Helper: run deepSearch and load results into the shared `results` array.
373
377
  const runDeepInto = async ({ auto = false } = {}) => {
374
- const { results: deepRows, variants } = await deepSearch(db, {
378
+ const { results: deepRows, variants, reranked } = await deepSearch(db, {
375
379
  query: args.query,
376
380
  project: args.project || null,
377
381
  type: args.obs_type || null,
@@ -381,11 +385,12 @@ async function runSearchPipeline(db, args, { llm } = {}) {
381
385
  epochFrom, epochTo,
382
386
  limit: perSourceLimit,
383
387
  currentProject,
384
- }, llm ? { llm } : { auto });
388
+ }, llm ? { llm, rerank: rerank && !auto, rerankLlm } : { auto, rerank: rerank && !auto, rerankLlm });
385
389
  // Safe to reset: sessions/prompts are pushed AFTER the obs block, so nothing is lost here.
386
390
  results.length = 0;
387
391
  results.push(...deepRows);
388
392
  deepVariants = variants;
393
+ deepReranked = reranked;
389
394
  };
390
395
 
391
396
  if (!effectiveType || effectiveType === 'observations') {
@@ -460,9 +465,13 @@ async function runSearchPipeline(db, args, { llm } = {}) {
460
465
  // empty-ftsQuery deep path we tag-but-don't-reorder (keep RRF order).
461
466
  if ((ftsQuery || isDeep) && results.some(r => r.source === 'obs')) {
462
467
  const obsResults = results.filter(r => r.source === 'obs');
463
- if (ftsQuery) reRankWithContext(db, obsResults, currentProject);
468
+ // When the deep candidates were explicitly LLM-reranked, that order is final:
469
+ // skip the file-context re-rank + re-sort (they would perturb the rerank order
470
+ // via score multiplication / score-sort). markSuperseded is pure stale-tagging
471
+ // and still runs. (D#43 — parity with the CLI deep path, which keeps array order.)
472
+ if (ftsQuery && !deepReranked) reRankWithContext(db, obsResults, currentProject);
464
473
  markSuperseded(obsResults);
465
- if (ftsQuery) results.sort((a, b) => (a.score ?? 0) - (b.score ?? 0));
474
+ if (ftsQuery && !deepReranked) results.sort((a, b) => (a.score ?? 0) - (b.score ?? 0));
466
475
  }
467
476
 
468
477
  // Tier post-filter: batch-lookup full rows and classify (shared with CLI).
@@ -512,9 +521,14 @@ async function runSearchPipeline(db, args, { llm } = {}) {
512
521
  ? `\n\n[deep search: rewrote into ${deepVariants.length} variants — ${deepVariants.slice(1).map(v => JSON.stringify(v)).join(', ')}]`
513
522
  : '\n\n[deep search: rewrite produced no usable variants; searched the original query only (== baseline)]';
514
523
  }
524
+ // Discoverability signal for the opt-in rerank (D#43): tell the calling agent the
525
+ // candidates were LLM-reranked — parity with the CLI stderr note.
526
+ if (deepReranked && output.content?.[0]?.type === 'text') {
527
+ output.content[0].text += '\n\n[deep search: LLM-reranked the top candidates by relevance]';
528
+ }
515
529
 
516
530
  // Return an object that exposes structured fields for tests + the MCP content blob.
517
- return { ...output, results: paginatedResults, total: totalBeforePagination, escalated, variants: deepVariants };
531
+ return { ...output, results: paginatedResults, total: totalBeforePagination, escalated, variants: deepVariants, reranked: deepReranked };
518
532
  }
519
533
 
520
534
  server.registerTool(
package/source-files.mjs CHANGED
@@ -6,7 +6,7 @@
6
6
 
7
7
  export const SOURCE_FILES = [
8
8
  // Entry points and top-level modules
9
- 'cli.mjs', 'cli-path.mjs', 'server.mjs', 'server-internals.mjs', 'search-engine.mjs', 'deep-search.mjs', 'tool-schemas.mjs',
9
+ 'cli.mjs', 'cli-path.mjs', 'server.mjs', 'server-internals.mjs', 'search-engine.mjs', 'deep-search.mjs', 'rerank.mjs', 'tool-schemas.mjs',
10
10
  'hook.mjs', 'hook-shared.mjs', 'hook-llm.mjs', 'hook-memory.mjs', 'skip-tools.mjs',
11
11
  'hook-semaphore.mjs', 'hook-episode.mjs', 'hook-context.mjs', 'hook-handoff.mjs',
12
12
  'hook-update.mjs', 'hook-optimize.mjs', 'hook-precompact.mjs',
package/tool-schemas.mjs CHANGED
@@ -94,6 +94,7 @@ export const memSearchSchema = {
94
94
  include_noise: z.boolean().optional().describe('Include hook-llm fallback titles ("Modified X", "Worked on X", raw error logs) — hidden by default as they have ~3% access rate'),
95
95
  or: coerceBool.optional().describe('Force OR semantics between query terms from the start (default: AND with automatic OR-fallback when AND returns 0). Aligns with CLI --or.'),
96
96
  deep: coerceBool.optional().describe('Tri-state LLM multi-query/HyDE deep search (observations-only). true=force; false=never; omit=AUTO (default ON for mem_search): a normal search that returns weak/few results auto-escalates with ONE Haiku call (query rewritten to keyword/concept/HyDE variants, RRF-fused). Set CLAUDE_MEM_AUTO_DEEP=0 to disable AUTO. Passive recall stays single-query.'),
97
+ rerank: coerceBool.optional().describe('Opt-in: LLM-rerank the deep-search candidates for ranking precision (one extra Haiku call, ~1.4s). Requires deep=true (no effect on AUTO/normal). Reserve for hard, ranking-sensitive queries where the right memory is likely retrieved but mis-ranked — skip for routine search. Default off.'),
97
98
  };
98
99
 
99
100
  export const memRecentSchema = {