npm - sweet-search - Versions diffs - 2.5.2 → 2.5.4 - Mend

sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

package/core/cli.js +24 -3
package/core/graph/graph-expansion.js +215 -36
package/core/graph/graph-extractor.js +196 -11
package/core/graph/graph-search.js +395 -92
package/core/graph/hcgs-generator.js +2 -1
package/core/graph/index.js +2 -0
package/core/graph/repo-map.js +28 -6
package/core/graph/structural-answer-cues.js +168 -0
package/core/graph/structural-callsite-hints.js +40 -0
package/core/graph/structural-context-format.js +40 -0
package/core/graph/structural-context.js +450 -0
package/core/graph/structural-forward-push.js +156 -0
package/core/graph/structural-header-context.js +19 -0
package/core/graph/structural-importance.js +148 -0
package/core/graph/structural-pagerank.js +197 -0
package/core/graph/summary-manager.js +13 -9
package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
package/core/incremental-indexing/application/file-watcher.mjs +197 -0
package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
package/core/incremental-indexing/application/operator-cli.mjs +554 -0
package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
package/core/incremental-indexing/application/reconciler.mjs +477 -0
package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
package/core/indexing/admission-policy.js +139 -0
package/core/indexing/artifact-builder.js +29 -12
package/core/indexing/ast-chunker.js +107 -30
package/core/indexing/dedup/exemplar-selector.js +19 -1
package/core/indexing/gitignore-filter.js +223 -0
package/core/indexing/incremental-tracker.js +99 -30
package/core/indexing/index-codebase-v21.js +6 -5
package/core/indexing/index-maintainer.mjs +698 -6
package/core/indexing/indexer-ann.js +99 -15
package/core/indexing/indexer-build.js +158 -45
package/core/indexing/indexer-empty-baseline.js +80 -0
package/core/indexing/indexer-manifest.js +66 -0
package/core/indexing/indexer-phases.js +56 -23
package/core/indexing/indexer-sparse-gram.js +54 -13
package/core/indexing/indexer-utils.js +26 -208
package/core/indexing/indexing-file-policy.js +32 -7
package/core/indexing/maintainer-launcher.mjs +137 -0
package/core/indexing/merkle-tracker.js +251 -244
package/core/indexing/model-pool.js +46 -5
package/core/infrastructure/code-graph-repository.js +758 -6
package/core/infrastructure/code-graph-visibility.js +157 -0
package/core/infrastructure/codebase-repository.js +100 -13
package/core/infrastructure/config/search.js +1 -1
package/core/infrastructure/db-utils.js +118 -0
package/core/infrastructure/dedup-hashing.js +10 -13
package/core/infrastructure/hardware-capability.js +17 -7
package/core/infrastructure/index.js +8 -2
package/core/infrastructure/language-patterns/maps.js +4 -1
package/core/infrastructure/language-patterns/registry-core.js +56 -17
package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
package/core/infrastructure/language-patterns.js +69 -0
package/core/infrastructure/model-registry.js +20 -0
package/core/infrastructure/native-inference.js +7 -12
package/core/infrastructure/native-resolver.js +52 -37
package/core/infrastructure/native-sparse-gram.js +261 -20
package/core/infrastructure/native-tokenizer.js +6 -15
package/core/infrastructure/simd-distance.js +10 -16
package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
package/core/infrastructure/structural-alias-resolver.js +122 -0
package/core/infrastructure/structural-candidate-ranker.js +34 -0
package/core/infrastructure/structural-context-repository.js +472 -0
package/core/infrastructure/structural-context-utils.js +51 -0
package/core/infrastructure/structural-graph-signals.js +121 -0
package/core/infrastructure/structural-qualified-resolution.js +15 -0
package/core/infrastructure/structural-source-definitions.js +100 -0
package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
package/core/infrastructure/tree-sitter-provider.js +811 -37
package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
package/core/query/query-router.js +55 -5
package/core/ranking/file-kind-ranking.js +2192 -15
package/core/ranking/late-interaction-index.js +87 -12
package/core/search/cli-decoration.js +290 -0
package/core/search/context-expander.js +988 -78
package/core/search/index.js +1 -0
package/core/search/output-policy.js +275 -0
package/core/search/search-anchor.js +499 -0
package/core/search/search-boost.js +93 -1
package/core/search/search-cli.js +61 -204
package/core/search/search-hybrid.js +250 -10
package/core/search/search-pattern-chunks.js +57 -8
package/core/search/search-pattern-planner.js +68 -9
package/core/search/search-pattern-prefilter.js +30 -10
package/core/search/search-pattern-ripgrep.js +40 -4
package/core/search/search-pattern-sparse-overlay.js +256 -0
package/core/search/search-pattern.js +117 -29
package/core/search/search-postprocess.js +479 -5
package/core/search/search-read-semantic.js +260 -23
package/core/search/search-read.js +82 -64
package/core/search/search-reader-pin.js +71 -0
package/core/search/search-rrf.js +279 -0
package/core/search/search-semantic.js +110 -5
package/core/search/search-server.js +130 -57
package/core/search/search-trace.js +107 -0
package/core/search/server-identity.js +93 -0
package/core/search/session-daemon-prewarm.mjs +33 -10
package/core/search/sweet-search.js +399 -7
package/core/skills/sweet-index/SKILL.md +8 -6
package/core/vector-store/binary-hnsw-index.js +194 -30
package/core/vector-store/float-vector-store.js +96 -6
package/core/vector-store/hnsw-index.js +220 -49
package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
package/eval/agent-read-workflows/bin/ss-find +15 -0
package/eval/agent-read-workflows/bin/ss-grep +12 -0
package/eval/agent-read-workflows/bin/ss-read +14 -0
package/eval/agent-read-workflows/bin/ss-search +18 -0
package/eval/agent-read-workflows/bin/ss-semantic +12 -0
package/eval/agent-read-workflows/bin/ss-trace +11 -0
package/mcp/read-tool.js +109 -0
package/mcp/server.js +55 -15
package/mcp/tool-handlers.js +14 -124
package/mcp/trace-tool.js +81 -0
package/package.json +25 -10
package/scripts/hooks/intercept-read.mjs +55 -0
package/scripts/hooks/remind-tools.mjs +40 -0
package/scripts/init.js +698 -54
package/scripts/inject-agent-instructions.js +431 -0
package/scripts/install-prompt-reminders.js +188 -0
package/scripts/install-tool-enforcement.js +220 -0
package/scripts/smoke-test.js +12 -9
package/scripts/uninstall.js +276 -18
package/scripts/write-claude-rules.js +110 -0

package/core/search/search-read.js CHANGED Viewed

@@ -1,32 +1,15 @@
 /**
- * sweet-search read — filesystem-grounded file reader.
- *
- * Returns exact bytes from disk. The vectors index may attach symbol/chunk
- * metadata for indexed files, but the returned `text` always comes from
- * `node:fs`, never from the (truncated) DB column.
- *
- * Design notes:
- *   - Filesystem is ground truth. Never return DB-stored text as content.
- *   - Batch up to 20 files; per-file errors do not fail the batch.
- *   - Warm-process cache keyed by `path|size|mtimeMs` avoids re-reading hot
- *     files; line-offset table lets line-range reads avoid materialising the
- *     whole content for large files.
- *
- * DDD: this module lives in the search/ application layer (allowed to import
- * infrastructure for filesystem grounding and chunk metadata).
+ * sweet-search read — filesystem-grounded file reader. Returns exact bytes from
+ * disk; the vectors index may attach symbol/chunk metadata, but the returned
+ * `text` always comes from node:fs, never from the (truncated) DB column.
  */
-import { promises as fs, statSync } from 'node:fs';
+import { promises as fs, realpathSync, statSync } from 'node:fs';
 import path from 'node:path';
 import { CodebaseRepository } from '../infrastructure/codebase-repository.js';
-import { DB_PATHS } from '../infrastructure/config/index.js';
-// ---------------------------------------------------------------------------
-// Cache — keyed by absolutePath|size|mtimeMs (any change invalidates).
-// Bounded LRU. Entries hold either the full text + line-offset table, or just
-// the line-offset table for very large files where we deliberately avoid
-// caching the whole content.
-// ---------------------------------------------------------------------------
+import { DB_PATHS, PROJECT_ROOT } from '../infrastructure/config/index.js';
+import { withPinnedRead } from './search-reader-pin.js';
+import { emitToolIdentityAuto } from './cli-decoration.js';
 const CACHE_MAX_ENTRIES = 64;
 const CACHE_LARGE_FILE_BYTES = 4 * 1024 * 1024; // 4MB — switch to range-read mode
@@ -45,22 +28,22 @@ function _cacheTouch(key, value) {
   }
 }
-// ---------------------------------------------------------------------------
-// Repository singleton — lazy and tolerant of a missing/empty DB.
-// ---------------------------------------------------------------------------
-let _repo = null;
-function _getRepo() {
-  if (_repo === null) {
-    try { _repo = new CodebaseRepository(DB_PATHS.codebase); }
-    catch { _repo = false; }
+const _repos = new Map();
+function _getRepo(projectRoot) {
+  const dbPath = _codebasePathForProject(projectRoot);
+  if (!_repos.has(dbPath)) {
+    try { _repos.set(dbPath, new CodebaseRepository(dbPath)); }
+    catch { _repos.set(dbPath, false); }
   }
-  return _repo || null;
+  return _repos.get(dbPath) || null;
 }
-// ---------------------------------------------------------------------------
-// Path resolution helpers
-// ---------------------------------------------------------------------------
+function _codebasePathForProject(projectRoot) {
+  const root = path.resolve(projectRoot || process.cwd());
+  if (root === path.resolve(PROJECT_ROOT || process.cwd())) return DB_PATHS.codebase;
+  const stateDir = path.basename(path.dirname(DB_PATHS.codebase || '.sweet-search/codebase.db'));
+  return path.join(root, stateDir, 'codebase.db');
+}
 function _resolvePath(p, projectRoot) {
   if (!p) throw new Error('path is required');
@@ -70,10 +53,24 @@ function _resolvePath(p, projectRoot) {
 function _projectRelative(absPath, projectRoot) {
   const root = projectRoot || process.cwd();
-  const rel = path.relative(root, absPath);
-  // Inside the project root → use relative form (matches vectors.file_path).
-  // Outside → keep the absolute path (no chunks will match anyway).
-  return rel.startsWith('..') || path.isAbsolute(rel) ? absPath : rel;
+  const normalized = _normalizeRelativePath(path.relative(root, absPath));
+  if (normalized) return normalized;
+  try {
+    return _normalizeRelativePath(
+      path.relative(realpathSync.native(root), realpathSync.native(absPath)),
+    ) || absPath;
+  } catch {
+    return absPath;
+  }
+}
+function _normalizeRelativePath(rel) {
+  const normalized = rel.replace(/\\/g, '/').replace(/^\.\//, '');
+  return (
+    normalized && !normalized.startsWith('../') && !path.isAbsolute(normalized)
+      ? normalized
+      : null
+    );
 }
 // ---------------------------------------------------------------------------
@@ -123,7 +120,6 @@ async function _readFromDisk(absPath) {
   const isLarge = stat.size > CACHE_LARGE_FILE_BYTES;
   const entry = {
     text: isLarge ? null : buf.toString('utf8'),
-    bufferRef: isLarge ? null : null, // not held — text is the canonical form
     lineOffsets,
     size: stat.size,
     mtimeMs: stat.mtimeMs,
@@ -217,8 +213,8 @@ function _metaEndLine(meta) {
       : null;
 }
-function _attachIndexMetadata(filePathRel) {
-  const repo = _getRepo();
+function _attachIndexMetadata(filePathRel, projectRoot) {
+  const repo = _getRepo(projectRoot);
   if (!repo) return { indexed: false, chunks: [], language: null };
   const rows = repo.getChunksByFilePath(filePathRel);
@@ -258,7 +254,7 @@ function _attachIndexMetadata(filePathRel) {
  * @param {boolean} [req.includeMetadata=true] - attach index chunks/language
  * @returns {Promise<Object>}
  */
-export async function readFile(req) {
+async function _readFileUnpinned(req) {
   const t0 = performance.now();
   const projectRoot = req.projectRoot || process.cwd();
   const absPath = _resolvePath(req.path, projectRoot);
@@ -291,7 +287,7 @@ export async function readFile(req) {
   let chunks = [];
   let indexed = false;
   if (req.includeMetadata !== false) {
-    const meta = _attachIndexMetadata(relForIndex);
+    const meta = _attachIndexMetadata(relForIndex, projectRoot);
     indexed = meta.indexed;
     chunks = meta.chunks;
     language = meta.language;
@@ -323,6 +319,14 @@ export async function readFile(req) {
   };
 }
+export async function readFile(req) {
+  const projectRoot = req?.projectRoot || process.cwd();
+  return withPinnedRead(
+    { projectRoot, meta: { tool: 'read', path: req?.path ?? null, count: 1 } },
+    () => _readFileUnpinned({ ...req, projectRoot }),
+  );
+}
 /**
  * Batch read — up to 20 files in parallel. Per-file failures are returned
  * inline; the batch never throws unless `files` is malformed.
@@ -340,15 +344,18 @@ export async function readFiles(files, opts = {}) {
   if (files.length > 20) {
     throw new Error(`read accepts at most 20 files; got ${files.length}`);
   }
-  const t0 = performance.now();
-  const results = await Promise.all(files.map(f => readFile({
-    path: f.path,
-    startLine: f.startLine,
-    endLine: f.endLine,
-    projectRoot: opts.projectRoot,
-    includeMetadata: opts.includeMetadata !== false,
-  })));
-  return { files: results, totalMs: +(performance.now() - t0).toFixed(2) };
+  const projectRoot = opts.projectRoot || process.cwd();
+  return withPinnedRead({ projectRoot, meta: { tool: 'read', count: files.length } }, async () => {
+    const t0 = performance.now();
+    const results = await Promise.all(files.map(f => _readFileUnpinned({
+      path: f.path,
+      startLine: f.startLine,
+      endLine: f.endLine,
+      projectRoot,
+      includeMetadata: opts.includeMetadata !== false,
+    })));
+    return { files: results, totalMs: +(performance.now() - t0).toFixed(2) };
+  });
 }
 // ---------------------------------------------------------------------------
@@ -385,12 +392,6 @@ export function formatReadResults(results, format = 'agent') {
 // ---------------------------------------------------------------------------
 // CLI handler
-// Usage:
-//   sweet-search read path/to/file.ts
-//   sweet-search read path/to/file.ts --lines 45-92
-//   sweet-search read a.ts b.ts c.ts
-//   sweet-search read path/to/file.ts --json
-//   sweet-search read path/to/file.ts --raw
 // ---------------------------------------------------------------------------
 function _parseLineRange(spec) {
@@ -409,13 +410,21 @@ function _parseArgs(args) {
   let startLine = null;
   let endLine = null;
   let includeMetadata = true;
+  let plain = false;
+  let noBanner = false;
   for (let i = 0; i < args.length; i++) {
     const a = args[i];
     if (a === '--json') format = 'json';
     else if (a === '--raw') format = 'raw';
     else if (a === '--agent') format = 'agent';
     else if (a === '--no-metadata') includeMetadata = false;
-    else if (a === '--lines') {
+    else if (a === '--no-banner') noBanner = true;
+    else if (a === '--format' || a.startsWith('--format=')) {
+      const v = a === '--format' ? args[++i] : a.slice('--format='.length);
+      if (v === 'json' || v === 'raw' || v === 'agent') format = v;
+      else if (v === 'plain') plain = true;
+      else throw new Error(`unknown --format value: ${v}`);
+    } else if (a === '--lines') {
       const [s, e] = _parseLineRange(args[++i]);
       startLine = s; endLine = e;
     } else if (a === '--help' || a === '-h') {
@@ -427,7 +436,7 @@ function _parseArgs(args) {
       positional.push(a);
     }
   }
-  return { positional, format, startLine, endLine, includeMetadata };
+  return { positional, format, startLine, endLine, includeMetadata, plain, noBanner };
 }
 function _printHelp() {
@@ -443,6 +452,8 @@ function _printHelp() {
     '  --json            Emit JSON (machine-readable)',
     '  --raw             Emit raw text only (no fences/headers)',
     '  --agent           Default — markdown fenced block + symbol hints',
+    '  --format <fmt>    json | raw | agent | plain (plain = no identity line)',
+    '  --no-banner       Suppress the identity line',
     '  --no-metadata     Skip index metadata attachment',
     '',
   ].join('\n'));
@@ -467,6 +478,10 @@ export async function handleReadCli(args) {
     endLine: wantsRange ? parsed.endLine : undefined,
   }));
   const out = await readFiles(files, { includeMetadata: parsed.includeMetadata });
+  if (parsed.format !== 'json') {
+    const detail = files.length === 1 ? files[0].path : `${files.length} files`;
+    emitToolIdentityAuto('read', detail, { plain: parsed.plain, noBanner: parsed.noBanner });
+  }
   process.stdout.write(formatReadResults(out, parsed.format));
   if (parsed.format !== 'json') process.stdout.write('\n');
   // Non-zero exit if every file failed (so shell pipelines see the error).
@@ -477,5 +492,8 @@ export async function handleReadCli(args) {
 // Test-only export — clears caches between unit tests.
 export function __resetReadCachesForTests() {
   _cache.clear();
-  _repo = null;
+  for (const repo of _repos.values()) repo?.close?.();
+  _repos.clear();
 }
+export const __testing = { projectRelative: _projectRelative, codebasePathForProject: _codebasePathForProject };

package/core/search/search-reader-pin.js ADDED Viewed

@@ -0,0 +1,71 @@
+import path from 'node:path';
+import { DB_PATHS, PROJECT_ROOT } from '../infrastructure/config/index.js';
+import { readManifest } from '../incremental-indexing/infrastructure/manifest.mjs';
+import { beginRead, endRead } from '../incremental-indexing/infrastructure/reader-heartbeat.mjs';
+function dataDirName() {
+  const dir = path.basename(path.dirname(DB_PATHS.codebase || ''));
+  return dir && dir !== '.' ? dir : '.sweet-search';
+}
+export function searchStateDir(projectRoot = process.cwd()) {
+  const root = path.resolve(projectRoot || process.cwd());
+  if (root === path.resolve(PROJECT_ROOT || process.cwd())) {
+    return path.dirname(DB_PATHS.codebase);
+  }
+  return path.join(root, dataDirName());
+}
+// Negative cache for stateDirs known to have no reconcile-manifest.json.
+// 1s TTL bounds staleness if reconcile starts publishing after first probe.
+// Cleared per-stateDir whenever a manifest is observed.
+const _manifestAbsentAt = new Map();
+const MANIFEST_ABSENT_TTL_MS = 1000;
+export function _resetManifestAbsentCache() {
+  _manifestAbsentAt.clear();
+}
+export function beginPinnedRead({ projectRoot, stateDir, epoch, meta } = {}) {
+  // Caller signaled "I already checked and there is no pinned epoch".
+  // Heartbeat has no GC contract to honor without an epoch — no-op.
+  if (epoch === null) return null;
+  const resolvedStateDir = stateDir || (projectRoot ? searchStateDir(projectRoot) : null);
+  if (!resolvedStateDir) return null;
+  // Skip readManifest when we recently observed it was absent at this path.
+  if (!Number.isInteger(epoch)) {
+    const absentAt = _manifestAbsentAt.get(resolvedStateDir);
+    if (absentAt !== undefined && Date.now() - absentAt < MANIFEST_ABSENT_TTL_MS) {
+      return null;
+    }
+  }
+  const manifest = Number.isInteger(epoch) ? null : readManifest(resolvedStateDir);
+  const manifestEpoch = Number.isInteger(epoch)
+    ? epoch
+    : manifest?.epoch;
+  if (!Number.isInteger(manifestEpoch)) {
+    _manifestAbsentAt.set(resolvedStateDir, Date.now());
+    return null;
+  }
+  _manifestAbsentAt.delete(resolvedStateDir);
+  return {
+    stateDir: resolvedStateDir,
+    epoch: manifestEpoch,
+    manifest,
+    record: beginRead(resolvedStateDir, manifestEpoch, meta || {}),
+  };
+}
+export function endPinnedRead(pin) {
+  if (!pin) return;
+  endRead(pin.stateDir, pin.record);
+}
+export async function withPinnedRead(options, fn) {
+  const pin = beginPinnedRead(options);
+  try {
+    return await fn(pin?.epoch ?? null, pin);
+  } finally {
+    endPinnedRead(pin);
+  }
+}

package/core/search/search-rrf.js ADDED Viewed

@@ -0,0 +1,279 @@
+/**
+ * Multi-query parallel BM25F + Reciprocal Rank Fusion (RRF) tail fallback.
+ *
+ * Applies when the normal hybrid pipeline (lexical + semantic + CC fusion
+ * + IAR + post-fusion boosts + demotions + MMR + existing rewrite-retry)
+ * still leaves results weak — empty, low-confidence top-1, or no source
+ * file in top-3.
+ *
+ * Why this design:
+ *   - Long natural-language queries get tokenized by FTS5's sanitizer
+ *     into AND-of-many-tokens (`"how" "does" "Fastify" "compile" ...`),
+ *     and no chunk has all those tokens. Result: zero hits.
+ *   - SOTA in 2025-2026 (Cognition SWE-grep, Polarity Omnigrep, Cody
+ *     Deep Search, T2-RAGBench) is multi-query parallel retrieval with
+ *     RRF fusion — fire one BM25 per content keyword, fuse by rank.
+ *   - RRF (Cormack 2009) is corpus-agnostic and avoids the per-keyword
+ *     score-normalization trap. A chunk that ranks high in MULTIPLE
+ *     per-keyword queries floats up; a chunk that only matches one
+ *     noisy keyword (e.g. "time" → setTimeout) stays mid-pack because
+ *     it has a single 1/(k+rank) contribution.
+ *
+ * Why NOT a hand-curated stopword list:
+ *   The earlier draft (Proposal C v1) added "time", "data", "value" etc.
+ *   to a stopword list because Q4 ("registration time") matched
+ *   `setTimeout`. That's the Clever Hans / corpus-overfit anti-pattern
+ *   per the Mitra & Craswell neural-IR survey and Vespa's WAND article.
+ *   RRF handles this structurally: "time" matches noisy chunks at
+ *   rank 1, but those chunks DON'T also match "compile" + "schemas",
+ *   so their RRF score stays low compared to a chunk that hits all
+ *   three.
+ *
+ * Disable via `ablations: new Set(['no-rrf-fallback'])`.
+ *
+ * References:
+ *   - Cormack et al. "Reciprocal Rank Fusion outperforms Condorcet and
+ *     individual rank learning methods", SIGIR 2009.
+ *   - Cognition SWE-grep blog (Oct 2025).
+ *   - T2-RAGBench multi-query+RRF, EACL 2026.
+ */
+import { detectFileKind } from '../ranking/file-kind-ranking.js';
+// Question-scaffolding stopwords ONLY. Generic English nouns (time, data,
+// state, mode, value, etc.) intentionally NOT in this list — they may be
+// the actual concept the user is asking about, and RRF naturally demotes
+// them when they're noise (one rank-1 hit can't beat two rank-mid hits).
+const QUERY_SCAFFOLD_STOPWORDS = new Set([
+  'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', 'could',
+  'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have',
+  'how', 'in', 'into', 'is', 'it', 'its', 'of', 'on', 'or', 'should',
+  'so', 'than', 'that', 'the', 'their', 'them', 'this', 'those',
+  'to', 'too', 'use', 'using', 'was', 'were', 'what', 'when', 'where',
+  'whether', 'which', 'while', 'who', 'whom', 'why', 'will', 'with',
+  'would', 'you', 'your',
+]);
+// Standard RRF k constant from Cormack 2009. Higher k flattens the
+// rank-position curve (less weight for top hits); lower k sharpens it.
+// 60 is the published default and what most production systems use.
+const RRF_K = 60;
+const DEFAULT_PER_KEYWORD_LIMIT = 30;
+const DEFAULT_CONFIDENCE_FLOOR = 0.35;
+// RRF scores are tiny (~0.01-0.05). Map to a [base, base+range] band so
+// fallback candidates compete mid-pack without overwhelming a strong
+// fused top-1 from the encoder (typically 0.4-0.86).
+const FALLBACK_BASE = 0.40;
+const FALLBACK_RANGE = 0.20;
+export function extractContentKeywords(query) {
+  if (!query) return [];
+  const tokens = String(query).match(/[A-Za-z_][A-Za-z0-9_]+/g) || [];
+  const out = [];
+  const seen = new Set();
+  for (const tok of tokens) {
+    if (tok.length < 3) continue;
+    const lower = tok.toLowerCase();
+    if (QUERY_SCAFFOLD_STOPWORDS.has(lower)) continue;
+    if (seen.has(lower)) continue;
+    seen.add(lower);
+    out.push(tok);
+  }
+  return out;
+}
+/**
+ * Decide whether the keyword fallback should run. Tightened (2026-05-05)
+ * after a 20-query probe found the original triggers fired too eagerly.
+ *
+ * The earlier two-clause trigger (`low_confidence` ∨ `no_source_in_top3`)
+ * caused regressions on queries where the encoder DID produce a real
+ * named source symbol just below the score floor (e.g. `getServerInstance`
+ * at score 0.32 lost to a 1-line `[typeAlias: HttpKeys]` injected by RRF).
+ *
+ * New rule: RRF fires only when top-3 has NO "good source candidate" —
+ * defined as an implementation-file chunk with a real named entity. That
+ * captures the genuine "retrieval is lost" case (only docs / tests /
+ * unlabelled chunks) without sacrificing borderline-confidence wins.
+ *
+ *   - empty                   → fire (always)
+ *   - top-1 in docs/tests AND no good source candidate → fire
+ *   - all top-3 are unlabelled chunks (no symbol name) → fire
+ *   - otherwise               → don't fire
+ *
+ * The previous standalone `low_confidence` trigger (top-1 score < floor)
+ * was removed — encoder scores below 0.35 are common on long NL queries
+ * even when the answer IS the encoder's top-1.
+ */
+export function shouldRunFallback(results, opts = {}) {
+  if (!Array.isArray(results) || results.length === 0) return 'empty';
+  const window = results.slice(0, Math.min(3, results.length));
+  const hasGoodSource = window.some(r => {
+    const file = r.metadata?.file || r.file || r.file_path || '';
+    if (detectFileKind(file) !== 'implementation') return false;
+    const name = r.metadata?.name || r.name;
+    return name && String(name).trim().length > 0;
+  });
+  if (!hasGoodSource) return 'no_good_source_in_top3';
+  return null;
+}
+function chunkKey(r) {
+  const m = r.metadata || {};
+  const file = m.file || r.file || r.file_path;
+  const sl = m.startLine ?? r.startLine;
+  const el = m.endLine ?? r.endLine;
+  return `${file}|${sl}|${el}`;
+}
+/**
+ * Compute Reciprocal Rank Fusion across per-keyword BM25 result lists.
+ *
+ * For each chunk, RRF score = sum over all keywords k of 1 / (RRF_K + rank_k)
+ * where rank_k is the chunk's 1-indexed position in keyword k's results.
+ * Chunks not present in a keyword's results contribute 0 for that keyword.
+ *
+ * This naturally rewards chunks that appear in MULTIPLE per-keyword queries
+ * over chunks that only appear at rank 1 of a single noisy keyword.
+ *
+ * @param {Array<Array>} perKeywordResults - one array of BM25 hits per keyword
+ * @returns {Map<string, { result, rrf, perKeywordRanks: Map<string, number> }>}
+ */
+export function fuseRRF(perKeywordResults) {
+  const acc = new Map();
+  for (let kIdx = 0; kIdx < perKeywordResults.length; kIdx++) {
+    const list = perKeywordResults[kIdx] || [];
+    for (let r = 0; r < list.length; r++) {
+      const item = list[r];
+      const key = chunkKey(item);
+      const rank = r + 1;                          // 1-indexed
+      const contrib = 1 / (RRF_K + rank);
+      if (!acc.has(key)) {
+        acc.set(key, {
+          result: item,
+          rrf: 0,
+          keywordsHit: new Set(),
+        });
+      }
+      const entry = acc.get(key);
+      entry.rrf += contrib;
+      entry.keywordsHit.add(kIdx);
+    }
+  }
+  return acc;
+}
+/**
+ * Run multi-query BM25F + RRF fallback against the existing fused list.
+ *
+ * Fires when shouldRunFallback returns a reason. Extracts content
+ * keywords, fires one BM25F query per keyword via the existing
+ * `graphSearch.bm25SearchRaw` (which already uses the 4-column FTS5
+ * with weighted BM25), fuses with RRF, normalizes RRF scores to a
+ * mid-pack band, and merges into the existing candidate set.
+ *
+ * @param {Array} fused - current candidate list
+ * @param {string} query
+ * @param {object} opts
+ * @returns {Promise<{ results: Array, stats: object }>}
+ */
+export async function runRRFFallback(fused, query, opts = {}) {
+  const ablations = opts.ablations;
+  if (ablations && (ablations instanceof Set ? ablations.has('no-rrf-fallback') : Array.isArray(ablations) && ablations.includes('no-rrf-fallback'))) {
+    return { results: fused, stats: { reason: null, keywords: [], injected: 0, boosted: 0, fusedCount: 0 } };
+  }
+  const reason = shouldRunFallback(fused, opts);
+  if (!reason) {
+    return { results: fused, stats: { reason: null, keywords: [], injected: 0, boosted: 0, fusedCount: 0 } };
+  }
+  const keywords = extractContentKeywords(query);
+  if (keywords.length < 2) {
+    return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: 0 } };
+  }
+  const searcher = opts.searcher;
+  const graphSearch = searcher?.graphSearch;
+  if (!graphSearch || typeof graphSearch.bm25SearchRaw !== 'function') {
+    return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: 0 } };
+  }
+  // Fire BM25F per keyword in parallel. The existing bm25SearchRaw
+  // handles the AND/prefix/trigram cascade for each individual keyword,
+  // and uses the 4-column BM25F (`bm25(entities_fts, 10.0, 4.0, 5.0, 1.0)`).
+  const perKeywordLimit = Math.max(10, Math.min(50, opts.perKeywordLimit ?? DEFAULT_PER_KEYWORD_LIMIT));
+  const perKeyword = await Promise.all(
+    keywords.map(async (kw) => {
+      try {
+        const r = await graphSearch.bm25SearchRaw(kw, perKeywordLimit);
+        return r?.results || [];
+      } catch {
+        return [];
+      }
+    })
+  );
+  const fusedMap = fuseRRF(perKeyword);
+  if (fusedMap.size === 0) {
+    return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: 0 } };
+  }
+  // Sort by RRF score descending; cap the number we inject to avoid
+  // flooding the candidate set when many chunks have small RRF scores.
+  const ranked = [...fusedMap.values()].sort((a, b) => b.rrf - a.rrf);
+  const injectCap = Math.max(5, Math.min(30, opts.injectCap ?? 20));
+  const top = ranked.slice(0, injectCap);
+  // Normalize RRF scores to [FALLBACK_BASE, FALLBACK_BASE + FALLBACK_RANGE]
+  const maxRrf = top[0]?.rrf || 0;
+  if (maxRrf <= 0) {
+    return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: fusedMap.size } };
+  }
+  const existingByKey = new Map();
+  for (const r of fused) existingByKey.set(chunkKey(r), r);
+  let injected = 0;
+  let boosted = 0;
+  const additions = [];
+  for (const { result, rrf, keywordsHit } of top) {
+    const key = chunkKey(result);
+    const norm = rrf / maxRrf;
+    const fallbackScore = FALLBACK_BASE + FALLBACK_RANGE * norm;
+    const exists = existingByKey.get(key);
+    if (exists) {
+      if ((exists.score || 0) < fallbackScore) {
+        exists.score = fallbackScore;
+        exists._rrfBoosted = true;
+        exists._rrfHits = keywordsHit.size;
+        boosted++;
+      }
+      continue;
+    }
+    additions.push({
+      ...result,
+      searchPath: 'rrf-fallback',
+      score: fallbackScore,
+      _rrfFallback: true,
+      _rrfHits: keywordsHit.size,
+      _rrfRaw: rrf,
+    });
+    injected++;
+  }
+  if (injected === 0 && boosted === 0) {
+    return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: fusedMap.size } };
+  }
+  const merged = [...fused, ...additions].sort((a, b) => (b.score || 0) - (a.score || 0));
+  return {
+    results: merged,
+    stats: { reason, keywords, injected, boosted, fusedCount: fusedMap.size },
+  };
+}