sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -1,32 +1,15 @@
1
1
  /**
2
- * sweet-search read — filesystem-grounded file reader.
3
- *
4
- * Returns exact bytes from disk. The vectors index may attach symbol/chunk
5
- * metadata for indexed files, but the returned `text` always comes from
6
- * `node:fs`, never from the (truncated) DB column.
7
- *
8
- * Design notes:
9
- * - Filesystem is ground truth. Never return DB-stored text as content.
10
- * - Batch up to 20 files; per-file errors do not fail the batch.
11
- * - Warm-process cache keyed by `path|size|mtimeMs` avoids re-reading hot
12
- * files; line-offset table lets line-range reads avoid materialising the
13
- * whole content for large files.
14
- *
15
- * DDD: this module lives in the search/ application layer (allowed to import
16
- * infrastructure for filesystem grounding and chunk metadata).
2
+ * sweet-search read — filesystem-grounded file reader. Returns exact bytes from
3
+ * disk; the vectors index may attach symbol/chunk metadata, but the returned
4
+ * `text` always comes from node:fs, never from the (truncated) DB column.
17
5
  */
18
6
 
19
- import { promises as fs, statSync } from 'node:fs';
7
+ import { promises as fs, realpathSync, statSync } from 'node:fs';
20
8
  import path from 'node:path';
21
9
  import { CodebaseRepository } from '../infrastructure/codebase-repository.js';
22
- import { DB_PATHS } from '../infrastructure/config/index.js';
23
-
24
- // ---------------------------------------------------------------------------
25
- // Cache — keyed by absolutePath|size|mtimeMs (any change invalidates).
26
- // Bounded LRU. Entries hold either the full text + line-offset table, or just
27
- // the line-offset table for very large files where we deliberately avoid
28
- // caching the whole content.
29
- // ---------------------------------------------------------------------------
10
+ import { DB_PATHS, PROJECT_ROOT } from '../infrastructure/config/index.js';
11
+ import { withPinnedRead } from './search-reader-pin.js';
12
+ import { emitToolIdentityAuto } from './cli-decoration.js';
30
13
 
31
14
  const CACHE_MAX_ENTRIES = 64;
32
15
  const CACHE_LARGE_FILE_BYTES = 4 * 1024 * 1024; // 4MB — switch to range-read mode
@@ -45,22 +28,22 @@ function _cacheTouch(key, value) {
45
28
  }
46
29
  }
47
30
 
48
- // ---------------------------------------------------------------------------
49
- // Repository singleton — lazy and tolerant of a missing/empty DB.
50
- // ---------------------------------------------------------------------------
51
-
52
- let _repo = null;
53
- function _getRepo() {
54
- if (_repo === null) {
55
- try { _repo = new CodebaseRepository(DB_PATHS.codebase); }
56
- catch { _repo = false; }
31
+ const _repos = new Map();
32
+ function _getRepo(projectRoot) {
33
+ const dbPath = _codebasePathForProject(projectRoot);
34
+ if (!_repos.has(dbPath)) {
35
+ try { _repos.set(dbPath, new CodebaseRepository(dbPath)); }
36
+ catch { _repos.set(dbPath, false); }
57
37
  }
58
- return _repo || null;
38
+ return _repos.get(dbPath) || null;
59
39
  }
60
40
 
61
- // ---------------------------------------------------------------------------
62
- // Path resolution helpers
63
- // ---------------------------------------------------------------------------
41
+ function _codebasePathForProject(projectRoot) {
42
+ const root = path.resolve(projectRoot || process.cwd());
43
+ if (root === path.resolve(PROJECT_ROOT || process.cwd())) return DB_PATHS.codebase;
44
+ const stateDir = path.basename(path.dirname(DB_PATHS.codebase || '.sweet-search/codebase.db'));
45
+ return path.join(root, stateDir, 'codebase.db');
46
+ }
64
47
 
65
48
  function _resolvePath(p, projectRoot) {
66
49
  if (!p) throw new Error('path is required');
@@ -70,10 +53,24 @@ function _resolvePath(p, projectRoot) {
70
53
 
71
54
  function _projectRelative(absPath, projectRoot) {
72
55
  const root = projectRoot || process.cwd();
73
- const rel = path.relative(root, absPath);
74
- // Inside the project root → use relative form (matches vectors.file_path).
75
- // Outside → keep the absolute path (no chunks will match anyway).
76
- return rel.startsWith('..') || path.isAbsolute(rel) ? absPath : rel;
56
+ const normalized = _normalizeRelativePath(path.relative(root, absPath));
57
+ if (normalized) return normalized;
58
+ try {
59
+ return _normalizeRelativePath(
60
+ path.relative(realpathSync.native(root), realpathSync.native(absPath)),
61
+ ) || absPath;
62
+ } catch {
63
+ return absPath;
64
+ }
65
+ }
66
+
67
+ function _normalizeRelativePath(rel) {
68
+ const normalized = rel.replace(/\\/g, '/').replace(/^\.\//, '');
69
+ return (
70
+ normalized && !normalized.startsWith('../') && !path.isAbsolute(normalized)
71
+ ? normalized
72
+ : null
73
+ );
77
74
  }
78
75
 
79
76
  // ---------------------------------------------------------------------------
@@ -123,7 +120,6 @@ async function _readFromDisk(absPath) {
123
120
  const isLarge = stat.size > CACHE_LARGE_FILE_BYTES;
124
121
  const entry = {
125
122
  text: isLarge ? null : buf.toString('utf8'),
126
- bufferRef: isLarge ? null : null, // not held — text is the canonical form
127
123
  lineOffsets,
128
124
  size: stat.size,
129
125
  mtimeMs: stat.mtimeMs,
@@ -217,8 +213,8 @@ function _metaEndLine(meta) {
217
213
  : null;
218
214
  }
219
215
 
220
- function _attachIndexMetadata(filePathRel) {
221
- const repo = _getRepo();
216
+ function _attachIndexMetadata(filePathRel, projectRoot) {
217
+ const repo = _getRepo(projectRoot);
222
218
  if (!repo) return { indexed: false, chunks: [], language: null };
223
219
 
224
220
  const rows = repo.getChunksByFilePath(filePathRel);
@@ -258,7 +254,7 @@ function _attachIndexMetadata(filePathRel) {
258
254
  * @param {boolean} [req.includeMetadata=true] - attach index chunks/language
259
255
  * @returns {Promise<Object>}
260
256
  */
261
- export async function readFile(req) {
257
+ async function _readFileUnpinned(req) {
262
258
  const t0 = performance.now();
263
259
  const projectRoot = req.projectRoot || process.cwd();
264
260
  const absPath = _resolvePath(req.path, projectRoot);
@@ -291,7 +287,7 @@ export async function readFile(req) {
291
287
  let chunks = [];
292
288
  let indexed = false;
293
289
  if (req.includeMetadata !== false) {
294
- const meta = _attachIndexMetadata(relForIndex);
290
+ const meta = _attachIndexMetadata(relForIndex, projectRoot);
295
291
  indexed = meta.indexed;
296
292
  chunks = meta.chunks;
297
293
  language = meta.language;
@@ -323,6 +319,14 @@ export async function readFile(req) {
323
319
  };
324
320
  }
325
321
 
322
+ export async function readFile(req) {
323
+ const projectRoot = req?.projectRoot || process.cwd();
324
+ return withPinnedRead(
325
+ { projectRoot, meta: { tool: 'read', path: req?.path ?? null, count: 1 } },
326
+ () => _readFileUnpinned({ ...req, projectRoot }),
327
+ );
328
+ }
329
+
326
330
  /**
327
331
  * Batch read — up to 20 files in parallel. Per-file failures are returned
328
332
  * inline; the batch never throws unless `files` is malformed.
@@ -340,15 +344,18 @@ export async function readFiles(files, opts = {}) {
340
344
  if (files.length > 20) {
341
345
  throw new Error(`read accepts at most 20 files; got ${files.length}`);
342
346
  }
343
- const t0 = performance.now();
344
- const results = await Promise.all(files.map(f => readFile({
345
- path: f.path,
346
- startLine: f.startLine,
347
- endLine: f.endLine,
348
- projectRoot: opts.projectRoot,
349
- includeMetadata: opts.includeMetadata !== false,
350
- })));
351
- return { files: results, totalMs: +(performance.now() - t0).toFixed(2) };
347
+ const projectRoot = opts.projectRoot || process.cwd();
348
+ return withPinnedRead({ projectRoot, meta: { tool: 'read', count: files.length } }, async () => {
349
+ const t0 = performance.now();
350
+ const results = await Promise.all(files.map(f => _readFileUnpinned({
351
+ path: f.path,
352
+ startLine: f.startLine,
353
+ endLine: f.endLine,
354
+ projectRoot,
355
+ includeMetadata: opts.includeMetadata !== false,
356
+ })));
357
+ return { files: results, totalMs: +(performance.now() - t0).toFixed(2) };
358
+ });
352
359
  }
353
360
 
354
361
  // ---------------------------------------------------------------------------
@@ -385,12 +392,6 @@ export function formatReadResults(results, format = 'agent') {
385
392
 
386
393
  // ---------------------------------------------------------------------------
387
394
  // CLI handler
388
- // Usage:
389
- // sweet-search read path/to/file.ts
390
- // sweet-search read path/to/file.ts --lines 45-92
391
- // sweet-search read a.ts b.ts c.ts
392
- // sweet-search read path/to/file.ts --json
393
- // sweet-search read path/to/file.ts --raw
394
395
  // ---------------------------------------------------------------------------
395
396
 
396
397
  function _parseLineRange(spec) {
@@ -409,13 +410,21 @@ function _parseArgs(args) {
409
410
  let startLine = null;
410
411
  let endLine = null;
411
412
  let includeMetadata = true;
413
+ let plain = false;
414
+ let noBanner = false;
412
415
  for (let i = 0; i < args.length; i++) {
413
416
  const a = args[i];
414
417
  if (a === '--json') format = 'json';
415
418
  else if (a === '--raw') format = 'raw';
416
419
  else if (a === '--agent') format = 'agent';
417
420
  else if (a === '--no-metadata') includeMetadata = false;
418
- else if (a === '--lines') {
421
+ else if (a === '--no-banner') noBanner = true;
422
+ else if (a === '--format' || a.startsWith('--format=')) {
423
+ const v = a === '--format' ? args[++i] : a.slice('--format='.length);
424
+ if (v === 'json' || v === 'raw' || v === 'agent') format = v;
425
+ else if (v === 'plain') plain = true;
426
+ else throw new Error(`unknown --format value: ${v}`);
427
+ } else if (a === '--lines') {
419
428
  const [s, e] = _parseLineRange(args[++i]);
420
429
  startLine = s; endLine = e;
421
430
  } else if (a === '--help' || a === '-h') {
@@ -427,7 +436,7 @@ function _parseArgs(args) {
427
436
  positional.push(a);
428
437
  }
429
438
  }
430
- return { positional, format, startLine, endLine, includeMetadata };
439
+ return { positional, format, startLine, endLine, includeMetadata, plain, noBanner };
431
440
  }
432
441
 
433
442
  function _printHelp() {
@@ -443,6 +452,8 @@ function _printHelp() {
443
452
  ' --json Emit JSON (machine-readable)',
444
453
  ' --raw Emit raw text only (no fences/headers)',
445
454
  ' --agent Default — markdown fenced block + symbol hints',
455
+ ' --format <fmt> json | raw | agent | plain (plain = no identity line)',
456
+ ' --no-banner Suppress the identity line',
446
457
  ' --no-metadata Skip index metadata attachment',
447
458
  '',
448
459
  ].join('\n'));
@@ -467,6 +478,10 @@ export async function handleReadCli(args) {
467
478
  endLine: wantsRange ? parsed.endLine : undefined,
468
479
  }));
469
480
  const out = await readFiles(files, { includeMetadata: parsed.includeMetadata });
481
+ if (parsed.format !== 'json') {
482
+ const detail = files.length === 1 ? files[0].path : `${files.length} files`;
483
+ emitToolIdentityAuto('read', detail, { plain: parsed.plain, noBanner: parsed.noBanner });
484
+ }
470
485
  process.stdout.write(formatReadResults(out, parsed.format));
471
486
  if (parsed.format !== 'json') process.stdout.write('\n');
472
487
  // Non-zero exit if every file failed (so shell pipelines see the error).
@@ -477,5 +492,8 @@ export async function handleReadCli(args) {
477
492
  // Test-only export — clears caches between unit tests.
478
493
  export function __resetReadCachesForTests() {
479
494
  _cache.clear();
480
- _repo = null;
495
+ for (const repo of _repos.values()) repo?.close?.();
496
+ _repos.clear();
481
497
  }
498
+
499
+ export const __testing = { projectRelative: _projectRelative, codebasePathForProject: _codebasePathForProject };
@@ -0,0 +1,71 @@
1
+ import path from 'node:path';
2
+ import { DB_PATHS, PROJECT_ROOT } from '../infrastructure/config/index.js';
3
+ import { readManifest } from '../incremental-indexing/infrastructure/manifest.mjs';
4
+ import { beginRead, endRead } from '../incremental-indexing/infrastructure/reader-heartbeat.mjs';
5
+
6
+ function dataDirName() {
7
+ const dir = path.basename(path.dirname(DB_PATHS.codebase || ''));
8
+ return dir && dir !== '.' ? dir : '.sweet-search';
9
+ }
10
+
11
+ export function searchStateDir(projectRoot = process.cwd()) {
12
+ const root = path.resolve(projectRoot || process.cwd());
13
+ if (root === path.resolve(PROJECT_ROOT || process.cwd())) {
14
+ return path.dirname(DB_PATHS.codebase);
15
+ }
16
+ return path.join(root, dataDirName());
17
+ }
18
+
19
+ // Negative cache for stateDirs known to have no reconcile-manifest.json.
20
+ // 1s TTL bounds staleness if reconcile starts publishing after first probe.
21
+ // Cleared per-stateDir whenever a manifest is observed.
22
+ const _manifestAbsentAt = new Map();
23
+ const MANIFEST_ABSENT_TTL_MS = 1000;
24
+
25
+ export function _resetManifestAbsentCache() {
26
+ _manifestAbsentAt.clear();
27
+ }
28
+
29
+ export function beginPinnedRead({ projectRoot, stateDir, epoch, meta } = {}) {
30
+ // Caller signaled "I already checked and there is no pinned epoch".
31
+ // Heartbeat has no GC contract to honor without an epoch — no-op.
32
+ if (epoch === null) return null;
33
+ const resolvedStateDir = stateDir || (projectRoot ? searchStateDir(projectRoot) : null);
34
+ if (!resolvedStateDir) return null;
35
+ // Skip readManifest when we recently observed it was absent at this path.
36
+ if (!Number.isInteger(epoch)) {
37
+ const absentAt = _manifestAbsentAt.get(resolvedStateDir);
38
+ if (absentAt !== undefined && Date.now() - absentAt < MANIFEST_ABSENT_TTL_MS) {
39
+ return null;
40
+ }
41
+ }
42
+ const manifest = Number.isInteger(epoch) ? null : readManifest(resolvedStateDir);
43
+ const manifestEpoch = Number.isInteger(epoch)
44
+ ? epoch
45
+ : manifest?.epoch;
46
+ if (!Number.isInteger(manifestEpoch)) {
47
+ _manifestAbsentAt.set(resolvedStateDir, Date.now());
48
+ return null;
49
+ }
50
+ _manifestAbsentAt.delete(resolvedStateDir);
51
+ return {
52
+ stateDir: resolvedStateDir,
53
+ epoch: manifestEpoch,
54
+ manifest,
55
+ record: beginRead(resolvedStateDir, manifestEpoch, meta || {}),
56
+ };
57
+ }
58
+
59
+ export function endPinnedRead(pin) {
60
+ if (!pin) return;
61
+ endRead(pin.stateDir, pin.record);
62
+ }
63
+
64
+ export async function withPinnedRead(options, fn) {
65
+ const pin = beginPinnedRead(options);
66
+ try {
67
+ return await fn(pin?.epoch ?? null, pin);
68
+ } finally {
69
+ endPinnedRead(pin);
70
+ }
71
+ }
@@ -0,0 +1,279 @@
1
+ /**
2
+ * Multi-query parallel BM25F + Reciprocal Rank Fusion (RRF) tail fallback.
3
+ *
4
+ * Applies when the normal hybrid pipeline (lexical + semantic + CC fusion
5
+ * + IAR + post-fusion boosts + demotions + MMR + existing rewrite-retry)
6
+ * still leaves results weak — empty, low-confidence top-1, or no source
7
+ * file in top-3.
8
+ *
9
+ * Why this design:
10
+ * - Long natural-language queries get tokenized by FTS5's sanitizer
11
+ * into AND-of-many-tokens (`"how" "does" "Fastify" "compile" ...`),
12
+ * and no chunk has all those tokens. Result: zero hits.
13
+ * - SOTA in 2025-2026 (Cognition SWE-grep, Polarity Omnigrep, Cody
14
+ * Deep Search, T2-RAGBench) is multi-query parallel retrieval with
15
+ * RRF fusion — fire one BM25 per content keyword, fuse by rank.
16
+ * - RRF (Cormack 2009) is corpus-agnostic and avoids the per-keyword
17
+ * score-normalization trap. A chunk that ranks high in MULTIPLE
18
+ * per-keyword queries floats up; a chunk that only matches one
19
+ * noisy keyword (e.g. "time" → setTimeout) stays mid-pack because
20
+ * it has a single 1/(k+rank) contribution.
21
+ *
22
+ * Why NOT a hand-curated stopword list:
23
+ * The earlier draft (Proposal C v1) added "time", "data", "value" etc.
24
+ * to a stopword list because Q4 ("registration time") matched
25
+ * `setTimeout`. That's the Clever Hans / corpus-overfit anti-pattern
26
+ * per the Mitra & Craswell neural-IR survey and Vespa's WAND article.
27
+ * RRF handles this structurally: "time" matches noisy chunks at
28
+ * rank 1, but those chunks DON'T also match "compile" + "schemas",
29
+ * so their RRF score stays low compared to a chunk that hits all
30
+ * three.
31
+ *
32
+ * Disable via `ablations: new Set(['no-rrf-fallback'])`.
33
+ *
34
+ * References:
35
+ * - Cormack et al. "Reciprocal Rank Fusion outperforms Condorcet and
36
+ * individual rank learning methods", SIGIR 2009.
37
+ * - Cognition SWE-grep blog (Oct 2025).
38
+ * - T2-RAGBench multi-query+RRF, EACL 2026.
39
+ */
40
+
41
+ import { detectFileKind } from '../ranking/file-kind-ranking.js';
42
+
43
+ // Question-scaffolding stopwords ONLY. Generic English nouns (time, data,
44
+ // state, mode, value, etc.) intentionally NOT in this list — they may be
45
+ // the actual concept the user is asking about, and RRF naturally demotes
46
+ // them when they're noise (one rank-1 hit can't beat two rank-mid hits).
47
+ const QUERY_SCAFFOLD_STOPWORDS = new Set([
48
+ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', 'could',
49
+ 'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have',
50
+ 'how', 'in', 'into', 'is', 'it', 'its', 'of', 'on', 'or', 'should',
51
+ 'so', 'than', 'that', 'the', 'their', 'them', 'this', 'those',
52
+ 'to', 'too', 'use', 'using', 'was', 'were', 'what', 'when', 'where',
53
+ 'whether', 'which', 'while', 'who', 'whom', 'why', 'will', 'with',
54
+ 'would', 'you', 'your',
55
+ ]);
56
+
57
+ // Standard RRF k constant from Cormack 2009. Higher k flattens the
58
+ // rank-position curve (less weight for top hits); lower k sharpens it.
59
+ // 60 is the published default and what most production systems use.
60
+ const RRF_K = 60;
61
+
62
+ const DEFAULT_PER_KEYWORD_LIMIT = 30;
63
+ const DEFAULT_CONFIDENCE_FLOOR = 0.35;
64
+
65
+ // RRF scores are tiny (~0.01-0.05). Map to a [base, base+range] band so
66
+ // fallback candidates compete mid-pack without overwhelming a strong
67
+ // fused top-1 from the encoder (typically 0.4-0.86).
68
+ const FALLBACK_BASE = 0.40;
69
+ const FALLBACK_RANGE = 0.20;
70
+
71
+ export function extractContentKeywords(query) {
72
+ if (!query) return [];
73
+ const tokens = String(query).match(/[A-Za-z_][A-Za-z0-9_]+/g) || [];
74
+ const out = [];
75
+ const seen = new Set();
76
+ for (const tok of tokens) {
77
+ if (tok.length < 3) continue;
78
+ const lower = tok.toLowerCase();
79
+ if (QUERY_SCAFFOLD_STOPWORDS.has(lower)) continue;
80
+ if (seen.has(lower)) continue;
81
+ seen.add(lower);
82
+ out.push(tok);
83
+ }
84
+ return out;
85
+ }
86
+
87
+ /**
88
+ * Decide whether the keyword fallback should run. Tightened (2026-05-05)
89
+ * after a 20-query probe found the original triggers fired too eagerly.
90
+ *
91
+ * The earlier two-clause trigger (`low_confidence` ∨ `no_source_in_top3`)
92
+ * caused regressions on queries where the encoder DID produce a real
93
+ * named source symbol just below the score floor (e.g. `getServerInstance`
94
+ * at score 0.32 lost to a 1-line `[typeAlias: HttpKeys]` injected by RRF).
95
+ *
96
+ * New rule: RRF fires only when top-3 has NO "good source candidate" —
97
+ * defined as an implementation-file chunk with a real named entity. That
98
+ * captures the genuine "retrieval is lost" case (only docs / tests /
99
+ * unlabelled chunks) without sacrificing borderline-confidence wins.
100
+ *
101
+ * - empty → fire (always)
102
+ * - top-1 in docs/tests AND no good source candidate → fire
103
+ * - all top-3 are unlabelled chunks (no symbol name) → fire
104
+ * - otherwise → don't fire
105
+ *
106
+ * The previous standalone `low_confidence` trigger (top-1 score < floor)
107
+ * was removed — encoder scores below 0.35 are common on long NL queries
108
+ * even when the answer IS the encoder's top-1.
109
+ */
110
+ export function shouldRunFallback(results, opts = {}) {
111
+ if (!Array.isArray(results) || results.length === 0) return 'empty';
112
+ const window = results.slice(0, Math.min(3, results.length));
113
+ const hasGoodSource = window.some(r => {
114
+ const file = r.metadata?.file || r.file || r.file_path || '';
115
+ if (detectFileKind(file) !== 'implementation') return false;
116
+ const name = r.metadata?.name || r.name;
117
+ return name && String(name).trim().length > 0;
118
+ });
119
+ if (!hasGoodSource) return 'no_good_source_in_top3';
120
+ return null;
121
+ }
122
+
123
+ function chunkKey(r) {
124
+ const m = r.metadata || {};
125
+ const file = m.file || r.file || r.file_path;
126
+ const sl = m.startLine ?? r.startLine;
127
+ const el = m.endLine ?? r.endLine;
128
+ return `${file}|${sl}|${el}`;
129
+ }
130
+
131
+ /**
132
+ * Compute Reciprocal Rank Fusion across per-keyword BM25 result lists.
133
+ *
134
+ * For each chunk, RRF score = sum over all keywords k of 1 / (RRF_K + rank_k)
135
+ * where rank_k is the chunk's 1-indexed position in keyword k's results.
136
+ * Chunks not present in a keyword's results contribute 0 for that keyword.
137
+ *
138
+ * This naturally rewards chunks that appear in MULTIPLE per-keyword queries
139
+ * over chunks that only appear at rank 1 of a single noisy keyword.
140
+ *
141
+ * @param {Array<Array>} perKeywordResults - one array of BM25 hits per keyword
142
+ * @returns {Map<string, { result, rrf, perKeywordRanks: Map<string, number> }>}
143
+ */
144
+ export function fuseRRF(perKeywordResults) {
145
+ const acc = new Map();
146
+ for (let kIdx = 0; kIdx < perKeywordResults.length; kIdx++) {
147
+ const list = perKeywordResults[kIdx] || [];
148
+ for (let r = 0; r < list.length; r++) {
149
+ const item = list[r];
150
+ const key = chunkKey(item);
151
+ const rank = r + 1; // 1-indexed
152
+ const contrib = 1 / (RRF_K + rank);
153
+ if (!acc.has(key)) {
154
+ acc.set(key, {
155
+ result: item,
156
+ rrf: 0,
157
+ keywordsHit: new Set(),
158
+ });
159
+ }
160
+ const entry = acc.get(key);
161
+ entry.rrf += contrib;
162
+ entry.keywordsHit.add(kIdx);
163
+ }
164
+ }
165
+ return acc;
166
+ }
167
+
168
+ /**
169
+ * Run multi-query BM25F + RRF fallback against the existing fused list.
170
+ *
171
+ * Fires when shouldRunFallback returns a reason. Extracts content
172
+ * keywords, fires one BM25F query per keyword via the existing
173
+ * `graphSearch.bm25SearchRaw` (which already uses the 4-column FTS5
174
+ * with weighted BM25), fuses with RRF, normalizes RRF scores to a
175
+ * mid-pack band, and merges into the existing candidate set.
176
+ *
177
+ * @param {Array} fused - current candidate list
178
+ * @param {string} query
179
+ * @param {object} opts
180
+ * @returns {Promise<{ results: Array, stats: object }>}
181
+ */
182
+ export async function runRRFFallback(fused, query, opts = {}) {
183
+ const ablations = opts.ablations;
184
+ if (ablations && (ablations instanceof Set ? ablations.has('no-rrf-fallback') : Array.isArray(ablations) && ablations.includes('no-rrf-fallback'))) {
185
+ return { results: fused, stats: { reason: null, keywords: [], injected: 0, boosted: 0, fusedCount: 0 } };
186
+ }
187
+
188
+ const reason = shouldRunFallback(fused, opts);
189
+ if (!reason) {
190
+ return { results: fused, stats: { reason: null, keywords: [], injected: 0, boosted: 0, fusedCount: 0 } };
191
+ }
192
+
193
+ const keywords = extractContentKeywords(query);
194
+ if (keywords.length < 2) {
195
+ return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: 0 } };
196
+ }
197
+
198
+ const searcher = opts.searcher;
199
+ const graphSearch = searcher?.graphSearch;
200
+ if (!graphSearch || typeof graphSearch.bm25SearchRaw !== 'function') {
201
+ return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: 0 } };
202
+ }
203
+
204
+ // Fire BM25F per keyword in parallel. The existing bm25SearchRaw
205
+ // handles the AND/prefix/trigram cascade for each individual keyword,
206
+ // and uses the 4-column BM25F (`bm25(entities_fts, 10.0, 4.0, 5.0, 1.0)`).
207
+ const perKeywordLimit = Math.max(10, Math.min(50, opts.perKeywordLimit ?? DEFAULT_PER_KEYWORD_LIMIT));
208
+ const perKeyword = await Promise.all(
209
+ keywords.map(async (kw) => {
210
+ try {
211
+ const r = await graphSearch.bm25SearchRaw(kw, perKeywordLimit);
212
+ return r?.results || [];
213
+ } catch {
214
+ return [];
215
+ }
216
+ })
217
+ );
218
+
219
+ const fusedMap = fuseRRF(perKeyword);
220
+ if (fusedMap.size === 0) {
221
+ return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: 0 } };
222
+ }
223
+
224
+ // Sort by RRF score descending; cap the number we inject to avoid
225
+ // flooding the candidate set when many chunks have small RRF scores.
226
+ const ranked = [...fusedMap.values()].sort((a, b) => b.rrf - a.rrf);
227
+ const injectCap = Math.max(5, Math.min(30, opts.injectCap ?? 20));
228
+ const top = ranked.slice(0, injectCap);
229
+
230
+ // Normalize RRF scores to [FALLBACK_BASE, FALLBACK_BASE + FALLBACK_RANGE]
231
+ const maxRrf = top[0]?.rrf || 0;
232
+ if (maxRrf <= 0) {
233
+ return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: fusedMap.size } };
234
+ }
235
+
236
+ const existingByKey = new Map();
237
+ for (const r of fused) existingByKey.set(chunkKey(r), r);
238
+
239
+ let injected = 0;
240
+ let boosted = 0;
241
+ const additions = [];
242
+
243
+ for (const { result, rrf, keywordsHit } of top) {
244
+ const key = chunkKey(result);
245
+ const norm = rrf / maxRrf;
246
+ const fallbackScore = FALLBACK_BASE + FALLBACK_RANGE * norm;
247
+
248
+ const exists = existingByKey.get(key);
249
+ if (exists) {
250
+ if ((exists.score || 0) < fallbackScore) {
251
+ exists.score = fallbackScore;
252
+ exists._rrfBoosted = true;
253
+ exists._rrfHits = keywordsHit.size;
254
+ boosted++;
255
+ }
256
+ continue;
257
+ }
258
+
259
+ additions.push({
260
+ ...result,
261
+ searchPath: 'rrf-fallback',
262
+ score: fallbackScore,
263
+ _rrfFallback: true,
264
+ _rrfHits: keywordsHit.size,
265
+ _rrfRaw: rrf,
266
+ });
267
+ injected++;
268
+ }
269
+
270
+ if (injected === 0 && boosted === 0) {
271
+ return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: fusedMap.size } };
272
+ }
273
+
274
+ const merged = [...fused, ...additions].sort((a, b) => (b.score || 0) - (a.score || 0));
275
+ return {
276
+ results: merged,
277
+ stats: { reason, keywords, injected, boosted, fusedCount: fusedMap.size },
278
+ };
279
+ }