sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -1,3 +1,6 @@
1
+ import { readFileSync } from 'fs';
2
+ import path from 'path';
3
+
1
4
  /**
2
5
  * Intent-aware file-kind ranking (conservative variant).
3
6
  *
@@ -40,8 +43,62 @@
40
43
  */
41
44
 
42
45
  const DOCS_RE = /\.md$|\.mdx$|\.rst$|(?:^|\/)docs?\//i;
43
- const TESTS_RE = /(?:^|\/)tests?\/|(?:^|\/)spec\/|\.test\.[a-z0-9]+$|_test\.[a-z0-9]+$|\.spec\.[a-z0-9]+$|_spec\.[a-z0-9]+$/i;
46
+ // Optional leading underscore covers Go's `_examples/` build-excluded
47
+ // convention (go-chi, gin, gorilla, etc. — `go build` ignores any path
48
+ // component starting with `_`). The non-underscore form catches the
49
+ // generic `examples/`/`example/` directory shape across all languages.
50
+ // Verified across the AST-tester repo set: only `_examples/` directories
51
+ // follow the examples-pollution pattern; other leading-underscore dirs
52
+ // (Sphinx `_static`, Rust `_typeshed`, test fixture `_unrelated`, etc.)
53
+ // fall under existing docs / test demotion or are legitimate sources.
54
+ const EXAMPLES_RE = /(?:^|\/)_?examples?\//i;
55
+ // Tests directory patterns. Includes the standard tests?/spec/__tests__/__mocks__
56
+ // plus integration/, e2e/, fixtures?/, cypress/, playwright/ — common test-fixture
57
+ // directory conventions across JS/Python/Rust/Go that shipped without TESTS_RE
58
+ // catching them (e.g. fastify integration/server.js was mis-classified as
59
+ // 'implementation' until this update 2026-05-07).
60
+ // 2026-05-13: added `(?:^|/)test\.(c|cpp|cc|h|hpp|py|rb|go|rs|js|ts|java|kt|scala|php|lua|zig|dart|ex|exs)$`
61
+ // to catch bare `test.<ext>` files at repo root or any directory level
62
+ // (hiredis convention — C-005 in ast-tester probes). Restricted to code
63
+ // extensions only so `test.html`, `test.json`, `test.yaml` (legitimate
64
+ // fixtures) aren't misclassified.
65
+ const TESTS_RE = /(?:^|\/)(?:tests?|spec|integration|e2e|fixtures?|__tests__|__mocks__|cypress|playwright)\/|\.test\.[a-z0-9]+$|_test\.[a-z0-9]+$|\.spec\.[a-z0-9]+$|_spec\.[a-z0-9]+$|\.e2e\.[a-z0-9]+$|_e2e\.[a-z0-9]+$|(?:^|\/)test\.(?:c|cpp|cc|cxx|h|hpp|hh|py|rb|go|rs|js|ts|jsx|tsx|java|kt|scala|php|lua|zig|dart|ex|exs|swift|mjs|cjs|cs)$/i;
44
66
  const TYPES_RE = /\.d\.ts$|(?:^|\/)types\//i;
67
+ // Ancillary files: configuration, lockfiles, CI manifests, container build
68
+ // definitions. 2026-05-07 added Dockerfile / Containerfile / .dockerignore
69
+ // after FreshStack uv UV-FLOW-2 surfaced a Dockerfile as top-1 for "what
70
+ // happens end-to-end when I run uv sync". Containerfile descriptors are not
71
+ // implementation code; demote consistently with .yaml/.toml/Cargo.lock
72
+ // siblings.
73
+ //
74
+ // NOTE: Deliberately NOT including `Makefile` / `GNUmakefile` here even
75
+ // though they are also build-orchestration. Probe S6-Q6 (gin) regressed
76
+ // PASS→PARTIAL when gin's `Makefile` was demoted: classifying it shifted
77
+ // the file-kind window's `demotableCount`, which cascaded through the
78
+ // rerank into a different gin.go top-1 chunk pick. Treating Makefile as
79
+ // implementation is the safer default — it rarely competes with real source
80
+ // for top-1 anyway. Re-evaluate if a future probe shows Makefile actually
81
+ // poisoning a top-1 result.
82
+ const ANCILLARY_RE = /(?:^|\/)\.(?:github|gitlab|circleci|vscode|cursor)\/|(?:^|\/)(?:package(?:-lock)?\.json|pnpm-lock\.yaml|yarn\.lock|Cargo\.lock|Gemfile\.lock|Dockerfile(?:\.[\w.-]+)?|Containerfile|\.dockerignore)$|\.(?:ya?ml|jsonc?|toml|ini|cfg|conf|lock|xml|csv|dockerfile)$/i;
83
+ const DECLARATION_RE = /\b(function|class|struct|interface|enum|trait|fn\s+\w+|def\s+\w+|const\s+k[A-Z])\b|\btype\s+\w+\s*=/;
84
+ const EXECUTABLE_DECLARATION_RE = /\b(function|class|struct|interface|enum|trait|fn\s+\w+|def\s+\w+|func\s+\w+)\b/;
85
+ const STOPWORDS = new Set([
86
+ 'and', 'are', 'does', 'for', 'from', 'how', 'into', 'is', 'the', 'this',
87
+ 'that', 'what', 'when', 'where', 'which', 'with', 'why',
88
+ ]);
89
+ const LANG_KEYWORDS = new Set([
90
+ 'class', 'const', 'def', 'enum', 'fn', 'function', 'impl', 'import',
91
+ 'interface', 'let', 'package', 'pub', 'struct', 'trait', 'type', 'use',
92
+ ]);
93
+
94
+ const ENTITY_KIND_KEYWORDS = {
95
+ enum: ['enum'],
96
+ struct: ['struct'],
97
+ interface: ['interface', 'trait'],
98
+ trait: ['trait'],
99
+ class: ['class'],
100
+ type: ['type', 'typeAlias', 'enum', 'struct', 'trait', 'class', 'interface'],
101
+ };
45
102
 
46
103
  // Strong implementation-seeking signals. A query that fires one of these is
47
104
  // confidently asking for source code; anything else is treated as `'unknown'`.
@@ -50,16 +107,20 @@ const TYPES_RE = /\.d\.ts$|(?:^|\/)types\//i;
50
107
  const IMPL_INTENT_RE = new RegExp(
51
108
  '\\b(' + [
52
109
  // English wh-questions about location/behaviour
53
- 'where', 'how does', 'how do',
110
+ 'where', 'what', 'how does', 'how do',
111
+ 'when',
54
112
  // Definition / implementation phrasing
55
113
  'implements?', 'implementation', 'defines?', 'definition', 'declared?',
114
+ 'decides?',
56
115
  // Code-structure nouns
57
116
  'function', 'functions', 'method', 'methods', 'class', 'classes',
58
117
  'constructor', 'module', 'library', 'crate', 'package',
59
118
  // Verbs that strongly signal a code unit
60
119
  'dispatch(?:es|er)?', 'handles?', 'handler', 'handlers',
61
- 'parses?', 'parser', 'parsers',
120
+ 'bind(?:s|ing)?',
121
+ 'parses?', 'parsed', 'parser', 'parsers',
62
122
  'router?', 'routes?', 'routing',
123
+ 'redirect(?:s|ed|ing)?',
63
124
  'register(?:s|ed|ing)?',
64
125
  'builds?', 'builder', 'builders',
65
126
  'generat(?:es?|or|ors|ed|ing)',
@@ -98,18 +159,31 @@ const IMPL_INTENT_RE = new RegExp(
98
159
 
99
160
  const DOCS_INTENT_RE = /\b(doc|docs|documentation|readme|guide|tutorial|reference|example)\b/i;
100
161
  const TESTS_INTENT_RE = /\b(test|tests|spec|specs|fixture|fixtures|mock|mocks)\b/i;
101
- const TYPES_INTENT_RE = /\b(type|types|interface|declaration|signature|typings|typedef)\b/i;
162
+ const TYPES_INTENT_RE = /\b(types|interface|declaration|signature|typings|typedef)\b|\btype\s+(?:alias|declaration|definition|interface|signature)\b/i;
163
+ const ANCILLARY_INTENT_RE = /\b(config|configuration|manifest|workflow|ci|github action|labeler|toml|lockfile|package\.json)\b/i;
102
164
 
103
165
  /**
104
166
  * Detect the file kind from a result path.
105
- * @returns {'docs'|'tests'|'types'|'implementation'}
167
+ * @returns {'docs'|'examples'|'tests'|'types'|'ancillary'|'implementation'}
106
168
  */
107
- export function detectFileKind(filePath) {
169
+ export function detectFileKind(filePath, opts) {
108
170
  if (!filePath || typeof filePath !== 'string') return 'implementation';
109
- if (DOCS_RE.test(filePath)) return 'docs';
110
- if (TESTS_RE.test(filePath)) return 'tests';
111
- if (TYPES_RE.test(filePath)) return 'types';
112
- return 'implementation';
171
+ // Per-call cache. Each filePath produces a deterministic kind; calling
172
+ // 5 regex tests + an isTestSupportFile path-rule scan per result × per
173
+ // demotion site burns cycles redundantly when only ~10-20 unique files
174
+ // live in a result set. Cache keyed by file path; verdict reused.
175
+ const cache = opts && opts._fileKindCache;
176
+ if (cache && cache.has(filePath)) return cache.get(filePath);
177
+ let kind;
178
+ if (DOCS_RE.test(filePath)) kind = 'docs';
179
+ else if (EXAMPLES_RE.test(filePath)) kind = 'examples';
180
+ else if (TESTS_RE.test(filePath)) kind = 'tests';
181
+ else if (isTestSupportFile(filePath)) kind = 'tests';
182
+ else if (TYPES_RE.test(filePath)) kind = 'types';
183
+ else if (ANCILLARY_RE.test(filePath)) kind = 'ancillary';
184
+ else kind = 'implementation';
185
+ if (cache) cache.set(filePath, kind);
186
+ return kind;
113
187
  }
114
188
 
115
189
  /**
@@ -118,7 +192,7 @@ export function detectFileKind(filePath) {
118
192
  * `'unknown'`, and the helper treats `'unknown'` as a no-op (just like the
119
193
  * docs/tests/types intents).
120
194
  *
121
- * @returns {'docs'|'tests'|'types'|'implementation'|'unknown'}
195
+ * @returns {'docs'|'tests'|'types'|'ancillary'|'implementation'|'unknown'}
122
196
  */
123
197
  export function classifyFileKindIntent(query) {
124
198
  const q = (query || '').toLowerCase();
@@ -127,6 +201,7 @@ export function classifyFileKindIntent(query) {
127
201
  if (TYPES_INTENT_RE.test(q)) return 'types';
128
202
  if (DOCS_INTENT_RE.test(q)) return 'docs';
129
203
  if (TESTS_INTENT_RE.test(q)) return 'tests';
204
+ if (ANCILLARY_INTENT_RE.test(q)) return 'ancillary';
130
205
  if (IMPL_INTENT_RE.test(q)) return 'implementation';
131
206
  return 'unknown';
132
207
  }
@@ -141,6 +216,2083 @@ function resolveFilePath(r) {
141
216
  || '';
142
217
  }
143
218
 
219
+ function inferLineCount(r) {
220
+ const meta = r?.metadata || {};
221
+ const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
222
+ const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line;
223
+ if (Number.isFinite(start) && Number.isFinite(end) && end >= start) {
224
+ return end - start + 1;
225
+ }
226
+
227
+ const text = r?.text || r?.content || r?.code || r?.snippet || '';
228
+ if (typeof text === 'string' && text.length > 0) {
229
+ return text.split(/\r?\n/).length;
230
+ }
231
+
232
+ return Infinity;
233
+ }
234
+
235
+ function readResultSpan(r, opts = {}) {
236
+ if (!opts.projectRoot) return '';
237
+ const file = resolveFilePath(r);
238
+ if (!file) return '';
239
+ const meta = r?.metadata || {};
240
+ const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
241
+ const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line ?? start;
242
+ if (!Number.isFinite(start) || !Number.isFinite(end) || end < start) return '';
243
+ try {
244
+ const abs = path.resolve(opts.projectRoot, file);
245
+ const root = path.resolve(opts.projectRoot);
246
+ if (abs !== root && !abs.startsWith(root + path.sep)) return '';
247
+ const lines = readFileSync(abs, 'utf8').split('\n');
248
+ const contextStart = Math.max(1, start - 2);
249
+ return lines.slice(contextStart - 1, end).join('\n');
250
+ } catch {
251
+ return '';
252
+ }
253
+ }
254
+
255
+ function resolveResultText(r, opts = {}) {
256
+ const inline = r?.content || r?.text || r?.code || r?.snippet;
257
+ if (inline) return inline;
258
+ // Per-call cache: this function is hit by 5+ demotion sub-rules per result
259
+ // (bodyDensity, isTestChunk fallback, anomalousChunk, docCommentOnly,
260
+ // inferEntityKindFromText). Without memoization, each cache miss triggers
261
+ // a full readFileSync + split('\n') on the chunk's source file — 5 file
262
+ // reads per result × 100 results = ~500 disk reads per applyResultDemotions
263
+ // call, which dominates the 6ms p50 cost.
264
+ const cache = opts._resultTextCache;
265
+ if (cache) {
266
+ const file = resolveFilePath(r);
267
+ const meta = r?.metadata || {};
268
+ const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
269
+ const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line ?? start;
270
+ if (file && Number.isFinite(start)) {
271
+ const key = `${file}|${start}|${Number.isFinite(end) ? end : start}`;
272
+ if (cache.has(key)) return cache.get(key);
273
+ const text = readResultSpan(r, opts);
274
+ cache.set(key, text);
275
+ return text;
276
+ }
277
+ }
278
+ return readResultSpan(r, opts);
279
+ }
280
+
281
+ function resolveResultName(r) {
282
+ return r?.metadata?.name || r?.name || '';
283
+ }
284
+
285
+ function resolveResultType(r) {
286
+ return r?.metadata?.type || r?.type || '';
287
+ }
288
+
289
+ function normalizeType(type) {
290
+ return String(type || '').toLowerCase();
291
+ }
292
+
293
+ function hasAblation(ablations, name) {
294
+ return ablations instanceof Set ? ablations.has(name) : Array.isArray(ablations) && ablations.includes(name);
295
+ }
296
+
297
+ // Removed (2026-05-05): the standalone tiny-ancillary-chunk floor became
298
+ // redundant once cAST sibling-merge was confirmed in tree-sitter-provider.js
299
+ // (recursiveChunk merges adjacent siblings up to MAX_CHUNK_SIZE so tiny
300
+ // chunks don't enter the index as standalone retrieval units), and the
301
+ // range-preservation invariant in applyResultDemotions stopped entity
302
+ // adoption from shrinking already-merged chunks. Kept the per-ancillary-file
303
+ // hard tiny factor (`tinyAncillaryFactor` in applyFileKindRanking) since
304
+ // that's a sub-rule of doc/test demotion, not a general size penalty.
305
+
306
+ export function isTestChunk(r, opts = {}) {
307
+ const filePath = resolveFilePath(r);
308
+ // Per-chunk verdict cache. isTestChunk fires once per result inside the
309
+ // demotion loop, but its inputs (filePath, chunk text, chunk name) are
310
+ // immutable for a given (file, start, end). Cache the boolean to skip the
311
+ // 4 chunk-text regexes + name regex on cache hits.
312
+ const verdictCache = opts._isTestChunkCache;
313
+ let chunkKey = null;
314
+ if (verdictCache) {
315
+ const meta = r?.metadata || {};
316
+ const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
317
+ const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line ?? start;
318
+ if (filePath && Number.isFinite(start)) {
319
+ chunkKey = `${filePath}|${start}|${Number.isFinite(end) ? end : start}`;
320
+ if (verdictCache.has(chunkKey)) return verdictCache.get(chunkKey);
321
+ }
322
+ }
323
+ const verdict = isTestChunkUncached(r, opts, filePath);
324
+ if (chunkKey) verdictCache.set(chunkKey, verdict);
325
+ return verdict;
326
+ }
327
+
328
+ function isTestChunkUncached(r, opts, filePath) {
329
+ const fileKind = detectFileKind(filePath, opts);
330
+ if (fileKind === 'tests') return true;
331
+ if (!hasAblation(opts.ablations, 'no-test-support-detection')) {
332
+ // Per-file verdict cache. isTestSupportFile is deterministic in
333
+ // (filePath, file content) and the file content is immutable for the
334
+ // duration of one search() call. Without this cache, the text-scan
335
+ // path (split/filter/per-line-regex over hundreds of lines) ran on
336
+ // every result, dominated by ~100 results × 100µs = 10ms per
337
+ // applyResultDemotions call. Cached, the verdict is computed at most
338
+ // once per unique file path.
339
+ const verdictCache = opts._isTestSupportCache;
340
+ let supportVerdict;
341
+ if (verdictCache && verdictCache.has(filePath)) {
342
+ supportVerdict = verdictCache.get(filePath);
343
+ } else {
344
+ supportVerdict = isTestSupportFile(
345
+ filePath,
346
+ () => resolveFullFileText(r, opts) || resolveResultText(r, opts),
347
+ );
348
+ if (verdictCache) verdictCache.set(filePath, supportVerdict);
349
+ }
350
+ if (supportVerdict) return true;
351
+ }
352
+
353
+ // Combined alternation over the four prior single-pattern tests:
354
+ // #[cfg(test)] / #[test] (Rust attribute)
355
+ // func Test<X> (Go testing)
356
+ // def test_<X> (Python unittest/pytest)
357
+ // it/test/describe(...) (JS/TS suite frameworks)
358
+ // V8 compiles a single alternation regex into one DFA pass over the text;
359
+ // running four `.test()` calls forced four separate scans even when the
360
+ // first three short-circuited successfully. Per result the saving is
361
+ // ~30-100µs, which compounds across the per-call window (~100 results)
362
+ // and is the dominant remaining cost in rule:testName after the verdict
363
+ // caches eliminated repeats.
364
+ const text = resolveResultText(r, opts);
365
+ if (TEST_CHUNK_BODY_RE.test(text)) return true;
366
+
367
+ const name = resolveResultName(r);
368
+ return TEST_CHUNK_NAME_RE.test(name);
369
+ }
370
+
371
+ const TEST_CHUNK_BODY_RE = /^\s*(?:#\[(?:cfg\s*\(\s*test\s*\)|test)\]|func\s+Test[A-Z]|def\s+test_|(?:it|test|describe)\s*\(\s*['"])/m;
372
+ const TEST_CHUNK_NAME_RE = /^(?:test_|Test[A-Z])|_test$/;
373
+
374
+ function resolveFullFileText(r, opts = {}) {
375
+ if (!opts.projectRoot) return '';
376
+ const file = resolveFilePath(r);
377
+ if (!file) return '';
378
+ // Per-call cache: this fires once per result × per isTestChunk site
379
+ // (hybrid + postprocess). Without memoization a query touching N
380
+ // distinct files reads each one fully ~2× per result that hits the
381
+ // file. Keyed by file path — the file content is immutable for the
382
+ // duration of one search() call.
383
+ const cache = opts._fullFileTextCache;
384
+ if (cache && cache.has(file)) return cache.get(file);
385
+ try {
386
+ const root = path.resolve(opts.projectRoot);
387
+ const abs = path.resolve(root, file);
388
+ if (abs !== root && !abs.startsWith(root + path.sep)) {
389
+ if (cache) cache.set(file, '');
390
+ return '';
391
+ }
392
+ const text = readFileSync(abs, 'utf8');
393
+ if (cache) cache.set(file, text);
394
+ return text;
395
+ } catch {
396
+ if (cache) cache.set(file, '');
397
+ return '';
398
+ }
399
+ }
400
+
401
+ export function isTestSupportFile(filePath, content = '') {
402
+ if (!filePath) return false;
403
+ const pathRules = [
404
+ /(^|\/)(testutil|test_util|test_utils|test_helper|test_helpers|testing_support|spec_helper)\.[a-z]+$/i,
405
+ /(^|\/)(test|tests|spec|__tests__|__mocks__)\/[^/]*(util|helper|fixture|mock|stub|setup|harness)/i,
406
+ /(^|\/)(testdata|fixtures|__fixtures__|test_data)\//i,
407
+ /(^|\/)conftest\.py$/i,
408
+ /\.test-d\.[tj]sx?$/i,
409
+ ];
410
+ if (pathRules.some(re => re.test(filePath))) return true;
411
+
412
+ // Lazy content getter: caller passes a thunk to avoid reading the file
413
+ // when path rules already determine the answer. Plain string still
414
+ // accepted for back-compat with non-applyResultDemotions callers.
415
+ const text = typeof content === 'function' ? content() : content;
416
+ if (!text) return false;
417
+ if (/^\s*#!\[cfg\s*\(\s*test\s*\)/m.test(text)) return true;
418
+
419
+ const lines = text.split('\n').filter(line => line.trim());
420
+ if (lines.length < 8) return false;
421
+ const hasJsTestContext = /(^|\/)(test|tests|spec|__tests__)\//i.test(filePath)
422
+ || /^\s*(describe|it|test)\s*\(/m.test(text);
423
+ const assertionRe = hasJsTestContext
424
+ ? /\b(assert!|assert_eq!|assert_ne!|expect\(|assertEqual|assertEquals|t\.Errorf|t\.Fatalf|t\.Helper\(\)|require\.\w+|assert\.\w+)\b/
425
+ : /\b(assert!|assert_eq!|assert_ne!|assertEqual|assertEquals|t\.Errorf|t\.Fatalf|t\.Helper\(\))\b/;
426
+ const assertLines = lines.filter(line => assertionRe.test(line)).length;
427
+ return assertLines / lines.length > 0.30;
428
+ }
429
+
430
+ function queryTokenSet(query, queryTokens) {
431
+ if (queryTokens instanceof Set) return queryTokens;
432
+ if (Array.isArray(queryTokens)) return new Set(queryTokens.map(t => String(t).toLowerCase()));
433
+ return new Set(String(query || '').toLowerCase().split(/[_\W]+/).filter(t => t.length >= 3));
434
+ }
435
+
436
+ export function testNameQueryOverlap(r, queryTokens) {
437
+ const name = resolveResultName(r).toLowerCase();
438
+ if (!name) return 0;
439
+ const nameTokens = name
440
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
441
+ .replace(/([a-zA-Z])(\d)/g, '$1 $2')
442
+ .split(/[_\W]+/)
443
+ .filter(t => t.length >= 3);
444
+ if (nameTokens.length === 0) return 0;
445
+
446
+ let hits = 0;
447
+ for (const token of nameTokens) {
448
+ if (queryTokens.has(token)) hits++;
449
+ }
450
+ return hits / nameTokens.length;
451
+ }
452
+
453
+ export function entityKindPreferenceFromQuery(query) {
454
+ const q = String(query || '').toLowerCase();
455
+ for (const [bucket, keywords] of Object.entries(ENTITY_KIND_KEYWORDS)) {
456
+ for (const keyword of keywords) {
457
+ if (new RegExp(`\\b${keyword.toLowerCase()}\\b`, 'i').test(q)) return bucket;
458
+ }
459
+ }
460
+ return null;
461
+ }
462
+
463
+ export function extractNameHints(query) {
464
+ const tokens = String(query || '').match(/[A-Za-z_][A-Za-z0-9_]+/g) || [];
465
+ const hints = new Set();
466
+ for (const token of tokens) {
467
+ if (token.length < 3) continue;
468
+ if (LANG_KEYWORDS.has(token)) continue;
469
+ if (STOPWORDS.has(token.toLowerCase())) continue;
470
+ if (/[A-Z]/.test(token) || token.length >= 4) hints.add(token);
471
+ }
472
+ return hints;
473
+ }
474
+
475
+ function splitIdentifierName(name) {
476
+ return String(name || '')
477
+ .replace(/([a-z0-9])([A-Z])/g, '$1 $2')
478
+ .split(/[_\W]+/)
479
+ .map(s => s.toLowerCase())
480
+ .filter(Boolean);
481
+ }
482
+
483
+ function resolveEntityKindInfo(r, opts = {}) {
484
+ const file = resolveFilePath(r);
485
+ const meta = r?.metadata || {};
486
+ const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
487
+ const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line ?? start;
488
+ // Intra-call memoization: this function is invoked 4-7x per result by
489
+ // different multipliers (buildRefCountMap, entityKindMultiplier,
490
+ // namePrecisionMultiplier, bodyDensityMultiplier, megaEntityPenalty,
491
+ // referenceCountBoost, the main loop). With ~100 results that's
492
+ // 400-1400 SQLite round-trips. Cache by (file, start, end).
493
+ const cache = opts._entityKindCache;
494
+ let cacheKey = null;
495
+ if (cache && file && Number.isFinite(start)) {
496
+ cacheKey = `${file}|${start}|${Number.isFinite(end) ? end : start}`;
497
+ if (cache.has(cacheKey)) return cache.get(cacheKey);
498
+ }
499
+ let result = null;
500
+ if (opts.codeGraphRepo && file && Number.isFinite(start)) {
501
+ try {
502
+ const entity = opts.codeGraphRepo.findEnclosingEntity(file, start, Number.isFinite(end) ? end : start)
503
+ || opts.codeGraphRepo.findEnclosingEntity(file, start, start);
504
+ if (entity?.type) {
505
+ result = entity;
506
+ } else if (typeof opts.codeGraphRepo.findFirstEntityInRange === 'function' && Number.isFinite(end)) {
507
+ const first = opts.codeGraphRepo.findFirstEntityInRange(file, start, end);
508
+ if (first?.type) result = first;
509
+ }
510
+ } catch {
511
+ // Fall through to source-span inference.
512
+ }
513
+ }
514
+ if (!result) {
515
+ const inferred = inferEntityKindFromText(resolveResultText(r, opts));
516
+ result = inferred ? { type: inferred } : null;
517
+ }
518
+ if (cacheKey) cache.set(cacheKey, result);
519
+ return result;
520
+ }
521
+
522
+ // Boost magnitudes are env-tunable so we can ablate without re-deploying.
523
+ // Defaults softened (2026-05-05) from (1.25, 0.85, 1.20, 1.05) to
524
+ // (1.10, 0.90, 1.10, 1.03) after a 16-query 3-config ablation showed
525
+ // 15 of 16 top-1 results unchanged at the lower magnitudes — less
526
+ // leverage = less interaction risk with name-precision and other
527
+ // signals, with no observed quality loss. The stronger old values
528
+ // remain reachable via env vars if a future probe shows they help.
529
+ function envFloat(name, dflt) {
530
+ const v = process.env[name];
531
+ if (v == null || v === '') return dflt;
532
+ const n = Number(v);
533
+ return Number.isFinite(n) && n > 0 ? n : dflt;
534
+ }
535
+
536
+ function entityKindMultiplier(r, preferred, opts = {}) {
537
+ if (!preferred) return 1;
538
+ const kindBoost = envFloat('SWEET_SEARCH_KIND_BOOST', 1.10);
539
+ const kindDemote = envFloat('SWEET_SEARCH_KIND_DEMOTE', 0.90);
540
+ const wantSet = new Set((ENTITY_KIND_KEYWORDS[preferred] || []).map(normalizeType));
541
+ const inferred = resolveEntityKindInfo(r, opts)?.type || '';
542
+ const recorded = normalizeType(resolveResultType(r));
543
+ const type = recorded && recorded !== 'code' && recorded !== 'chunk' ? recorded : normalizeType(inferred);
544
+ if (wantSet.has(type) || (type === 'typealias' && preferred === 'type')) return kindBoost;
545
+ if ((type === 'impl' || type === 'method' || type === 'function') && preferred !== 'function') return kindDemote;
546
+ return 1;
547
+ }
548
+
549
+ function namePrecisionMultiplier(r, preferred, nameHintsLower, opts = {}) {
550
+ if (!preferred || nameHintsLower.size === 0) return 1;
551
+ const exactBoost = envFloat('SWEET_SEARCH_NAME_EXACT_BOOST', 1.10);
552
+ const substrBoost = envFloat('SWEET_SEARCH_NAME_SUBSTR_BOOST', 1.03);
553
+ const wantSet = new Set((ENTITY_KIND_KEYWORDS[preferred] || []).map(normalizeType));
554
+ const entityInfo = resolveEntityKindInfo(r, opts);
555
+ const recorded = normalizeType(resolveResultType(r));
556
+ const type = recorded && recorded !== 'code' && recorded !== 'chunk'
557
+ ? recorded
558
+ : normalizeType(entityInfo?.type);
559
+ if (!wantSet.has(type) && !(type === 'typealias' && preferred === 'type')) return 1;
560
+
561
+ const name = resolveResultName(r) || entityInfo?.name || '';
562
+ if (!name) return 1;
563
+ if (nameHintsLower.has(name.toLowerCase())) return exactBoost;
564
+ const nameTokens = splitIdentifierName(name);
565
+ for (const hint of nameHintsLower) {
566
+ if (nameTokens.includes(hint)) return substrBoost;
567
+ }
568
+ return 1;
569
+ }
570
+
571
+ function exactNamedEntityForResult(r, preferred, nameHints, nameHintsLower, opts = {}) {
572
+ if (!opts.codeGraphRepo || !preferred || nameHintsLower.size === 0) return null;
573
+ const file = resolveFilePath(r);
574
+ if (!file) return null;
575
+ const types = ENTITY_KIND_KEYWORDS[preferred] || [];
576
+ try {
577
+ const entities = (typeof opts.codeGraphRepo.findEntitiesByNamesCaseInsensitive === 'function'
578
+ ? opts.codeGraphRepo.findEntitiesByNamesCaseInsensitive([...nameHintsLower], {
579
+ types,
580
+ limit: 16,
581
+ })
582
+ : opts.codeGraphRepo.findEntitiesByNames([...nameHints], {
583
+ types,
584
+ limit: 16,
585
+ })) || [];
586
+ const sameFile = entities.find(entity =>
587
+ (entity.filePath || entity.file) === file && nameHintsLower.has(String(entity.name || '').toLowerCase())
588
+ );
589
+ return sameFile || null;
590
+ } catch {
591
+ return null;
592
+ }
593
+ }
594
+
595
+ function inferEntityKindFromText(text) {
596
+ if (!text) return '';
597
+ if (/^\s*(?:pub(?:\([^)]*\))?\s+)?enum\s+\w+/m.test(text)) return 'enum';
598
+ if (/^\s*(?:pub(?:\([^)]*\))?\s+)?struct\s+\w+/m.test(text)) return 'struct';
599
+ if (/^\s*(?:pub(?:\([^)]*\))?\s+)?trait\s+\w+/m.test(text)) return 'trait';
600
+ if (/^\s*impl(?:\s*<[^>]+>)?\s+\w+/m.test(text)) return 'impl';
601
+ if (/^\s*(?:export\s+)?(?:abstract\s+)?class\s+\w+/m.test(text)) return 'class';
602
+ if (/^\s*(?:export\s+)?interface\s+\w+/m.test(text)) return 'interface';
603
+ if (/^\s*(?:export\s+)?type\s+\w+\s*=/m.test(text)) return 'typealias';
604
+ return '';
605
+ }
606
+
607
+ // Declarative / doc-string-heavy chunk demotion (added 2026-05-05).
608
+ //
609
+ // Three narrow, independent content-shape triggers — each catches a specific
610
+ // failure shape observed in the May-05 novel-probe analysis:
611
+ //
612
+ // T1. Declarative-entity demotion. When the chunk's primary entity type is
613
+ // `namespace`, `interface`, or `typeAlias`, the chunk is by definition
614
+ // a declaration block — signatures / property decls without behaviour.
615
+ // Such chunks should not outrank `function`/`impl` chunks for
616
+ // procedural queries. Catches the .d.ts namespace / interface case.
617
+ //
618
+ // T2. Raw-string-dominant impl. When > 50 % of an `impl` chunk's non-blank
619
+ // characters live inside Rust raw-string literals (`r#"..."#`,
620
+ // `r"..."`), the chunk is mostly documentation. Catches clap-style
621
+ // flag impls whose `doc_long()` returns a 30-line description (e.g.
622
+ // `impl Flag for SearchZip`).
623
+ //
624
+ // T3. Stub-impl. Multiple `fn` definitions in an `impl` chunk with avg
625
+ // body line count < 4. Catches clap-style impls whose individual
626
+ // `doc_long()` is small enough to escape T2 but whose methods are
627
+ // still mostly 1-line literal returns (e.g. `impl Flag for
628
+ // CaseSensitive`).
629
+ //
630
+ // All three triggers are intent-gated to `implementation` queries, so a
631
+ // phrasing like "what is the FastifyInstance interface" — which legitimately
632
+ // wants a declaration — is unaffected. T2/T3 are also restricted to chunks
633
+ // whose primary entity type is `impl` to avoid touching anything outside
634
+ // the Rust idiom we're targeting.
635
+ //
636
+ // Defaults are conservative. An earlier "execution density" heuristic
637
+ // (penalise any chunk with low control-flow ratio) over-fired on data-
638
+ // declaration chunks like `lib/errors.js` constant tables, which are the
639
+ // genuinely-correct answer for "how does Fastify handle errors". The
640
+ // triggers here are shape-specific instead of density-specific.
641
+ //
642
+ // Disable everything with `ablations: 'no-body-density'` or
643
+ // SWEET_SEARCH_BODY_DENSITY=0; per-trigger overrides via
644
+ // SWEET_SEARCH_DECLARATIVE_FACTOR / SWEET_SEARCH_RAWSTRING_FACTOR /
645
+ // SWEET_SEARCH_STUB_FACTOR.
646
+ const DECLARATIVE_ENTITY_TYPES = new Set(['namespace', 'interface', 'typealias']);
647
+
648
+ function envFloatRange(name, dflt) {
649
+ const v = process.env[name];
650
+ if (v == null || v === '') return dflt;
651
+ const n = Number(v);
652
+ return Number.isFinite(n) && n >= 0 && n <= 1 ? n : dflt;
653
+ }
654
+
655
+ /**
656
+ * Detect whether a Rust `impl` chunk is a "stub impl" — fn definitions with
657
+ * no real body. Catches two patterns:
658
+ *
659
+ * (A) MULTI-METHOD stubs (original ac280d4 case): clap-style flag-arg impls
660
+ * where every method is a 1-line literal return (e.g. `impl Flag for
661
+ * CaseSensitive` whose 6 methods total ~6 body lines), independent of
662
+ * whether `doc_long` carries a big raw-string description.
663
+ *
664
+ * (B) SINGLE-METHOD trivial-body stubs (added 2026-05-07 — FreshStack uv
665
+ * UV-FLOW-8 diagnosis): derive-equivalent impls like
666
+ * `impl Clone for X { fn clone(&self) -> Self { Self {...} } }` with a
667
+ * body of < 2 substantive lines. The original rule required ≥2 fns and
668
+ * missed these single-method derive-style impls. Worth being conservative
669
+ * here — Display::fmt is usually 3+ lines, From::from sometimes IS 1
670
+ * line and is genuinely trivial. The 1.5-line cutoff fires only on
671
+ * truly stub-grade single-fns (closer to derive macros than real impls).
672
+ *
673
+ * Returns the estimated average body line count, or `Infinity` if the chunk
674
+ * contains no fn definitions. Lower = more stub-like.
675
+ */
676
+ export function avgFnBodyLines(text) {
677
+ if (typeof text !== 'string' || text.length === 0) return Infinity;
678
+ const fnRe = /^\s*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+|const\s+|unsafe\s+)*fn\s+\w+/gm;
679
+ const matches = [];
680
+ let m;
681
+ while ((m = fnRe.exec(text)) !== null) matches.push(m.index);
682
+ if (matches.length === 0) return Infinity;
683
+ let totalBodyLines = 0;
684
+ let counted = 0;
685
+ for (const startIdx of matches) {
686
+ // Find the opening `{` after this fn signature.
687
+ const openIdx = text.indexOf('{', startIdx);
688
+ if (openIdx === -1) continue;
689
+ // Walk braces to find the matching close.
690
+ let depth = 1;
691
+ let j = openIdx + 1;
692
+ let inString = false;
693
+ let stringTerm = null;
694
+ while (j < text.length && depth > 0) {
695
+ const ch = text[j];
696
+ if (inString) {
697
+ if (ch === '\\') { j += 2; continue; }
698
+ if (ch === stringTerm) inString = false;
699
+ } else {
700
+ if (ch === '"' || ch === "'") { inString = true; stringTerm = ch; }
701
+ else if (ch === '{') depth++;
702
+ else if (ch === '}') depth--;
703
+ }
704
+ j++;
705
+ }
706
+ if (depth !== 0) continue;
707
+ const body = text.slice(openIdx + 1, j - 1);
708
+ const bodyLines = body.split('\n').filter(l => l.trim().length > 0).length;
709
+ totalBodyLines += bodyLines;
710
+ counted++;
711
+ }
712
+ if (counted === 0) return Infinity;
713
+ // Single-fn impls with ≤1.5 substantive body lines (1 trivial line plus
714
+ // the closing brace, or a 1-line `Self { ... }` body) are derive-equivalent
715
+ // stubs (UV-FLOW-8 case: `impl Clone for X { fn clone(&self) -> Self { Self {...} } }`).
716
+ // Multi-fn impls keep the original average-body rule.
717
+ if (counted === 1) {
718
+ return totalBodyLines <= 1.5 ? totalBodyLines : Infinity;
719
+ }
720
+ return totalBodyLines / counted;
721
+ }
722
+
723
+ /**
724
+ * Estimate the fraction of a chunk's characters that live inside Rust
725
+ * raw-string literals. Returns a number in [0, 1].
726
+ *
727
+ * Heuristic: scan the text once tracking entry into `r#"`/`r"` regions and
728
+ * exit at the matching `"#`/`"`. Counts only the inner payload chars.
729
+ */
730
+ export function rawStringDensity(text) {
731
+ if (typeof text !== 'string' || text.length === 0) return 0;
732
+ let i = 0;
733
+ let inside = 0;
734
+ let total = 0;
735
+ const len = text.length;
736
+ while (i < len) {
737
+ if (!/\s/.test(text[i])) total++;
738
+ // Detect `r#*"` opener.
739
+ if (text[i] === 'r' && (text[i + 1] === '"' || text[i + 1] === '#')) {
740
+ let j = i + 1;
741
+ let hashCount = 0;
742
+ while (text[j] === '#') { hashCount++; j++; }
743
+ if (text[j] === '"') {
744
+ // We're inside a raw string. Find the matching close.
745
+ const closeNeedle = '"' + '#'.repeat(hashCount);
746
+ const closeAt = text.indexOf(closeNeedle, j + 1);
747
+ if (closeAt === -1) {
748
+ // unterminated — count rest of file as inside
749
+ for (let k = j + 1; k < len; k++) {
750
+ if (!/\s/.test(text[k])) { inside++; total++; }
751
+ }
752
+ return total === 0 ? 0 : inside / total;
753
+ }
754
+ for (let k = j + 1; k < closeAt; k++) {
755
+ if (!/\s/.test(text[k])) { inside++; total++; }
756
+ }
757
+ i = closeAt + closeNeedle.length;
758
+ continue;
759
+ }
760
+ }
761
+ i++;
762
+ }
763
+ return total === 0 ? 0 : inside / total;
764
+ }
765
+
766
+ /**
767
+ * Mega-chunk size penalty (added 2026-05-07 — 60-probe diagnosis).
768
+ *
769
+ * Long candidate chunks (entire 1500-line classes, 700-line module
770
+ * functions) systematically outscore precise 30-line chunks even when the
771
+ * latter contain the actual answer. The dense bi-encoder doesn't penalise
772
+ * length the way BM25's `b` parameter does, so a mega-chunk that touches
773
+ * many topics earns a moderate similarity to many queries.
774
+ *
775
+ * SOTA precedent: BM25 length normalization (Robertson & Zaragoza 2009),
776
+ * subsequently incorporated as length penalties in dense rerankers
777
+ * (ColBERTv2 token-budget caps, MS-MARCO-tuned cross-encoders). Soft
778
+ * piecewise-linear here rather than `1/(1 + b·L/L_avg)` because (a) we
779
+ * lack a per-corpus L_avg estimate at query time and (b) BM25-style
780
+ * normalization is too aggressive for long behavioural-flow chunks where
781
+ * length carries some signal.
782
+ *
783
+ * Tuning floor/slope to be PERMISSIVE — ONLY truly mega chunks lose score:
784
+ * - L ≤ 500 lines → factor 1.0 (no penalty — every reasonable function chunk)
785
+ * - L = 800 → ~0.91 (typical large class)
786
+ * - L = 1000 → ~0.85
787
+ * - L ≥ 1500 → 0.80 (floor — entire-file chunks)
788
+ *
789
+ * Tightened cutoff from 200 → 500 after S6-Q6 gin regression: a 40-line
790
+ * `New` function had been the right top-1, but penalising 200+ chunks
791
+ * shifted the within-file ranking. 500-line cutoff exempts every legit
792
+ * function/method chunk and only demotes whole-class megachunks.
793
+ *
794
+ * Override via env: SWEET_SEARCH_MEGA_CHUNK_CUTOFF (default 500),
795
+ * SWEET_SEARCH_MEGA_CHUNK_SLOPE (default 0.0003 per-line),
796
+ * SWEET_SEARCH_MEGA_CHUNK_FLOOR (default 0.80). Disable via
797
+ * SWEET_SEARCH_MEGA_CHUNK_FLOOR=1 (no-op) or
798
+ * `ablations: ['no-mega-chunk-penalty']`.
799
+ *
800
+ * Diagnosed cases (60-probe new-set):
801
+ * - S5-Q10 flask: 1516-line `class Flask` chunk beat 30-line `abort` fn
802
+ * - S4-Q2 fastify: 735-line `function fastify` chunk beat 1-line
803
+ * `kRouteContext` symbol declaration
804
+ */
805
+ /**
806
+ * Symbol-exact-match boost for definition-style queries.
807
+ *
808
+ * Added 2026-05-07 — both diagnoses (FreshStack uv #1, 60-probe new-set #1)
809
+ * converged on this as the highest-impact fix. When a query has the shape
810
+ * "show me X struct/enum/class/function/...", chunks where the symbol name
811
+ * EQUALS X (case-insensitive, after stemming s/es/ing suffixes) should
812
+ * dominate the lexical-collision sibling chunk that the encoder happens
813
+ * to score nearby.
814
+ *
815
+ * Diagnosed cases (combined): Cache vs CacheArgs (UV-DEF-1), Resolver vs
816
+ * Resolution (UV-DEF-4), ContentTypeParser vs ContentType (S6-Q2),
817
+ * Flask vs App (S6-Q9), buildErrorHandler vs setErrorHeaders (S6-Q3),
818
+ * Set method vs Value method (S3-Q6), get_send_file_max_age vs
819
+ * send_static_file (S3-Q9). 8+ failures in the new-probe set, 4 in
820
+ * FreshStack — strong evidence of a real systematic gap.
821
+ *
822
+ * SOTA precedent: BM25F field-weighted boosting on the symbol field
823
+ * (canonical IR move when one field carries decisive signal); ColBERTv2
824
+ * "expansion-aware reranking" with identifier prior; Sourcegraph Cody's
825
+ * "hint" tokens that bias toward exact symbol matches in graph-aware
826
+ * retrieval (Cody arXiv 2408.05344).
827
+ *
828
+ * Trigger pattern (conservative — only fires on UNAMBIGUOUS definition
829
+ * queries):
830
+ * /\b(show|give|find|describe|display|fetch).+?(?:the\s+)?(\w+)\s+
831
+ * (struct|enum|class|fn|function|method|trait|type|interface|impl|
832
+ * definition|signature|prototype|constructor)\b/i
833
+ *
834
+ * Plus a "WHAT IS X TYPE" alternate trigger:
835
+ * /\bwhat\s+(?:is|does)\s+(?:the\s+)?(\w+)\s+
836
+ * (struct|enum|class|function|method|type)\b/i
837
+ *
838
+ * Boost: 1.30× when chunk.symbol case-insensitive-equals the captured
839
+ * identifier. Capped at 1.30 (mild — definition queries account for ≤25%
840
+ * of probe traffic so a stronger boost risks breaking non-DEF queries).
841
+ *
842
+ * Override env: SWEET_SEARCH_SYMBOL_EXACT_BOOST (default 1.30, set to 1.0
843
+ * to disable). `ablations: ['no-symbol-exact-boost']` also disables.
844
+ */
845
+ // Lazy quantifier on the prefix so the capture greedily prefers an
846
+ // identifier-like noun (buildErrorHandler) over a keyword that happens
847
+ // to also be in the trailing list (function/definition). Verified
848
+ // 2026-05-07: greedy version captured "function" for
849
+ // "show me the buildErrorHandler function definition in full",
850
+ // missing the contained-entity boost on S6-Q3. But lazy also fails on
851
+ // "show me the full Engine struct" (captures "the"/"full") — which is
852
+ // why extractSymbolDefinitionTarget tries lazy first and falls back to
853
+ // greedy when the lazy capture is a stopword.
854
+ const SYMBOL_DEFN_QUERY_RE = new RegExp(
855
+ '\\b(?:show|give|find|describe|display|fetch|see)' +
856
+ '(?:\\s+\\w+){0,5}?\\s+' +
857
+ '(?:the\\s+)?' +
858
+ '(\\w+)' +
859
+ '(?:\\s+\\w+)?\\s+' +
860
+ '(?:struct|enum|class|fn|function|method|trait|type|interface|impl|' +
861
+ 'definition|signature|prototype|constructor)\\b',
862
+ 'i'
863
+ );
864
+ const SYMBOL_DEFN_QUERY_RE_GREEDY = new RegExp(
865
+ '\\b(?:show|give|find|describe|display|fetch|see)' +
866
+ '(?:\\s+\\w+){0,5}\\s+' +
867
+ '(?:the\\s+)?' +
868
+ '(\\w+)' +
869
+ '(?:\\s+\\w+)?\\s+' +
870
+ '(?:struct|enum|class|fn|function|method|trait|type|interface|impl|' +
871
+ 'definition|signature|prototype|constructor)\\b',
872
+ 'i'
873
+ );
874
+ const SYMBOL_WHATIS_QUERY_RE = new RegExp(
875
+ '\\bwhat\\s+(?:is|does|are)\\s+(?:the\\s+)?' +
876
+ '(\\w+)\\s+' +
877
+ '(?:struct|enum|class|function|method|type|trait|interface|' +
878
+ 'renderer|handler|component|service|module|controller|provider|builder)\\b',
879
+ 'i'
880
+ );
881
+ // "where is the X function/method/struct" pattern — captures probe-style queries
882
+ // like S3-Q4 "where is the Default function..." and S3-Q6 "where is the Set
883
+ // method on Context...". Added 2026-05-07 after F7 trace showed extractSymbolDefinitionTarget
884
+ // returned null for these queries, missing the contained-entity boost.
885
+ const SYMBOL_WHERE_QUERY_RE = new RegExp(
886
+ '\\bwhere\\s+(?:is|does)\\s+(?:the\\s+)?' +
887
+ '(\\w+)\\s+' +
888
+ '(?:struct|enum|class|fn|function|method|trait|type|interface|impl|' +
889
+ 'definition|signature|prototype|constructor)\\b',
890
+ 'i'
891
+ );
892
+
893
+ // Identifier-shape heuristic: code identifiers across all languages
894
+ // commonly use one of: uppercase letters (PascalCase / camelCase),
895
+ // underscores (snake_case), hyphens (kebab-case), or digits. Plain
896
+ // English adjectives / determiners ("the", "complete", "every") fall
897
+ // outside this shape. This is more principled than a curated stopword
898
+ // list — it generalizes to non-English languages, avoids removing
899
+ // real lowercase identifiers like Rust `lock` / Python `commit` (which
900
+ // stay as final-fallback when no identifier-shape candidate exists),
901
+ // and doesn't require maintaining a word list. Long-term, swap for a
902
+ // small POS classifier if false-positive identifier captures appear.
903
+ function looksLikeIdentifier(name) {
904
+ if (!name || name.length < 3) return false;
905
+ return /[A-Z_\-0-9]/.test(name);
906
+ }
907
+
908
+ function extractSymbolDefinitionTarget(query) {
909
+ if (!query || typeof query !== 'string') return null;
910
+ const candidates = [];
911
+ for (const re of [SYMBOL_DEFN_QUERY_RE, SYMBOL_DEFN_QUERY_RE_GREEDY, SYMBOL_WHATIS_QUERY_RE, SYMBOL_WHERE_QUERY_RE]) {
912
+ const m = query.match(re);
913
+ if (m && m[1] && m[1].length >= 3) candidates.push(m[1]);
914
+ }
915
+ if (candidates.length === 0) return null;
916
+ // Prefer identifier-shape captures (uppercase / underscore / digit) over
917
+ // plain lowercase English captures. Falls back to first capture if no
918
+ // identifier-shape candidate found (catches lowercase identifiers like
919
+ // Rust `lock` or Python `commit`).
920
+ const idShape = candidates.find(looksLikeIdentifier);
921
+ return idShape || candidates[0];
922
+ }
923
+
924
+ /**
925
+ * Strict identifier-shape filter for query-token-to-symbol-name matching.
926
+ *
927
+ * Distinguishes "code identifier" from "English word that happens to start
928
+ * uppercase". A token is strict-identifier-shaped if it is ≥4 chars AND
929
+ * has one of:
930
+ * - digit (Vec128, AVX2, std::int32_t)
931
+ * - underscore (HWY_DLLEXPORT, snake_case_thing, _Alignas)
932
+ * - internal uppercase beyond the first character (FunctionCache,
933
+ * AlignedDeleter, DetectTargets — true camelCase / PascalCase)
934
+ *
935
+ * This excludes single-cap English nouns: "Type", "Class", "Vector", "Map",
936
+ * "Set", "Function", "Method", "Method", "Component" — none have a digit,
937
+ * underscore, or internal uppercase, so they fail the structural check.
938
+ * Excludes 3-char tokens like "SSE", "x86", "AVX" — these are domain
939
+ * acronyms that risk false-matching unrelated short symbols. The 1.15× boost
940
+ * trades off vs the 1.30× of the verb-anchored extractor: lower precision,
941
+ * higher recall on noun-anchored probe queries.
942
+ */
943
+ function looksLikeStrictIdentifier(token) {
944
+ if (!token || token.length < 4) return false;
945
+ if (/\d/.test(token)) return true;
946
+ if (/_/.test(token)) return true;
947
+ // Internal uppercase: after the first character, find another upper.
948
+ // `^.[a-z0-9]*[A-Z]` is too permissive (matches XYz). Require any
949
+ // upper at position ≥1 (so "Aa" doesn't trigger but "AaA" does).
950
+ for (let i = 1; i < token.length; i++) {
951
+ if (token[i] >= 'A' && token[i] <= 'Z') return true;
952
+ }
953
+ return false;
954
+ }
955
+
956
+ // =============================================================================
957
+ // F9 (2026-05-12): additional_symbols re-anchoring for cAST sibling-merged
958
+ // chunks. JS/TS-gated. Pure ranking-time metadata fix — no chunk regeneration,
959
+ // no reindex required.
960
+ //
961
+ // Motivation: cAST sibling-merge collapses ≥2 top-level boundaries into one
962
+ // chunk and attributes the chunk to the FIRST boundary's name. Probe failures
963
+ // like TS-006 (chunk named `SlashCommand` but expected `slashCommands`) and
964
+ // TS-008 (chunk named `regularPrompt` but expected `systemPrompt`) are
965
+ // structurally PARTIAL — file is correct, symbol is the wrong sibling.
966
+ //
967
+ // The chunker already records secondary boundary names in
968
+ // `metadata.additional_symbols` (tree-sitter-provider.js:928-934). F9
969
+ // promotes the best-matching sibling to the chunk's primary label when the
970
+ // query references it more strongly than the original primary.
971
+ //
972
+ // SOTA references:
973
+ // - Sourcegraph BM25F (2025): "treat symbols as a multi-valued field"
974
+ // - Supermemory code-chunk: explicit scope-tree carries secondary entities
975
+ // per chunk, not just the head boundary
976
+ // - cAST (arXiv 2506.15655): acknowledges the sibling-merge attribution
977
+ // gap as a known limitation
978
+ //
979
+ // Pilot scope: JS/TS/TSX/JSX only. The mechanism generalizes to every
980
+ // language but per-language gating limits the validation surface for the
981
+ // initial rollout. Promote to additional languages once probes confirm gain
982
+ // on JS/TS with zero regressions on JS/TS + GCSN.
983
+ // =============================================================================
984
+
985
+ const JSTS_LANGS = new Set(['javascript', 'typescript', 'tsx', 'jsx']);
986
+ const JSTS_EXTENSIONS = new Set(['.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx', '.mts', '.cts']);
987
+
988
+ function isJsTsResult(result) {
989
+ const meta = result?.metadata ?? {};
990
+ if (meta.language && JSTS_LANGS.has(meta.language)) return true;
991
+ // Expansion entities from graph-expansion.js carry no metadata.language;
992
+ // fall back to file-extension sniff so F9 can still process them.
993
+ const fp = result?.file_path || result?.file || meta.file || meta.path || result?.filePath || '';
994
+ if (!fp) return false;
995
+ const dot = fp.lastIndexOf('.');
996
+ if (dot < 0) return false;
997
+ return JSTS_EXTENSIONS.has(fp.slice(dot).toLowerCase());
998
+ }
999
+
1000
+ /**
1001
+ * Split a camelCase/PascalCase/snake_case/kebab-case identifier into
1002
+ * lowercased sub-tokens. Filters out very short fragments.
1003
+ *
1004
+ * slashCommands → ['slash', 'commands']
1005
+ * SlashCommand → ['slash', 'command']
1006
+ * entitlementsByUserType → ['entitlements', 'by', 'user', 'type']
1007
+ * $ZodTypeInternals → ['zod', 'type', 'internals'] ($ stripped)
1008
+ * HTTPSConnection → ['https', 'connection']
1009
+ */
1010
+ function splitCamelCaseTokens(name) {
1011
+ if (!name) return [];
1012
+ return String(name)
1013
+ .replace(/\$/g, '') // strip $ (zod-style prefix)
1014
+ .replace(/([a-z0-9])([A-Z])/g, '$1 $2') // camelCase boundary
1015
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') // PascalCase run boundary
1016
+ .replace(/[_\-]/g, ' ')
1017
+ .toLowerCase()
1018
+ .split(/\s+/)
1019
+ .filter(t => t.length >= 2);
1020
+ }
1021
+
1022
+ /**
1023
+ * Match a name's camelCase token against a query word. Handles plural/
1024
+ * singular and prefix variants with a length threshold to avoid noise.
1025
+ *
1026
+ * tokenMatches('error', 'errors') → true (prefix, both ≥4)
1027
+ * tokenMatches('type', 'type') → true (exact)
1028
+ * tokenMatches('by', 'by') → true (exact, short OK)
1029
+ * tokenMatches('to', 'tokenize') → false (prefix but shorter is <3)
1030
+ */
1031
+ function tokenMatches(token, queryWord) {
1032
+ if (!token || !queryWord) return false;
1033
+ if (token === queryWord) return true;
1034
+ // For very short tokens, require exact only
1035
+ if (token.length < 3 || queryWord.length < 3) return false;
1036
+ const shorter = token.length <= queryWord.length ? token : queryWord;
1037
+ const longer = token.length > queryWord.length ? token : queryWord;
1038
+ if (shorter.length < 4) return false; // 'try'/'tried'/'trie' would otherwise alias
1039
+ return longer.startsWith(shorter);
1040
+ }
1041
+
1042
+ /**
1043
+ * Score how strongly a candidate name matches the query, in tiers.
1044
+ *
1045
+ * tier 2 (literal): full lowercased name appears as a literal query word.
1046
+ * Strong signal: query mentions the identifier directly.
1047
+ * tier 1 (tokens): ALL camelCase tokens of the name are covered by some
1048
+ * query word (with prefix/plural matching). Medium signal:
1049
+ * query describes the identifier compositionally.
1050
+ * tier 0: not enough tokens covered — abstain.
1051
+ *
1052
+ * Returns {tier, tokens} so callers can compare tiers explicitly. Within a
1053
+ * tier, comparing on raw token count is noisy (sibling names often outscore
1054
+ * the primary by 1 token in ways that don't reflect query intent — e.g.,
1055
+ * "codeArtifact definition including its onStreamPart handler" mentions
1056
+ * BOTH names literally, but the user means codeArtifact). The relabel
1057
+ * rule should require sibling.tier > primary.tier strictly.
1058
+ */
1059
+ function scoreNameMatchTiered(name, queryWordsArr, queryWordsSet) {
1060
+ if (!name) return { tier: 0, tokens: 0 };
1061
+ const tokens = splitCamelCaseTokens(name);
1062
+ if (tokens.length === 0) return { tier: 0, tokens: 0 };
1063
+ const nLowerRaw = String(name).toLowerCase();
1064
+ const nLowerStripped = nLowerRaw.replace(/\$/g, '');
1065
+ // Tier 2: literal full-name match. Check $-preserving form first so a
1066
+ // query mentioning "$ZodType" exact-matches v4/core/$ZodType but only
1067
+ // token-matches v4/classic/ZodType (F10).
1068
+ if (queryWordsSet.has(nLowerRaw) || queryWordsSet.has(nLowerStripped)) {
1069
+ return { tier: 2, tokens: tokens.length };
1070
+ }
1071
+ // Tier 1: all camelCase tokens covered
1072
+ for (const t of tokens) {
1073
+ let found = false;
1074
+ for (let i = 0; i < queryWordsArr.length; i++) {
1075
+ if (tokenMatches(t, queryWordsArr[i])) { found = true; break; }
1076
+ }
1077
+ if (!found) return { tier: 0, tokens: 0 };
1078
+ }
1079
+ return { tier: 1, tokens: tokens.length };
1080
+ }
1081
+
1082
+ /**
1083
+ * Find a sibling entity in the chunk's range that beats the primary on
1084
+ * query match. Returns the matching code-graph entity (for F8-style label
1085
+ * adoption) or null.
1086
+ *
1087
+ * Queries the code graph for ALL entities declared in the chunk's range
1088
+ * (every top-level boundary cAST merged in). Scores each entity name
1089
+ * against the query and picks the best match — but only relabels if it
1090
+ * STRICTLY beats the primary's match. Ties keep primary (avoid noise).
1091
+ *
1092
+ * Returns null unless: language ∈ JS/TS/TSX/JSX, codeGraphRepo is available
1093
+ * with findEntitiesInRange, ≥1 sibling entity exists, the best sibling
1094
+ * beats primary, AND the primary did NOT already match the query strongly
1095
+ * (a strong primary match means the encoder + indexer agreed — don't
1096
+ * second-guess them at relabel time).
1097
+ */
1098
+ function findAdditionalSymbolRelabel(result, queryWordsArr, queryWordsSet, opts) {
1099
+ // 2026-05-13: widened from JS/TS-only to all languages. The cAST sibling-
1100
+ // merge phenomenon F9 addresses is universal — Java/C#/Lua/Python classes
1101
+ // and namespaces also produce one-chunk-per-N-siblings outputs that label
1102
+ // with the first entity. The `findEntitiesInRange` + tiered match logic
1103
+ // generalises; the JS/TS gate was a pilot scope, not a semantic requirement.
1104
+ // Stage 7 audit (stage7-deep-diagnosis.md) confirmed ~8 Java/C#/C-family
1105
+ // PARTIALs sitting on this exact mechanism. Validate against post-perf-60
1106
+ // + GCSN + 18-language probes — revert/narrow if any regression.
1107
+ // Original `isJsTsResult(result)` gate removed; behaviour now language-
1108
+ // agnostic with the strict-identifier and tier-strict-beat gates still
1109
+ // enforced below.
1110
+ const meta = result?.metadata ?? {};
1111
+
1112
+ if (!opts.codeGraphRepo || typeof opts.codeGraphRepo.findEntitiesInRange !== 'function') {
1113
+ return null;
1114
+ }
1115
+
1116
+ const fp = resolveFilePath(result);
1117
+ const sl = Number(result?.startLine ?? meta.startLine ?? meta.line_start);
1118
+ const el = Number(result?.endLine ?? meta.endLine ?? meta.line_end);
1119
+ if (!fp || !Number.isFinite(sl) || !Number.isFinite(el)) return null;
1120
+
1121
+ // Cache per (file, range) — entities don't change within a query call.
1122
+ const cache = opts._entityNameCache;
1123
+ const entitiesCacheKey = cache ? `${fp}|${sl}|${el}|f9-entities` : null;
1124
+ let entities;
1125
+ if (entitiesCacheKey && cache.has(entitiesCacheKey)) {
1126
+ entities = cache.get(entitiesCacheKey);
1127
+ } else {
1128
+ try { entities = opts.codeGraphRepo.findEntitiesInRange(fp, sl, el); }
1129
+ catch { entities = []; }
1130
+ if (entitiesCacheKey) cache.set(entitiesCacheKey, entities);
1131
+ }
1132
+ if (!Array.isArray(entities) || entities.length < 2) return null;
1133
+
1134
+ const primaryName = meta.symbol || meta.name || result?.name || result?.symbol || null;
1135
+ // Critical gate: F9 only operates on chunks that ALREADY have a primary
1136
+ // name attributed by the indexer (cAST sibling-merge case). When primary
1137
+ // is null/missing, the chunk is an anonymous code-block whose label will
1138
+ // be resolved by context-expander via findFirstEntityInRange at result
1139
+ // presentation time — F9 must not preempt that path (regresses TS-004:
1140
+ // every artifact client.tsx chunk with name=null was being relabeled to
1141
+ // the inner `onStreamPart` arrow function the query mentions).
1142
+ if (!primaryName) return null;
1143
+ const primaryNameLower = String(primaryName).toLowerCase();
1144
+ const primaryMatch = scoreNameMatchTiered(primaryName, queryWordsArr, queryWordsSet);
1145
+
1146
+ let bestEntity = null;
1147
+ let bestMatch = { tier: 0, tokens: 0 };
1148
+ for (const ent of entities) {
1149
+ if (!ent?.name) continue;
1150
+ if (primaryNameLower && String(ent.name).toLowerCase() === primaryNameLower) continue;
1151
+ // Require strict-identifier shape on the candidate. Avoids relabeling
1152
+ // to a common English-word entity captured by tree-sitter — extremely
1153
+ // rare in well-typed JS/TS but cheap to guard.
1154
+ if (!looksLikeStrictIdentifier(ent.name)) continue;
1155
+ const m = scoreNameMatchTiered(ent.name, queryWordsArr, queryWordsSet);
1156
+ if (m.tier > bestMatch.tier || (m.tier === bestMatch.tier && m.tokens > bestMatch.tokens)) {
1157
+ bestMatch = m;
1158
+ bestEntity = ent;
1159
+ }
1160
+ }
1161
+
1162
+ // Only relabel when sibling tier STRICTLY beats primary tier. Same-tier
1163
+ // ties keep primary — when both are literal-name matches in the query
1164
+ // (TS-004: codeArtifact + onStreamPart both literal), the chunker's
1165
+ // primary attribution wins because the encoder already ranked the chunk
1166
+ // on that signal.
1167
+ if (!bestEntity || bestMatch.tier === 0 || bestMatch.tier <= primaryMatch.tier) return null;
1168
+ return bestEntity;
1169
+ }
1170
+
1171
+ /**
1172
+ * Extract strict-identifier-shaped tokens from a query. Used by
1173
+ * identifierMentionBoost as a complement to extractSymbolDefinitionTarget:
1174
+ * the verb-anchored extractor catches "show me X struct" patterns; this
1175
+ * noun-anchored extractor catches "X with Y characteristic" probe-style
1176
+ * phrasings ("Vec128 SSE vector class template", "AlignedDeleter RAII class",
1177
+ * "FunctionCache template struct").
1178
+ *
1179
+ * Returns a Set of tokens. Returns null if no strict-identifier tokens found.
1180
+ */
1181
+ function extractIdentifierMentions(query) {
1182
+ if (!query || typeof query !== 'string') return null;
1183
+ const mentions = new Set();
1184
+ // 2026-05-14: preserve leading `$` so identifiers like `$ZodType` /
1185
+ // `$ZodTypeInternals` (zod v4/core public-API convention — structural
1186
+ // interfaces are $-prefixed while runtime classes are not) round-trip
1187
+ // through tokenization. Mirrors F10's fix for the F9 path (line ~1944)
1188
+ // — the same parser inconsistency caused identifier-mention boost to
1189
+ // fire on wrong chunks (v3/types.ts::ZodType) while missing the right
1190
+ // chunk (v4/core/schemas.ts containing the $-prefixed entity). Path 2
1191
+ // (`findEntityWithNameInRange`) is case-insensitive on `_`/`-` but NOT
1192
+ // on `$`, so the mention must preserve the `$` to find the entity.
1193
+ const tokens = query.match(/\$?[A-Za-z_][A-Za-z0-9_]*\b/g) || [];
1194
+ for (const tok of tokens) {
1195
+ if (looksLikeStrictIdentifier(tok)) mentions.add(tok);
1196
+ }
1197
+ // Also capture dotted compound identifiers. Lua (`tablex.deepcopy`),
1198
+ // Python (`os.path.exists`), and Ruby's `Module.method` style produce
1199
+ // code-graph entity names with embedded `.` — the single-token extractor
1200
+ // splits these into `tablex` / `deepcopy`, which then never matches the
1201
+ // chunk's actual entity name verbatim. The dotted form bypasses
1202
+ // `looksLikeStrictIdentifier` because a `a.b` shape is inherently
1203
+ // identifier-like (no English word contains a `.`).
1204
+ const dotted = query.match(/\b[A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)+\b/g) || [];
1205
+ for (const tok of dotted) mentions.add(tok);
1206
+ return mentions.size > 0 ? mentions : null;
1207
+ }
1208
+
1209
+ /**
1210
+ * Mild symbol-name boost triggered by strict-identifier mentions in the
1211
+ * query (complement to symbolExactMatchBoost which uses the verb-anchored
1212
+ * `extractSymbolDefinitionTarget`). Same BM25F field-weighted principle:
1213
+ * when the query explicitly names an identifier, prefer chunks whose
1214
+ * symbol matches that name.
1215
+ *
1216
+ * Calibrated lower (×1.15 default vs ×1.30 for the verb path) because the
1217
+ * noun-anchored trigger has lower precision: a query mentioning "Vec128"
1218
+ * may or may not be asking about that specific symbol. Verb-anchored
1219
+ * queries like "show me X struct" are unambiguous; noun-anchored mentions
1220
+ * are weaker signals.
1221
+ *
1222
+ * Skips the verb-anchored target to avoid double-counting: if
1223
+ * extractSymbolDefinitionTarget already returned "FunctionCache" and the
1224
+ * symbol matches that, symbolExactMatchBoost handles it; this boost
1225
+ * only fires on DIFFERENT mentions, or on mentions where the verb path
1226
+ * did not fire (target was null).
1227
+ *
1228
+ * Format-gated via the caller (only fires when isAgentFormat=true).
1229
+ */
1230
+ function identifierMentionBoost(result, mentions, opts = {}) {
1231
+ if (!mentions || mentions.size === 0) return 1.0;
1232
+ const raw = process.env.SWEET_SEARCH_IDENTIFIER_MENTION_BOOST;
1233
+ let boost = opts.identifierMentionBoostFactor ?? 1.15;
1234
+ if (raw != null && raw !== '') {
1235
+ const n = Number.parseFloat(raw);
1236
+ if (Number.isFinite(n) && n >= 1.0 && n <= 2.0) boost = n;
1237
+ }
1238
+ if (boost === 1.0) return 1.0;
1239
+
1240
+ const symbol = result?.name
1241
+ || result?.metadata?.name
1242
+ || result?.entity?.name
1243
+ || result?.symbol
1244
+ || '';
1245
+ const skipTarget = opts._symbolExactTarget ? String(opts._symbolExactTarget).toLowerCase() : '';
1246
+ const norm = (s) => s.replace(/[_-]/g, '').toLowerCase();
1247
+
1248
+ // Path 1 — direct symbol comparison (existing behaviour). Fires when the
1249
+ // chunk has a populated label that lexically matches a query mention.
1250
+ if (symbol) {
1251
+ const symLower = String(symbol).toLowerCase();
1252
+ const symNorm = norm(symLower);
1253
+ for (const mention of mentions) {
1254
+ const mLower = mention.toLowerCase();
1255
+ if (skipTarget && mLower === skipTarget) continue;
1256
+ if (symLower === mLower || symNorm === norm(mLower)) {
1257
+ return boost;
1258
+ }
1259
+ }
1260
+ // No Path 1 match — fall through to Path 2 (code-graph fallback). The
1261
+ // chunk's labeled symbol didn't match any mention, but a contained
1262
+ // sibling entity might (cAST sibling-merge case: chunk labeled with
1263
+ // the FIRST entity in the merged group while query references a
1264
+ // later sibling, e.g. Java chunk 121-168 labeled `verifyNoTypeVariable`
1265
+ // but containing the gold's `getType` at 166-168).
1266
+ }
1267
+
1268
+ // Path 2 — code-graph fallback (2026-05-13). Two scenarios:
1269
+ // 1. Null-name LI metadata — some indexes were built without populated
1270
+ // `metadata.name` (e.g. the typescript ast-tester repo has every doc
1271
+ // carrying `name: null`). Path 1 short-circuits via `if (symbol)`
1272
+ // and we land here directly.
1273
+ // 2. Sibling-merge mislabel — chunk has a populated label that doesn't
1274
+ // match any mention, but the chunk's range contains a sibling entity
1275
+ // that does. The fall-through above lets us still apply the boost.
1276
+ // Either way we look up each mention via `findEntityWithNameInRange` —
1277
+ // same code-graph signal that F8's exactSymbolTargetEntity uses — and
1278
+ // apply the same boost factor when a match exists. Cached per
1279
+ // (file, startLine, endLine, mention).
1280
+ //
1281
+ // Format-gated by the caller (identifierMentions is built only when
1282
+ // isAgentFormat is true), so this path is dormant on GCSN benchmark
1283
+ // traffic.
1284
+ if (!opts.codeGraphRepo || typeof opts.codeGraphRepo.findEntityWithNameInRange !== 'function') {
1285
+ return 1.0;
1286
+ }
1287
+ const file = resolveFilePath(result);
1288
+ const meta = result?.metadata ?? {};
1289
+ const sl = Number(result?.startLine ?? meta.startLine);
1290
+ const el = Number(result?.endLine ?? meta.endLine);
1291
+ if (!file || !Number.isFinite(sl) || !Number.isFinite(el)) return 1.0;
1292
+ const cache = opts._entityNameCache;
1293
+ for (const mention of mentions) {
1294
+ const mLower = mention.toLowerCase();
1295
+ if (skipTarget && mLower === skipTarget) continue;
1296
+ const cacheKey = cache ? `${file}|${sl}|${el}|mention:${mention}` : null;
1297
+ let resolved;
1298
+ if (cacheKey && cache.has(cacheKey)) {
1299
+ resolved = cache.get(cacheKey);
1300
+ } else {
1301
+ try {
1302
+ resolved = opts.codeGraphRepo.findEntityWithNameInRange(file, sl, el, mention);
1303
+ } catch { resolved = null; }
1304
+ if (cacheKey) cache.set(cacheKey, resolved);
1305
+ }
1306
+ if (resolved) return boost;
1307
+ }
1308
+ return 1.0;
1309
+ }
1310
+
1311
+ /**
1312
+ * Path-token boost (added 2026-05-07 — 60-probe diagnosis NEW pattern).
1313
+ *
1314
+ * When a query mentions a crate / module / package name (e.g. "in globset",
1315
+ * "in render package", "from binding/json"), boost candidates whose file
1316
+ * path contains that token. Same Sourcegraph BM25F principle as the
1317
+ * symbol boost: filename matches are a strong field-level signal that
1318
+ * dense embedding alone underweights.
1319
+ *
1320
+ * SOTA: BM25F filename field weighting (Sourcegraph "Keeping it boring..."
1321
+ * April 2025). Quote: "we should be able to use these indexes to reward
1322
+ * symbol and FILENAME matches... think of contents, symbols, and filenames
1323
+ * as different 'fields' within a file." See docs/SOTA_RESEARCH_2026_FIXES.md.
1324
+ *
1325
+ * Diagnosed cases (60-probe new-set #4): ripgrep S6-Q8 (two `Glob` structs
1326
+ * in different crates — symbol-exact alone CANNOT disambiguate; the query
1327
+ * said "in globset" so paths containing /globset/ should win).
1328
+ *
1329
+ * Trigger pattern: extract bare path-like tokens after a path preposition
1330
+ * /\b(?:in|from|inside|under|within)\s+(\w[\w/-]*)\b/gi
1331
+ *
1332
+ * Only fires on tokens of length ≥ 4 (avoid trivial "in"/"on") and not
1333
+ * common English stopwords. Boost: 1.20× when path contains the token
1334
+ * (case-insensitive substring match on the path string). Mild magnitude
1335
+ * because path tokens are softer signals than symbol-exact matches.
1336
+ *
1337
+ * Override env: SWEET_SEARCH_PATH_TOKEN_BOOST (default 1.20). Disable
1338
+ * with `ablations: ['no-path-token-boost']`.
1339
+ */
1340
+ const PATH_TOKEN_QUERY_RE = /\b(?:in|from|inside|under|within|of)\s+([a-z][\w-]*(?:[\/-][\w-]+)*)\b/gi;
1341
+ const PATH_TOKEN_STOPWORDS = new Set([
1342
+ 'the', 'this', 'that', 'these', 'those', 'them', 'their', 'they',
1343
+ 'when', 'while', 'where', 'with', 'without', 'have', 'been', 'each',
1344
+ 'and', 'but', 'for', 'all', 'any', 'some', 'can', 'will', 'would',
1345
+ 'fact', 'case', 'order', 'time', 'turn', 'fact',
1346
+ ]);
1347
+
1348
+ function extractPathTokens(query) {
1349
+ if (!query || typeof query !== 'string') return [];
1350
+ const tokens = [];
1351
+ let m;
1352
+ PATH_TOKEN_QUERY_RE.lastIndex = 0;
1353
+ while ((m = PATH_TOKEN_QUERY_RE.exec(query)) !== null) {
1354
+ const tok = m[1];
1355
+ if (!tok || tok.length < 4) continue;
1356
+ if (PATH_TOKEN_STOPWORDS.has(tok.toLowerCase())) continue;
1357
+ tokens.push(tok.toLowerCase());
1358
+ }
1359
+ return tokens;
1360
+ }
1361
+
1362
+ function pathTokenBoost(result, pathTokens, opts = {}) {
1363
+ if (!pathTokens || pathTokens.length === 0) return 1.0;
1364
+ const raw = process.env.SWEET_SEARCH_PATH_TOKEN_BOOST;
1365
+ let boost = opts.pathTokenBoost ?? 1.20;
1366
+ if (raw != null && raw !== '') {
1367
+ const n = Number.parseFloat(raw);
1368
+ if (Number.isFinite(n) && n >= 1.0 && n <= 2.0) boost = n;
1369
+ }
1370
+ if (boost === 1.0) return 1.0;
1371
+ const path = String(result?.file || result?.metadata?.file || '').toLowerCase();
1372
+ if (!path) return 1.0;
1373
+ // Match token as path component (separator-bounded) — avoid spurious
1374
+ // substring matches like "iter" matching inside "literator".
1375
+ for (const tok of pathTokens) {
1376
+ const re = new RegExp('(^|[/_-])' + tok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + '($|[/_.-])');
1377
+ if (re.test(path)) return boost;
1378
+ }
1379
+ return 1.0;
1380
+ }
1381
+
1382
+ function symbolExactMatchBoost(result, target, opts = {}) {
1383
+ if (!target) return 1.0;
1384
+ const raw = process.env.SWEET_SEARCH_SYMBOL_EXACT_BOOST;
1385
+ let boost = opts.symbolExactBoost ?? 1.30;
1386
+ if (raw != null && raw !== '') {
1387
+ const n = Number.parseFloat(raw);
1388
+ if (Number.isFinite(n) && n >= 1.0 && n <= 2.0) boost = n;
1389
+ }
1390
+ if (boost === 1.0) return 1.0;
1391
+
1392
+ const symbol = result?.name
1393
+ || result?.metadata?.name
1394
+ || result?.entity?.name
1395
+ || result?.symbol
1396
+ || '';
1397
+ const tLower = target.toLowerCase();
1398
+ const norm = (s) => s.replace(/[_-]/g, '').toLowerCase();
1399
+ if (symbol) {
1400
+ const sLower = String(symbol).toLowerCase();
1401
+ if (sLower === tLower) return boost;
1402
+ if (norm(sLower) === norm(tLower)) return boost;
1403
+ }
1404
+ // F7 (2026-05-07): chunk's labeled symbol may not match the target, but a
1405
+ // sibling entity contained inside the chunk range may match. Diagnosed from
1406
+ // S6-Q3 (fastify): chunk 103-150 contains both fallbackErrorHandler AND
1407
+ // buildErrorHandler at lines 142-150; chunker labeled it fallbackErrorHandler.
1408
+ // Query "show me the buildErrorHandler function definition" extracts target
1409
+ // "buildErrorHandler" — the contained-entity check finds it and applies the
1410
+ // same 1.30× boost so the chunk wins over adjacent setErrorHeaders chunk.
1411
+ if (opts.codeGraphRepo && typeof opts.codeGraphRepo.hasEntityWithNameInRange === 'function') {
1412
+ const filePath = resolveFilePath(result);
1413
+ const meta = result?.metadata ?? {};
1414
+ const startLine = Number(result?.startLine ?? meta.startLine);
1415
+ const endLine = Number(result?.endLine ?? meta.endLine);
1416
+ if (filePath && Number.isFinite(startLine) && Number.isFinite(endLine)) {
1417
+ try {
1418
+ if (opts.codeGraphRepo.hasEntityWithNameInRange(filePath, startLine, endLine, target)) {
1419
+ return boost;
1420
+ }
1421
+ } catch { /* fall through */ }
1422
+ }
1423
+ }
1424
+ return 1.0;
1425
+ }
1426
+
1427
+ /**
1428
+ * Demote anomalous chunks: anonymous (symbol==null) AND symbolType==='code',
1429
+ * AND ANY of:
1430
+ * (a) file-header — startLine===1 (e.g. file-imports leak)
1431
+ * (b) tiny span — endLine-startLine<5 (e.g. bare impl-header text)
1432
+ * (c) preprocessor-dense — mid-file anonymous chunk where >50% of non-blank
1433
+ * lines are preprocessor directives (#include/#define/#ifdef/etc.) and
1434
+ * no real declaration appears. Covers C/C++ "include cluster" + "macro
1435
+ * wall" chunks that the (a)/(b) predicates miss because they sit in the
1436
+ * middle of a header and span 10-100 lines (CPP-002 / CPP-003 / CPP-008
1437
+ * root cause: anonymous code chunks dominated by HWY_ATTAINABLE_*,
1438
+ * HWY_HAVE_RUNTIME_DISPATCH_*, #include "hwy/base.h" etc., winning
1439
+ * NL queries on token density).
1440
+ *
1441
+ * These chunks bypass the entity DB (sparse/grep fallback or chunker leak)
1442
+ * and shouldn't surface as top-1.
1443
+ *
1444
+ * Predicate (a)+(b) verified 2026-05-07 against live probe + FreshStack
1445
+ * PARTIALs: legitimate symbol-mislabel cases (S3-Q2, S4-Q1, S6-Q4, S3-Q8)
1446
+ * all have span >20 lines and startLine deep in file — they pass through
1447
+ * unaffected by (a)+(b). Predicate (c) is narrower: requires zero real
1448
+ * declarations AND ≥50% preprocessor lines, so a real code chunk with a
1449
+ * couple of #includes at the top is NOT affected.
1450
+ *
1451
+ * Demote (×0.10) rather than filter so a single-anomalous-result fallback
1452
+ * still surfaces the chunk if nothing else matches.
1453
+ */
1454
+ const PREPROC_LINE_RE = /^\s*#\s*(?:include|define|undef|ifdef|ifndef|if|else|elif|endif|pragma|error|warning|line)\b/;
1455
+
1456
+ function isPreprocDenseAnonymousChunk(result, opts) {
1457
+ const text = resolveResultText(result, opts);
1458
+ if (!text || text.length < 50) return false;
1459
+ const lines = text.split(/\r?\n/);
1460
+ let nonBlank = 0;
1461
+ let preproc = 0;
1462
+ let hasDecl = false;
1463
+ // A `#define` (or `#pragma`/etc.) that ends in `\` is a multi-line
1464
+ // continuation. Lines participating in the continuation are functionally
1465
+ // part of the preprocessor directive and should count as preproc, not as
1466
+ // "real code". Without this, a long multi-line macro definition like
1467
+ // `#define HWY_BASELINE_TARGETS (FLAG_A | FLAG_B | \\` ... `FLAG_Z)`
1468
+ // dilutes the density below 50% even though the entire span is one macro.
1469
+ let inContinuation = false;
1470
+ for (const line of lines) {
1471
+ const trimmed = line.trim();
1472
+ if (trimmed.length === 0) {
1473
+ inContinuation = false;
1474
+ continue;
1475
+ }
1476
+ nonBlank++;
1477
+ const isPreproc = PREPROC_LINE_RE.test(line) || inContinuation;
1478
+ if (isPreproc) {
1479
+ preproc++;
1480
+ } else if (DECL_KEYWORD_RE.test(trimmed)) {
1481
+ hasDecl = true;
1482
+ break;
1483
+ }
1484
+ // Track continuation for the next line: a preproc line (or an existing
1485
+ // continuation line) that ends in `\` continues the directive.
1486
+ inContinuation = isPreproc && line.replace(/\s+$/, '').endsWith('\\');
1487
+ }
1488
+ if (hasDecl) return false;
1489
+ if (nonBlank < 5) return false;
1490
+ return (preproc / nonBlank) >= 0.5;
1491
+ }
1492
+
1493
+ function anomalousChunkDemotion(result, opts = {}) {
1494
+ if (process.env.SWEET_SEARCH_NO_ANOMALOUS_CHUNK_DEMOTION === '1') return 1.0;
1495
+ if (hasAblation(opts.ablations, 'no-anomalous-chunk-demotion')) return 1.0;
1496
+ // Format-gated: GCSN-style NL queries hit many file-start anonymous code
1497
+ // chunks that are actually correct answers; ungated, this demotion drops
1498
+ // GCSN dev MRR by ~27pp. Agent-format queries (probes/FreshStack) don't
1499
+ // expect file-header content as the answer.
1500
+ if (!opts._isAgentFormat) return 1.0;
1501
+ const meta = result?.metadata ?? {};
1502
+ const symbolType = result?.symbolType ?? result?.type ?? meta.type ?? null;
1503
+ const startLine = Number(result?.startLine ?? meta.startLine ?? meta.line_start);
1504
+ const endLine = Number(result?.endLine ?? meta.endLine ?? meta.line_end);
1505
+ if (!Number.isFinite(startLine) || !Number.isFinite(endLine)) return 1.0;
1506
+ const span = endLine - startLine;
1507
+
1508
+ // PATH 1 — Anonymous code chunks (symbol==null, type='code'):
1509
+ // demote on file-header (startLine===1) OR tiny-span (<5 lines) OR
1510
+ // preprocessor-density >=50% with no declarations.
1511
+ const sym = result?.symbol ?? meta.symbol ?? meta.name ?? null;
1512
+ const isAnonymousCode = (sym === null || sym === '' || sym === undefined) && symbolType === 'code';
1513
+ if (isAnonymousCode) {
1514
+ const isFileHeader = startLine === 1;
1515
+ const isTinySpan = span < 5;
1516
+ const isPreprocDense = (!isFileHeader && !isTinySpan)
1517
+ ? isPreprocDenseAnonymousChunk(result, opts)
1518
+ : false;
1519
+ if (isFileHeader || isTinySpan || isPreprocDense) {
1520
+ return opts.anomalousChunkFactor ?? 0.10;
1521
+ }
1522
+ return 1.0;
1523
+ }
1524
+
1525
+ // PATH 2 — Macro-wall chunks (symbolType='macro', span>=5, preprocessor-dense):
1526
+ // adopted from the entity DB during search, these chunks have a non-null
1527
+ // `symbol` like HWY_BASELINE_TARGETS even though their underlying chunk
1528
+ // metadata is anonymous-code. Functionally identical to PATH 1: a wall of
1529
+ // #defines that happens to include one extractable macro entity. Single
1530
+ // small macros (span<5) are NOT demoted — they may be the correct answer
1531
+ // for a query targeting that macro.
1532
+ if (symbolType === 'macro' && span >= 5 && isPreprocDenseAnonymousChunk(result, opts)) {
1533
+ return opts.anomalousChunkFactor ?? 0.10;
1534
+ }
1535
+
1536
+ return 1.0;
1537
+ }
1538
+
1539
+ /**
1540
+ * Mega-entity penalty (F1, 2026-05-07): when a chunk's enclosing entity
1541
+ * (e.g. function fastify @ 735 lines, Flask App class @ 1516 lines) exceeds
1542
+ * a configurable cap, demote the chunk's score. The fix targets the post-
1543
+ * retrieval envelope-bloat pattern from the taxonomy: small chunks score
1544
+ * highly because they're packed with token-dense surfaces from a mega-fn,
1545
+ * and presentation later expands them into a 700+ line envelope.
1546
+ *
1547
+ * Format-gated (agent only): GCSN single-function NL queries shouldn't
1548
+ * be affected by entity envelope sizes.
1549
+ *
1550
+ * Off by default (Infinity); calibrated via SWEET_SEARCH_MAX_ENVELOPE_LINES
1551
+ * env var or opts.maxEnvelopeLines.
1552
+ */
1553
+ // Loop-invariant resolution of the env-controlled envelope cap. Computed
1554
+ // once per applyResultDemotions call (see ruleOpts setup) and stashed on
1555
+ // opts._megaEnvelopeMax to avoid the env+parseInt+default lookup per
1556
+ // result. Resolver returns -1 to mean "skip the rule entirely" (when the
1557
+ // env var is set to a non-positive/non-finite value).
1558
+ function resolveMaxEnvelopeLines(opts) {
1559
+ if (typeof opts._megaEnvelopeMax === 'number') return opts._megaEnvelopeMax;
1560
+ const raw = process.env.SWEET_SEARCH_MAX_ENVELOPE_LINES;
1561
+ if (raw != null && raw !== '') {
1562
+ const n = Number.parseInt(raw, 10);
1563
+ if (Number.isFinite(n) && n > 0) return n;
1564
+ return -1;
1565
+ }
1566
+ // Default 500: calibrated 2026-05-07 on 60-probe + FreshStack uv + GCSN dev/held-out.
1567
+ // Cap=500 yields +1 PASS on probes (S5-Q9 Flask Scaffold) and +1 FAIL→PARTIAL on
1568
+ // FreshStack uv (UV-NL-2 do_lock) with zero regression on GCSN. Smaller caps
1569
+ // regressed FreshStack; larger caps yielded no further gain.
1570
+ return opts.maxEnvelopeLines ?? 500;
1571
+ }
1572
+
1573
+ function megaEntityPenalty(result, opts = {}) {
1574
+ if (!opts._isAgentFormat) return 1.0;
1575
+ if (hasAblation(opts.ablations, 'no-mega-entity-penalty')) return 1.0;
1576
+ const maxEnvelopeLines = resolveMaxEnvelopeLines(opts);
1577
+ if (maxEnvelopeLines <= 0 || !Number.isFinite(maxEnvelopeLines)) return 1.0;
1578
+ if (!opts.codeGraphRepo || typeof opts.codeGraphRepo.findEnclosingEntity !== 'function') {
1579
+ return 1.0;
1580
+ }
1581
+ // Route through resolveEntityKindInfo so we hit the search-scoped
1582
+ // _entityKindCache instead of going to SQLite again. The cached entity
1583
+ // carries startLine/endLine which is all this rule needs.
1584
+ const entity = resolveEntityKindInfo(result, opts);
1585
+ if (!entity || !Number.isFinite(entity.startLine) || !Number.isFinite(entity.endLine)) return 1.0;
1586
+ const entityLines = (entity.endLine - entity.startLine) + 1;
1587
+ if (entityLines <= maxEnvelopeLines) return 1.0;
1588
+ const factor = opts.megaEntityFactor ?? 0.85;
1589
+ return factor;
1590
+ }
1591
+
1592
+ /**
1593
+ * Doc-comment-only chunk demotion (F6, 2026-05-07).
1594
+ *
1595
+ * Detects chunks whose content is predominantly doc-comments without any
1596
+ * executable type/function declarations. Diagnosed from S3-Q8 ripgrep
1597
+ * (walk.rs:434-469): the chunker split WalkBuilder's 48-line docstring
1598
+ * across two chunks; the docstring-only chunk lexically matched
1599
+ * "WalkBuilder" + "directory iterator" and out-ranked the chunk that
1600
+ * actually contained the `pub struct WalkBuilder` declaration.
1601
+ *
1602
+ * Predicate: doc-comment lines / total non-blank lines > 0.85 AND no
1603
+ * declaration keywords (pub struct/fn/impl/enum/trait/class/def/function).
1604
+ *
1605
+ * Format-gated to agent: GCSN-style queries don't reliably target docs vs
1606
+ * code, and over-demoting comment-heavy chunks could regress. Format-gated
1607
+ * keeps it safe per the CLAUDE.md format-gating principle.
1608
+ */
1609
+ const DOC_COMMENT_LINE_RE = /^\s*(?:\/\/[\/!]|\/\*\*?|\*\s|"""|'''|#'\s|#\s|--\s|--\|)/;
1610
+ const DECL_KEYWORD_RE = /\b(?:pub\s+)?(?:struct|enum|trait|impl|mod)\b|\bfn\s+\w|\bclass\s+\w|\bdef\s+\w|\bfunction\s+\w|\binterface\s+\w|^\s*(?:export\s+)?(?:async\s+)?function\b/;
1611
+ function docCommentOnlyDemotion(result, opts = {}) {
1612
+ if (!opts._isAgentFormat) return 1.0;
1613
+ if (hasAblation(opts.ablations, 'no-doc-comment-demote')) return 1.0;
1614
+ const text = resolveResultText(result, opts);
1615
+ if (!text || text.length < 80) return 1.0;
1616
+ const lines = text.split(/\r?\n/);
1617
+ let docLines = 0;
1618
+ let nonBlankLines = 0;
1619
+ let hasDecl = false;
1620
+ for (const line of lines) {
1621
+ const trimmed = line.trim();
1622
+ if (trimmed.length === 0) continue;
1623
+ nonBlankLines++;
1624
+ if (DOC_COMMENT_LINE_RE.test(line)) {
1625
+ docLines++;
1626
+ } else if (DECL_KEYWORD_RE.test(trimmed)) {
1627
+ hasDecl = true;
1628
+ break;
1629
+ }
1630
+ }
1631
+ if (hasDecl) return 1.0;
1632
+ if (nonBlankLines < 5) return 1.0;
1633
+ if (docLines / nonBlankLines < 0.85) return 1.0;
1634
+ return opts.docCommentOnlyFactor ?? 0.70;
1635
+ }
1636
+
1637
+ function megaChunkSizePenalty(result, opts = {}) {
1638
+ const floor = (() => {
1639
+ const raw = process.env.SWEET_SEARCH_MEGA_CHUNK_FLOOR;
1640
+ if (raw == null || raw === '') return opts.megaChunkFloor ?? 0.80;
1641
+ const n = Number.parseFloat(raw);
1642
+ return Number.isFinite(n) && n >= 0 && n <= 1 ? n : (opts.megaChunkFloor ?? 0.80);
1643
+ })();
1644
+ if (floor >= 1.0) return 1.0; // disabled
1645
+ const cutoff = (() => {
1646
+ const raw = process.env.SWEET_SEARCH_MEGA_CHUNK_CUTOFF;
1647
+ if (raw == null || raw === '') return opts.megaChunkCutoff ?? 500;
1648
+ const n = Number.parseInt(raw, 10);
1649
+ return Number.isFinite(n) && n > 0 ? n : (opts.megaChunkCutoff ?? 500);
1650
+ })();
1651
+ const slope = (() => {
1652
+ const raw = process.env.SWEET_SEARCH_MEGA_CHUNK_SLOPE;
1653
+ if (raw == null || raw === '') return opts.megaChunkSlope ?? 0.0003;
1654
+ const n = Number.parseFloat(raw);
1655
+ return Number.isFinite(n) && n >= 0 && n <= 0.01 ? n : (opts.megaChunkSlope ?? 0.0003);
1656
+ })();
1657
+
1658
+ const lineCount = inferLineCount(result);
1659
+ if (!Number.isFinite(lineCount) || lineCount <= cutoff) return 1.0;
1660
+ return Math.max(floor, 1.0 - slope * (lineCount - cutoff));
1661
+ }
1662
+
1663
+ function bodyDensityMultiplier(result, opts = {}) {
1664
+ if (process.env.SWEET_SEARCH_BODY_DENSITY === '0'
1665
+ || process.env.SWEET_SEARCH_BODY_DENSITY === 'false') {
1666
+ return 1;
1667
+ }
1668
+ // Procedural-intent gate: a query asking "what is the X interface" should
1669
+ // not penalize declaration chunks.
1670
+ const intent = opts.intent || classifyFileKindIntent(opts.query || '');
1671
+ if (intent !== 'implementation') return 1;
1672
+
1673
+ // Trigger 1: declarative-entity types. Cheap — uses already-known metadata.
1674
+ const recordedType = normalizeType(resolveResultType(result));
1675
+ const inferredType = recordedType && recordedType !== 'code' && recordedType !== 'chunk'
1676
+ ? recordedType
1677
+ : normalizeType(resolveEntityKindInfo(result, opts)?.type);
1678
+ let mult = 1;
1679
+ if (DECLARATIVE_ENTITY_TYPES.has(inferredType)) {
1680
+ const declFactor = envFloatRange('SWEET_SEARCH_DECLARATIVE_FACTOR', 0.85);
1681
+ mult *= declFactor;
1682
+ }
1683
+
1684
+ // Triggers 2 & 3: text-content-derived signals for `impl` chunks.
1685
+ // Both target Rust impl blocks specifically because the failure shape
1686
+ // we observed (clap-style flag-arg impls) is a Rust idiom — it doesn't
1687
+ // exist in JS/TS/Go/Python.
1688
+ //
1689
+ // 2. Raw-string-dominant — > rsThreshold of non-blank chars live inside
1690
+ // a Rust raw-string literal. Catches impls where `doc_long()` is a
1691
+ // large `r#"..."#` description (e.g. `impl Flag for SearchZip`).
1692
+ //
1693
+ // 3. Stub-impl — multiple fn defs with avg body line count < stubMaxLines.
1694
+ // Catches impls where every method is a 1-line literal return
1695
+ // (e.g. `impl Flag for CaseSensitive` whose 6 methods total ~6 body
1696
+ // lines), independent of doc string size.
1697
+ //
1698
+ // Both apply 0.85× by default. They MAY stack on a chunk that hits both,
1699
+ // but the combined factor (~0.72) is still milder than the existing
1700
+ // doc/test demotion (0.35) so a true impl chunk that wrongly trips one
1701
+ // of these can still recover via other signals.
1702
+ if (inferredType === 'impl') {
1703
+ const text = resolveResultText(result, opts);
1704
+ if (text && text.length > 200) {
1705
+ const rsDensity = rawStringDensity(text);
1706
+ const rsThreshold = envFloatRange('SWEET_SEARCH_RAWSTRING_THRESHOLD', 0.50);
1707
+ if (rsDensity > rsThreshold) {
1708
+ const rsFactor = envFloatRange('SWEET_SEARCH_RAWSTRING_FACTOR', 0.85);
1709
+ mult *= rsFactor;
1710
+ }
1711
+
1712
+ const avgBody = avgFnBodyLines(text);
1713
+ // Threshold of 4.0 catches:
1714
+ // - CaseSensitive impl in ripgrep (avg body ≈ 2.6 incl. raw-string lines)
1715
+ // - SearchZip impl in ripgrep (avg body ≈ 3.8)
1716
+ // - Other clap-style flag-arg impls with mostly 1-line literal returns
1717
+ // While leaving alone real impls — Display/Iterator/Builder typically
1718
+ // have avg body ≥ 5 lines because their core methods are non-trivial.
1719
+ const stubMax = envFloatRange('SWEET_SEARCH_STUB_MAX_LINES', 4.0);
1720
+ if (avgBody < stubMax) {
1721
+ const stubFactor = envFloatRange('SWEET_SEARCH_STUB_FACTOR', 0.85);
1722
+ mult *= stubFactor;
1723
+ }
1724
+ }
1725
+ }
1726
+
1727
+ return mult;
1728
+ }
1729
+
1730
+ // Reference-count boost (added 2026-05-05). Aider-style behavioural-graph
1731
+ // signal: chunks whose primary entity is invoked from many call sites get
1732
+ // a small log-scaled boost, capped low enough that it can't dominate
1733
+ // embedding scores.
1734
+ //
1735
+ // Why this matters. The bi-encoder ranks `lib/decorate.js`'s `decorate` fn
1736
+ // purely on text similarity, where doc-rich `.d.ts` namespace blocks or
1737
+ // generic helpers can outrank it. The call graph encodes that `decorate`
1738
+ // is invoked 41 times across the codebase while the namespace declaration
1739
+ // is referenced almost exclusively from imports (4 hits). That's a strong
1740
+ // behavioural signal: this entity is structurally important.
1741
+ //
1742
+ // Restrictions:
1743
+ // - Only fires on `function` / `method` / `impl` entities. Declarative
1744
+ // types are handled by T1 above and shouldn't compete on call count.
1745
+ // - Only fires under `intent='implementation'`. Asking "what is the
1746
+ // ConfigError type" should not promote a fn just because it's called
1747
+ // a lot.
1748
+ // - Counts `type='calls'` only — not `imports`/`uses`/`extends`. Imports
1749
+ // are noisy (every file imports a few standards) and don't reflect
1750
+ // behavioural invocation.
1751
+ // - Boost is `1 + alpha · log(1 + count)` capped at REF_BOOST_CAP. With
1752
+ // alpha=0.025 and cap=1.10, 30 calls yields ~1.085, 1000 calls hits
1753
+ // the cap. So a heavily-tested helper can't run away with the ranking.
1754
+ // - Skipped on chunks larger than REF_BOOST_LARGE_LINES (default 80) to
1755
+ // avoid worsening Cluster B (oversized parent chunks like a 700-line
1756
+ // factory function whose graph degree is naturally high).
1757
+ //
1758
+ // Disable with `ablations: 'no-ref-count-boost'` or
1759
+ // SWEET_SEARCH_REF_BOOST_ALPHA=0. Suffix aggregation is homonym-gated in
1760
+ // CodeGraphRepository (`SWEET_SEARCH_REF_SUFFIX_AGG_FANOUT_MAX`, default 12).
1761
+ const REF_BOOSTABLE_TYPES = new Set(['function', 'method', 'impl']);
1762
+
1763
+ function referenceCountBoost(result, refCounts, opts = {}) {
1764
+ if (!refCounts || refCounts.size === 0) return 1;
1765
+ if (process.env.SWEET_SEARCH_REF_BOOST_ALPHA === '0') return 1;
1766
+
1767
+ const intent = opts.intent || classifyFileKindIntent(opts.query || '');
1768
+ if (intent !== 'implementation') return 1;
1769
+
1770
+ const recordedType = normalizeType(resolveResultType(result));
1771
+ const inferredType = recordedType && recordedType !== 'code' && recordedType !== 'chunk'
1772
+ ? recordedType
1773
+ : normalizeType(resolveEntityKindInfo(result, opts)?.type);
1774
+ if (!REF_BOOSTABLE_TYPES.has(inferredType)) return 1;
1775
+
1776
+ const meta = result?.metadata || {};
1777
+ const start = result?.startLine ?? meta.startLine;
1778
+ const end = result?.endLine ?? meta.endLine;
1779
+ if (Number.isFinite(start) && Number.isFinite(end)) {
1780
+ const lineCount = Math.max(1, end - start + 1);
1781
+ const largeThresh = Number(process.env.SWEET_SEARCH_REF_BOOST_LARGE_LINES || 80);
1782
+ if (lineCount > largeThresh) return 1;
1783
+ }
1784
+
1785
+ const name = resolveResultName(result) || resolveEntityKindInfo(result, opts)?.name;
1786
+ if (!name || name.length < 3) return 1;
1787
+
1788
+ const count = refCounts.get(name) || 0;
1789
+ if (count <= 0) return 1;
1790
+
1791
+ const alpha = envFloatRange('SWEET_SEARCH_REF_BOOST_ALPHA', 0.025);
1792
+ const cap = (() => {
1793
+ const v = process.env.SWEET_SEARCH_REF_BOOST_CAP;
1794
+ if (v == null || v === '') return 1.10;
1795
+ const n = Number(v);
1796
+ return Number.isFinite(n) && n >= 1.0 && n <= 1.5 ? n : 1.10;
1797
+ })();
1798
+ const boost = Math.min(cap, 1 + alpha * Math.log(1 + count));
1799
+ return boost;
1800
+ }
1801
+
1802
+ // Pre-compute incoming-call counts for ALL candidate names in one DB query.
1803
+ // Without this, the multiplier function would do N queries per result set
1804
+ // (one per candidate), which adds 100-200 ms in practice.
1805
+ function buildRefCountMap(results, opts = {}) {
1806
+ const repo = opts.codeGraphRepo;
1807
+ if (!repo || typeof repo.countIncomingCallsByNames !== 'function') return new Map();
1808
+ const intent = opts.intent || classifyFileKindIntent(opts.query || '');
1809
+ if (intent !== 'implementation') return new Map();
1810
+ if (process.env.SWEET_SEARCH_REF_BOOST_ALPHA === '0') return new Map();
1811
+
1812
+ const names = [];
1813
+ for (const r of results) {
1814
+ const recordedType = normalizeType(resolveResultType(r));
1815
+ const inferredType = recordedType && recordedType !== 'code' && recordedType !== 'chunk'
1816
+ ? recordedType
1817
+ : normalizeType(resolveEntityKindInfo(r, opts)?.type);
1818
+ if (!REF_BOOSTABLE_TYPES.has(inferredType)) continue;
1819
+ const name = resolveResultName(r) || resolveEntityKindInfo(r, opts)?.name;
1820
+ if (name && name.length >= 3) names.push(name);
1821
+ }
1822
+ if (names.length === 0) return new Map();
1823
+ try {
1824
+ // Default: skip ref-boost for the whole query when any boostable candidate
1825
+ // bare name has >12 distinct call-graph targets (dense single-fun corpora).
1826
+ // Opt out with SWEET_SEARCH_REF_BOOST_QUERY_HOMONYM_DISABLE=0; tighten for
1827
+ // eval with =2..=8 (lifts GCSN, may trim monorepo boosts — see probes).
1828
+ const rawTh = process.env.SWEET_SEARCH_REF_BOOST_QUERY_HOMONYM_DISABLE;
1829
+ const parsed = parseInt(rawTh != null && rawTh !== '' ? rawTh : '12', 10);
1830
+ const homonymCeil = rawTh === '0'
1831
+ ? Infinity
1832
+ : (Number.isFinite(parsed) && parsed > 0 ? parsed : 12);
1833
+ if (typeof repo.relationshipBareFanout === 'function'
1834
+ && homonymCeil < Infinity
1835
+ && names.some((n) => repo.relationshipBareFanout(n) > homonymCeil)) {
1836
+ return new Map();
1837
+ }
1838
+ return repo.countIncomingCallsByNames(names);
1839
+ } catch {
1840
+ return new Map();
1841
+ }
1842
+ }
1843
+
1844
+ // Removed (2026-05-05): file-header chunk detection became redundant
1845
+ // once cAST sibling-merge was confirmed. With cAST, a chunk starting at
1846
+ // line 1 of a source file naturally merges the package decl + imports
1847
+ // with the first executable declaration(s), so a "lines 1-N: imports
1848
+ // only" chunk shouldn't normally win retrieval. Cases where it still
1849
+ // does are rare enough that the cost of the false-positive demotion
1850
+ // (e.g. a `types.go` consisting purely of type aliases) outweighs the
1851
+ // benefit. The per-doc `tinyAncillaryFactor` in applyFileKindRanking
1852
+ // still catches tiny doc/test/example top-1 results.
1853
+
1854
+ /**
1855
+ * Apply content-aware result demotions/boosts before top-k truncation.
1856
+ * Catches inline test functions and explicit entity-kind queries that
1857
+ * path-only demotion cannot see. Tiny-chunk and file-header rules were
1858
+ * removed once cAST sibling-merge made them structurally redundant.
1859
+ */
1860
+ export function applyResultDemotions(results, opts = {}) {
1861
+ if (!Array.isArray(results) || results.length === 0) return results;
1862
+
1863
+ // Attach intra-call (and optionally cross-call) memoization for the three
1864
+ // hot lookups inside the demotion sub-rules:
1865
+ // - _entityKindCache : enclosing/contained entity from SQLite
1866
+ // - _entityNameCache : findEntityWithNameInRange (symbol-target adopt)
1867
+ // - _resultTextCache : readFileSync source span — biggest win, since
1868
+ // 5+ rules call resolveResultText per result and
1869
+ // each cache-miss fires a full readFileSync.
1870
+ // Caller may pass pre-allocated Maps via opts to share across both
1871
+ // applyResultDemotions calls in the same search() invocation.
1872
+ opts = {
1873
+ ...opts,
1874
+ _entityKindCache: opts._entityKindCache instanceof Map ? opts._entityKindCache : new Map(),
1875
+ _entityNameCache: opts._entityNameCache instanceof Map ? opts._entityNameCache : new Map(),
1876
+ _resultTextCache: opts._resultTextCache instanceof Map ? opts._resultTextCache : new Map(),
1877
+ _fullFileTextCache: opts._fullFileTextCache instanceof Map ? opts._fullFileTextCache : new Map(),
1878
+ _isTestSupportCache: opts._isTestSupportCache instanceof Map ? opts._isTestSupportCache : new Map(),
1879
+ _isTestChunkCache: opts._isTestChunkCache instanceof Map ? opts._isTestChunkCache : new Map(),
1880
+ _fileKindCache: opts._fileKindCache instanceof Map ? opts._fileKindCache : new Map(),
1881
+ };
1882
+
1883
+ const ablations = opts.ablations;
1884
+ if (hasAblation(ablations, 'no-result-demotions')) return results;
1885
+
1886
+ const qTokens = queryTokenSet(opts.query || '', opts.queryTokens);
1887
+ const preferredKind = hasAblation(ablations, 'no-entity-kind-pref')
1888
+ ? null
1889
+ : entityKindPreferenceFromQuery(opts.query || '');
1890
+ const nameHints = hasAblation(ablations, 'no-name-precision')
1891
+ ? new Set()
1892
+ : extractNameHints(opts.query || '');
1893
+ const nameHintsLower = hasAblation(ablations, 'no-name-precision')
1894
+ ? new Set()
1895
+ : new Set([...nameHints].map(s => s.toLowerCase()));
1896
+
1897
+ // Pre-compute incoming-call counts in a single batched query so the
1898
+ // per-result loop doesn't make N round trips to SQLite.
1899
+ const refCounts = !hasAblation(ablations, 'no-ref-count-boost')
1900
+ ? buildRefCountMap(results, opts)
1901
+ : new Map();
1902
+
1903
+ // Symbol-exact-match target + path-token targets — extracted ONCE per
1904
+ // query (not per-result). BM25F SOTA pattern (Sourcegraph BM25F blog
1905
+ // April 2025, +20% on code search; Pérez-Iglesias et al. arXiv
1906
+ // 0911.5046; Robertson & Zaragoza 2009).
1907
+ //
1908
+ // CRITICAL — gated on opts.format === 'agent' (or env override) to
1909
+ // avoid −0.07pp regression on GCSN heldout MRR. GCSN-style NL queries
1910
+ // ("Sort an array of integers", "Find the index of an element") trip
1911
+ // the path-token "of X" pattern with non-path tokens like "integers"
1912
+ // / "ascending", and lightly poison ranking. The boosts are designed
1913
+ // for agent queries with explicit identifier/path hints ("show me X
1914
+ // struct", "in globset"), not for benchmark NL traffic. Probes use
1915
+ // format='agent', so their behaviour is preserved; GCSN bench uses
1916
+ // mode='auto' without format, so boosts are skipped — restoring the
1917
+ // 85.99% MRR heldout baseline.
1918
+ //
1919
+ // See docs/SOTA_RESEARCH_2026_FIXES.md for full rationale.
1920
+ const isAgentFormat = opts.format === 'agent'
1921
+ || opts.format === 'agent_full'
1922
+ || opts.format === 'agent_full_xl'
1923
+ || opts.format === 'agent_preview'
1924
+ || process.env.SWEET_SEARCH_FORCE_BM25F_BOOSTS === '1';
1925
+ const symbolExactTarget = isAgentFormat && !hasAblation(ablations, 'no-symbol-exact-boost')
1926
+ ? extractSymbolDefinitionTarget(opts.query || '')
1927
+ : null;
1928
+ const pathTokens = isAgentFormat && !hasAblation(ablations, 'no-path-token-boost')
1929
+ ? extractPathTokens(opts.query || '')
1930
+ : [];
1931
+ // Identifier-mention boost (complements verb-anchored symbolExactTarget):
1932
+ // fires on noun-anchored probe phrasings where the gold symbol appears in
1933
+ // the query without a "show me/find/where is" prefix. Format-gated; opts
1934
+ // passes through `_symbolExactTarget` so this boost skips mentions already
1935
+ // boosted by the higher-precision verb path.
1936
+ const identifierMentions = isAgentFormat && !hasAblation(ablations, 'no-identifier-mention-boost')
1937
+ ? extractIdentifierMentions(opts.query || '')
1938
+ : null;
1939
+
1940
+ // F9 (2026-05-12): pre-compute query word tokens once for additional_symbols
1941
+ // re-anchoring (see findAdditionalSymbolRelabel docstring). Format-gated;
1942
+ // skipped entirely when isAgentFormat=false so GCSN benchmark traffic is
1943
+ // untouched (same gate as the other BM25F-family signals above).
1944
+ //
1945
+ // F10 (2026-05-12): the extraction regex preserves a leading `$` so
1946
+ // identifiers like $ZodType / $ZodTypeInternals (zod v4/core public-API
1947
+ // convention — the structural interfaces are $-prefixed while the runtime
1948
+ // classes are not) round-trip through tokenization. `\b[A-Za-z0-9_]+\b`
1949
+ // would silently strip the `$` (since `$` is non-word) and make
1950
+ // `$ZodType` indistinguishable from plain `ZodType`, costing TSL-004/8
1951
+ // (chunk relabel picked classic/ZodType over core/$ZodType because the
1952
+ // tier-A literal match was ambiguous).
1953
+ const f9QueryWordsArr = (isAgentFormat && !hasAblation(ablations, 'no-addsym-relabel'))
1954
+ ? ((opts.query || '').match(/\$?[A-Za-z_][A-Za-z0-9_]*/g) || []).map(w => w.toLowerCase())
1955
+ : null;
1956
+ const f9QueryWordsSet = f9QueryWordsArr ? new Set(f9QueryWordsArr) : null;
1957
+
1958
+ let changed = false;
1959
+ const window = Math.min(opts.window ?? results.length, results.length);
1960
+
1961
+ // Per-rule timers — accumulator pattern, no object allocation per call.
1962
+ // No-op in production; only fires when profile-search-stages.mjs sets
1963
+ // globalThis.__stageTimings. Adds ~1ms overhead per call when profiling
1964
+ // (12 rules × 100 results × 2 performance.now() calls), acceptable for
1965
+ // the diagnostic.
1966
+ const __profOn = !!globalThis.__stageTimings;
1967
+ const __ruleTime = __profOn ? new Float64Array(12) : null;
1968
+ let __ruleT0 = 0;
1969
+ // Hoist loop-invariant work out of the per-result map():
1970
+ // - ruleOpts: a single spread reused across the 3 ruleOpts callsites
1971
+ // (anomalous, docComment, megaEntity). Original allocated 3 fresh
1972
+ // spreads per result (~15-20 keys each) × 100 results = 300 extra
1973
+ // objects per call.
1974
+ // - skip* flags: hasAblation() called once per result per rule otherwise.
1975
+ // - preferredKindKeywordSet: the kind→keywords list never changes during
1976
+ // the loop, but the original recomputed
1977
+ // `(ENTITY_KIND_KEYWORDS[preferredKind] || []).map(normalizeType)` per
1978
+ // result inside the entity-adoption gate.
1979
+ // Pre-resolve the envelope-cap once for ruleOpts. resolveMaxEnvelopeLines
1980
+ // does an env-var lookup + parseInt + default fallback; without this it
1981
+ // ran per result inside megaEntityPenalty.
1982
+ const ruleOpts = {
1983
+ ...opts,
1984
+ ablations,
1985
+ _isAgentFormat: isAgentFormat,
1986
+ _megaEnvelopeMax: resolveMaxEnvelopeLines(opts),
1987
+ };
1988
+ const skipTestName = hasAblation(ablations, 'no-test-name-overlap');
1989
+ const skipBodyDensity = hasAblation(ablations, 'no-body-density');
1990
+ const skipMegaChunk = hasAblation(ablations, 'no-mega-chunk-penalty');
1991
+ const skipRefCount = hasAblation(ablations, 'no-ref-count-boost');
1992
+ const skipNamePrecision = hasAblation(ablations, 'no-name-precision');
1993
+ const skipEntityKindPref = hasAblation(ablations, 'no-entity-kind-pref');
1994
+ const testNameOverlapThreshold = opts.testNameOverlapThreshold ?? 0.5;
1995
+ const testNameOverlapFactor = opts.testNameOverlapFactor ?? 0.40;
1996
+ const preferredKindKeywordSet = preferredKind
1997
+ ? new Set((ENTITY_KIND_KEYWORDS[preferredKind] || []).map(normalizeType))
1998
+ : null;
1999
+
2000
+ // For-loop with a pre-allocated array. The hot path here was a `.map()`
2001
+ // callback that always allocated a `details` array per result and a fresh
2002
+ // result spread `{ ...result, _resultDemotionOrigIndex: index }` even when
2003
+ // no rule fired. With ~100 results × 2 demotion sites that's hundreds of
2004
+ // empty arrays + light spreads per query for nothing. Lazy `details`
2005
+ // allocation skips the array when the result has zero rule hits;
2006
+ // unchanged-result spreads keep going through the same shape (the caller
2007
+ // expects new references — cascade scoring writes back r.score).
2008
+ const adjusted = new Array(window);
2009
+ for (let index = 0; index < window; index++) {
2010
+ const result = results[index];
2011
+ let mult = 1;
2012
+ let details = null;
2013
+
2014
+ if (!skipTestName) {
2015
+ if (__profOn) __ruleT0 = performance.now();
2016
+ if (isTestChunk(result, opts)) {
2017
+ const overlap = testNameQueryOverlap(result, qTokens);
2018
+ if (overlap >= testNameOverlapThreshold) {
2019
+ mult *= testNameOverlapFactor;
2020
+ (details ||= []).push('test-name:0.40');
2021
+ }
2022
+ }
2023
+ if (__profOn) __ruleTime[0] += performance.now() - __ruleT0;
2024
+ }
2025
+
2026
+ if (__profOn) __ruleT0 = performance.now();
2027
+ const kindMult = entityKindMultiplier(result, preferredKind, opts);
2028
+ if (__profOn) __ruleTime[1] += performance.now() - __ruleT0;
2029
+ if (kindMult !== 1) {
2030
+ mult *= kindMult;
2031
+ (details ||= []).push(`kind-pref:${kindMult.toFixed(2)}`);
2032
+ }
2033
+
2034
+ if (__profOn) __ruleT0 = performance.now();
2035
+ const nameMult = namePrecisionMultiplier(result, preferredKind, nameHintsLower, opts);
2036
+ if (__profOn) __ruleTime[2] += performance.now() - __ruleT0;
2037
+ if (nameMult !== 1) {
2038
+ mult *= nameMult;
2039
+ (details ||= []).push(`name-precision:${nameMult.toFixed(2)}`);
2040
+ }
2041
+
2042
+ if (!skipBodyDensity) {
2043
+ if (__profOn) __ruleT0 = performance.now();
2044
+ const bodyMult = bodyDensityMultiplier(result, opts);
2045
+ if (__profOn) __ruleTime[3] += performance.now() - __ruleT0;
2046
+ if (bodyMult !== 1) {
2047
+ mult *= bodyMult;
2048
+ (details ||= []).push(`body-density:${bodyMult.toFixed(2)}`);
2049
+ }
2050
+ }
2051
+
2052
+ if (!skipMegaChunk) {
2053
+ if (__profOn) __ruleT0 = performance.now();
2054
+ const megaMult = megaChunkSizePenalty(result, opts);
2055
+ if (__profOn) __ruleTime[4] += performance.now() - __ruleT0;
2056
+ if (megaMult !== 1) {
2057
+ mult *= megaMult;
2058
+ (details ||= []).push(`mega-chunk:${megaMult.toFixed(2)}`);
2059
+ }
2060
+ }
2061
+
2062
+ {
2063
+ if (__profOn) __ruleT0 = performance.now();
2064
+ const anomMult = anomalousChunkDemotion(result, ruleOpts);
2065
+ if (__profOn) __ruleTime[5] += performance.now() - __ruleT0;
2066
+ if (anomMult !== 1) {
2067
+ mult *= anomMult;
2068
+ (details ||= []).push(`anomalous-chunk:${anomMult.toFixed(2)}`);
2069
+ }
2070
+ }
2071
+
2072
+ {
2073
+ if (__profOn) __ruleT0 = performance.now();
2074
+ const docMult = docCommentOnlyDemotion(result, ruleOpts);
2075
+ if (__profOn) __ruleTime[6] += performance.now() - __ruleT0;
2076
+ if (docMult !== 1) {
2077
+ mult *= docMult;
2078
+ (details ||= []).push(`doc-comment-only:${docMult.toFixed(2)}`);
2079
+ }
2080
+ }
2081
+
2082
+ {
2083
+ if (__profOn) __ruleT0 = performance.now();
2084
+ const entMult = megaEntityPenalty(result, ruleOpts);
2085
+ if (__profOn) __ruleTime[7] += performance.now() - __ruleT0;
2086
+ if (entMult !== 1) {
2087
+ mult *= entMult;
2088
+ (details ||= []).push(`mega-entity:${entMult.toFixed(2)}`);
2089
+ }
2090
+ }
2091
+
2092
+
2093
+ if (symbolExactTarget) {
2094
+ if (__profOn) __ruleT0 = performance.now();
2095
+ const symbolMult = symbolExactMatchBoost(result, symbolExactTarget, opts);
2096
+ if (__profOn) __ruleTime[8] += performance.now() - __ruleT0;
2097
+ if (symbolMult !== 1) {
2098
+ mult *= symbolMult;
2099
+ (details ||= []).push(`symbol-exact:${symbolMult.toFixed(2)}`);
2100
+ }
2101
+ }
2102
+
2103
+ if (identifierMentions) {
2104
+ if (__profOn) __ruleT0 = performance.now();
2105
+ const mentionMult = identifierMentionBoost(result, identifierMentions, {
2106
+ ...opts,
2107
+ _symbolExactTarget: symbolExactTarget,
2108
+ });
2109
+ if (__profOn) __ruleTime[8] += performance.now() - __ruleT0;
2110
+ if (mentionMult !== 1) {
2111
+ mult *= mentionMult;
2112
+ (details ||= []).push(`identifier-mention:${mentionMult.toFixed(2)}`);
2113
+ }
2114
+ }
2115
+
2116
+ if (pathTokens.length > 0) {
2117
+ if (__profOn) __ruleT0 = performance.now();
2118
+ const pathMult = pathTokenBoost(result, pathTokens, opts);
2119
+ if (__profOn) __ruleTime[9] += performance.now() - __ruleT0;
2120
+ if (pathMult !== 1) {
2121
+ mult *= pathMult;
2122
+ (details ||= []).push(`path-token:${pathMult.toFixed(2)}`);
2123
+ }
2124
+ }
2125
+
2126
+ if (!skipRefCount) {
2127
+ if (__profOn) __ruleT0 = performance.now();
2128
+ const refMult = referenceCountBoost(result, refCounts, opts);
2129
+ if (__profOn) __ruleTime[10] += performance.now() - __ruleT0;
2130
+ if (refMult !== 1) {
2131
+ mult *= refMult;
2132
+ (details ||= []).push(`ref-count:${refMult.toFixed(2)}`);
2133
+ }
2134
+ }
2135
+
2136
+ const baseScore = typeof result.score === 'number' ? result.score : 0;
2137
+ if (__profOn) __ruleT0 = performance.now();
2138
+ // F8 (2026-05-07): when the query has an explicit symbol target (extractSymbolDefinitionTarget)
2139
+ // AND the chunk contains an entity matching that name, prefer THAT entity for labeling
2140
+ // over kind-preference / name-precision heuristics. Targets cases like S3-Q4 (chunk
2141
+ // labeled "Binding" but contains the Default function the user asked for) and parallels
2142
+ // F7's contained-entity boost (which only changes ranking, not symbol attribution).
2143
+ // Format-gated through symbolExactTarget which is set only when isAgentFormat.
2144
+ const exactSymbolTargetEntity = symbolExactTarget && opts.codeGraphRepo
2145
+ && typeof opts.codeGraphRepo.findEntityWithNameInRange === 'function'
2146
+ ? (() => {
2147
+ const fp = resolveFilePath(result);
2148
+ const meta = result?.metadata ?? {};
2149
+ const sl = Number(result?.startLine ?? meta.startLine);
2150
+ const el = Number(result?.endLine ?? meta.endLine);
2151
+ if (!fp || !Number.isFinite(sl) || !Number.isFinite(el)) return null;
2152
+ const cache = opts._entityNameCache;
2153
+ const cacheKey = cache ? `${fp}|${sl}|${el}|${symbolExactTarget}` : null;
2154
+ if (cacheKey && cache.has(cacheKey)) return cache.get(cacheKey);
2155
+ let resolved = null;
2156
+ try {
2157
+ resolved = opts.codeGraphRepo.findEntityWithNameInRange(fp, sl, el, symbolExactTarget);
2158
+ } catch { resolved = null; }
2159
+ if (cacheKey) cache.set(cacheKey, resolved);
2160
+ return resolved;
2161
+ })()
2162
+ : null;
2163
+ // F9 (2026-05-12): when F8 (verb-anchored explicit target) did not fire,
2164
+ // try additional_symbols re-anchoring. JS/TS/TSX/JSX-gated inside the
2165
+ // helper. Pure label adoption — no score change. See helper docstring.
2166
+ const additionalSymbolRelabelEntity = !exactSymbolTargetEntity
2167
+ && f9QueryWordsArr
2168
+ ? findAdditionalSymbolRelabel(result, f9QueryWordsArr, f9QueryWordsSet, opts)
2169
+ : null;
2170
+ const exactEntity = exactSymbolTargetEntity
2171
+ || additionalSymbolRelabelEntity
2172
+ || (!skipNamePrecision
2173
+ ? exactNamedEntityForResult(result, preferredKind, nameHints, nameHintsLower, opts)
2174
+ : null);
2175
+ const preferredEntity = exactEntity || (preferredKind && !skipEntityKindPref
2176
+ ? resolveEntityKindInfo(result, opts)
2177
+ : null);
2178
+ const preferredType = normalizeType(preferredEntity?.type);
2179
+ // F8 (continued): when the chunk contains an entity matching the explicit
2180
+ // symbol target (function name from "show me X function" queries), bypass
2181
+ // the kind-keyword gate. Functions/methods aren't in ENTITY_KIND_KEYWORDS
2182
+ // (which is struct/enum/class/interface/trait/type), so without bypass the
2183
+ // relabel path was gated off for "show me X function" queries — defeating
2184
+ // the purpose of having SYMBOL_DEFN_QUERY_RE recognise "function".
2185
+ const shouldAdoptViaExactTarget = !!(exactSymbolTargetEntity
2186
+ && exactSymbolTargetEntity.name
2187
+ && exactSymbolTargetEntity.startLine
2188
+ && exactSymbolTargetEntity.endLine);
2189
+ // F9: noun-anchored addSym path also bypasses the kind-keyword gate.
2190
+ // Variables/typeAliases captured as additional_symbols (TS-006/8 scope)
2191
+ // would otherwise fail the gate (variable/typeAlias not in keyword set).
2192
+ const shouldAdoptViaAddSym = !!(additionalSymbolRelabelEntity
2193
+ && additionalSymbolRelabelEntity.name
2194
+ && additionalSymbolRelabelEntity.startLine
2195
+ && additionalSymbolRelabelEntity.endLine);
2196
+ const shouldAdoptEntity = shouldAdoptViaExactTarget || shouldAdoptViaAddSym || !!(preferredEntity?.startLine
2197
+ && preferredEntity?.endLine
2198
+ && preferredKindKeywordSet && preferredKindKeywordSet.has(preferredType));
2199
+ const containedEntity = !shouldAdoptEntity && opts.codeGraphRepo && typeof opts.codeGraphRepo.findFirstEntityInRange === 'function'
2200
+ ? resolveEntityKindInfo(result, opts)
2201
+ : null;
2202
+ const shouldAdoptContained = !!(containedEntity?.name && containedEntity?.startLine && containedEntity?.endLine);
2203
+ const entityToAdopt = shouldAdoptEntity ? preferredEntity : shouldAdoptContained ? containedEntity : null;
2204
+ if (__profOn) __ruleTime[11] += performance.now() - __ruleT0;
2205
+ if (mult === 1 && !entityToAdopt) {
2206
+ // Unchanged: shallow copy preserves the caller-expected new-reference
2207
+ // shape (downstream cascade scoring writes back r.score) without the
2208
+ // redundant _resultDemotionOrigIndex field — V8 Array.sort is stable
2209
+ // since ES2019, so the in-place index-order tie-break is implicit.
2210
+ adjusted[index] = { ...result };
2211
+ continue;
2212
+ }
2213
+ changed = true;
2214
+ // Range-preservation invariant: adopting an entity is a *labeling*
2215
+ // operation (it tells the caller what symbol the chunk is about); it
2216
+ // must not SHRINK a well-formed retrieval chunk to a per-symbol entity
2217
+ // boundary. The cAST/sibling-merged chunk is the right unit for the
2218
+ // agent to read; the entity name + type are added as annotations.
2219
+ //
2220
+ // Concretely: a Go file's bsonBinding has a 1-line typeAlias entity
2221
+ // at line 14, but the LI chunk is lines 1-31 (typeAlias + 3 methods,
2222
+ // all merged by cAST). Adopting the entity's range used to drop 30
2223
+ // lines of content; now we keep the chunk range and just adopt the
2224
+ // name/type as labels. Range adoption only fires when the entity
2225
+ // is at least as large as the chunk (e.g. expanding a partial
2226
+ // chunk to its enclosing symbol — which is the legitimate use case).
2227
+ const chunkStart = result.metadata?.startLine ?? result.startLine ?? null;
2228
+ const chunkEnd = result.metadata?.endLine ?? result.endLine ?? null;
2229
+ const chunkRange = (chunkStart != null && chunkEnd != null)
2230
+ ? Math.max(0, chunkEnd - chunkStart + 1) : 0;
2231
+ const entityRange = entityToAdopt
2232
+ ? Math.max(0, (entityToAdopt.endLine || 0) - (entityToAdopt.startLine || 0) + 1) : 0;
2233
+ const adoptRange = !!entityToAdopt && entityRange >= chunkRange;
2234
+ const adoptedFile = entityToAdopt
2235
+ ? (entityToAdopt.file || entityToAdopt.filePath || resolveFilePath(result))
2236
+ : null;
2237
+ const baseMetadata = result.metadata || {};
2238
+ const nextMetadata = entityToAdopt
2239
+ ? {
2240
+ ...baseMetadata,
2241
+ ...(shouldAdoptEntity
2242
+ ? { name: entityToAdopt.name || baseMetadata.name || result.name || null }
2243
+ : { name: entityToAdopt.name }),
2244
+ type: entityToAdopt.type,
2245
+ ...(adoptRange ? {
2246
+ file: adoptedFile,
2247
+ startLine: entityToAdopt.startLine,
2248
+ endLine: entityToAdopt.endLine,
2249
+ } : {}),
2250
+ }
2251
+ : baseMetadata;
2252
+ adjusted[index] = {
2253
+ ...result,
2254
+ ...(entityToAdopt ? {
2255
+ name: shouldAdoptEntity
2256
+ ? (entityToAdopt.name || result.name)
2257
+ : entityToAdopt.name,
2258
+ type: entityToAdopt.type,
2259
+ ...(adoptRange ? {
2260
+ startLine: entityToAdopt.startLine,
2261
+ endLine: entityToAdopt.endLine,
2262
+ } : {}),
2263
+ } : {}),
2264
+ ...(nextMetadata ? { metadata: nextMetadata } : {}),
2265
+ score: baseScore * mult,
2266
+ _resultDemotionOrigScore: baseScore,
2267
+ _resultDemotionMult: mult,
2268
+ _resultDemotionDetails: details ?? [],
2269
+ };
2270
+ }
2271
+
2272
+ // Dump per-rule timings to globalThis.__stageTimings (set by the profiler).
2273
+ // No-op in production. Labels mirror the rule names so the profiler's flat
2274
+ // table reads cleanly.
2275
+ if (__profOn && __ruleTime) {
2276
+ const labels = [
2277
+ 'rule:testName', 'rule:entityKind', 'rule:namePrec', 'rule:body',
2278
+ 'rule:megaChunk', 'rule:anomalous', 'rule:docComment', 'rule:megaEntity',
2279
+ 'rule:symbolExact', 'rule:pathToken', 'rule:refCount', 'rule:adoptEntity',
2280
+ ];
2281
+ const buf = globalThis.__stageTimings;
2282
+ for (let i = 0; i < labels.length; i++) {
2283
+ (buf[labels[i]] = buf[labels[i]] || []).push(__ruleTime[i]);
2284
+ }
2285
+ }
2286
+
2287
+ if (!changed) return results;
2288
+
2289
+ // V8 Array.sort is stable (ES2019) — same-score results retain their
2290
+ // original-window order without needing the explicit _origIndex tiebreak
2291
+ // the prior implementation carried.
2292
+ adjusted.sort((a, b) => (b.score || 0) - (a.score || 0));
2293
+ return window === results.length ? adjusted : adjusted.concat(results.slice(window));
2294
+ }
2295
+
144
2296
  function envOff() {
145
2297
  return process.env.SWEET_SEARCH_FILE_KIND_RANKING === '0'
146
2298
  || process.env.SWEET_SEARCH_FILE_KIND_RANKING === 'false';
@@ -169,7 +2321,7 @@ const DEFAULT_WINDOW = 30;
169
2321
  *
170
2322
  * Demotion fires only when:
171
2323
  * - intent === 'implementation' (confident, NOT 'unknown'), AND
172
- * - the top-N window contains at least one docs/tests/types candidate, AND
2324
+ * - the top-N window contains at least one demotable candidate, AND
173
2325
  * - the top-N window contains at least one implementation candidate.
174
2326
  *
175
2327
  * In every other case the original `results` array is returned unchanged
@@ -182,11 +2334,15 @@ const DEFAULT_WINDOW = 30;
182
2334
  * @param {Object} [opts]
183
2335
  * @param {string} [opts.query] - raw query (used to infer intent
184
2336
  * if opts.intent isn't supplied)
185
- * @param {'docs'|'tests'|'types'|'implementation'|'unknown'} [opts.intent]
2337
+ * @param {'docs'|'tests'|'types'|'ancillary'|'implementation'|'unknown'} [opts.intent]
186
2338
  * - explicit intent override
187
2339
  * @param {number} [opts.docFactor] - default from env / 0.85
2340
+ * @param {number} [opts.exampleFactor] - default from docFactor
188
2341
  * @param {number} [opts.testFactor] - default from env / 0.85
189
2342
  * @param {number} [opts.typeFactor] - default from env / 0.85
2343
+ * @param {number} [opts.ancillaryFactor] - default from env / 0.85
2344
+ * @param {number} [opts.tinyAncillaryFactor]
2345
+ * @param {number} [opts.tinyLineThreshold]
190
2346
  * @param {number} [opts.window] - top-N window for analysis +
191
2347
  * bounded re-sort (default 30)
192
2348
  * @returns {Array} either the original `results` (no-op) or a new array
@@ -209,14 +2365,22 @@ export function applyFileKindRanking(results, opts = {}) {
209
2365
  : envWindow('SWEET_SEARCH_FILE_KIND_WINDOW', DEFAULT_WINDOW);
210
2366
  const windowSize = Math.min(window, results.length);
211
2367
 
2368
+ // Per-call file-kind cache: detectFileKind is invoked for every result
2369
+ // here AND inside isTestChunk → the same file path can be classified
2370
+ // many times in one applyFileKindRanking + applyResultDemotions pass.
2371
+ // Caller may pass opts._fileKindCache to share with the demotion sites.
2372
+ const fileKindOpts = opts._fileKindCache instanceof Map
2373
+ ? opts
2374
+ : { ...opts, _fileKindCache: new Map() };
2375
+
212
2376
  // Walk the window once: classify kinds and check for competition.
213
2377
  const kinds = new Array(windowSize);
214
2378
  let demotableCount = 0;
215
2379
  let implCount = 0;
216
2380
  for (let i = 0; i < windowSize; i++) {
217
- const k = detectFileKind(resolveFilePath(results[i]));
2381
+ const k = detectFileKind(resolveFilePath(results[i]), fileKindOpts);
218
2382
  kinds[i] = k;
219
- if (k === 'docs' || k === 'tests' || k === 'types') demotableCount++;
2383
+ if (k === 'docs' || k === 'examples' || k === 'tests' || k === 'types' || k === 'ancillary') demotableCount++;
220
2384
  else if (k === 'implementation') implCount++;
221
2385
  }
222
2386
 
@@ -225,8 +2389,14 @@ export function applyFileKindRanking(results, opts = {}) {
225
2389
 
226
2390
  const factor = envFactor('SWEET_SEARCH_FILE_KIND_FACTOR', DEFAULT_FACTOR);
227
2391
  const docFactor = opts.docFactor != null ? opts.docFactor : factor;
2392
+ const exampleFactor = opts.exampleFactor != null ? opts.exampleFactor : docFactor;
228
2393
  const testFactor = opts.testFactor != null ? opts.testFactor : factor;
229
2394
  const typeFactor = opts.typeFactor != null ? opts.typeFactor : factor;
2395
+ const ancillaryFactor = opts.ancillaryFactor != null ? opts.ancillaryFactor : factor;
2396
+ const tinyAncillaryFactor = opts.tinyAncillaryFactor != null
2397
+ ? opts.tinyAncillaryFactor
2398
+ : ancillaryFactor;
2399
+ const tinyLineThreshold = opts.tinyLineThreshold != null ? opts.tinyLineThreshold : 3;
230
2400
 
231
2401
  const reranked = new Array(windowSize);
232
2402
  for (let i = 0; i < windowSize; i++) {
@@ -234,8 +2404,15 @@ export function applyFileKindRanking(results, opts = {}) {
234
2404
  const kind = kinds[i];
235
2405
  let mult = 1;
236
2406
  if (kind === 'docs') mult = docFactor;
2407
+ else if (kind === 'examples') mult = exampleFactor;
237
2408
  else if (kind === 'tests') mult = testFactor;
238
2409
  else if (kind === 'types') mult = typeFactor;
2410
+ else if (kind === 'ancillary') {
2411
+ const lineCount = inferLineCount(r);
2412
+ mult = lineCount <= tinyLineThreshold
2413
+ ? Math.min(ancillaryFactor, tinyAncillaryFactor)
2414
+ : ancillaryFactor;
2415
+ }
239
2416
  const baseScore = (typeof r.score === 'number') ? r.score : 0;
240
2417
  reranked[i] = {
241
2418
  ...r,