sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,499 @@
1
+ /**
2
+ * Identifier-Anchored Retrieval (IAR).
3
+ *
4
+ * Aider / Cursor / Cody / Greptile all couple dense retrieval with an
5
+ * exact-name symbol lookup so abstract natural-language queries that
6
+ * happen to mention a real entity name can land on that entity even
7
+ * when the encoder ranks something tangentially-similar higher.
8
+ *
9
+ * This module:
10
+ * 1. Extracts identifier-shaped tokens from the query (PascalCase,
11
+ * camelCase, snake_case, kCamel, ≥3 chars, not stopwords/keywords).
12
+ * 2. Looks them up case-insensitively against the entities graph
13
+ * (any kind: function, method, struct, type, class, etc.).
14
+ * 3. Maps each matched entity to the cAST/LI chunk that covers it.
15
+ * 4. Injects those chunks into the candidate set with a baseline
16
+ * lexical-anchor score, deduped against existing fused results.
17
+ *
18
+ * The downstream pipeline (entity-kind preference, name precision,
19
+ * doc/test demotion, MMR) then ranks the augmented candidate set
20
+ * using its existing rules. IAR is purely additive — it can only
21
+ * surface entities that genuinely exist in the index.
22
+ *
23
+ * Disable via `ablations: new Set(['no-anchor-injection'])`.
24
+ */
25
+
26
+ import { extractNameHints } from '../ranking/file-kind-ranking.js';
27
+
28
+ /**
29
+ * Extract IDENTIFIER-shaped anchor names from a query.
30
+ *
31
+ * Strictly tighter than `extractNameHints` (which is permissive enough
32
+ * for ranking tiebreakers — it treats any 3+ char non-keyword as a
33
+ * hint). For IAR we need to AVOID firing on plain English words like
34
+ * "request", "config", "default" that happen to share lowercase
35
+ * spelling with real entities, because that drags those entities
36
+ * ahead of the user's actual target.
37
+ *
38
+ * Required shape — at least one of:
39
+ * - has an uppercase letter (PascalCase, camelCase, kPrefix style)
40
+ * - contains an underscore (snake_case_func, ALL_CAPS_CONST)
41
+ *
42
+ * That matches how programmers actually NAME entities. A query token
43
+ * like "FastifyInstance", "kSchemaParams", "BindBody", "calculate_path"
44
+ * fires; "request", "lifecycle", "config", "default" doesn't. The
45
+ * downstream lookup is case-insensitive, so this filter doesn't lose
46
+ * anything except the ambiguous English-word path.
47
+ *
48
+ * Token length floor stays at 3 to drop noise like "is", "to", "by".
49
+ */
50
+ export function extractStrictAnchorNames(query, opts = {}) {
51
+ const tokens = String(query || '').match(/[A-Za-z_][A-Za-z0-9_]+/g) || [];
52
+ const hints = new Set();
53
+ const allowPlainTitlecase = opts.allowPlainTitlecase === true;
54
+ for (const token of tokens) {
55
+ if (token.length < 3) continue;
56
+ // Require strong identifier shape: internal uppercase, acronym,
57
+ // underscore, or digit. Plain sentence Titlecase ("Downloads") is too
58
+ // ambiguous for injection; ranking tiebreakers can still use it later.
59
+ if (!isStrongIdentifierToken(token) && !(allowPlainTitlecase && isPlainTitlecase(token))) continue;
60
+ hints.add(token);
61
+ }
62
+ return hints;
63
+ }
64
+
65
+ const DEFAULT_PER_QUERY_ENTITY_LIMIT = 16;
66
+ const ANCHOR_BASELINE_SCORE = 0.50; // floor for an injected chunk
67
+ const ANCHOR_PER_HINT_BONUS = 0.10; // per matched anchor name
68
+ const ANCHOR_MAX_SCORE = 0.85; // ceiling — never beat a strong fused top-1
69
+ const EXISTING_BOOST = 0.05; // additive boost when the chunk is already fused
70
+
71
+ // Entity types that count as "the user named THIS THING by writing its name"
72
+ // — used by the existing-boost score-floor and new-injection gates below.
73
+ // See block comment at the gate site for rationale. Function/method/component
74
+ // entities are NOT here: the dense ranker handles those well, and IAR floor +
75
+ // post-fusion definition-match boost stack to ~1.0 scores that bulldoze the
76
+ // more specific function the user actually wants on prototype/property-of-X
77
+ // style queries.
78
+ const CLASS_LIKE_ENTITY_TYPES = new Set([
79
+ 'class', 'module', 'interface', 'trait',
80
+ 'struct', 'record', 'enum', 'namespace',
81
+ ]);
82
+
83
+ /**
84
+ * Uniqueness ceiling for anchor names: hints whose lowercase form matches
85
+ * MORE entities than this threshold are dropped before injection. KPR/SPAR
86
+ * pattern (arXiv 2507.03922, 2110.06918): entity-aware injection helps in
87
+ * proportion to rarity.
88
+ *
89
+ * **Default: 0 (gate DISABLED).** On the current 60-probe dev/held-out split
90
+ * (40/20, seed=42, stratified by repo) the gate at ceil=8 transfers
91
+ * asymmetrically — dev gains 2 PASS / loses 0, held-out gains 0 PASS / loses
92
+ * 1 (S3-Q3 fastify). One probe (S3-Q3) had a brittle pre-fix PASS that
93
+ * relied on IAR flooding + MMR diversity penalty rather than dense-ranking
94
+ * signal. The principle is sound but the eval set is too small (60 queries)
95
+ * to ship a non-zero default per the BEIR-grade methodology in CLAUDE.md
96
+ * §Benchmark Methodology — held-out regressions are non-negotiable.
97
+ *
98
+ * Opt in via `SWEET_SEARCH_IAR_UNIQUENESS_CEIL=N`. Aligned with the existing
99
+ * ref-count homonym ceiling (file-kind-ranking.js, env
100
+ * SWEET_SEARCH_REF_BOOST_QUERY_HOMONYM_DISABLE, default 12); experiments
101
+ * suggest 8 for IAR. Set higher for less aggressive gating, 0 to disable.
102
+ */
103
+ // Default 0 = gate disabled. Held-out 60-probe eval (2026-05-07) showed no
104
+ // ceil value transfers: corpus stats lock dev/held-out probes together (the
105
+ // same hint Fastify=46 that helps dev S3-Q7+S4-Q2 hurts held-out S3-Q3).
106
+ // Re-evaluate when a >200-query post-cutoff (FreshStack-style) eval lands.
107
+ const DEFAULT_UNIQUENESS_CEIL = 0;
108
+
109
+ function readUniquenessCeil(opts) {
110
+ if (opts && Number.isFinite(opts.uniquenessCeil)) {
111
+ return opts.uniquenessCeil;
112
+ }
113
+ const raw = process.env.SWEET_SEARCH_IAR_UNIQUENESS_CEIL;
114
+ if (raw == null || raw === '') return DEFAULT_UNIQUENESS_CEIL;
115
+ const n = Number.parseInt(raw, 10);
116
+ if (!Number.isFinite(n) || n < 0) return DEFAULT_UNIQUENESS_CEIL;
117
+ return n; // 0 means "no gate"
118
+ }
119
+
120
+ /**
121
+ * Find the LI chunk that covers a given (filePath, startLine, endLine)
122
+ * region. Linear scan over the LI document Map — typical projects have
123
+ * a few hundred to a few thousand chunks; this runs in microseconds.
124
+ *
125
+ * Prefers the SMALLEST containing chunk when several overlap (canonical
126
+ * symbol-aligned chunk vs an enclosing parent chunk).
127
+ *
128
+ * @param {object} liIndex - LateInteractionIndex instance with .documents Map
129
+ * @param {{ filePath: string, startLine: number, endLine: number }} entity
130
+ * @returns {{ id: string, metadata: object, content?: string, text?: string }|null}
131
+ */
132
+ function findChunkForEntity(liIndex, entity) {
133
+ if (!liIndex || !entity) return null;
134
+ let best = null;
135
+ let bestSize = Infinity;
136
+ // Header-chunk fallback: used when no chunk fully contains the entity
137
+ // (large classes/modules whose body the cAST chunker split into multiple
138
+ // sub-chunks). The header chunk emitted by parseFileToChunks for an
139
+ // oversized boundary starts at the entity's declaration line and carries
140
+ // the declaration name + opening body — exactly the canonical anchor we
141
+ // want for an identifier-anchored injection. Without this fallback, IAR
142
+ // silently no-ops on every entity larger than the chunk budget (e.g.
143
+ // sinatra Base 1100 lines, fastify Server, etc.) — entity exists in the
144
+ // graph but no chunk strictly contains it.
145
+ let headerBest = null;
146
+ let headerBestSize = Infinity;
147
+ const entityNameLc = String(entity.name || '').toLowerCase();
148
+ for (const [id, doc] of liIndex.documents) {
149
+ const m = doc?.metadata;
150
+ if (!m || m.file !== entity.filePath) continue;
151
+ const cs = m.startLine, ce = m.endLine;
152
+ if (cs == null || ce == null) continue;
153
+ if (cs <= entity.startLine && ce >= entity.endLine) {
154
+ const size = ce - cs;
155
+ if (size < bestSize) {
156
+ best = { id, ...doc };
157
+ bestSize = size;
158
+ }
159
+ } else if (
160
+ // Strict fallback gate: chunk starts at the SAME line as the entity
161
+ // declaration AND its symbol name matches the entity name (case-
162
+ // insensitive). The line equality protects against picking up a
163
+ // method chunk that happens to live inside the entity's range and
164
+ // share part of the name; the name-equality protects against picking
165
+ // up an adjacent declaration that just happened to start at the
166
+ // same line on a multi-line statement.
167
+ entityNameLc
168
+ && cs === entity.startLine
169
+ && m.name
170
+ && String(m.name).toLowerCase() === entityNameLc
171
+ ) {
172
+ const size = ce - cs;
173
+ if (size < headerBestSize) {
174
+ headerBest = { id, ...doc };
175
+ headerBestSize = size;
176
+ }
177
+ }
178
+ }
179
+ return best || headerBest;
180
+ }
181
+
182
+ function chunkKey(r) {
183
+ const m = r.metadata || {};
184
+ const file = m.file || r.file;
185
+ const sl = m.startLine ?? r.startLine;
186
+ const el = m.endLine ?? r.endLine;
187
+ return `${file}|${sl}|${el}`;
188
+ }
189
+
190
+ function scoreForAnchor(entity, hintsLower) {
191
+ const nameLc = String(entity.name || '').toLowerCase();
192
+ let matched = 0;
193
+ for (const h of hintsLower) {
194
+ if (nameLc === h || nameLc.includes(h) || h.includes(nameLc)) matched++;
195
+ }
196
+ return Math.min(ANCHOR_MAX_SCORE, ANCHOR_BASELINE_SCORE + ANCHOR_PER_HINT_BONUS * matched);
197
+ }
198
+
199
+ function isPlainTitlecase(token) {
200
+ return /^[A-Z][a-z0-9]+$/.test(token);
201
+ }
202
+
203
+ function isStrongIdentifierToken(token) {
204
+ return token.includes('_') || /[a-z][A-Z]/.test(token) || /[A-Z].*[A-Z]/.test(token) || /\d/.test(token);
205
+ }
206
+
207
+ function entityMatchesAnchorHint(entity, hints) {
208
+ const name = String(entity?.name || '');
209
+ if (!name) return false;
210
+ const nameLower = name.toLowerCase();
211
+
212
+ for (const hint of hints) {
213
+ if (isStrongIdentifierToken(hint)) {
214
+ const hintLower = hint.toLowerCase();
215
+ if (nameLower === hintLower || nameLower.includes(hintLower) || hintLower.includes(nameLower)) {
216
+ return true;
217
+ }
218
+ continue;
219
+ }
220
+
221
+ if (isPlainTitlecase(hint)) {
222
+ if (name === hint || name.includes(hint) || hint.includes(name)) return true;
223
+ continue;
224
+ }
225
+
226
+ const hintLower = hint.toLowerCase();
227
+ if (nameLower === hintLower || nameLower.includes(hintLower) || hintLower.includes(nameLower)) {
228
+ return true;
229
+ }
230
+ }
231
+
232
+ return false;
233
+ }
234
+
235
+ /**
236
+ * Inject anchor candidates into a fused result list.
237
+ *
238
+ * @param {Array} fused - Result list after CC/RRF fusion (mutates a copy)
239
+ * @param {string} query - The user's query
240
+ * @param {object} opts
241
+ * @param {object} opts.codeGraphRepo - CodeGraphRepository
242
+ * @param {object} opts.lateInteractionIndex - LateInteractionIndex
243
+ * @param {Set<string>} [opts.ablations]
244
+ * @param {number} [opts.entityLimit]
245
+ * @returns {{ results: Array, stats: { hintCount: number, entitiesFound: number,
246
+ * newCandidates: number, existingBoosted: number } }}
247
+ */
248
+ export function injectAnchorCandidates(fused, query, opts = {}) {
249
+ const ablations = opts.ablations;
250
+ if (ablations && (ablations instanceof Set ? ablations.has('no-anchor-injection') : Array.isArray(ablations) && ablations.includes('no-anchor-injection'))) {
251
+ return { results: fused, stats: { hintCount: 0, entitiesFound: 0, newCandidates: 0, existingBoosted: 0 } };
252
+ }
253
+
254
+ const repo = opts.codeGraphRepo;
255
+ const liIndex = opts.lateInteractionIndex;
256
+ if (!repo || !liIndex || typeof repo.findEntitiesByAnyName !== 'function') {
257
+ return { results: fused, stats: { hintCount: 0, entitiesFound: 0, newCandidates: 0, existingBoosted: 0 } };
258
+ }
259
+
260
+ const allHints = [...extractStrictAnchorNames(query || '', {
261
+ allowPlainTitlecase: opts.allowPlainTitlecase !== false,
262
+ })];
263
+ if (allHints.length === 0) {
264
+ return { results: fused, stats: { hintCount: 0, entitiesFound: 0, newCandidates: 0, existingBoosted: 0 } };
265
+ }
266
+
267
+ // Uniqueness gate: drop any hint whose lowercase form matches more
268
+ // entities than the ceiling. IDF-gated injection pattern (KPR arXiv
269
+ // 2507.03922, SPAR arXiv 2110.06918, "Match Your Words" arXiv 2112.05662).
270
+ // Rare identifiers benefit from anchor injection; common identifiers
271
+ // ("Get", "Fastify", "Set") flood the candidate set with mostly-irrelevant
272
+ // entities — even the canonical pick is unreliable when 50 entities share
273
+ // the bare name. Cleaner to skip the hint entirely than to inject a
274
+ // possibly-wrong "canonical" entity. Mirrors the existing ref-count homonym
275
+ // gate (file-kind-ranking.js, env SWEET_SEARCH_REF_BOOST_QUERY_HOMONYM_DISABLE,
276
+ // default 12). IAR uses a tighter default (8) because anchor injection is
277
+ // more sensitive to homonym noise than ref-count rescaling.
278
+ //
279
+ // Override env: SWEET_SEARCH_IAR_UNIQUENESS_CEIL=N. Set to 0 to disable.
280
+ const ceil = readUniquenessCeil(opts);
281
+ let hints = allHints;
282
+ let droppedCommon = [];
283
+ if (ceil > 0 && typeof repo.countEntitiesByAnyName === 'function') {
284
+ let countMap = null;
285
+ try {
286
+ countMap = repo.countEntitiesByAnyName(allHints);
287
+ } catch {
288
+ countMap = null;
289
+ }
290
+ if (countMap) {
291
+ const kept = [];
292
+ for (const h of allHints) {
293
+ const c = countMap.get(h.toLowerCase()) || 0;
294
+ if (c === 0 || c <= ceil) {
295
+ kept.push(h);
296
+ } else {
297
+ droppedCommon.push({ hint: h, count: c });
298
+ }
299
+ }
300
+ hints = kept;
301
+ }
302
+ }
303
+ if (hints.length === 0) {
304
+ return {
305
+ results: fused,
306
+ stats: {
307
+ hintCount: allHints.length,
308
+ entitiesFound: 0,
309
+ newCandidates: 0,
310
+ existingBoosted: 0,
311
+ droppedCommon,
312
+ uniquenessCeil: ceil,
313
+ },
314
+ };
315
+ }
316
+ const hintsLower = hints.map(s => s.toLowerCase());
317
+
318
+ let entities = [];
319
+ try {
320
+ const totalLimit = opts.entityLimit ?? DEFAULT_PER_QUERY_ENTITY_LIMIT;
321
+ if (hints.length > 1) {
322
+ // Per-hint quota: each hint gets up to ceil(totalLimit / hints.length)
323
+ // entities, deduped by id. Without this, a common hint (e.g. "Sinatra"
324
+ // matching 50+ small `module Sinatra` wrappers) saturates the budget
325
+ // via the `ORDER BY (end_line - start_line) ASC` tie-breaker — the
326
+ // smallest-entity-first ordering crowds out rarer co-hints
327
+ // ("IndifferentHash", "ExtendedRack", "TemplateCache") that are
328
+ // typically what the user is actually asking about. KPR/SPAR's
329
+ // IDF-gated injection (arXiv 2507.03922) handles this by ratioing
330
+ // anchor weight to rarity; here we instead enforce diversity at the
331
+ // candidate set level so per-hint specificity bias surfaces later
332
+ // in scoreForAnchor.
333
+ const perHint = Math.max(1, Math.ceil(totalLimit / hints.length));
334
+ const seen = new Set();
335
+ const entityKey = (e) => e?.id != null
336
+ ? `id:${e.id}`
337
+ : `${e?.filePath || ''}|${e?.startLine ?? ''}|${e?.endLine ?? ''}|${e?.name || ''}`;
338
+ for (const h of hints) {
339
+ // Pull a wider candidate window per hint (3x quota) so the in-JS
340
+ // re-ranking below can prefer case-exact matches over case-folded
341
+ // homonyms. Without this, a Pascal-case hint like "Helpers" gets
342
+ // out-prioritized by 5 tiny case-folded `def helpers` methods
343
+ // (2 lines each) that beat the canonical `module Helpers`
344
+ // (436 lines) under the SQL's `(end_line - start_line) ASC`
345
+ // tie-break. The case-sensitive preference reflects the user's
346
+ // own capitalization choice — they wrote "Helpers" because they
347
+ // mean the class/module, not a generic helper method.
348
+ const wider = repo.findEntitiesByAnyName([h], { limit: perHint * 3 }) || [];
349
+ // Stable resort: exact-case matches first, then keep the upstream
350
+ // size order (stable sort preserves the SQL `ORDER BY size ASC`).
351
+ wider.sort((a, b) => {
352
+ const aExact = a.name === h ? 0 : 1;
353
+ const bExact = b.name === h ? 0 : 1;
354
+ return aExact - bExact;
355
+ });
356
+ let added = 0;
357
+ for (const e of wider) {
358
+ const key = entityKey(e);
359
+ if (seen.has(key)) continue;
360
+ entities.push(e);
361
+ seen.add(key);
362
+ added++;
363
+ if (added >= perHint || entities.length >= totalLimit) break;
364
+ }
365
+ if (entities.length >= totalLimit) break;
366
+ }
367
+ } else {
368
+ entities = repo.findEntitiesByAnyName(hints, { limit: totalLimit }) || [];
369
+ }
370
+ } catch {
371
+ return {
372
+ results: fused,
373
+ stats: {
374
+ hintCount: allHints.length,
375
+ entitiesFound: 0,
376
+ newCandidates: 0,
377
+ existingBoosted: 0,
378
+ droppedCommon,
379
+ uniquenessCeil: ceil,
380
+ },
381
+ };
382
+ }
383
+ if (entities.length === 0) {
384
+ return {
385
+ results: fused,
386
+ stats: {
387
+ hintCount: allHints.length,
388
+ entitiesFound: 0,
389
+ newCandidates: 0,
390
+ existingBoosted: 0,
391
+ droppedCommon,
392
+ uniquenessCeil: ceil,
393
+ },
394
+ };
395
+ }
396
+
397
+ // Index existing fused results by chunk key for dedup and existing-boost.
398
+ const fusedByKey = new Map();
399
+ for (const r of fused) fusedByKey.set(chunkKey(r), r);
400
+
401
+ let newCandidates = 0;
402
+ let existingBoosted = 0;
403
+ const out = fused.slice(); // copy — we'll append injections
404
+ const seenInjected = new Set();
405
+
406
+ for (const entity of entities) {
407
+ if (!entityMatchesAnchorHint(entity, hints)) continue;
408
+ const chunk = findChunkForEntity(liIndex, entity);
409
+ if (!chunk) continue;
410
+ const key = chunkKey({ metadata: chunk.metadata });
411
+ if (seenInjected.has(key)) continue;
412
+ seenInjected.add(key);
413
+
414
+ const anchorScore = scoreForAnchor(entity, hintsLower);
415
+
416
+ // Class-anchor score-floor gate (rationale below).
417
+ //
418
+ // Score-floor (existing-boost path) and new-injection both fire at
419
+ // full anchor baseline (0.50-0.85) ONLY when the matched entity is a
420
+ // class-like type — class, module, interface, trait, struct, record,
421
+ // enum, namespace.
422
+ //
423
+ // Without this gate, a confidently-matched class entity
424
+ // ("IndifferentHash" / "ExtendedRack" / "TemplateCache") that the
425
+ // dense ranker placed low in the fused list stays low and gets
426
+ // crowded out by short-file mega-envelopes on class-targeted queries.
427
+ //
428
+ // Restricting to class-like types prevents over-promoting a literal
429
+ // entity over more specific derived functions on prototype-style
430
+ // queries — "how does Fastify decorate the Reply prototype": Reply
431
+ // is a function-typed entity, the user wants `decorateReply`;
432
+ // flooring/injecting Reply blocks decorateReply from top-1.
433
+ //
434
+ // Heuristic: when the user types a class/module/interface/trait
435
+ // name, they almost always mean the type itself; when they type a
436
+ // function/method name, they may mean callers, callees, or related
437
+ // operations — and the dense ranker generally surfaces those better
438
+ // than a name-only anchor can. Marking `_anchorBoosted` on every
439
+ // matched entity (including non-class) preserves downstream
440
+ // demotion signal alignment.
441
+ const isClassLike = entity?.type && CLASS_LIKE_ENTITY_TYPES.has(entity.type);
442
+
443
+ const existing = fusedByKey.get(key);
444
+ if (existing) {
445
+ if (isClassLike) {
446
+ existing.score = Math.max((existing.score || 0) + EXISTING_BOOST, anchorScore);
447
+ } else {
448
+ existing.score = (existing.score || 0) + EXISTING_BOOST;
449
+ }
450
+ existing._anchorBoosted = true;
451
+ existing._anchorEntity = entity.name;
452
+ existingBoosted++;
453
+ continue;
454
+ }
455
+ // New-injection path: skip when entity is not class-like. The dense
456
+ // ranker is the authority on function/method retrieval for non-
457
+ // class queries; injecting a function/method chunk at 0.60 with
458
+ // post-fusion definition-match amplification routinely scores
459
+ // 1.0+ and bulldozes the legitimately-correct function the user
460
+ // was after.
461
+ if (!isClassLike) continue;
462
+
463
+ // Inject as a fresh candidate. Carry the LI chunk's metadata so the
464
+ // downstream packager has the correct file/range/type.
465
+ out.push({
466
+ id: chunk.id,
467
+ file: chunk.metadata?.file,
468
+ startLine: chunk.metadata?.startLine,
469
+ endLine: chunk.metadata?.endLine,
470
+ name: chunk.metadata?.name || entity.name,
471
+ type: chunk.metadata?.type || entity.type,
472
+ content: chunk.content || chunk.text || '',
473
+ metadata: { ...(chunk.metadata || {}) },
474
+ score: anchorScore,
475
+ searchPath: 'anchor',
476
+ _anchorInjected: true,
477
+ _anchorEntity: entity.name,
478
+ _anchorEntityType: entity.type,
479
+ });
480
+ newCandidates++;
481
+ }
482
+
483
+ // Re-sort by score so the augmented list is consistent for downstream
484
+ // top-k truncation.
485
+ out.sort((a, b) => (b.score || 0) - (a.score || 0));
486
+
487
+ return {
488
+ results: out,
489
+ stats: {
490
+ hintCount: allHints.length,
491
+ hintsKept: hints.length,
492
+ entitiesFound: entities.length,
493
+ newCandidates,
494
+ existingBoosted,
495
+ droppedCommon,
496
+ uniquenessCeil: ceil,
497
+ },
498
+ };
499
+ }
@@ -10,6 +10,13 @@
10
10
 
11
11
  import { SYMBOL_KIND_WEIGHTS, DEFINITION_TYPES } from '../infrastructure/constants.js';
12
12
 
13
+ const IDENTIFIER_AGREEMENT_STOPWORDS = new Set([
14
+ 'and', 'are', 'can', 'does', 'for', 'from', 'get', 'has', 'have',
15
+ 'how', 'into', 'new', 'not', 'other', 'return', 'returns', 'set',
16
+ 'should', 'that', 'the', 'this', 'true', 'use', 'used', 'using',
17
+ 'was', 'were', 'what', 'when', 'where', 'which', 'with', 'you',
18
+ ]);
19
+
13
20
  // =============================================================================
14
21
  // BOOST_POLICY (static property on SweetSearch)
15
22
  // =============================================================================
@@ -109,12 +116,14 @@ export function getBoostIntent(routerMode, routerConfidence) {
109
116
  * NOTE: References SweetSearch.BOOST_POLICY — we import BOOST_POLICY locally
110
117
  * and reference it directly since the static property is wired separately.
111
118
  */
112
- export function applyPostFusionBoosts(fusedResults, query, routerMode, routerConfidence) {
119
+ export function applyPostFusionBoosts(fusedResults, query, routerMode, routerConfidence, options = {}) {
113
120
  const boostIntent = this.getBoostIntent(routerMode, routerConfidence);
114
121
  const policy = BOOST_POLICY[boostIntent] || BOOST_POLICY.general;
115
122
 
116
123
  const queryLower = query.toLowerCase().trim();
117
124
  const queryTokens = this.extractQueryTokens(query);
125
+ const agentFormats = new Set(['agent', 'agent_preview', 'agent_full', 'agent_full_xl']);
126
+ const allowIdentifierAgreement = !agentFormats.has(options.format);
118
127
 
119
128
  return fusedResults.map(result => {
120
129
  let totalBoost = 1.0;
@@ -140,6 +149,16 @@ export function applyPostFusionBoosts(fusedResults, query, routerMode, routerCon
140
149
  }
141
150
  }
142
151
 
152
+ // 2.5 Identifier agreement: prefer symbols/files whose meaningful
153
+ // identifier words are named by the natural-language query.
154
+ const idBoost = allowIdentifierAgreement
155
+ ? this.computeIdentifierAgreementBoost?.(result, query)
156
+ : 1.0;
157
+ if (idBoost > 1.0) {
158
+ totalBoost *= idBoost;
159
+ boostDetails.push(`id:${idBoost.toFixed(2)}`);
160
+ }
161
+
143
162
  // 3. Symbol Kind Hierarchy (always mild)
144
163
  if (policy.kindHierarchy) {
145
164
  const kindWeight = SYMBOL_KIND_WEIGHTS[result.type] || 0.5;
@@ -175,6 +194,78 @@ export function applyPostFusionBoosts(fusedResults, query, routerMode, routerCon
175
194
  }).sort((a, b) => b.score - a.score);
176
195
  }
177
196
 
197
+ function envFloat(name, fallback, min = 0, max = 1) {
198
+ const raw = process.env[name];
199
+ if (raw == null || raw === '') return fallback;
200
+ const parsed = Number.parseFloat(raw);
201
+ return Number.isFinite(parsed) && parsed >= min && parsed <= max ? parsed : fallback;
202
+ }
203
+
204
+ function splitIdentifierTerms(value) {
205
+ return String(value || '')
206
+ .replace(/_[0-9a-f]{8}(?=\.[^.]+$|$)/gi, '')
207
+ .replace(/\.[^.]+$/, '')
208
+ .replace(/([a-z0-9])([A-Z])/g, '$1 $2')
209
+ .toLowerCase()
210
+ .split(/[^a-z0-9]+/)
211
+ .map(stemIdentifierTerm)
212
+ .filter(term => term.length >= 3 && !IDENTIFIER_AGREEMENT_STOPWORDS.has(term));
213
+ }
214
+
215
+ function stemIdentifierTerm(term) {
216
+ if (term.endsWith('ies') && term.length > 4) return `${term.slice(0, -3)}y`;
217
+ for (const suffix of ['ing', 'ers', 'ied', 'ed', 'es', 's']) {
218
+ if (term.endsWith(suffix) && term.length > suffix.length + 3) {
219
+ return term.slice(0, -suffix.length);
220
+ }
221
+ }
222
+ return term;
223
+ }
224
+
225
+ /**
226
+ * Boost candidates whose symbol/file identifier terms agree with query terms.
227
+ *
228
+ * This is intentionally small and corpus-agnostic: it only helps when the
229
+ * candidate exposes meaningful identifier words, and it never fabricates a
230
+ * match from comments or benchmark labels.
231
+ */
232
+ export function computeIdentifierAgreementBoost(result, query) {
233
+ const weight = envFloat('SWEET_SEARCH_IDENTIFIER_AGREEMENT_BOOST', 0.40, 0, 1);
234
+ if (weight === 0) return 1.0;
235
+
236
+ const queryTerms = new Set(splitIdentifierTerms(query));
237
+ if (queryTerms.size === 0) return 1.0;
238
+
239
+ const fileName = (result.file || result.path || result.metadata?.file || '')
240
+ .split('/')
241
+ .pop() || '';
242
+ const candidateTerms = new Set([
243
+ ...splitIdentifierTerms(result.name || result.metadata?.name || ''),
244
+ ...splitIdentifierTerms(fileName),
245
+ ]);
246
+ if (candidateTerms.size === 0) return 1.0;
247
+
248
+ let hits = 0;
249
+ for (const queryTerm of queryTerms) {
250
+ if (candidateTerms.has(queryTerm)) {
251
+ hits++;
252
+ continue;
253
+ }
254
+ if (queryTerm.length >= 5) {
255
+ for (const candidateTerm of candidateTerms) {
256
+ if (candidateTerm.includes(queryTerm) || queryTerm.includes(candidateTerm)) {
257
+ hits++;
258
+ break;
259
+ }
260
+ }
261
+ }
262
+ }
263
+ if (hits === 0) return 1.0;
264
+
265
+ const agreement = hits / Math.min(queryTerms.size, Math.max(2, candidateTerms.size));
266
+ return 1.0 + weight * Math.min(1, agreement);
267
+ }
268
+
178
269
  /**
179
270
  * Compute definition boost (PHASE_1_FIXES helper)
180
271
  */
@@ -190,6 +281,7 @@ export function computeDefinitionBoost(result, queryLower, queryTokens) {
190
281
  const exactNameMatch = queryTokens.some(token => resultNameLower === token);
191
282
 
192
283
  if (filenameMatchesQuery && isDefinitionType) return 2.0;
284
+ if (filenameMatchesQuery) return 1.3;
193
285
  if (exactNameMatch && isDefinitionType) return 1.5;
194
286
  if (isDefinitionType) return 1.2;
195
287
  return 1.0;