sweet-search 2.4.2 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/core/cli.js +43 -5
  2. package/core/embedding/embedding-cache.js +266 -18
  3. package/core/embedding/embedding-service.js +45 -9
  4. package/core/graph/graph-expansion.js +52 -12
  5. package/core/graph/graph-extractor.js +30 -1
  6. package/core/indexing/ast-chunker.js +331 -16
  7. package/core/indexing/chunking/chunk-builder.js +34 -1
  8. package/core/indexing/index-codebase-v21.js +31 -2
  9. package/core/indexing/index.js +6 -3
  10. package/core/indexing/indexer-ann.js +45 -6
  11. package/core/indexing/indexer-build.js +9 -1
  12. package/core/indexing/indexer-phases.js +6 -4
  13. package/core/indexing/indexing-file-policy.js +140 -0
  14. package/core/indexing/li-skip-policy.js +11 -220
  15. package/core/infrastructure/codebase-repository.js +21 -0
  16. package/core/infrastructure/config/embedding.js +20 -1
  17. package/core/infrastructure/config/graph.js +2 -2
  18. package/core/infrastructure/config/ranking.js +10 -0
  19. package/core/infrastructure/config/vector-store.js +1 -1
  20. package/core/infrastructure/coreml-cascade.js +236 -30
  21. package/core/infrastructure/coreml-cascade.json +25 -0
  22. package/core/infrastructure/index.js +17 -0
  23. package/core/infrastructure/init-config.js +216 -0
  24. package/core/infrastructure/language-patterns/registry-core.js +18 -0
  25. package/core/infrastructure/model-registry.js +12 -0
  26. package/core/infrastructure/native-inference.js +143 -51
  27. package/core/infrastructure/tree-sitter-provider.js +92 -2
  28. package/core/ranking/cascaded-scorer.js +6 -2
  29. package/core/ranking/file-kind-ranking.js +264 -0
  30. package/core/ranking/late-interaction-index.js +10 -4
  31. package/core/ranking/late-interaction-policy.js +304 -0
  32. package/core/search/context-expander.js +267 -28
  33. package/core/search/index.js +4 -0
  34. package/core/search/search-cli.js +3 -1
  35. package/core/search/search-pattern.js +4 -3
  36. package/core/search/search-postprocess.js +189 -8
  37. package/core/search/search-read-semantic.js +734 -0
  38. package/core/search/search-read.js +481 -0
  39. package/core/search/search-server.js +153 -5
  40. package/core/search/sweet-search.js +133 -16
  41. package/core/start-server.js +13 -2
  42. package/mcp/server.js +41 -0
  43. package/mcp/tool-handlers.js +117 -6
  44. package/package.json +9 -7
  45. package/scripts/init.js +386 -5
  46. package/scripts/uninstall.js +152 -6
@@ -20,6 +20,276 @@ const MIN_CONTENT_LENGTH = 30;
20
20
  const MAX_PEEK_LINES = 3;
21
21
  const DEFAULT_MAX_REGEX_LINE_LENGTH = 4000;
22
22
 
23
+ // =============================================================================
24
+ // Embedding-text cap — RESEARCH / ABLATION INFRASTRUCTURE
25
+ // =============================================================================
26
+ //
27
+ // Production default: 2000, byte-identical to shipped v6.2.
28
+ //
29
+ // Set SWEET_SEARCH_EMBED_TEXT_CAP=N to raise/lower the slice ceiling on
30
+ // `embedding_text`, `li_text`, `li_greedy_text`, and the enriched form.
31
+ // The May-2026 chunk-overflow audit found 80.5% of overflowing chunks are
32
+ // header-pushed (raw content ≤2000, headers push the embedding text over),
33
+ // motivating an A/B vs slightly-larger caps. See eval/results/
34
+ // chunk-overflow-audit.md for the audit and eval/run_overflow_ablation.sh
35
+ // for the wiring. Does NOT alter the chunker max size — that lives behind
36
+ // SWEET_SEARCH_CHUNK_HEADER_OVERHEAD in tree-sitter-provider.js.
37
+ function getEmbedTextCap() {
38
+ const v = parseInt(process.env.SWEET_SEARCH_EMBED_TEXT_CAP || '', 10);
39
+ return Number.isFinite(v) && v >= 500 ? v : MAX_CHUNK_SIZE;
40
+ }
41
+
42
+ /**
43
+ * Strip a trailing `_<hex>` slug from a path's final basename, before
44
+ * its extension. Used by the Java family (Java/PHP/C#/Kotlin/Scala)
45
+ * per the v6.2 language-conditioned policy.
46
+ *
47
+ * javascript/Semantic-UI/Table_366c13.js -> javascript/Semantic-UI/Table.js
48
+ * src/Button.tsx (no slug) -> src/Button.tsx (unchanged)
49
+ *
50
+ * Exported for reuse in chunk-builder.js (document chunks) and tests.
51
+ */
52
+ export function normalizePathSlug(relativePath) {
53
+ if (!relativePath) return relativePath;
54
+ return relativePath.replace(/_[0-9a-f]{6,}(\.[a-zA-Z0-9]+)$/, '$1');
55
+ }
56
+
57
+ // Languages that share Java's slug-stripped-path li_text policy.
58
+ // (At LI input routing time these will all be sent to chunk.li_text;
59
+ // see LI_TEXT_LANGUAGES in core/indexing/indexer-ann.js, which imports
60
+ // this set so the policy stays consistent in one place.)
61
+ export const JAVA_FAMILY = new Set([
62
+ 'java', 'php',
63
+ 'csharp', 'c#',
64
+ 'kotlin', 'scala',
65
+ ]);
66
+
67
+ // =============================================================================
68
+ // Embedding-text variant switch — RESEARCH / ABLATION INFRASTRUCTURE
69
+ // =============================================================================
70
+ //
71
+ // Production behavior is fixed at variant=current and is byte-identical to
72
+ // shipped v6.2. None of the alternative variants below are recommended for
73
+ // production; an isolated R1 ablation in May 2026 found no variant beats
74
+ // shipped on total MRR@10 (see docs/JS_CHUNK_BLEEDING_ANALYSIS.md for the
75
+ // full table). The switch and helpers are kept ONLY so future R1
76
+ // experiments can be re-run without re-implementing the scaffolding.
77
+ //
78
+ // Default: SWEET_SEARCH_EMBED_TEXT_VARIANT unset → 'current' →
79
+ // embedding_text + LI input both byte-identical to shipped.
80
+ //
81
+ // Available experiment variants (research use only):
82
+ // current shipped form (production default)
83
+ // no_path drop the # path line
84
+ // normalized_path slug-strip the trailing _<hex> in path
85
+ // no_language drop # Language: line
86
+ // parent_only path + parent only, drop function + language
87
+ // enriched identical to current here; also runs enrichment
88
+ // code_breadcrumb compact `# Parent.Symbol` or `# Parent::symbol`
89
+ // signature current + `# Signature: <multi-line-sig>` line
90
+ // between the symbol line and the language line.
91
+ // Signature is AST-extracted by tree-sitter (decl
92
+ // header up to body, whitespace-collapsed, capped
93
+ // at MAX_SIGNATURE_LENGTH). When no signature is
94
+ // available (regex fallback path, non-boundary
95
+ // chunks, or merged sibling buffers) this variant
96
+ // is byte-identical to `current`.
97
+ // signature_rbphp like `signature` but only for Ruby and PHP — the
98
+ // two languages where the May-2026 R1 ablation
99
+ // showed signatures helped (PHP +0.79 MRR@10,
100
+ // Ruby +0.58); other languages route to the same
101
+ // output as `current`. Motivated by per-language
102
+ // pareto front (matching v6.2 LI routing pattern).
103
+ //
104
+ // LI isolation: when a non-current variant is active, `chunk.li_greedy_text`
105
+ // still carries the shipped (variant=current) form, and pickLiInput() for
106
+ // non-Python/Java languages reads it — so an R1 experiment never
107
+ // contaminates the LI input. See enrichEmbeddingText() and pickLiInput().
108
+ const ENRICHMENT_VARIANTS = new Set(['current', 'enriched']);
109
+
110
+ function getEmbedTextVariant() {
111
+ const v = (process.env.SWEET_SEARCH_EMBED_TEXT_VARIANT || 'current').toLowerCase();
112
+ return [
113
+ 'current', 'no_path', 'normalized_path', 'no_language',
114
+ 'parent_only', 'enriched', 'code_breadcrumb',
115
+ 'signature', 'signature_rbphp',
116
+ ].includes(v) ? v : 'current';
117
+ }
118
+
119
+ // Languages where the May-2026 ablation showed `signature` strictly
120
+ // helped on R1 MRR@10 (PHP +0.79, Ruby +0.58 vs current). The
121
+ // `signature_rbphp` variant only emits the # Signature: line for
122
+ // these — every other language gets exactly the `current` output.
123
+ const SIGNATURE_HELP_LANGUAGES = new Set(['ruby', 'php']);
124
+
125
+ export function shouldRunEnrichment() {
126
+ return ENRICHMENT_VARIANTS.has(getEmbedTextVariant());
127
+ }
128
+
129
+ // `Foo::bar` for langs that conventionally use ::, `Foo.bar` otherwise.
130
+ // Used by the experimental code_breadcrumb variant only.
131
+ function breadcrumbSep(language) {
132
+ return ['ruby', 'php', 'cpp', 'c++', 'rust'].includes(language) ? '::' : '.';
133
+ }
134
+
135
+ /**
136
+ * Build `embedding_text` per the active R1 variant.
137
+ *
138
+ * Production callers always pass either no variant (env-driven, defaults
139
+ * to 'current') or `variant: 'current'` explicitly — both produce text
140
+ * byte-identical to shipped v6.2.
141
+ *
142
+ * `variant: 'current'` is also passed explicitly when populating
143
+ * `chunk.li_greedy_text` so the LI stage sees the shipped form even
144
+ * when the bi-encoder is consuming a different experimental variant.
145
+ */
146
+ function buildEmbeddingText({ variant: variantOverride, content, relativePath, language, chunkType, symbol, hierarchyInfo }) {
147
+ const variant = variantOverride || getEmbedTextVariant();
148
+ const trimmed = content.trim();
149
+ const parts = [];
150
+
151
+ const pathLine = relativePath ? `# ${relativePath}` : null;
152
+ const parentLine = hierarchyInfo?.parentSymbol
153
+ ? `# Parent: ${hierarchyInfo.parentType} ${hierarchyInfo.parentSymbol}` : null;
154
+ const symbolLine = (symbol && symbol !== 'unknown')
155
+ ? `# ${chunkType}: ${symbol}` : null;
156
+ const langLine = (language && language !== 'text')
157
+ ? `# Language: ${language}` : null;
158
+ const signatureLine = hierarchyInfo?.signature
159
+ ? `# Signature: ${hierarchyInfo.signature}` : null;
160
+
161
+ switch (variant) {
162
+ case 'no_path':
163
+ if (parentLine) parts.push(parentLine);
164
+ if (symbolLine) parts.push(symbolLine);
165
+ if (langLine) parts.push(langLine);
166
+ break;
167
+ case 'normalized_path':
168
+ if (relativePath) parts.push(`# ${normalizePathSlug(relativePath)}`);
169
+ if (parentLine) parts.push(parentLine);
170
+ if (symbolLine) parts.push(symbolLine);
171
+ if (langLine) parts.push(langLine);
172
+ break;
173
+ case 'no_language':
174
+ if (pathLine) parts.push(pathLine);
175
+ if (parentLine) parts.push(parentLine);
176
+ if (symbolLine) parts.push(symbolLine);
177
+ break;
178
+ case 'parent_only':
179
+ if (pathLine) parts.push(pathLine);
180
+ if (parentLine) parts.push(parentLine);
181
+ break;
182
+ case 'code_breadcrumb': {
183
+ if (pathLine) parts.push(pathLine);
184
+ const sep = breadcrumbSep(language);
185
+ if (hierarchyInfo?.parentSymbol && symbol && symbol !== 'unknown') {
186
+ parts.push(`# ${hierarchyInfo.parentSymbol}${sep}${symbol}`);
187
+ } else if (symbol && symbol !== 'unknown') {
188
+ parts.push(`# ${symbol}`);
189
+ } else if (hierarchyInfo?.parentSymbol) {
190
+ parts.push(`# ${hierarchyInfo.parentSymbol}`);
191
+ }
192
+ if (language && language !== 'text') parts.push(`# ${language}`);
193
+ break;
194
+ }
195
+ case 'signature':
196
+ // Same headers as current, plus a `# Signature:` line emitted
197
+ // between the symbol line and the language line. When no
198
+ // signature is available (regex fallback / merged sibling
199
+ // buffer / non-boundary chunk), this is byte-identical to
200
+ // current — the variant degrades gracefully.
201
+ if (pathLine) parts.push(pathLine);
202
+ if (parentLine) parts.push(parentLine);
203
+ if (symbolLine) parts.push(symbolLine);
204
+ if (signatureLine) parts.push(signatureLine);
205
+ if (langLine) parts.push(langLine);
206
+ break;
207
+ case 'signature_rbphp':
208
+ // Lang-conditioned: emit signature only for Ruby and PHP (the
209
+ // two languages where unconditional `signature` strictly helped
210
+ // on the May-2026 R1 ablation). Every other language gets the
211
+ // exact `current` output. Mirrors the v6.2 LI routing pattern.
212
+ if (pathLine) parts.push(pathLine);
213
+ if (parentLine) parts.push(parentLine);
214
+ if (symbolLine) parts.push(symbolLine);
215
+ if (signatureLine && SIGNATURE_HELP_LANGUAGES.has(language)) {
216
+ parts.push(signatureLine);
217
+ }
218
+ if (langLine) parts.push(langLine);
219
+ break;
220
+ case 'enriched':
221
+ case 'current':
222
+ default:
223
+ if (pathLine) parts.push(pathLine);
224
+ if (parentLine) parts.push(parentLine);
225
+ if (symbolLine) parts.push(symbolLine);
226
+ if (langLine) parts.push(langLine);
227
+ break;
228
+ }
229
+
230
+ parts.push(trimmed);
231
+ return parts.join('\n').slice(0, getEmbedTextCap());
232
+ }
233
+
234
+ /**
235
+ * Per-language path-line strategy for late-interaction MaxSim input.
236
+ *
237
+ * - Python: no path line (at 97% MRR ceiling; path duplicated funcname).
238
+ * - Java family (Java, PHP, C#, Kotlin, Scala): slug-stripped path.
239
+ * - Everything else: full path. (For JS family, Ruby, Go, C/C++/Rust
240
+ * and unknown languages, the LI input router in indexer-ann.js
241
+ * bypasses li_text entirely and reads chunk.embedding_text directly,
242
+ * so this branch only matters as a fallback if those routes ever
243
+ * fail to find embedding_text.)
244
+ */
245
+ function pathLineForLi(relativePath, language) {
246
+ if (!relativePath) return null;
247
+ if (language === 'python') return null;
248
+ if (JAVA_FAMILY.has(language)) {
249
+ return `# ${normalizePathSlug(relativePath)}`;
250
+ }
251
+ return `# ${relativePath}`;
252
+ }
253
+
254
+ /**
255
+ * Build `li_text` — the text fed to lateon-code MaxSim reranking.
256
+ *
257
+ * v6.2 (2026-05): language-conditioned path strategy. All languages
258
+ * share the same `# Parent / # symbol / # Language` label-shaped
259
+ * headers and differ only in how (or whether) the path comment is
260
+ * included. See pathLineForLi() docstring for the per-language map.
261
+ *
262
+ * Why language-conditioned vs uniform: per-language ablation showed
263
+ * uniform policies always regress at least one language (greedy
264
+ * regresses Python; v4 no-path regresses Ruby below baseline; v5
265
+ * slug-strip-uniform hurts JS). The v6.2 split — Python skips the
266
+ * path, Java family slug-strips, JS family / Ruby / Go / fallback get
267
+ * the full enriched embedding_text — is the empirical pareto front.
268
+ *
269
+ * The actual routing of `li_text` vs `embedding_text` lives in
270
+ * core/indexing/indexer-ann.js (`pickLiInput`).
271
+ */
272
+ function buildLiText({ content, relativePath, language, chunkType, symbol, hierarchyInfo }) {
273
+ const trimmed = content.trim();
274
+ const lines = [];
275
+
276
+ const pathLine = pathLineForLi(relativePath, language);
277
+ if (pathLine) lines.push(pathLine);
278
+
279
+ if (hierarchyInfo?.parentSymbol) {
280
+ lines.push(`# Parent: ${hierarchyInfo.parentType} ${hierarchyInfo.parentSymbol}`);
281
+ }
282
+ if (symbol && symbol !== 'unknown') {
283
+ lines.push(`# ${chunkType}: ${symbol}`);
284
+ }
285
+ if (language && language !== 'text') {
286
+ lines.push(`# Language: ${language}`);
287
+ }
288
+ lines.push(trimmed);
289
+
290
+ return lines.join('\n').slice(0, getEmbedTextCap());
291
+ }
292
+
23
293
  /**
24
294
  * AST-like semantic code chunker supporting 35+ languages.
25
295
  * Uses regex boundary patterns from core/language-patterns.js registry.
@@ -100,6 +370,7 @@ export class ASTChunker {
100
370
  parentChunkId: chunk.parentChunkId,
101
371
  parentSymbol: chunk.parentSymbol,
102
372
  parentType: chunk.parentType,
373
+ signature: chunk.signature || null,
103
374
  }
104
375
  )
105
376
  );
@@ -606,19 +877,43 @@ export class ASTChunker {
606
877
  const hash = createHash('sha256').update(content).digest('hex').slice(0, 16);
607
878
  const relativePath = this.projectRoot ? path.relative(this.projectRoot, filePath) : filePath;
608
879
 
609
- // Build contextualized embedding text
610
- const embeddingParts = [];
611
- embeddingParts.push(`# ${relativePath}`);
612
- if (hierarchyInfo?.parentSymbol) {
613
- embeddingParts.push(`# Parent: ${hierarchyInfo.parentType} ${hierarchyInfo.parentSymbol}`);
614
- }
615
- if (symbol && symbol !== 'unknown') {
616
- embeddingParts.push(`# ${chunkType}: ${symbol}`);
617
- }
618
- if (language && language !== 'text') {
619
- embeddingParts.push(`# Language: ${language}`);
620
- }
621
- embeddingParts.push(content.trim());
880
+ // Production embedding text. With the default variant (current) this
881
+ // is byte-identical to shipped v6.2; under a research-only variant
882
+ // (SWEET_SEARCH_EMBED_TEXT_VARIANT) it produces the experimental form.
883
+ const embedding_text = buildEmbeddingText({
884
+ content,
885
+ relativePath,
886
+ language,
887
+ chunkType,
888
+ symbol,
889
+ hierarchyInfo,
890
+ });
891
+
892
+ // Research isolation: always build the shipped (current-variant)
893
+ // form alongside, so the LI stage can read a canonical input even
894
+ // when an R1 experiment is active on embedding_text. In production
895
+ // (variant=current) this is the same content as embedding_text and
896
+ // pickLiInput's preference for li_greedy_text changes nothing.
897
+ const li_greedy_text = buildEmbeddingText({
898
+ variant: 'current',
899
+ content,
900
+ relativePath,
901
+ language,
902
+ chunkType,
903
+ symbol,
904
+ hierarchyInfo,
905
+ });
906
+
907
+ // Build LI text via the v6.2 builder (language-conditioned path
908
+ // policy). See buildLiText() docstring for rationale.
909
+ const li_text = buildLiText({
910
+ content,
911
+ relativePath,
912
+ language,
913
+ chunkType,
914
+ symbol,
915
+ hierarchyInfo,
916
+ });
622
917
 
623
918
  const metadata = {
624
919
  type: 'codebase',
@@ -647,15 +942,26 @@ export class ASTChunker {
647
942
  return {
648
943
  text: content.trim(),
649
944
  content: content.trim(),
650
- embedding_text: embeddingParts.join('\n').slice(0, 2000),
945
+ embedding_text,
946
+ li_greedy_text,
947
+ li_text,
651
948
  metadata,
652
949
  tags: ['codebase', language, this.inferProjectTag(filePath)]
653
950
  };
654
951
  }
655
952
 
656
953
  /**
657
- * Enrich a chunk's embedding_text with scope chain and import information.
954
+ * Enrich a chunk's embedding_text with scope chain and import info.
658
955
  * Called after initial chunking when scope info is available.
956
+ *
957
+ * Production path (variant=current): updates BOTH `embedding_text`
958
+ * and `li_greedy_text` to the same enriched string — byte-identical
959
+ * to shipped v6.2.
960
+ *
961
+ * Research path (variant != current/enriched): only updates
962
+ * `li_greedy_text`. The variant's `embedding_text` is preserved so
963
+ * an R1 ablation can compare its embedding form against shipped
964
+ * without losing the LI side's enriched text.
659
965
  */
660
966
  static enrichEmbeddingText(chunk, scopeChain, imports) {
661
967
  const parts = [];
@@ -681,7 +987,16 @@ export class ASTChunker {
681
987
  }
682
988
 
683
989
  parts.push(chunk.content);
684
- chunk.embedding_text = parts.join('\n').slice(0, 2000);
990
+ const enriched = parts.join('\n').slice(0, getEmbedTextCap());
991
+
992
+ // Always update the byte-stable LI passthrough form.
993
+ chunk.li_greedy_text = enriched;
994
+
995
+ // Only update embedding_text when the active variant participates
996
+ // in post-build enrichment.
997
+ if (shouldRunEnrichment()) {
998
+ chunk.embedding_text = enriched;
999
+ }
685
1000
  return chunk;
686
1001
  }
687
1002
 
@@ -5,10 +5,19 @@
5
5
  import { createHash } from 'crypto';
6
6
  import path from 'path';
7
7
  import { detectProjectBoundary } from '../../infrastructure/project-detector.js';
8
+ import { JAVA_FAMILY, normalizePathSlug } from '../ast-chunker.js';
8
9
 
9
10
  const MAX_CHUNK_SIZE = 2000; // chars — matches ast-chunker.js
10
11
  const MIN_CHUNK_SIZE = 30; // chars — matches ast-chunker.js threshold
11
12
 
13
+ // Embedding-text cap mirrors ast-chunker.js:getEmbedTextCap. Default 2000,
14
+ // byte-identical to shipped. Override via SWEET_SEARCH_EMBED_TEXT_CAP for
15
+ // the May-2026 budget-alignment ablation.
16
+ function getEmbedTextCap() {
17
+ const v = parseInt(process.env.SWEET_SEARCH_EMBED_TEXT_CAP || '', 10);
18
+ return Number.isFinite(v) && v >= 500 ? v : MAX_CHUNK_SIZE;
19
+ }
20
+
12
21
  /** Default config for markdown chunker */
13
22
  const MD_DEFAULTS = {
14
23
  maxChunkSize: MAX_CHUNK_SIZE,
@@ -59,10 +68,34 @@ function buildDocChunk(content, filePath, language, chunkType, symbol, lineStart
59
68
  }
60
69
  embeddingParts.push(trimmed);
61
70
 
71
+ // Build LI text via v6.2 — language-conditioned path strategy.
72
+ // Document chunks are typically markdown/text and fall under the
73
+ // "other" branch (full raw path). Code-language chunks reuse the
74
+ // shared JAVA_FAMILY set from ast-chunker.js so the path policy
75
+ // here cannot drift from the AST chunker.
76
+ const liParts = [];
77
+ if (relativePath) {
78
+ if (language === 'python') {
79
+ // skip path line for Python
80
+ } else if (JAVA_FAMILY.has(language)) {
81
+ liParts.push(`# ${normalizePathSlug(relativePath)}`);
82
+ } else {
83
+ liParts.push(`# ${relativePath}`);
84
+ }
85
+ }
86
+ if (symbol && symbol !== 'unknown') {
87
+ liParts.push(`# ${chunkType}: ${symbol}`);
88
+ }
89
+ if (language && language !== 'text') {
90
+ liParts.push(`# Language: ${language}`);
91
+ }
92
+ liParts.push(trimmed);
93
+
62
94
  return {
63
95
  text: trimmed,
64
96
  content: trimmed,
65
- embedding_text: embeddingParts.join('\n').slice(0, 2000),
97
+ embedding_text: embeddingParts.join('\n').slice(0, getEmbedTextCap()),
98
+ li_text: liParts.join('\n').slice(0, getEmbedTextCap()),
66
99
  metadata: {
67
100
  type: 'document',
68
101
  file: path.basename(filePath),
@@ -40,6 +40,7 @@ if (process.env.SWEET_SEARCH_UV_THREADPOOL_SIZE && !process.env.UV_THREADPOOL_SI
40
40
  import { existsSync } from 'fs';
41
41
 
42
42
  import { DB_PATHS, LATE_INTERACTION_CONFIG } from '../infrastructure/config/index.js';
43
+ import { applyPersistedLiModel } from '../infrastructure/init-config.js';
43
44
  import { resolveRelationshipTargets } from '../graph/relationship-resolver.js';
44
45
  import { requireNativeAnn as requireNativeAnnBackend } from '../vector-store/hnsw-index.js';
45
46
  import { getStats as getIncrementalStats } from './incremental-tracker.js';
@@ -124,11 +125,18 @@ async function main() {
124
125
  setVerboseMode(true);
125
126
  }
126
127
 
127
- // Apply late interaction model overrides before any model code runs
128
+ // Apply late interaction model overrides before any model code runs.
129
+ // Precedence: --no-late-interaction > --late-interaction-model=… > env
130
+ // var (already honoured by LATE_INTERACTION_CONFIG.model at module load) >
131
+ // .sweet-search/config.json::runtime.li.model > built-in default. Only
132
+ // touch the persisted-config branch when neither CLI flag was used —
133
+ // applyPersistedLiModel internally re-checks the env var.
128
134
  if (noLateInteraction) {
129
135
  LATE_INTERACTION_CONFIG.model = false;
130
136
  } else if (lateInteractionModel) {
131
137
  LATE_INTERACTION_CONFIG.model = lateInteractionModel;
138
+ } else {
139
+ applyPersistedLiModel(process.env.SWEET_SEARCH_PROJECT_ROOT || process.cwd());
132
140
  }
133
141
 
134
142
  log(`${colors.bright}╔═══════════════════════════════════════════════════╗${colors.reset}`, 'bright');
@@ -441,7 +449,28 @@ Output:
441
449
  }
442
450
  }
443
451
 
444
- if (import.meta.url === `file://${process.argv[1]}`) {
452
+ // Direct-run guard. The previous `import.meta.url === \`file://${process.argv[1]}\``
453
+ // form silently no-op'd under three real-world conditions:
454
+ // 1. `npm install ../sweet-search-private` (file install) symlinks
455
+ // `node_modules/sweet-search/` to the source — `process.argv[1]` is the
456
+ // symlink path while `import.meta.url` resolves to the realpath.
457
+ // 2. Paths containing spaces or unicode — the URL form encodes them but
458
+ // `file://` + raw path doesn't.
459
+ // 3. Windows backslash vs URL forward-slash mismatch.
460
+ // Resolve both sides through `realpathSync(fileURLToPath(...))` so the
461
+ // comparison survives every common install layout. Falls back to never-direct
462
+ // (safe default) if either side errors.
463
+ import { realpathSync } from 'node:fs';
464
+ import { fileURLToPath } from 'node:url';
465
+ const _isDirectRun = (() => {
466
+ if (!process.argv[1]) return false;
467
+ try {
468
+ return realpathSync(fileURLToPath(import.meta.url)) === realpathSync(process.argv[1]);
469
+ } catch {
470
+ return false;
471
+ }
472
+ })();
473
+ if (_isDirectRun) {
445
474
  main().catch(err => {
446
475
  console.error(err);
447
476
  process.exit(1);
@@ -79,12 +79,15 @@ export {
79
79
  getEmbeddingPool,
80
80
  } from './indexer-pool.js';
81
81
 
82
- // Late-interaction skip policy (consumed by tests; honors unified exclude globs)
82
+ // Shared indexing file policy (embedding, sparse/BM25 artifacts, and LI)
83
83
  export {
84
- applyLiSkipPolicy,
84
+ applyIndexingChunkPolicy,
85
85
  isExcludedByConfig,
86
86
  chunkLooksGenerated,
87
- } from './li-skip-policy.js';
87
+ } from './indexing-file-policy.js';
88
+
89
+ // Late-interaction compatibility export
90
+ export { applyLiSkipPolicy } from './li-skip-policy.js';
88
91
 
89
92
  // Sparse-gram artifact builder (tier-1 grep acceleration)
90
93
  export { buildSparseGramArtifact } from './indexer-sparse-gram.js';
@@ -12,6 +12,7 @@ import { LateInteractionIndex } from '../ranking/late-interaction-index.js';
12
12
  import { truncateForHNSW, getEmbeddings, getModelInfo, fisherYatesShuffle } from '../embedding/embedding-service.js';
13
13
  import { buildFromCodebaseDb as buildQuantizedArtifacts, shouldSkipArtifactRebuild, updateArtifactState, ARTIFACT_THRESHOLDS } from './artifact-builder.js';
14
14
  import { log, logProgress } from './indexer-utils.js';
15
+ import { JAVA_FAMILY } from './ast-chunker.js';
15
16
 
16
17
  // =============================================================================
17
18
  // DURABLE WRITE HELPERS (Phase E — fsync ordering for checkpoint safety)
@@ -20,6 +21,45 @@ import { log, logProgress } from './indexer-utils.js';
20
21
  const CHECKPOINT_INTERVAL_SEC = 30;
21
22
  const MIN_VECTORS_BETWEEN_SAVES = 1000;
22
23
 
24
+ /**
25
+ * v6.2: language-family-conditioned LI input routing.
26
+ *
27
+ * Per-language ablation on GenCodeSearchNet (May 2026, see
28
+ * docs/JS_CHUNK_BLEEDING_ANALYSIS.md) determined that different
29
+ * languages benefit from different metadata richness in the late-
30
+ * interaction MaxSim input. v6.2 picks empirically:
31
+ *
32
+ * chunk.li_text (simple, per-lang path policy in chunker):
33
+ * - python (no path; at 97% ceiling, path duplicated funcname)
34
+ * - JAVA_FAMILY: java, php, csharp/c#, kotlin, scala
35
+ * (slug-stripped path; framework signal preserved, hashed-slug
36
+ * noise removed)
37
+ *
38
+ * chunk.embedding_text (greedy enriched; Scope + Defines + Uses):
39
+ * - javascript, typescript, jsx, tsx (closures + imports help)
40
+ * - ruby, go, c, cpp, c++, rust
41
+ * - any unknown language as the safe fallback
42
+ *
43
+ * The Java-family set is sourced from ast-chunker.js so that the
44
+ * routing here and the path-policy logic inside the chunker can never
45
+ * drift out of sync.
46
+ */
47
+ export function pickLiInput(chunk) {
48
+ const lang = chunk?.metadata?.language;
49
+ if (lang === 'python' || JAVA_FAMILY.has(lang)) {
50
+ return chunk.li_text || chunk.embedding_text || chunk.text || chunk.content || '';
51
+ }
52
+ // For JS family / Ruby / Go / C/C++/Rust / unknown, read from
53
+ // `li_greedy_text` first. In production (variant=current) this is
54
+ // identical to embedding_text — same shipped v6.2 behavior. The
55
+ // preference matters only when a research-only embedding variant
56
+ // (SWEET_SEARCH_EMBED_TEXT_VARIANT) is active: then li_greedy_text
57
+ // still carries the shipped form, so an R1 ablation cannot
58
+ // accidentally mutate the LI input. Older chunks without
59
+ // li_greedy_text fall through to embedding_text as before.
60
+ return chunk.li_greedy_text || chunk.embedding_text || chunk.li_text || chunk.text || chunk.content || '';
61
+ }
62
+
23
63
  function fsyncFile(filePath) {
24
64
  const fd = openSync(filePath, 'r');
25
65
  try { fsyncSync(fd); } finally { closeSync(fd); }
@@ -239,7 +279,7 @@ function buildLateInteractionBatches(chunks, options = {}) {
239
279
  attentionBudget = Math.max(1, Math.floor(batchSizeCap / 2)) * maxLength * maxLength;
240
280
  }
241
281
  const indexed = chunks.map((chunk) => {
242
- const text = chunk.text || chunk.content || '';
282
+ const text = pickLiInput(chunk);
243
283
  return {
244
284
  chunk,
245
285
  text,
@@ -603,17 +643,16 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
603
643
  // Apply LI skip policy: last-mile guard before the slow encoder runs.
604
644
  // The embed indexer has already dropped glob-excluded files at discovery
605
645
  // time using the same loadProjectConfig() excludes; this pass adds the
606
- // LI-specific checks globs can't do content-based @generated markers
607
- // and a per-file token budget. Disable via SWEET_SEARCH_LI_SKIP_DISABLE=1.
646
+ // LI-specific check globs can't do: content-based @generated markers.
647
+ // Disable via SWEET_SEARCH_LI_SKIP_DISABLE=1.
608
648
  let skippedSummary = null;
609
649
  if (Array.isArray(chunks) && chunks.length > 0) {
610
- const { applyLiSkipPolicy } = await import('./li-skip-policy.js');
611
- const { kept, stats } = applyLiSkipPolicy(chunks, { projectRoot });
650
+ const { applyIndexingChunkPolicy } = await import('./indexing-file-policy.js');
651
+ const { kept, stats } = applyIndexingChunkPolicy(chunks, { projectRoot });
612
652
  if (stats.totalSkipped > 0) {
613
653
  const breakdown = [
614
654
  stats.excluded > 0 ? `${stats.excluded} excluded` : null,
615
655
  stats.generated > 0 ? `${stats.generated} generated` : null,
616
- stats.huge > 0 ? `${stats.huge} huge-file` : null,
617
656
  ].filter(Boolean).join(', ');
618
657
  log(`LI skip policy: dropped ${stats.totalSkipped} chunks across ${stats.skippedFiles} files (${breakdown}); kept ${kept.length} chunks across ${stats.keptFiles} files`, 'dim');
619
658
  chunks = kept;
@@ -537,9 +537,17 @@ export async function chunkFiles(files) {
537
537
  }
538
538
  }
539
539
 
540
+ // Embedding-text cap: defaults to 2000 (byte-identical to shipped). The
541
+ // SWEET_SEARCH_EMBED_TEXT_CAP env var (see ast-chunker.js:getEmbedTextCap)
542
+ // is honored by the chunk builders themselves; this final re-slice mirrors
543
+ // the same cap so an ablation can raise the ceiling end-to-end.
544
+ const _embCap = (() => {
545
+ const v = parseInt(process.env.SWEET_SEARCH_EMBED_TEXT_CAP || '', 10);
546
+ return Number.isFinite(v) && v >= 500 ? v : 2000;
547
+ })();
540
548
  const texts = allChunks.map(chunk => {
541
549
  if (chunk.embedding_text) {
542
- return chunk.embedding_text.slice(0, 2000);
550
+ return chunk.embedding_text.slice(0, _embCap);
543
551
  }
544
552
  return `${chunk.file} ${chunk.metadata?.symbol || ''}\n${(chunk.text || chunk.content || '').slice(0, 1500)}`;
545
553
  });
@@ -7,7 +7,7 @@ import fs from 'fs/promises';
7
7
  import { existsSync } from 'fs';
8
8
  import path from 'path';
9
9
 
10
- import { DB_PATHS, PROJECT_ROOT, EMBEDDING_CONFIG } from '../infrastructure/config/index.js';
10
+ import { DB_PATHS, PROJECT_ROOT, EMBEDDING_CONFIG, HCGS_CONFIG } from '../infrastructure/config/index.js';
11
11
  import { getChangedFiles, updateState, getStats as getIncrementalStats, updatePhaseProgress, markPhaseComplete, clearPhaseProgress } from './incremental-tracker.js';
12
12
  import { backupSummaries, restoreSummaries, markForRegeneration } from '../graph/summary-manager.js';
13
13
  import { colors, log, logProgress, logError, discoverFiles, readFilesFromStdin, atomicSwapDatabase } from './indexer-utils.js';
@@ -300,8 +300,10 @@ export async function buildCodeGraphWithHCGSPhase(options = {}) {
300
300
  skipSummaryRegen,
301
301
  } = options;
302
302
 
303
+ const hcgsEnabled = HCGS_CONFIG.enabled;
304
+
303
305
  let summaryBackup = { summaries: [], count: 0 };
304
- if (!dryRun) {
306
+ if (!dryRun && hcgsEnabled) {
305
307
  summaryBackup = await backupSummaries(DB_PATHS.codeGraph);
306
308
  if (summaryBackup.count > 0) {
307
309
  log(`Backed up ${summaryBackup.count} existing summaries (with type validation)`, 'green');
@@ -318,14 +320,14 @@ export async function buildCodeGraphWithHCGSPhase(options = {}) {
318
320
 
319
321
  const graphStats = await buildCodeGraph(allFiles, dryRun);
320
322
 
321
- if (!dryRun && summaryBackup.count > 0) {
323
+ if (!dryRun && hcgsEnabled && summaryBackup.count > 0) {
322
324
  const restoreResult = await restoreSummaries(DB_PATHS.codeGraph, summaryBackup);
323
325
  log(`Restored ${restoreResult.restored} summaries (${restoreResult.skipped.total} skipped - entity removed/type changed)`, 'green');
324
326
  }
325
327
 
326
328
  let hcgsPromise = null;
327
329
 
328
- const shouldRunHCGS = !dryRun && (
330
+ const shouldRunHCGS = !dryRun && hcgsEnabled && (
329
331
  fullReindex ||
330
332
  filesFromStdin ||
331
333
  (incrementalInfo && filesToIndex.length > 0)