sweet-search 2.4.2 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +19 -5
- package/core/embedding/embedding-cache.js +177 -15
- package/core/embedding/embedding-service.js +18 -4
- package/core/graph/graph-expansion.js +52 -12
- package/core/graph/graph-extractor.js +30 -1
- package/core/indexing/ast-chunker.js +331 -16
- package/core/indexing/chunking/chunk-builder.js +34 -1
- package/core/indexing/index.js +6 -3
- package/core/indexing/indexer-ann.js +45 -6
- package/core/indexing/indexer-build.js +9 -1
- package/core/indexing/indexer-phases.js +6 -4
- package/core/indexing/indexing-file-policy.js +140 -0
- package/core/indexing/li-skip-policy.js +11 -220
- package/core/infrastructure/codebase-repository.js +21 -0
- package/core/infrastructure/config/embedding.js +20 -1
- package/core/infrastructure/config/graph.js +2 -2
- package/core/infrastructure/config/ranking.js +10 -0
- package/core/infrastructure/config/vector-store.js +1 -1
- package/core/infrastructure/coreml-cascade.js +236 -30
- package/core/infrastructure/coreml-cascade.json +25 -0
- package/core/infrastructure/index.js +15 -0
- package/core/infrastructure/init-config.js +78 -0
- package/core/infrastructure/language-patterns/registry-core.js +18 -0
- package/core/infrastructure/model-registry.js +12 -0
- package/core/infrastructure/native-inference.js +143 -51
- package/core/infrastructure/tree-sitter-provider.js +92 -2
- package/core/ranking/cascaded-scorer.js +6 -2
- package/core/ranking/file-kind-ranking.js +264 -0
- package/core/ranking/late-interaction-index.js +10 -4
- package/core/ranking/late-interaction-policy.js +304 -0
- package/core/search/context-expander.js +267 -28
- package/core/search/index.js +4 -0
- package/core/search/search-cli.js +3 -1
- package/core/search/search-pattern.js +4 -3
- package/core/search/search-postprocess.js +189 -8
- package/core/search/search-read-semantic.js +717 -0
- package/core/search/search-read.js +481 -0
- package/core/search/search-server.js +6 -4
- package/core/search/sweet-search.js +119 -15
- package/mcp/server.js +41 -0
- package/mcp/tool-handlers.js +117 -6
- package/package.json +9 -7
- package/scripts/init.js +386 -5
|
@@ -20,6 +20,276 @@ const MIN_CONTENT_LENGTH = 30;
|
|
|
20
20
|
const MAX_PEEK_LINES = 3;
|
|
21
21
|
const DEFAULT_MAX_REGEX_LINE_LENGTH = 4000;
|
|
22
22
|
|
|
23
|
+
// =============================================================================
|
|
24
|
+
// Embedding-text cap — RESEARCH / ABLATION INFRASTRUCTURE
|
|
25
|
+
// =============================================================================
|
|
26
|
+
//
|
|
27
|
+
// Production default: 2000, byte-identical to shipped v6.2.
|
|
28
|
+
//
|
|
29
|
+
// Set SWEET_SEARCH_EMBED_TEXT_CAP=N to raise/lower the slice ceiling on
|
|
30
|
+
// `embedding_text`, `li_text`, `li_greedy_text`, and the enriched form.
|
|
31
|
+
// The May-2026 chunk-overflow audit found 80.5% of overflowing chunks are
|
|
32
|
+
// header-pushed (raw content ≤2000, headers push the embedding text over),
|
|
33
|
+
// motivating an A/B vs slightly-larger caps. See eval/results/
|
|
34
|
+
// chunk-overflow-audit.md for the audit and eval/run_overflow_ablation.sh
|
|
35
|
+
// for the wiring. Does NOT alter the chunker max size — that lives behind
|
|
36
|
+
// SWEET_SEARCH_CHUNK_HEADER_OVERHEAD in tree-sitter-provider.js.
|
|
37
|
+
function getEmbedTextCap() {
|
|
38
|
+
const v = parseInt(process.env.SWEET_SEARCH_EMBED_TEXT_CAP || '', 10);
|
|
39
|
+
return Number.isFinite(v) && v >= 500 ? v : MAX_CHUNK_SIZE;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Strip a trailing `_<hex>` slug from a path's final basename, before
|
|
44
|
+
* its extension. Used by the Java family (Java/PHP/C#/Kotlin/Scala)
|
|
45
|
+
* per the v6.2 language-conditioned policy.
|
|
46
|
+
*
|
|
47
|
+
* javascript/Semantic-UI/Table_366c13.js -> javascript/Semantic-UI/Table.js
|
|
48
|
+
* src/Button.tsx (no slug) -> src/Button.tsx (unchanged)
|
|
49
|
+
*
|
|
50
|
+
* Exported for reuse in chunk-builder.js (document chunks) and tests.
|
|
51
|
+
*/
|
|
52
|
+
export function normalizePathSlug(relativePath) {
|
|
53
|
+
if (!relativePath) return relativePath;
|
|
54
|
+
return relativePath.replace(/_[0-9a-f]{6,}(\.[a-zA-Z0-9]+)$/, '$1');
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Languages that share Java's slug-stripped-path li_text policy.
|
|
58
|
+
// (At LI input routing time these will all be sent to chunk.li_text;
|
|
59
|
+
// see LI_TEXT_LANGUAGES in core/indexing/indexer-ann.js, which imports
|
|
60
|
+
// this set so the policy stays consistent in one place.)
|
|
61
|
+
export const JAVA_FAMILY = new Set([
|
|
62
|
+
'java', 'php',
|
|
63
|
+
'csharp', 'c#',
|
|
64
|
+
'kotlin', 'scala',
|
|
65
|
+
]);
|
|
66
|
+
|
|
67
|
+
// =============================================================================
|
|
68
|
+
// Embedding-text variant switch — RESEARCH / ABLATION INFRASTRUCTURE
|
|
69
|
+
// =============================================================================
|
|
70
|
+
//
|
|
71
|
+
// Production behavior is fixed at variant=current and is byte-identical to
|
|
72
|
+
// shipped v6.2. None of the alternative variants below are recommended for
|
|
73
|
+
// production; an isolated R1 ablation in May 2026 found no variant beats
|
|
74
|
+
// shipped on total MRR@10 (see docs/JS_CHUNK_BLEEDING_ANALYSIS.md for the
|
|
75
|
+
// full table). The switch and helpers are kept ONLY so future R1
|
|
76
|
+
// experiments can be re-run without re-implementing the scaffolding.
|
|
77
|
+
//
|
|
78
|
+
// Default: SWEET_SEARCH_EMBED_TEXT_VARIANT unset → 'current' →
|
|
79
|
+
// embedding_text + LI input both byte-identical to shipped.
|
|
80
|
+
//
|
|
81
|
+
// Available experiment variants (research use only):
|
|
82
|
+
// current shipped form (production default)
|
|
83
|
+
// no_path drop the # path line
|
|
84
|
+
// normalized_path slug-strip the trailing _<hex> in path
|
|
85
|
+
// no_language drop # Language: line
|
|
86
|
+
// parent_only path + parent only, drop function + language
|
|
87
|
+
// enriched identical to current here; also runs enrichment
|
|
88
|
+
// code_breadcrumb compact `# Parent.Symbol` or `# Parent::symbol`
|
|
89
|
+
// signature current + `# Signature: <multi-line-sig>` line
|
|
90
|
+
// between the symbol line and the language line.
|
|
91
|
+
// Signature is AST-extracted by tree-sitter (decl
|
|
92
|
+
// header up to body, whitespace-collapsed, capped
|
|
93
|
+
// at MAX_SIGNATURE_LENGTH). When no signature is
|
|
94
|
+
// available (regex fallback path, non-boundary
|
|
95
|
+
// chunks, or merged sibling buffers) this variant
|
|
96
|
+
// is byte-identical to `current`.
|
|
97
|
+
// signature_rbphp like `signature` but only for Ruby and PHP — the
|
|
98
|
+
// two languages where the May-2026 R1 ablation
|
|
99
|
+
// showed signatures helped (PHP +0.79 MRR@10,
|
|
100
|
+
// Ruby +0.58); other languages route to the same
|
|
101
|
+
// output as `current`. Motivated by per-language
|
|
102
|
+
// pareto front (matching v6.2 LI routing pattern).
|
|
103
|
+
//
|
|
104
|
+
// LI isolation: when a non-current variant is active, `chunk.li_greedy_text`
|
|
105
|
+
// still carries the shipped (variant=current) form, and pickLiInput() for
|
|
106
|
+
// non-Python/Java languages reads it — so an R1 experiment never
|
|
107
|
+
// contaminates the LI input. See enrichEmbeddingText() and pickLiInput().
|
|
108
|
+
const ENRICHMENT_VARIANTS = new Set(['current', 'enriched']);
|
|
109
|
+
|
|
110
|
+
function getEmbedTextVariant() {
|
|
111
|
+
const v = (process.env.SWEET_SEARCH_EMBED_TEXT_VARIANT || 'current').toLowerCase();
|
|
112
|
+
return [
|
|
113
|
+
'current', 'no_path', 'normalized_path', 'no_language',
|
|
114
|
+
'parent_only', 'enriched', 'code_breadcrumb',
|
|
115
|
+
'signature', 'signature_rbphp',
|
|
116
|
+
].includes(v) ? v : 'current';
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Languages where the May-2026 ablation showed `signature` strictly
|
|
120
|
+
// helped on R1 MRR@10 (PHP +0.79, Ruby +0.58 vs current). The
|
|
121
|
+
// `signature_rbphp` variant only emits the # Signature: line for
|
|
122
|
+
// these — every other language gets exactly the `current` output.
|
|
123
|
+
const SIGNATURE_HELP_LANGUAGES = new Set(['ruby', 'php']);
|
|
124
|
+
|
|
125
|
+
export function shouldRunEnrichment() {
|
|
126
|
+
return ENRICHMENT_VARIANTS.has(getEmbedTextVariant());
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// `Foo::bar` for langs that conventionally use ::, `Foo.bar` otherwise.
|
|
130
|
+
// Used by the experimental code_breadcrumb variant only.
|
|
131
|
+
function breadcrumbSep(language) {
|
|
132
|
+
return ['ruby', 'php', 'cpp', 'c++', 'rust'].includes(language) ? '::' : '.';
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Build `embedding_text` per the active R1 variant.
|
|
137
|
+
*
|
|
138
|
+
* Production callers always pass either no variant (env-driven, defaults
|
|
139
|
+
* to 'current') or `variant: 'current'` explicitly — both produce text
|
|
140
|
+
* byte-identical to shipped v6.2.
|
|
141
|
+
*
|
|
142
|
+
* `variant: 'current'` is also passed explicitly when populating
|
|
143
|
+
* `chunk.li_greedy_text` so the LI stage sees the shipped form even
|
|
144
|
+
* when the bi-encoder is consuming a different experimental variant.
|
|
145
|
+
*/
|
|
146
|
+
function buildEmbeddingText({ variant: variantOverride, content, relativePath, language, chunkType, symbol, hierarchyInfo }) {
|
|
147
|
+
const variant = variantOverride || getEmbedTextVariant();
|
|
148
|
+
const trimmed = content.trim();
|
|
149
|
+
const parts = [];
|
|
150
|
+
|
|
151
|
+
const pathLine = relativePath ? `# ${relativePath}` : null;
|
|
152
|
+
const parentLine = hierarchyInfo?.parentSymbol
|
|
153
|
+
? `# Parent: ${hierarchyInfo.parentType} ${hierarchyInfo.parentSymbol}` : null;
|
|
154
|
+
const symbolLine = (symbol && symbol !== 'unknown')
|
|
155
|
+
? `# ${chunkType}: ${symbol}` : null;
|
|
156
|
+
const langLine = (language && language !== 'text')
|
|
157
|
+
? `# Language: ${language}` : null;
|
|
158
|
+
const signatureLine = hierarchyInfo?.signature
|
|
159
|
+
? `# Signature: ${hierarchyInfo.signature}` : null;
|
|
160
|
+
|
|
161
|
+
switch (variant) {
|
|
162
|
+
case 'no_path':
|
|
163
|
+
if (parentLine) parts.push(parentLine);
|
|
164
|
+
if (symbolLine) parts.push(symbolLine);
|
|
165
|
+
if (langLine) parts.push(langLine);
|
|
166
|
+
break;
|
|
167
|
+
case 'normalized_path':
|
|
168
|
+
if (relativePath) parts.push(`# ${normalizePathSlug(relativePath)}`);
|
|
169
|
+
if (parentLine) parts.push(parentLine);
|
|
170
|
+
if (symbolLine) parts.push(symbolLine);
|
|
171
|
+
if (langLine) parts.push(langLine);
|
|
172
|
+
break;
|
|
173
|
+
case 'no_language':
|
|
174
|
+
if (pathLine) parts.push(pathLine);
|
|
175
|
+
if (parentLine) parts.push(parentLine);
|
|
176
|
+
if (symbolLine) parts.push(symbolLine);
|
|
177
|
+
break;
|
|
178
|
+
case 'parent_only':
|
|
179
|
+
if (pathLine) parts.push(pathLine);
|
|
180
|
+
if (parentLine) parts.push(parentLine);
|
|
181
|
+
break;
|
|
182
|
+
case 'code_breadcrumb': {
|
|
183
|
+
if (pathLine) parts.push(pathLine);
|
|
184
|
+
const sep = breadcrumbSep(language);
|
|
185
|
+
if (hierarchyInfo?.parentSymbol && symbol && symbol !== 'unknown') {
|
|
186
|
+
parts.push(`# ${hierarchyInfo.parentSymbol}${sep}${symbol}`);
|
|
187
|
+
} else if (symbol && symbol !== 'unknown') {
|
|
188
|
+
parts.push(`# ${symbol}`);
|
|
189
|
+
} else if (hierarchyInfo?.parentSymbol) {
|
|
190
|
+
parts.push(`# ${hierarchyInfo.parentSymbol}`);
|
|
191
|
+
}
|
|
192
|
+
if (language && language !== 'text') parts.push(`# ${language}`);
|
|
193
|
+
break;
|
|
194
|
+
}
|
|
195
|
+
case 'signature':
|
|
196
|
+
// Same headers as current, plus a `# Signature:` line emitted
|
|
197
|
+
// between the symbol line and the language line. When no
|
|
198
|
+
// signature is available (regex fallback / merged sibling
|
|
199
|
+
// buffer / non-boundary chunk), this is byte-identical to
|
|
200
|
+
// current — the variant degrades gracefully.
|
|
201
|
+
if (pathLine) parts.push(pathLine);
|
|
202
|
+
if (parentLine) parts.push(parentLine);
|
|
203
|
+
if (symbolLine) parts.push(symbolLine);
|
|
204
|
+
if (signatureLine) parts.push(signatureLine);
|
|
205
|
+
if (langLine) parts.push(langLine);
|
|
206
|
+
break;
|
|
207
|
+
case 'signature_rbphp':
|
|
208
|
+
// Lang-conditioned: emit signature only for Ruby and PHP (the
|
|
209
|
+
// two languages where unconditional `signature` strictly helped
|
|
210
|
+
// on the May-2026 R1 ablation). Every other language gets the
|
|
211
|
+
// exact `current` output. Mirrors the v6.2 LI routing pattern.
|
|
212
|
+
if (pathLine) parts.push(pathLine);
|
|
213
|
+
if (parentLine) parts.push(parentLine);
|
|
214
|
+
if (symbolLine) parts.push(symbolLine);
|
|
215
|
+
if (signatureLine && SIGNATURE_HELP_LANGUAGES.has(language)) {
|
|
216
|
+
parts.push(signatureLine);
|
|
217
|
+
}
|
|
218
|
+
if (langLine) parts.push(langLine);
|
|
219
|
+
break;
|
|
220
|
+
case 'enriched':
|
|
221
|
+
case 'current':
|
|
222
|
+
default:
|
|
223
|
+
if (pathLine) parts.push(pathLine);
|
|
224
|
+
if (parentLine) parts.push(parentLine);
|
|
225
|
+
if (symbolLine) parts.push(symbolLine);
|
|
226
|
+
if (langLine) parts.push(langLine);
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
parts.push(trimmed);
|
|
231
|
+
return parts.join('\n').slice(0, getEmbedTextCap());
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Per-language path-line strategy for late-interaction MaxSim input.
|
|
236
|
+
*
|
|
237
|
+
* - Python: no path line (at 97% MRR ceiling; path duplicated funcname).
|
|
238
|
+
* - Java family (Java, PHP, C#, Kotlin, Scala): slug-stripped path.
|
|
239
|
+
* - Everything else: full path. (For JS family, Ruby, Go, C/C++/Rust
|
|
240
|
+
* and unknown languages, the LI input router in indexer-ann.js
|
|
241
|
+
* bypasses li_text entirely and reads chunk.embedding_text directly,
|
|
242
|
+
* so this branch only matters as a fallback if those routes ever
|
|
243
|
+
* fail to find embedding_text.)
|
|
244
|
+
*/
|
|
245
|
+
function pathLineForLi(relativePath, language) {
|
|
246
|
+
if (!relativePath) return null;
|
|
247
|
+
if (language === 'python') return null;
|
|
248
|
+
if (JAVA_FAMILY.has(language)) {
|
|
249
|
+
return `# ${normalizePathSlug(relativePath)}`;
|
|
250
|
+
}
|
|
251
|
+
return `# ${relativePath}`;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Build `li_text` — the text fed to lateon-code MaxSim reranking.
|
|
256
|
+
*
|
|
257
|
+
* v6.2 (2026-05): language-conditioned path strategy. All languages
|
|
258
|
+
* share the same `# Parent / # symbol / # Language` label-shaped
|
|
259
|
+
* headers and differ only in how (or whether) the path comment is
|
|
260
|
+
* included. See pathLineForLi() docstring for the per-language map.
|
|
261
|
+
*
|
|
262
|
+
* Why language-conditioned vs uniform: per-language ablation showed
|
|
263
|
+
* uniform policies always regress at least one language (greedy
|
|
264
|
+
* regresses Python; v4 no-path regresses Ruby below baseline; v5
|
|
265
|
+
* slug-strip-uniform hurts JS). The v6.2 split — Python skips the
|
|
266
|
+
* path, Java family slug-strips, JS family / Ruby / Go / fallback get
|
|
267
|
+
* the full enriched embedding_text — is the empirical pareto front.
|
|
268
|
+
*
|
|
269
|
+
* The actual routing of `li_text` vs `embedding_text` lives in
|
|
270
|
+
* core/indexing/indexer-ann.js (`pickLiInput`).
|
|
271
|
+
*/
|
|
272
|
+
function buildLiText({ content, relativePath, language, chunkType, symbol, hierarchyInfo }) {
|
|
273
|
+
const trimmed = content.trim();
|
|
274
|
+
const lines = [];
|
|
275
|
+
|
|
276
|
+
const pathLine = pathLineForLi(relativePath, language);
|
|
277
|
+
if (pathLine) lines.push(pathLine);
|
|
278
|
+
|
|
279
|
+
if (hierarchyInfo?.parentSymbol) {
|
|
280
|
+
lines.push(`# Parent: ${hierarchyInfo.parentType} ${hierarchyInfo.parentSymbol}`);
|
|
281
|
+
}
|
|
282
|
+
if (symbol && symbol !== 'unknown') {
|
|
283
|
+
lines.push(`# ${chunkType}: ${symbol}`);
|
|
284
|
+
}
|
|
285
|
+
if (language && language !== 'text') {
|
|
286
|
+
lines.push(`# Language: ${language}`);
|
|
287
|
+
}
|
|
288
|
+
lines.push(trimmed);
|
|
289
|
+
|
|
290
|
+
return lines.join('\n').slice(0, getEmbedTextCap());
|
|
291
|
+
}
|
|
292
|
+
|
|
23
293
|
/**
|
|
24
294
|
* AST-like semantic code chunker supporting 35+ languages.
|
|
25
295
|
* Uses regex boundary patterns from core/language-patterns.js registry.
|
|
@@ -100,6 +370,7 @@ export class ASTChunker {
|
|
|
100
370
|
parentChunkId: chunk.parentChunkId,
|
|
101
371
|
parentSymbol: chunk.parentSymbol,
|
|
102
372
|
parentType: chunk.parentType,
|
|
373
|
+
signature: chunk.signature || null,
|
|
103
374
|
}
|
|
104
375
|
)
|
|
105
376
|
);
|
|
@@ -606,19 +877,43 @@ export class ASTChunker {
|
|
|
606
877
|
const hash = createHash('sha256').update(content).digest('hex').slice(0, 16);
|
|
607
878
|
const relativePath = this.projectRoot ? path.relative(this.projectRoot, filePath) : filePath;
|
|
608
879
|
|
|
609
|
-
//
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
880
|
+
// Production embedding text. With the default variant (current) this
|
|
881
|
+
// is byte-identical to shipped v6.2; under a research-only variant
|
|
882
|
+
// (SWEET_SEARCH_EMBED_TEXT_VARIANT) it produces the experimental form.
|
|
883
|
+
const embedding_text = buildEmbeddingText({
|
|
884
|
+
content,
|
|
885
|
+
relativePath,
|
|
886
|
+
language,
|
|
887
|
+
chunkType,
|
|
888
|
+
symbol,
|
|
889
|
+
hierarchyInfo,
|
|
890
|
+
});
|
|
891
|
+
|
|
892
|
+
// Research isolation: always build the shipped (current-variant)
|
|
893
|
+
// form alongside, so the LI stage can read a canonical input even
|
|
894
|
+
// when an R1 experiment is active on embedding_text. In production
|
|
895
|
+
// (variant=current) this is the same content as embedding_text and
|
|
896
|
+
// pickLiInput's preference for li_greedy_text changes nothing.
|
|
897
|
+
const li_greedy_text = buildEmbeddingText({
|
|
898
|
+
variant: 'current',
|
|
899
|
+
content,
|
|
900
|
+
relativePath,
|
|
901
|
+
language,
|
|
902
|
+
chunkType,
|
|
903
|
+
symbol,
|
|
904
|
+
hierarchyInfo,
|
|
905
|
+
});
|
|
906
|
+
|
|
907
|
+
// Build LI text via the v6.2 builder (language-conditioned path
|
|
908
|
+
// policy). See buildLiText() docstring for rationale.
|
|
909
|
+
const li_text = buildLiText({
|
|
910
|
+
content,
|
|
911
|
+
relativePath,
|
|
912
|
+
language,
|
|
913
|
+
chunkType,
|
|
914
|
+
symbol,
|
|
915
|
+
hierarchyInfo,
|
|
916
|
+
});
|
|
622
917
|
|
|
623
918
|
const metadata = {
|
|
624
919
|
type: 'codebase',
|
|
@@ -647,15 +942,26 @@ export class ASTChunker {
|
|
|
647
942
|
return {
|
|
648
943
|
text: content.trim(),
|
|
649
944
|
content: content.trim(),
|
|
650
|
-
embedding_text
|
|
945
|
+
embedding_text,
|
|
946
|
+
li_greedy_text,
|
|
947
|
+
li_text,
|
|
651
948
|
metadata,
|
|
652
949
|
tags: ['codebase', language, this.inferProjectTag(filePath)]
|
|
653
950
|
};
|
|
654
951
|
}
|
|
655
952
|
|
|
656
953
|
/**
|
|
657
|
-
* Enrich a chunk's embedding_text with scope chain and import
|
|
954
|
+
* Enrich a chunk's embedding_text with scope chain and import info.
|
|
658
955
|
* Called after initial chunking when scope info is available.
|
|
956
|
+
*
|
|
957
|
+
* Production path (variant=current): updates BOTH `embedding_text`
|
|
958
|
+
* and `li_greedy_text` to the same enriched string — byte-identical
|
|
959
|
+
* to shipped v6.2.
|
|
960
|
+
*
|
|
961
|
+
* Research path (variant != current/enriched): only updates
|
|
962
|
+
* `li_greedy_text`. The variant's `embedding_text` is preserved so
|
|
963
|
+
* an R1 ablation can compare its embedding form against shipped
|
|
964
|
+
* without losing the LI side's enriched text.
|
|
659
965
|
*/
|
|
660
966
|
static enrichEmbeddingText(chunk, scopeChain, imports) {
|
|
661
967
|
const parts = [];
|
|
@@ -681,7 +987,16 @@ export class ASTChunker {
|
|
|
681
987
|
}
|
|
682
988
|
|
|
683
989
|
parts.push(chunk.content);
|
|
684
|
-
|
|
990
|
+
const enriched = parts.join('\n').slice(0, getEmbedTextCap());
|
|
991
|
+
|
|
992
|
+
// Always update the byte-stable LI passthrough form.
|
|
993
|
+
chunk.li_greedy_text = enriched;
|
|
994
|
+
|
|
995
|
+
// Only update embedding_text when the active variant participates
|
|
996
|
+
// in post-build enrichment.
|
|
997
|
+
if (shouldRunEnrichment()) {
|
|
998
|
+
chunk.embedding_text = enriched;
|
|
999
|
+
}
|
|
685
1000
|
return chunk;
|
|
686
1001
|
}
|
|
687
1002
|
|
|
@@ -5,10 +5,19 @@
|
|
|
5
5
|
import { createHash } from 'crypto';
|
|
6
6
|
import path from 'path';
|
|
7
7
|
import { detectProjectBoundary } from '../../infrastructure/project-detector.js';
|
|
8
|
+
import { JAVA_FAMILY, normalizePathSlug } from '../ast-chunker.js';
|
|
8
9
|
|
|
9
10
|
const MAX_CHUNK_SIZE = 2000; // chars — matches ast-chunker.js
|
|
10
11
|
const MIN_CHUNK_SIZE = 30; // chars — matches ast-chunker.js threshold
|
|
11
12
|
|
|
13
|
+
// Embedding-text cap mirrors ast-chunker.js:getEmbedTextCap. Default 2000,
|
|
14
|
+
// byte-identical to shipped. Override via SWEET_SEARCH_EMBED_TEXT_CAP for
|
|
15
|
+
// the May-2026 budget-alignment ablation.
|
|
16
|
+
function getEmbedTextCap() {
|
|
17
|
+
const v = parseInt(process.env.SWEET_SEARCH_EMBED_TEXT_CAP || '', 10);
|
|
18
|
+
return Number.isFinite(v) && v >= 500 ? v : MAX_CHUNK_SIZE;
|
|
19
|
+
}
|
|
20
|
+
|
|
12
21
|
/** Default config for markdown chunker */
|
|
13
22
|
const MD_DEFAULTS = {
|
|
14
23
|
maxChunkSize: MAX_CHUNK_SIZE,
|
|
@@ -59,10 +68,34 @@ function buildDocChunk(content, filePath, language, chunkType, symbol, lineStart
|
|
|
59
68
|
}
|
|
60
69
|
embeddingParts.push(trimmed);
|
|
61
70
|
|
|
71
|
+
// Build LI text via v6.2 — language-conditioned path strategy.
|
|
72
|
+
// Document chunks are typically markdown/text and fall under the
|
|
73
|
+
// "other" branch (full raw path). Code-language chunks reuse the
|
|
74
|
+
// shared JAVA_FAMILY set from ast-chunker.js so the path policy
|
|
75
|
+
// here cannot drift from the AST chunker.
|
|
76
|
+
const liParts = [];
|
|
77
|
+
if (relativePath) {
|
|
78
|
+
if (language === 'python') {
|
|
79
|
+
// skip path line for Python
|
|
80
|
+
} else if (JAVA_FAMILY.has(language)) {
|
|
81
|
+
liParts.push(`# ${normalizePathSlug(relativePath)}`);
|
|
82
|
+
} else {
|
|
83
|
+
liParts.push(`# ${relativePath}`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
if (symbol && symbol !== 'unknown') {
|
|
87
|
+
liParts.push(`# ${chunkType}: ${symbol}`);
|
|
88
|
+
}
|
|
89
|
+
if (language && language !== 'text') {
|
|
90
|
+
liParts.push(`# Language: ${language}`);
|
|
91
|
+
}
|
|
92
|
+
liParts.push(trimmed);
|
|
93
|
+
|
|
62
94
|
return {
|
|
63
95
|
text: trimmed,
|
|
64
96
|
content: trimmed,
|
|
65
|
-
embedding_text: embeddingParts.join('\n').slice(0,
|
|
97
|
+
embedding_text: embeddingParts.join('\n').slice(0, getEmbedTextCap()),
|
|
98
|
+
li_text: liParts.join('\n').slice(0, getEmbedTextCap()),
|
|
66
99
|
metadata: {
|
|
67
100
|
type: 'document',
|
|
68
101
|
file: path.basename(filePath),
|
package/core/indexing/index.js
CHANGED
|
@@ -79,12 +79,15 @@ export {
|
|
|
79
79
|
getEmbeddingPool,
|
|
80
80
|
} from './indexer-pool.js';
|
|
81
81
|
|
|
82
|
-
//
|
|
82
|
+
// Shared indexing file policy (embedding, sparse/BM25 artifacts, and LI)
|
|
83
83
|
export {
|
|
84
|
-
|
|
84
|
+
applyIndexingChunkPolicy,
|
|
85
85
|
isExcludedByConfig,
|
|
86
86
|
chunkLooksGenerated,
|
|
87
|
-
} from './
|
|
87
|
+
} from './indexing-file-policy.js';
|
|
88
|
+
|
|
89
|
+
// Late-interaction compatibility export
|
|
90
|
+
export { applyLiSkipPolicy } from './li-skip-policy.js';
|
|
88
91
|
|
|
89
92
|
// Sparse-gram artifact builder (tier-1 grep acceleration)
|
|
90
93
|
export { buildSparseGramArtifact } from './indexer-sparse-gram.js';
|
|
@@ -12,6 +12,7 @@ import { LateInteractionIndex } from '../ranking/late-interaction-index.js';
|
|
|
12
12
|
import { truncateForHNSW, getEmbeddings, getModelInfo, fisherYatesShuffle } from '../embedding/embedding-service.js';
|
|
13
13
|
import { buildFromCodebaseDb as buildQuantizedArtifacts, shouldSkipArtifactRebuild, updateArtifactState, ARTIFACT_THRESHOLDS } from './artifact-builder.js';
|
|
14
14
|
import { log, logProgress } from './indexer-utils.js';
|
|
15
|
+
import { JAVA_FAMILY } from './ast-chunker.js';
|
|
15
16
|
|
|
16
17
|
// =============================================================================
|
|
17
18
|
// DURABLE WRITE HELPERS (Phase E — fsync ordering for checkpoint safety)
|
|
@@ -20,6 +21,45 @@ import { log, logProgress } from './indexer-utils.js';
|
|
|
20
21
|
const CHECKPOINT_INTERVAL_SEC = 30;
|
|
21
22
|
const MIN_VECTORS_BETWEEN_SAVES = 1000;
|
|
22
23
|
|
|
24
|
+
/**
|
|
25
|
+
* v6.2: language-family-conditioned LI input routing.
|
|
26
|
+
*
|
|
27
|
+
* Per-language ablation on GenCodeSearchNet (May 2026, see
|
|
28
|
+
* docs/JS_CHUNK_BLEEDING_ANALYSIS.md) determined that different
|
|
29
|
+
* languages benefit from different metadata richness in the late-
|
|
30
|
+
* interaction MaxSim input. v6.2 picks empirically:
|
|
31
|
+
*
|
|
32
|
+
* chunk.li_text (simple, per-lang path policy in chunker):
|
|
33
|
+
* - python (no path; at 97% ceiling, path duplicated funcname)
|
|
34
|
+
* - JAVA_FAMILY: java, php, csharp/c#, kotlin, scala
|
|
35
|
+
* (slug-stripped path; framework signal preserved, hashed-slug
|
|
36
|
+
* noise removed)
|
|
37
|
+
*
|
|
38
|
+
* chunk.embedding_text (greedy enriched; Scope + Defines + Uses):
|
|
39
|
+
* - javascript, typescript, jsx, tsx (closures + imports help)
|
|
40
|
+
* - ruby, go, c, cpp, c++, rust
|
|
41
|
+
* - any unknown language as the safe fallback
|
|
42
|
+
*
|
|
43
|
+
* The Java-family set is sourced from ast-chunker.js so that the
|
|
44
|
+
* routing here and the path-policy logic inside the chunker can never
|
|
45
|
+
* drift out of sync.
|
|
46
|
+
*/
|
|
47
|
+
export function pickLiInput(chunk) {
|
|
48
|
+
const lang = chunk?.metadata?.language;
|
|
49
|
+
if (lang === 'python' || JAVA_FAMILY.has(lang)) {
|
|
50
|
+
return chunk.li_text || chunk.embedding_text || chunk.text || chunk.content || '';
|
|
51
|
+
}
|
|
52
|
+
// For JS family / Ruby / Go / C/C++/Rust / unknown, read from
|
|
53
|
+
// `li_greedy_text` first. In production (variant=current) this is
|
|
54
|
+
// identical to embedding_text — same shipped v6.2 behavior. The
|
|
55
|
+
// preference matters only when a research-only embedding variant
|
|
56
|
+
// (SWEET_SEARCH_EMBED_TEXT_VARIANT) is active: then li_greedy_text
|
|
57
|
+
// still carries the shipped form, so an R1 ablation cannot
|
|
58
|
+
// accidentally mutate the LI input. Older chunks without
|
|
59
|
+
// li_greedy_text fall through to embedding_text as before.
|
|
60
|
+
return chunk.li_greedy_text || chunk.embedding_text || chunk.li_text || chunk.text || chunk.content || '';
|
|
61
|
+
}
|
|
62
|
+
|
|
23
63
|
function fsyncFile(filePath) {
|
|
24
64
|
const fd = openSync(filePath, 'r');
|
|
25
65
|
try { fsyncSync(fd); } finally { closeSync(fd); }
|
|
@@ -239,7 +279,7 @@ function buildLateInteractionBatches(chunks, options = {}) {
|
|
|
239
279
|
attentionBudget = Math.max(1, Math.floor(batchSizeCap / 2)) * maxLength * maxLength;
|
|
240
280
|
}
|
|
241
281
|
const indexed = chunks.map((chunk) => {
|
|
242
|
-
const text = chunk
|
|
282
|
+
const text = pickLiInput(chunk);
|
|
243
283
|
return {
|
|
244
284
|
chunk,
|
|
245
285
|
text,
|
|
@@ -603,17 +643,16 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
603
643
|
// Apply LI skip policy: last-mile guard before the slow encoder runs.
|
|
604
644
|
// The embed indexer has already dropped glob-excluded files at discovery
|
|
605
645
|
// time using the same loadProjectConfig() excludes; this pass adds the
|
|
606
|
-
// LI-specific
|
|
607
|
-
//
|
|
646
|
+
// LI-specific check globs can't do: content-based @generated markers.
|
|
647
|
+
// Disable via SWEET_SEARCH_LI_SKIP_DISABLE=1.
|
|
608
648
|
let skippedSummary = null;
|
|
609
649
|
if (Array.isArray(chunks) && chunks.length > 0) {
|
|
610
|
-
const {
|
|
611
|
-
const { kept, stats } =
|
|
650
|
+
const { applyIndexingChunkPolicy } = await import('./indexing-file-policy.js');
|
|
651
|
+
const { kept, stats } = applyIndexingChunkPolicy(chunks, { projectRoot });
|
|
612
652
|
if (stats.totalSkipped > 0) {
|
|
613
653
|
const breakdown = [
|
|
614
654
|
stats.excluded > 0 ? `${stats.excluded} excluded` : null,
|
|
615
655
|
stats.generated > 0 ? `${stats.generated} generated` : null,
|
|
616
|
-
stats.huge > 0 ? `${stats.huge} huge-file` : null,
|
|
617
656
|
].filter(Boolean).join(', ');
|
|
618
657
|
log(`LI skip policy: dropped ${stats.totalSkipped} chunks across ${stats.skippedFiles} files (${breakdown}); kept ${kept.length} chunks across ${stats.keptFiles} files`, 'dim');
|
|
619
658
|
chunks = kept;
|
|
@@ -537,9 +537,17 @@ export async function chunkFiles(files) {
|
|
|
537
537
|
}
|
|
538
538
|
}
|
|
539
539
|
|
|
540
|
+
// Embedding-text cap: defaults to 2000 (byte-identical to shipped). The
|
|
541
|
+
// SWEET_SEARCH_EMBED_TEXT_CAP env var (see ast-chunker.js:getEmbedTextCap)
|
|
542
|
+
// is honored by the chunk builders themselves; this final re-slice mirrors
|
|
543
|
+
// the same cap so an ablation can raise the ceiling end-to-end.
|
|
544
|
+
const _embCap = (() => {
|
|
545
|
+
const v = parseInt(process.env.SWEET_SEARCH_EMBED_TEXT_CAP || '', 10);
|
|
546
|
+
return Number.isFinite(v) && v >= 500 ? v : 2000;
|
|
547
|
+
})();
|
|
540
548
|
const texts = allChunks.map(chunk => {
|
|
541
549
|
if (chunk.embedding_text) {
|
|
542
|
-
return chunk.embedding_text.slice(0,
|
|
550
|
+
return chunk.embedding_text.slice(0, _embCap);
|
|
543
551
|
}
|
|
544
552
|
return `${chunk.file} ${chunk.metadata?.symbol || ''}\n${(chunk.text || chunk.content || '').slice(0, 1500)}`;
|
|
545
553
|
});
|
|
@@ -7,7 +7,7 @@ import fs from 'fs/promises';
|
|
|
7
7
|
import { existsSync } from 'fs';
|
|
8
8
|
import path from 'path';
|
|
9
9
|
|
|
10
|
-
import { DB_PATHS, PROJECT_ROOT, EMBEDDING_CONFIG } from '../infrastructure/config/index.js';
|
|
10
|
+
import { DB_PATHS, PROJECT_ROOT, EMBEDDING_CONFIG, HCGS_CONFIG } from '../infrastructure/config/index.js';
|
|
11
11
|
import { getChangedFiles, updateState, getStats as getIncrementalStats, updatePhaseProgress, markPhaseComplete, clearPhaseProgress } from './incremental-tracker.js';
|
|
12
12
|
import { backupSummaries, restoreSummaries, markForRegeneration } from '../graph/summary-manager.js';
|
|
13
13
|
import { colors, log, logProgress, logError, discoverFiles, readFilesFromStdin, atomicSwapDatabase } from './indexer-utils.js';
|
|
@@ -300,8 +300,10 @@ export async function buildCodeGraphWithHCGSPhase(options = {}) {
|
|
|
300
300
|
skipSummaryRegen,
|
|
301
301
|
} = options;
|
|
302
302
|
|
|
303
|
+
const hcgsEnabled = HCGS_CONFIG.enabled;
|
|
304
|
+
|
|
303
305
|
let summaryBackup = { summaries: [], count: 0 };
|
|
304
|
-
if (!dryRun) {
|
|
306
|
+
if (!dryRun && hcgsEnabled) {
|
|
305
307
|
summaryBackup = await backupSummaries(DB_PATHS.codeGraph);
|
|
306
308
|
if (summaryBackup.count > 0) {
|
|
307
309
|
log(`Backed up ${summaryBackup.count} existing summaries (with type validation)`, 'green');
|
|
@@ -318,14 +320,14 @@ export async function buildCodeGraphWithHCGSPhase(options = {}) {
|
|
|
318
320
|
|
|
319
321
|
const graphStats = await buildCodeGraph(allFiles, dryRun);
|
|
320
322
|
|
|
321
|
-
if (!dryRun && summaryBackup.count > 0) {
|
|
323
|
+
if (!dryRun && hcgsEnabled && summaryBackup.count > 0) {
|
|
322
324
|
const restoreResult = await restoreSummaries(DB_PATHS.codeGraph, summaryBackup);
|
|
323
325
|
log(`Restored ${restoreResult.restored} summaries (${restoreResult.skipped.total} skipped - entity removed/type changed)`, 'green');
|
|
324
326
|
}
|
|
325
327
|
|
|
326
328
|
let hcgsPromise = null;
|
|
327
329
|
|
|
328
|
-
const shouldRunHCGS = !dryRun && (
|
|
330
|
+
const shouldRunHCGS = !dryRun && hcgsEnabled && (
|
|
329
331
|
fullReindex ||
|
|
330
332
|
filesFromStdin ||
|
|
331
333
|
(incrementalInfo && filesToIndex.length > 0)
|