sweet-search 2.4.2 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +19 -5
- package/core/embedding/embedding-cache.js +177 -15
- package/core/embedding/embedding-service.js +18 -4
- package/core/graph/graph-expansion.js +52 -12
- package/core/graph/graph-extractor.js +30 -1
- package/core/indexing/ast-chunker.js +331 -16
- package/core/indexing/chunking/chunk-builder.js +34 -1
- package/core/indexing/index.js +6 -3
- package/core/indexing/indexer-ann.js +45 -6
- package/core/indexing/indexer-build.js +9 -1
- package/core/indexing/indexer-phases.js +6 -4
- package/core/indexing/indexing-file-policy.js +140 -0
- package/core/indexing/li-skip-policy.js +11 -220
- package/core/infrastructure/codebase-repository.js +21 -0
- package/core/infrastructure/config/embedding.js +20 -1
- package/core/infrastructure/config/graph.js +2 -2
- package/core/infrastructure/config/ranking.js +10 -0
- package/core/infrastructure/config/vector-store.js +1 -1
- package/core/infrastructure/coreml-cascade.js +236 -30
- package/core/infrastructure/coreml-cascade.json +25 -0
- package/core/infrastructure/index.js +15 -0
- package/core/infrastructure/init-config.js +78 -0
- package/core/infrastructure/language-patterns/registry-core.js +18 -0
- package/core/infrastructure/model-registry.js +12 -0
- package/core/infrastructure/native-inference.js +143 -51
- package/core/infrastructure/tree-sitter-provider.js +92 -2
- package/core/ranking/cascaded-scorer.js +6 -2
- package/core/ranking/file-kind-ranking.js +264 -0
- package/core/ranking/late-interaction-index.js +10 -4
- package/core/ranking/late-interaction-policy.js +304 -0
- package/core/search/context-expander.js +267 -28
- package/core/search/index.js +4 -0
- package/core/search/search-cli.js +3 -1
- package/core/search/search-pattern.js +4 -3
- package/core/search/search-postprocess.js +189 -8
- package/core/search/search-read-semantic.js +717 -0
- package/core/search/search-read.js +481 -0
- package/core/search/search-server.js +6 -4
- package/core/search/sweet-search.js +119 -15
- package/mcp/server.js +41 -0
- package/mcp/tool-handlers.js +117 -6
- package/package.json +9 -7
- package/scripts/init.js +386 -5
|
@@ -103,6 +103,18 @@ export const MODEL_REGISTRY = {
|
|
|
103
103
|
],
|
|
104
104
|
},
|
|
105
105
|
|
|
106
|
+
'lateon-code-edge-fp32': {
|
|
107
|
+
hfId: 'lightonai/LateOn-Code-edge',
|
|
108
|
+
profile: 'full',
|
|
109
|
+
description: 'Late interaction edge model (FP32 safetensors, backbone 256d, 2-stage projection) for native inference',
|
|
110
|
+
files: [
|
|
111
|
+
{ path: 'model.safetensors', sizeBytes: 67195976, sha256: '7ffc36b8ff71367249cd5220dbdd4bdbe177bc0e305b2e978a8b598bd8296f04' },
|
|
112
|
+
{ path: '1_Dense/model.safetensors', sizeBytes: 524376, sha256: '9efb17fcb2106cd8fcb01d57a9cd9c997a487ad20630ec8e44ce3f9d89efe0a7' },
|
|
113
|
+
{ path: '2_Dense/model.safetensors', sizeBytes: 98392, sha256: 'a7a388138b3c4bb1a81c8c3bcb9de123f1e652b9e9464a72707ca19ee86a26b1' },
|
|
114
|
+
{ path: 'config.json', sizeBytes: 1252, sha256: null },
|
|
115
|
+
],
|
|
116
|
+
},
|
|
117
|
+
|
|
106
118
|
'ms-marco-tinybert': {
|
|
107
119
|
hfId: 'Xenova/ms-marco-TinyBERT-L-2-v2',
|
|
108
120
|
profile: 'full',
|
|
@@ -55,6 +55,7 @@ import { getModelCacheDir, fetchModel } from './model-fetcher.js';
|
|
|
55
55
|
import { getModelEntry } from './model-registry.js';
|
|
56
56
|
import { getCoremlCascadeResolvedDirs } from './coreml-cascade.js';
|
|
57
57
|
import { detectHardwareCapability } from './hardware-capability.js';
|
|
58
|
+
import { LATE_INTERACTION_CONFIG } from './config/ranking.js';
|
|
58
59
|
|
|
59
60
|
const require = createRequire(import.meta.url);
|
|
60
61
|
|
|
@@ -63,12 +64,21 @@ const require = createRequire(import.meta.url);
|
|
|
63
64
|
let _addon = null;
|
|
64
65
|
let _embeddingModel = null;
|
|
65
66
|
let _embeddingModelLoadPromise = null; // race-gate for concurrent first calls
|
|
66
|
-
|
|
67
|
-
|
|
67
|
+
// Per-variant LI model cache. Keyed by FP32 registry key
|
|
68
|
+
// ('lateon-code-fp32' or 'lateon-code-edge-fp32') so a variant swap
|
|
69
|
+
// inside a single process (e.g. ORT-eval session followed by native
|
|
70
|
+
// indexing) doesn't return a stale model. Each entry is
|
|
71
|
+
// `{ model, promise }` where `promise` race-gates concurrent first
|
|
72
|
+
// calls and `model` becomes non-null on resolution.
|
|
73
|
+
const _liModels = new Map();
|
|
68
74
|
let _embTokenizer = null;
|
|
69
75
|
let _embTokenizerLoadPromise = null;
|
|
70
|
-
|
|
71
|
-
|
|
76
|
+
// Per-variant LI tokenizer cache. Keyed by tokenizer source key
|
|
77
|
+
// (matches the ORT-side registry key — `lateon-code` /
|
|
78
|
+
// `lateon-code-edge`). Standard and edge tokenizer.json files are
|
|
79
|
+
// byte-identical today but per-variant resolution is correct and
|
|
80
|
+
// future-proof.
|
|
81
|
+
const _liTokenizers = new Map();
|
|
72
82
|
let _available = null;
|
|
73
83
|
let _coremlCascadeLogged = false;
|
|
74
84
|
|
|
@@ -123,12 +133,17 @@ function propagateCudaComputeCapToAddonEnv() {
|
|
|
123
133
|
* Logged exactly once per process so a mis-configured cascade surfaces
|
|
124
134
|
* at startup instead of silently falling through on every call.
|
|
125
135
|
*
|
|
136
|
+
* Routes the LI cascade dir to `coreml-cascade/li/` (standard) or
|
|
137
|
+
* `coreml-cascade/li-edge/` (edge) based on the active variant in
|
|
138
|
+
* `LATE_INTERACTION_CONFIG`. The embed cascade is shared.
|
|
139
|
+
*
|
|
126
140
|
* Always returns an object — never throws. The returned dirs can be
|
|
127
141
|
* `null`, which the Rust addon treats as "CoreML path disabled" and
|
|
128
142
|
* falls back to candle unconditionally.
|
|
129
143
|
*/
|
|
130
144
|
function resolveCoremlCascadeForAddon() {
|
|
131
|
-
const
|
|
145
|
+
const liVariantKey = LATE_INTERACTION_CONFIG.model;
|
|
146
|
+
const resolved = getCoremlCascadeResolvedDirs(liVariantKey);
|
|
132
147
|
if (!_coremlCascadeLogged) {
|
|
133
148
|
_coremlCascadeLogged = true;
|
|
134
149
|
const hw = detectHardwareCapability();
|
|
@@ -137,7 +152,7 @@ function resolveCoremlCascadeForAddon() {
|
|
|
137
152
|
if (resolved.embedDir || resolved.liDir) {
|
|
138
153
|
process.stderr.write(
|
|
139
154
|
`[NativeInference] CoreML cascade: ${resolved.status}` +
|
|
140
|
-
` (embed=${resolved.embedDir ? 'yes' : 'no'}, li=${resolved.liDir ? 'yes' : 'no'},` +
|
|
155
|
+
` (embed=${resolved.embedDir ? 'yes' : 'no'}, li=${resolved.liDir ? 'yes' : 'no'} [${liVariantKey}],` +
|
|
141
156
|
` chip=${hw.brandString || 'unknown'})\n`
|
|
142
157
|
);
|
|
143
158
|
} else if (hw.coremlCascadeEligible) {
|
|
@@ -327,57 +342,117 @@ export async function nativeEmbed(texts, options = {}) {
|
|
|
327
342
|
// ─── Late Interaction Model ───
|
|
328
343
|
|
|
329
344
|
/**
|
|
330
|
-
*
|
|
331
|
-
*
|
|
345
|
+
* Resolve the active LI variant from `LATE_INTERACTION_CONFIG`. Returns
|
|
346
|
+
* the manifest the native loaders need (registry keys + projection
|
|
347
|
+
* paths and dims). Pure helper — no I/O, no caching.
|
|
348
|
+
*
|
|
349
|
+
* Falls back to the standard `lateon-code` entry if the active config
|
|
350
|
+
* is missing fields (defensive — every shipping config has them).
|
|
332
351
|
*/
|
|
333
|
-
export
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
352
|
+
export function resolveNativeLiVariant() {
|
|
353
|
+
const cfg = LATE_INTERACTION_CONFIG.activeModel;
|
|
354
|
+
const cfgKey = LATE_INTERACTION_CONFIG.model;
|
|
355
|
+
if (!cfg) {
|
|
356
|
+
throw new Error(
|
|
357
|
+
`[NativeInference] LATE_INTERACTION_CONFIG.model='${cfgKey}' is not a known variant`,
|
|
358
|
+
);
|
|
359
|
+
}
|
|
360
|
+
const fp32RegistryKey = cfg.nativeRegistryKey || `${cfgKey}-fp32`;
|
|
361
|
+
return {
|
|
362
|
+
cfgKey, // 'lateon-code' | 'lateon-code-edge'
|
|
363
|
+
fp32RegistryKey, // 'lateon-code-fp32' | 'lateon-code-edge-fp32'
|
|
364
|
+
tokenizerKey: cfgKey, // tokenizer lives next to the ORT model
|
|
365
|
+
projectionPaths: cfg.projectionPaths, // ['1_Dense/...'] | ['1_Dense/...', '2_Dense/...']
|
|
366
|
+
projectionDims: cfg.projectionDims, // [128] | [512, 48]
|
|
367
|
+
tokenDimension: cfg.tokenDimension, // 128 | 48
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
/**
|
|
372
|
+
* Internal: load the native LI model for a specific variant on the
|
|
373
|
+
* default device. Race-gated per variant via the `_liModels` Map so
|
|
374
|
+
* concurrent first callers share one load. Returns null if the addon
|
|
375
|
+
* isn't available or required files are missing.
|
|
376
|
+
*/
|
|
377
|
+
async function loadNativeLiVariantOnDefaultDevice(variant) {
|
|
378
|
+
const cached = _liModels.get(variant.fp32RegistryKey);
|
|
379
|
+
if (cached?.model) return cached.model;
|
|
380
|
+
if (cached?.promise) return cached.promise;
|
|
381
|
+
|
|
382
|
+
const promise = (async () => {
|
|
337
383
|
const addon = loadAddon();
|
|
338
384
|
if (!addon?.NativeLateInteractionModel) return null;
|
|
339
385
|
|
|
340
|
-
await fetchModel(
|
|
386
|
+
await fetchModel(variant.fp32RegistryKey);
|
|
341
387
|
|
|
342
|
-
const entry = getModelEntry(
|
|
388
|
+
const entry = getModelEntry(variant.fp32RegistryKey);
|
|
343
389
|
const modelDir = getModelCacheDir(entry.hfId);
|
|
344
390
|
const backbonePath = join(modelDir, 'model.safetensors');
|
|
345
|
-
const projPath = join(modelDir, '1_Dense', 'model.safetensors');
|
|
346
391
|
const configPath = join(modelDir, 'config.json');
|
|
392
|
+
const projAbsPaths = variant.projectionPaths.map((p) => join(modelDir, p));
|
|
347
393
|
|
|
348
|
-
if (!existsSync(backbonePath) || !existsSync(
|
|
394
|
+
if (!existsSync(backbonePath) || !existsSync(configPath)) return null;
|
|
395
|
+
if (!projAbsPaths.every(existsSync)) return null;
|
|
349
396
|
|
|
350
397
|
// Resolve the CoreML cascade dir for ModernBERT LI. Same contract
|
|
351
|
-
// as the embedding model above — see that comment.
|
|
398
|
+
// as the embedding model above — see that comment. The dir
|
|
399
|
+
// depends on the active variant (`coreml-cascade/li/` vs
|
|
400
|
+
// `coreml-cascade/li-edge/`).
|
|
352
401
|
const cascade = resolveCoremlCascadeForAddon();
|
|
353
402
|
|
|
354
403
|
const t0 = Date.now();
|
|
355
|
-
|
|
404
|
+
const model = addon.NativeLateInteractionModel.load(
|
|
356
405
|
backbonePath,
|
|
357
|
-
|
|
406
|
+
projAbsPaths,
|
|
407
|
+
variant.projectionDims,
|
|
358
408
|
configPath,
|
|
359
409
|
cascade.liDir || undefined,
|
|
360
410
|
);
|
|
361
|
-
console.log(
|
|
411
|
+
console.log(
|
|
412
|
+
`[NativeInference] LI model '${variant.cfgKey}' loaded in ${Date.now() - t0}ms `
|
|
413
|
+
+ `(dim: ${model.dim}, device: ${addon.nativeInferenceDevice()})`,
|
|
414
|
+
);
|
|
362
415
|
|
|
363
|
-
|
|
416
|
+
const slot = _liModels.get(variant.fp32RegistryKey);
|
|
417
|
+
if (slot) slot.model = model;
|
|
418
|
+
return model;
|
|
364
419
|
})();
|
|
365
|
-
|
|
420
|
+
|
|
421
|
+
_liModels.set(variant.fp32RegistryKey, { model: null, promise });
|
|
422
|
+
return promise;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
/**
|
|
426
|
+
* Load the native LI model for the currently-configured variant.
|
|
427
|
+
* Returns the model instance or null if unavailable. Race-gated per
|
|
428
|
+
* variant.
|
|
429
|
+
*/
|
|
430
|
+
export async function getNativeLiModel() {
|
|
431
|
+
const variant = resolveNativeLiVariant();
|
|
432
|
+
return loadNativeLiVariantOnDefaultDevice(variant);
|
|
366
433
|
}
|
|
367
434
|
|
|
368
435
|
/**
|
|
369
|
-
* Get or create the LI tokenizer
|
|
436
|
+
* Get or create the LI tokenizer for the currently-configured variant.
|
|
437
|
+
* Race-gated per variant via the `_liTokenizers` Map.
|
|
370
438
|
*/
|
|
371
439
|
async function getLiTokenizer() {
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
440
|
+
const variant = resolveNativeLiVariant();
|
|
441
|
+
const cached = _liTokenizers.get(variant.tokenizerKey);
|
|
442
|
+
if (cached?.tokenizer) return cached.tokenizer;
|
|
443
|
+
if (cached?.promise) return cached.promise;
|
|
444
|
+
|
|
445
|
+
const promise = (async () => {
|
|
446
|
+
const entry = getModelEntry(variant.tokenizerKey);
|
|
376
447
|
const tokenizerPath = join(getModelCacheDir(entry.hfId), 'tokenizer.json');
|
|
377
|
-
|
|
378
|
-
|
|
448
|
+
const tokenizer = await createTokenizer(tokenizerPath);
|
|
449
|
+
const slot = _liTokenizers.get(variant.tokenizerKey);
|
|
450
|
+
if (slot) slot.tokenizer = tokenizer;
|
|
451
|
+
return tokenizer;
|
|
379
452
|
})();
|
|
380
|
-
|
|
453
|
+
|
|
454
|
+
_liTokenizers.set(variant.tokenizerKey, { tokenizer: null, promise });
|
|
455
|
+
return promise;
|
|
381
456
|
}
|
|
382
457
|
|
|
383
458
|
/**
|
|
@@ -457,7 +532,11 @@ export function isNativeEmbeddingModelLoaded() {
|
|
|
457
532
|
}
|
|
458
533
|
|
|
459
534
|
export function isNativeLiModelLoaded() {
|
|
460
|
-
|
|
535
|
+
// True only when the *active* variant is loaded — a stale standard
|
|
536
|
+
// model lingering after a config swap to edge would otherwise
|
|
537
|
+
// mask the fact that edge encoding still has to load.
|
|
538
|
+
const variant = resolveNativeLiVariant();
|
|
539
|
+
return _liModels.get(variant.fp32RegistryKey)?.model != null;
|
|
461
540
|
}
|
|
462
541
|
|
|
463
542
|
// ─── Device-explicit loading ───
|
|
@@ -518,28 +597,32 @@ export async function loadNativeEmbeddingModelWithDevice(deviceKind, cascadeDirO
|
|
|
518
597
|
}
|
|
519
598
|
|
|
520
599
|
/**
|
|
521
|
-
* Load the native LI model on a specific device
|
|
600
|
+
* Load the native LI model on a specific device for the
|
|
601
|
+
* currently-configured variant. Race-gated per variant.
|
|
522
602
|
*/
|
|
523
603
|
export async function loadNativeLiModelWithDevice(deviceKind, cascadeDirOverride) {
|
|
524
|
-
|
|
525
|
-
|
|
604
|
+
const variant = resolveNativeLiVariant();
|
|
605
|
+
const cached = _liModels.get(variant.fp32RegistryKey);
|
|
606
|
+
if (cached?.model) return cached.model;
|
|
607
|
+
if (cached?.promise) return cached.promise;
|
|
526
608
|
|
|
527
|
-
|
|
609
|
+
const promise = (async () => {
|
|
528
610
|
const addon = loadAddon();
|
|
529
611
|
if (!addon?.NativeLateInteractionModel?.loadWithDevice) return null;
|
|
530
612
|
|
|
531
613
|
// See loadNativeEmbeddingModelWithDevice for why this is CUDA-only.
|
|
532
614
|
if (deviceKind === 'cuda') propagateCudaComputeCapToAddonEnv();
|
|
533
615
|
|
|
534
|
-
await fetchModel(
|
|
616
|
+
await fetchModel(variant.fp32RegistryKey);
|
|
535
617
|
|
|
536
|
-
const entry = getModelEntry(
|
|
618
|
+
const entry = getModelEntry(variant.fp32RegistryKey);
|
|
537
619
|
const modelDir = getModelCacheDir(entry.hfId);
|
|
538
620
|
const backbonePath = join(modelDir, 'model.safetensors');
|
|
539
|
-
const projPath = join(modelDir, '1_Dense', 'model.safetensors');
|
|
540
621
|
const configPath = join(modelDir, 'config.json');
|
|
622
|
+
const projAbsPaths = variant.projectionPaths.map((p) => join(modelDir, p));
|
|
541
623
|
|
|
542
|
-
if (!existsSync(backbonePath) || !existsSync(
|
|
624
|
+
if (!existsSync(backbonePath) || !existsSync(configPath)) return null;
|
|
625
|
+
if (!projAbsPaths.every(existsSync)) return null;
|
|
543
626
|
|
|
544
627
|
// CUDA has no cascade — see the matching comment in
|
|
545
628
|
// loadNativeEmbeddingModelWithDevice.
|
|
@@ -550,19 +633,26 @@ export async function loadNativeLiModelWithDevice(deviceKind, cascadeDirOverride
|
|
|
550
633
|
);
|
|
551
634
|
|
|
552
635
|
const t0 = Date.now();
|
|
553
|
-
|
|
636
|
+
const model = addon.NativeLateInteractionModel.loadWithDevice(
|
|
554
637
|
backbonePath,
|
|
555
|
-
|
|
638
|
+
projAbsPaths,
|
|
639
|
+
variant.projectionDims,
|
|
556
640
|
configPath,
|
|
557
641
|
cascadeDir,
|
|
558
642
|
deviceKind,
|
|
559
643
|
);
|
|
560
|
-
console.log(
|
|
644
|
+
console.log(
|
|
645
|
+
`[NativeInference] LI model '${variant.cfgKey}' loaded in ${Date.now() - t0}ms `
|
|
646
|
+
+ `(dim: ${model.dim}, device: ${deviceKind})`,
|
|
647
|
+
);
|
|
561
648
|
|
|
562
|
-
|
|
649
|
+
const slot = _liModels.get(variant.fp32RegistryKey);
|
|
650
|
+
if (slot) slot.model = model;
|
|
651
|
+
return model;
|
|
563
652
|
})();
|
|
564
653
|
|
|
565
|
-
|
|
654
|
+
_liModels.set(variant.fp32RegistryKey, { model: null, promise });
|
|
655
|
+
return promise;
|
|
566
656
|
}
|
|
567
657
|
|
|
568
658
|
// ─── Warmup primitives ───
|
|
@@ -575,10 +665,14 @@ export async function warmupNativeEmbeddingModel() {
|
|
|
575
665
|
}
|
|
576
666
|
|
|
577
667
|
export async function warmupNativeLiModel() {
|
|
578
|
-
|
|
668
|
+
// Warm up only the *active* variant — warming up an unused stale
|
|
669
|
+
// variant would be wasted Metal queue time.
|
|
670
|
+
const variant = resolveNativeLiVariant();
|
|
671
|
+
const model = _liModels.get(variant.fp32RegistryKey)?.model;
|
|
672
|
+
if (!model?.warmupForward) return;
|
|
579
673
|
const t0 = Date.now();
|
|
580
|
-
await
|
|
581
|
-
console.log(`[NativeInference] LI warmup forward in ${Date.now() - t0}ms`);
|
|
674
|
+
await model.warmupForward();
|
|
675
|
+
console.log(`[NativeInference] LI warmup forward (${variant.cfgKey}) in ${Date.now() - t0}ms`);
|
|
582
676
|
}
|
|
583
677
|
|
|
584
678
|
// ─── Cleanup ───
|
|
@@ -586,12 +680,10 @@ export async function warmupNativeLiModel() {
|
|
|
586
680
|
export function unloadNativeModels() {
|
|
587
681
|
_embeddingModel = null;
|
|
588
682
|
_embeddingModelLoadPromise = null;
|
|
589
|
-
|
|
590
|
-
_liModelLoadPromise = null;
|
|
683
|
+
_liModels.clear();
|
|
591
684
|
_embTokenizer = null;
|
|
592
685
|
_embTokenizerLoadPromise = null;
|
|
593
|
-
|
|
594
|
-
_liTokenizerLoadPromise = null;
|
|
686
|
+
_liTokenizers.clear();
|
|
595
687
|
_addon = null;
|
|
596
688
|
_available = null;
|
|
597
689
|
_coremlCascadeLogged = false;
|
|
@@ -69,6 +69,28 @@ const BOUNDARY_TYPES = new Set([
|
|
|
69
69
|
'class_specifier', 'namespace_definition',
|
|
70
70
|
]);
|
|
71
71
|
|
|
72
|
+
// AST node types that represent function/class bodies. Used by
|
|
73
|
+
// extractSignature() to find where the declaration's body starts so
|
|
74
|
+
// the signature span is everything before it (decorators + name +
|
|
75
|
+
// parameters + return type, excluding body).
|
|
76
|
+
const BODY_TYPES = new Set([
|
|
77
|
+
// JS/TS, Java, Go, Rust, Kotlin, Swift, C#, Ruby (sometimes)
|
|
78
|
+
'block', 'statement_block', 'class_body', 'function_body',
|
|
79
|
+
// C / C++ — function bodies
|
|
80
|
+
'compound_statement', 'field_declaration_list',
|
|
81
|
+
// Python uses `block` (already covered) but `:` precedes it
|
|
82
|
+
// PHP — function/method body
|
|
83
|
+
'compound_statement_php',
|
|
84
|
+
// Swift / Kotlin — sometimes labelled differently
|
|
85
|
+
'enum_class_body', 'enum_body', 'interface_body',
|
|
86
|
+
// Rust impl/trait bodies
|
|
87
|
+
'declaration_list',
|
|
88
|
+
]);
|
|
89
|
+
|
|
90
|
+
// Maximum signature length (chars) after whitespace normalization.
|
|
91
|
+
// Signatures longer than this get truncated with `…`.
|
|
92
|
+
const MAX_SIGNATURE_LENGTH = 200;
|
|
93
|
+
|
|
72
94
|
// Map tree-sitter node type -> our chunk type label
|
|
73
95
|
const NODE_TYPE_MAP = {
|
|
74
96
|
'function_declaration': 'function',
|
|
@@ -410,12 +432,23 @@ export class TreeSitterProvider {
|
|
|
410
432
|
/**
|
|
411
433
|
* Parse file content into semantic chunks using the cAST recursive algorithm.
|
|
412
434
|
* Returns array of chunk objects or null if tree-sitter can't handle it.
|
|
435
|
+
*
|
|
436
|
+
* Header-aware budget (research-only ablation, May 2026): set
|
|
437
|
+
* SWEET_SEARCH_CHUNK_HEADER_OVERHEAD=N to subtract N chars from the
|
|
438
|
+
* cAST max chunk size, leaving room for the embedding-text headers
|
|
439
|
+
* (path / parent / symbol / language ≈ 50–100 chars) without spilling
|
|
440
|
+
* past the embedding cap. Default 0 = byte-identical to shipped. The
|
|
441
|
+
* audit motivating this lever lives in eval/results/chunk-overflow-audit.md.
|
|
413
442
|
*/
|
|
414
443
|
async parseFileToChunks(content, languageId, options = {}) {
|
|
415
444
|
const tree = await this.parse(content, languageId);
|
|
416
445
|
if (!tree) return null;
|
|
417
446
|
|
|
418
|
-
const
|
|
447
|
+
const headerOverhead = (() => {
|
|
448
|
+
const v = parseInt(process.env.SWEET_SEARCH_CHUNK_HEADER_OVERHEAD || '', 10);
|
|
449
|
+
return Number.isFinite(v) && v >= 0 ? v : 0;
|
|
450
|
+
})();
|
|
451
|
+
const maxChunkSize = (options.maxChunkSize || 2000) - headerOverhead;
|
|
419
452
|
this._chunkCounter = 0;
|
|
420
453
|
|
|
421
454
|
const children = this._getChildren(tree.rootNode);
|
|
@@ -467,6 +500,7 @@ export class TreeSitterProvider {
|
|
|
467
500
|
const firstBoundary = buffer.find(n => BOUNDARY_TYPES.has(n.type));
|
|
468
501
|
const name = firstBoundary ? this._extractNodeName(firstBoundary) : null;
|
|
469
502
|
const type = firstBoundary ? (NODE_TYPE_MAP[firstBoundary.type] || 'code') : 'code';
|
|
503
|
+
const signature = firstBoundary ? this._extractSignature(firstBoundary, content) : null;
|
|
470
504
|
|
|
471
505
|
chunks.push({
|
|
472
506
|
chunkId: this._nextChunkId(),
|
|
@@ -478,6 +512,7 @@ export class TreeSitterProvider {
|
|
|
478
512
|
endLine: buffer[buffer.length - 1].endPosition.row,
|
|
479
513
|
type,
|
|
480
514
|
name: name || (buffer.length === 1 ? null : null),
|
|
515
|
+
signature,
|
|
481
516
|
});
|
|
482
517
|
}
|
|
483
518
|
buffer = [];
|
|
@@ -536,6 +571,7 @@ export class TreeSitterProvider {
|
|
|
536
571
|
endLine: node.endPosition.row,
|
|
537
572
|
type: NODE_TYPE_MAP[node.type] || 'code',
|
|
538
573
|
name: this._extractNodeName(node),
|
|
574
|
+
signature: this._extractSignature(node, content),
|
|
539
575
|
});
|
|
540
576
|
}
|
|
541
577
|
}
|
|
@@ -546,6 +582,60 @@ export class TreeSitterProvider {
|
|
|
546
582
|
return chunks;
|
|
547
583
|
}
|
|
548
584
|
|
|
585
|
+
/**
|
|
586
|
+
* Extract a compact, single-line signature for a boundary AST node.
|
|
587
|
+
*
|
|
588
|
+
* Strategy: find the first body-like child (block / statement_block /
|
|
589
|
+
* compound_statement / class_body / declaration_list / …), and return
|
|
590
|
+
* the source span [node.startIndex, body.startIndex) with whitespace
|
|
591
|
+
* normalized to single spaces. If no body child is found (e.g.
|
|
592
|
+
* declarations without a body, abstract methods, interface members),
|
|
593
|
+
* return the full first line of the node.
|
|
594
|
+
*
|
|
595
|
+
* Returns null when the node has no children to inspect.
|
|
596
|
+
*
|
|
597
|
+
* Used by the `signature` R1 embedding-text variant. Intentionally
|
|
598
|
+
* does NOT alter `text`, `li_text`, or `li_greedy_text` — signature
|
|
599
|
+
* surface is research-only on `embedding_text`.
|
|
600
|
+
*/
|
|
601
|
+
_extractSignature(node, content) {
|
|
602
|
+
if (!node || !content) return null;
|
|
603
|
+
if (!BOUNDARY_TYPES.has(node.type)) return null;
|
|
604
|
+
|
|
605
|
+
let bodyStart = null;
|
|
606
|
+
// Try field-name lookup first (works for most modern grammars).
|
|
607
|
+
const bodyField = node.childForFieldName?.('body');
|
|
608
|
+
if (bodyField && BODY_TYPES.has(bodyField.type)) {
|
|
609
|
+
bodyStart = bodyField.startIndex;
|
|
610
|
+
} else {
|
|
611
|
+
// Fall back to scanning children for a body-shaped child.
|
|
612
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
613
|
+
const child = node.child(i);
|
|
614
|
+
if (BODY_TYPES.has(child.type)) {
|
|
615
|
+
bodyStart = child.startIndex;
|
|
616
|
+
break;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
let raw;
|
|
622
|
+
if (bodyStart != null && bodyStart > node.startIndex) {
|
|
623
|
+
raw = content.substring(node.startIndex, bodyStart);
|
|
624
|
+
} else {
|
|
625
|
+
// No body found — declaration only (e.g. abstract method, type
|
|
626
|
+
// alias). Take the whole node text.
|
|
627
|
+
raw = content.substring(node.startIndex, node.endIndex);
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// Normalize: collapse runs of whitespace (including newlines) to a
|
|
631
|
+
// single space, drop leading/trailing whitespace.
|
|
632
|
+
const normalized = raw.replace(/\s+/g, ' ').trim();
|
|
633
|
+
if (!normalized) return null;
|
|
634
|
+
|
|
635
|
+
if (normalized.length <= MAX_SIGNATURE_LENGTH) return normalized;
|
|
636
|
+
return normalized.slice(0, MAX_SIGNATURE_LENGTH - 1) + '…';
|
|
637
|
+
}
|
|
638
|
+
|
|
549
639
|
/** Extract symbol name from an AST node */
|
|
550
640
|
_extractNodeName(node) {
|
|
551
641
|
// Try field name first (most reliable)
|
|
@@ -662,4 +752,4 @@ export function resetTreeSitterProvider() {
|
|
|
662
752
|
}
|
|
663
753
|
|
|
664
754
|
// Re-export constants for testing
|
|
665
|
-
export { GRAMMAR_MAP, IDENT_TYPES, BOUNDARY_TYPES, NODE_TYPE_MAP, TAGS_QUERIES, CAPTURE_TO_ENTITY_TYPE };
|
|
755
|
+
export { GRAMMAR_MAP, IDENT_TYPES, BOUNDARY_TYPES, BODY_TYPES, MAX_SIGNATURE_LENGTH, NODE_TYPE_MAP, TAGS_QUERIES, CAPTURE_TO_ENTITY_TYPE };
|
|
@@ -121,11 +121,15 @@ function partitionByTokenAvailability(candidates, liIndex) {
|
|
|
121
121
|
if (!liIndex) {
|
|
122
122
|
return { withTokens: [], withoutTokens: [...candidates] };
|
|
123
123
|
}
|
|
124
|
-
|
|
124
|
+
// Graph-expanded candidates have entity_id-based public ids that don't
|
|
125
|
+
// match LI-indexed chunk ids; they carry the resolved chunk id under
|
|
126
|
+
// _liChunkId. Honour it so expanded candidates can participate in MaxSim.
|
|
127
|
+
const lookupId = (c) => c._liChunkId || c.id || c.entity_id;
|
|
128
|
+
const available = liIndex.hasTokens(candidates.map(lookupId));
|
|
125
129
|
const withTokens = [];
|
|
126
130
|
const withoutTokens = [];
|
|
127
131
|
for (const c of candidates) {
|
|
128
|
-
(available.has(c
|
|
132
|
+
(available.has(lookupId(c)) ? withTokens : withoutTokens).push(c);
|
|
129
133
|
}
|
|
130
134
|
return { withTokens, withoutTokens };
|
|
131
135
|
}
|