sweet-search 2.4.2 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/core/cli.js +19 -5
  2. package/core/embedding/embedding-cache.js +177 -15
  3. package/core/embedding/embedding-service.js +18 -4
  4. package/core/graph/graph-expansion.js +52 -12
  5. package/core/graph/graph-extractor.js +30 -1
  6. package/core/indexing/ast-chunker.js +331 -16
  7. package/core/indexing/chunking/chunk-builder.js +34 -1
  8. package/core/indexing/index.js +6 -3
  9. package/core/indexing/indexer-ann.js +45 -6
  10. package/core/indexing/indexer-build.js +9 -1
  11. package/core/indexing/indexer-phases.js +6 -4
  12. package/core/indexing/indexing-file-policy.js +140 -0
  13. package/core/indexing/li-skip-policy.js +11 -220
  14. package/core/infrastructure/codebase-repository.js +21 -0
  15. package/core/infrastructure/config/embedding.js +20 -1
  16. package/core/infrastructure/config/graph.js +2 -2
  17. package/core/infrastructure/config/ranking.js +10 -0
  18. package/core/infrastructure/config/vector-store.js +1 -1
  19. package/core/infrastructure/coreml-cascade.js +236 -30
  20. package/core/infrastructure/coreml-cascade.json +25 -0
  21. package/core/infrastructure/index.js +15 -0
  22. package/core/infrastructure/init-config.js +78 -0
  23. package/core/infrastructure/language-patterns/registry-core.js +18 -0
  24. package/core/infrastructure/model-registry.js +12 -0
  25. package/core/infrastructure/native-inference.js +143 -51
  26. package/core/infrastructure/tree-sitter-provider.js +92 -2
  27. package/core/ranking/cascaded-scorer.js +6 -2
  28. package/core/ranking/file-kind-ranking.js +264 -0
  29. package/core/ranking/late-interaction-index.js +10 -4
  30. package/core/ranking/late-interaction-policy.js +304 -0
  31. package/core/search/context-expander.js +267 -28
  32. package/core/search/index.js +4 -0
  33. package/core/search/search-cli.js +3 -1
  34. package/core/search/search-pattern.js +4 -3
  35. package/core/search/search-postprocess.js +189 -8
  36. package/core/search/search-read-semantic.js +717 -0
  37. package/core/search/search-read.js +481 -0
  38. package/core/search/search-server.js +6 -4
  39. package/core/search/sweet-search.js +119 -15
  40. package/mcp/server.js +41 -0
  41. package/mcp/tool-handlers.js +117 -6
  42. package/package.json +9 -7
  43. package/scripts/init.js +386 -5
@@ -103,6 +103,18 @@ export const MODEL_REGISTRY = {
103
103
  ],
104
104
  },
105
105
 
106
+ 'lateon-code-edge-fp32': {
107
+ hfId: 'lightonai/LateOn-Code-edge',
108
+ profile: 'full',
109
+ description: 'Late interaction edge model (FP32 safetensors, backbone 256d, 2-stage projection) for native inference',
110
+ files: [
111
+ { path: 'model.safetensors', sizeBytes: 67195976, sha256: '7ffc36b8ff71367249cd5220dbdd4bdbe177bc0e305b2e978a8b598bd8296f04' },
112
+ { path: '1_Dense/model.safetensors', sizeBytes: 524376, sha256: '9efb17fcb2106cd8fcb01d57a9cd9c997a487ad20630ec8e44ce3f9d89efe0a7' },
113
+ { path: '2_Dense/model.safetensors', sizeBytes: 98392, sha256: 'a7a388138b3c4bb1a81c8c3bcb9de123f1e652b9e9464a72707ca19ee86a26b1' },
114
+ { path: 'config.json', sizeBytes: 1252, sha256: null },
115
+ ],
116
+ },
117
+
106
118
  'ms-marco-tinybert': {
107
119
  hfId: 'Xenova/ms-marco-TinyBERT-L-2-v2',
108
120
  profile: 'full',
@@ -55,6 +55,7 @@ import { getModelCacheDir, fetchModel } from './model-fetcher.js';
55
55
  import { getModelEntry } from './model-registry.js';
56
56
  import { getCoremlCascadeResolvedDirs } from './coreml-cascade.js';
57
57
  import { detectHardwareCapability } from './hardware-capability.js';
58
+ import { LATE_INTERACTION_CONFIG } from './config/ranking.js';
58
59
 
59
60
  const require = createRequire(import.meta.url);
60
61
 
@@ -63,12 +64,21 @@ const require = createRequire(import.meta.url);
63
64
  let _addon = null;
64
65
  let _embeddingModel = null;
65
66
  let _embeddingModelLoadPromise = null; // race-gate for concurrent first calls
66
- let _liModel = null;
67
- let _liModelLoadPromise = null;
67
+ // Per-variant LI model cache. Keyed by FP32 registry key
68
+ // ('lateon-code-fp32' or 'lateon-code-edge-fp32') so a variant swap
69
+ // inside a single process (e.g. ORT-eval session followed by native
70
+ // indexing) doesn't return a stale model. Each entry is
71
+ // `{ model, promise }` where `promise` race-gates concurrent first
72
+ // calls and `model` becomes non-null on resolution.
73
+ const _liModels = new Map();
68
74
  let _embTokenizer = null;
69
75
  let _embTokenizerLoadPromise = null;
70
- let _liTokenizer = null;
71
- let _liTokenizerLoadPromise = null;
76
+ // Per-variant LI tokenizer cache. Keyed by tokenizer source key
77
+ // (matches the ORT-side registry key — `lateon-code` /
78
+ // `lateon-code-edge`). Standard and edge tokenizer.json files are
79
+ // byte-identical today but per-variant resolution is correct and
80
+ // future-proof.
81
+ const _liTokenizers = new Map();
72
82
  let _available = null;
73
83
  let _coremlCascadeLogged = false;
74
84
 
@@ -123,12 +133,17 @@ function propagateCudaComputeCapToAddonEnv() {
123
133
  * Logged exactly once per process so a mis-configured cascade surfaces
124
134
  * at startup instead of silently falling through on every call.
125
135
  *
136
+ * Routes the LI cascade dir to `coreml-cascade/li/` (standard) or
137
+ * `coreml-cascade/li-edge/` (edge) based on the active variant in
138
+ * `LATE_INTERACTION_CONFIG`. The embed cascade is shared.
139
+ *
126
140
  * Always returns an object — never throws. The returned dirs can be
127
141
  * `null`, which the Rust addon treats as "CoreML path disabled" and
128
142
  * falls back to candle unconditionally.
129
143
  */
130
144
  function resolveCoremlCascadeForAddon() {
131
- const resolved = getCoremlCascadeResolvedDirs();
145
+ const liVariantKey = LATE_INTERACTION_CONFIG.model;
146
+ const resolved = getCoremlCascadeResolvedDirs(liVariantKey);
132
147
  if (!_coremlCascadeLogged) {
133
148
  _coremlCascadeLogged = true;
134
149
  const hw = detectHardwareCapability();
@@ -137,7 +152,7 @@ function resolveCoremlCascadeForAddon() {
137
152
  if (resolved.embedDir || resolved.liDir) {
138
153
  process.stderr.write(
139
154
  `[NativeInference] CoreML cascade: ${resolved.status}` +
140
- ` (embed=${resolved.embedDir ? 'yes' : 'no'}, li=${resolved.liDir ? 'yes' : 'no'},` +
155
+ ` (embed=${resolved.embedDir ? 'yes' : 'no'}, li=${resolved.liDir ? 'yes' : 'no'} [${liVariantKey}],` +
141
156
  ` chip=${hw.brandString || 'unknown'})\n`
142
157
  );
143
158
  } else if (hw.coremlCascadeEligible) {
@@ -327,57 +342,117 @@ export async function nativeEmbed(texts, options = {}) {
327
342
  // ─── Late Interaction Model ───
328
343
 
329
344
  /**
330
- * Load the native LI model (LateOn-Code FP32 safetensors + projection).
331
- * Returns the model instance or null if unavailable. Race-gated.
345
+ * Resolve the active LI variant from `LATE_INTERACTION_CONFIG`. Returns
346
+ * the manifest the native loaders need (registry keys + projection
347
+ * paths and dims). Pure helper — no I/O, no caching.
348
+ *
349
+ * Falls back to the standard `lateon-code` entry if the active config
350
+ * is missing fields (defensive — every shipping config has them).
332
351
  */
333
- export async function getNativeLiModel() {
334
- if (_liModel) return _liModel;
335
- if (_liModelLoadPromise) return _liModelLoadPromise;
336
- _liModelLoadPromise = (async () => {
352
+ export function resolveNativeLiVariant() {
353
+ const cfg = LATE_INTERACTION_CONFIG.activeModel;
354
+ const cfgKey = LATE_INTERACTION_CONFIG.model;
355
+ if (!cfg) {
356
+ throw new Error(
357
+ `[NativeInference] LATE_INTERACTION_CONFIG.model='${cfgKey}' is not a known variant`,
358
+ );
359
+ }
360
+ const fp32RegistryKey = cfg.nativeRegistryKey || `${cfgKey}-fp32`;
361
+ return {
362
+ cfgKey, // 'lateon-code' | 'lateon-code-edge'
363
+ fp32RegistryKey, // 'lateon-code-fp32' | 'lateon-code-edge-fp32'
364
+ tokenizerKey: cfgKey, // tokenizer lives next to the ORT model
365
+ projectionPaths: cfg.projectionPaths, // ['1_Dense/...'] | ['1_Dense/...', '2_Dense/...']
366
+ projectionDims: cfg.projectionDims, // [128] | [512, 48]
367
+ tokenDimension: cfg.tokenDimension, // 128 | 48
368
+ };
369
+ }
370
+
371
+ /**
372
+ * Internal: load the native LI model for a specific variant on the
373
+ * default device. Race-gated per variant via the `_liModels` Map so
374
+ * concurrent first callers share one load. Returns null if the addon
375
+ * isn't available or required files are missing.
376
+ */
377
+ async function loadNativeLiVariantOnDefaultDevice(variant) {
378
+ const cached = _liModels.get(variant.fp32RegistryKey);
379
+ if (cached?.model) return cached.model;
380
+ if (cached?.promise) return cached.promise;
381
+
382
+ const promise = (async () => {
337
383
  const addon = loadAddon();
338
384
  if (!addon?.NativeLateInteractionModel) return null;
339
385
 
340
- await fetchModel('lateon-code-fp32');
386
+ await fetchModel(variant.fp32RegistryKey);
341
387
 
342
- const entry = getModelEntry('lateon-code-fp32');
388
+ const entry = getModelEntry(variant.fp32RegistryKey);
343
389
  const modelDir = getModelCacheDir(entry.hfId);
344
390
  const backbonePath = join(modelDir, 'model.safetensors');
345
- const projPath = join(modelDir, '1_Dense', 'model.safetensors');
346
391
  const configPath = join(modelDir, 'config.json');
392
+ const projAbsPaths = variant.projectionPaths.map((p) => join(modelDir, p));
347
393
 
348
- if (!existsSync(backbonePath) || !existsSync(projPath) || !existsSync(configPath)) return null;
394
+ if (!existsSync(backbonePath) || !existsSync(configPath)) return null;
395
+ if (!projAbsPaths.every(existsSync)) return null;
349
396
 
350
397
  // Resolve the CoreML cascade dir for ModernBERT LI. Same contract
351
- // as the embedding model above — see that comment.
398
+ // as the embedding model above — see that comment. The dir
399
+ // depends on the active variant (`coreml-cascade/li/` vs
400
+ // `coreml-cascade/li-edge/`).
352
401
  const cascade = resolveCoremlCascadeForAddon();
353
402
 
354
403
  const t0 = Date.now();
355
- _liModel = addon.NativeLateInteractionModel.load(
404
+ const model = addon.NativeLateInteractionModel.load(
356
405
  backbonePath,
357
- projPath,
406
+ projAbsPaths,
407
+ variant.projectionDims,
358
408
  configPath,
359
409
  cascade.liDir || undefined,
360
410
  );
361
- console.log(`[NativeInference] LI model loaded in ${Date.now() - t0}ms (dim: ${_liModel.dim}, device: ${addon.nativeInferenceDevice()})`);
411
+ console.log(
412
+ `[NativeInference] LI model '${variant.cfgKey}' loaded in ${Date.now() - t0}ms `
413
+ + `(dim: ${model.dim}, device: ${addon.nativeInferenceDevice()})`,
414
+ );
362
415
 
363
- return _liModel;
416
+ const slot = _liModels.get(variant.fp32RegistryKey);
417
+ if (slot) slot.model = model;
418
+ return model;
364
419
  })();
365
- return _liModelLoadPromise;
420
+
421
+ _liModels.set(variant.fp32RegistryKey, { model: null, promise });
422
+ return promise;
423
+ }
424
+
425
+ /**
426
+ * Load the native LI model for the currently-configured variant.
427
+ * Returns the model instance or null if unavailable. Race-gated per
428
+ * variant.
429
+ */
430
+ export async function getNativeLiModel() {
431
+ const variant = resolveNativeLiVariant();
432
+ return loadNativeLiVariantOnDefaultDevice(variant);
366
433
  }
367
434
 
368
435
  /**
369
- * Get or create the LI tokenizer. Race-gated.
436
+ * Get or create the LI tokenizer for the currently-configured variant.
437
+ * Race-gated per variant via the `_liTokenizers` Map.
370
438
  */
371
439
  async function getLiTokenizer() {
372
- if (_liTokenizer) return _liTokenizer;
373
- if (_liTokenizerLoadPromise) return _liTokenizerLoadPromise;
374
- _liTokenizerLoadPromise = (async () => {
375
- const entry = getModelEntry('lateon-code');
440
+ const variant = resolveNativeLiVariant();
441
+ const cached = _liTokenizers.get(variant.tokenizerKey);
442
+ if (cached?.tokenizer) return cached.tokenizer;
443
+ if (cached?.promise) return cached.promise;
444
+
445
+ const promise = (async () => {
446
+ const entry = getModelEntry(variant.tokenizerKey);
376
447
  const tokenizerPath = join(getModelCacheDir(entry.hfId), 'tokenizer.json');
377
- _liTokenizer = await createTokenizer(tokenizerPath);
378
- return _liTokenizer;
448
+ const tokenizer = await createTokenizer(tokenizerPath);
449
+ const slot = _liTokenizers.get(variant.tokenizerKey);
450
+ if (slot) slot.tokenizer = tokenizer;
451
+ return tokenizer;
379
452
  })();
380
- return _liTokenizerLoadPromise;
453
+
454
+ _liTokenizers.set(variant.tokenizerKey, { tokenizer: null, promise });
455
+ return promise;
381
456
  }
382
457
 
383
458
  /**
@@ -457,7 +532,11 @@ export function isNativeEmbeddingModelLoaded() {
457
532
  }
458
533
 
459
534
  export function isNativeLiModelLoaded() {
460
- return _liModel != null;
535
+ // True only when the *active* variant is loaded — a stale standard
536
+ // model lingering after a config swap to edge would otherwise
537
+ // mask the fact that edge encoding still has to load.
538
+ const variant = resolveNativeLiVariant();
539
+ return _liModels.get(variant.fp32RegistryKey)?.model != null;
461
540
  }
462
541
 
463
542
  // ─── Device-explicit loading ───
@@ -518,28 +597,32 @@ export async function loadNativeEmbeddingModelWithDevice(deviceKind, cascadeDirO
518
597
  }
519
598
 
520
599
  /**
521
- * Load the native LI model on a specific device.
600
+ * Load the native LI model on a specific device for the
601
+ * currently-configured variant. Race-gated per variant.
522
602
  */
523
603
  export async function loadNativeLiModelWithDevice(deviceKind, cascadeDirOverride) {
524
- if (_liModel) return _liModel;
525
- if (_liModelLoadPromise) return _liModelLoadPromise;
604
+ const variant = resolveNativeLiVariant();
605
+ const cached = _liModels.get(variant.fp32RegistryKey);
606
+ if (cached?.model) return cached.model;
607
+ if (cached?.promise) return cached.promise;
526
608
 
527
- _liModelLoadPromise = (async () => {
609
+ const promise = (async () => {
528
610
  const addon = loadAddon();
529
611
  if (!addon?.NativeLateInteractionModel?.loadWithDevice) return null;
530
612
 
531
613
  // See loadNativeEmbeddingModelWithDevice for why this is CUDA-only.
532
614
  if (deviceKind === 'cuda') propagateCudaComputeCapToAddonEnv();
533
615
 
534
- await fetchModel('lateon-code-fp32');
616
+ await fetchModel(variant.fp32RegistryKey);
535
617
 
536
- const entry = getModelEntry('lateon-code-fp32');
618
+ const entry = getModelEntry(variant.fp32RegistryKey);
537
619
  const modelDir = getModelCacheDir(entry.hfId);
538
620
  const backbonePath = join(modelDir, 'model.safetensors');
539
- const projPath = join(modelDir, '1_Dense', 'model.safetensors');
540
621
  const configPath = join(modelDir, 'config.json');
622
+ const projAbsPaths = variant.projectionPaths.map((p) => join(modelDir, p));
541
623
 
542
- if (!existsSync(backbonePath) || !existsSync(projPath) || !existsSync(configPath)) return null;
624
+ if (!existsSync(backbonePath) || !existsSync(configPath)) return null;
625
+ if (!projAbsPaths.every(existsSync)) return null;
543
626
 
544
627
  // CUDA has no cascade — see the matching comment in
545
628
  // loadNativeEmbeddingModelWithDevice.
@@ -550,19 +633,26 @@ export async function loadNativeLiModelWithDevice(deviceKind, cascadeDirOverride
550
633
  );
551
634
 
552
635
  const t0 = Date.now();
553
- _liModel = addon.NativeLateInteractionModel.loadWithDevice(
636
+ const model = addon.NativeLateInteractionModel.loadWithDevice(
554
637
  backbonePath,
555
- projPath,
638
+ projAbsPaths,
639
+ variant.projectionDims,
556
640
  configPath,
557
641
  cascadeDir,
558
642
  deviceKind,
559
643
  );
560
- console.log(`[NativeInference] LI model loaded in ${Date.now() - t0}ms (dim: ${_liModel.dim}, device: ${deviceKind})`);
644
+ console.log(
645
+ `[NativeInference] LI model '${variant.cfgKey}' loaded in ${Date.now() - t0}ms `
646
+ + `(dim: ${model.dim}, device: ${deviceKind})`,
647
+ );
561
648
 
562
- return _liModel;
649
+ const slot = _liModels.get(variant.fp32RegistryKey);
650
+ if (slot) slot.model = model;
651
+ return model;
563
652
  })();
564
653
 
565
- return _liModelLoadPromise;
654
+ _liModels.set(variant.fp32RegistryKey, { model: null, promise });
655
+ return promise;
566
656
  }
567
657
 
568
658
  // ─── Warmup primitives ───
@@ -575,10 +665,14 @@ export async function warmupNativeEmbeddingModel() {
575
665
  }
576
666
 
577
667
  export async function warmupNativeLiModel() {
578
- if (!_liModel?.warmupForward) return;
668
+ // Warm up only the *active* variant — warming up an unused stale
669
+ // variant would be wasted Metal queue time.
670
+ const variant = resolveNativeLiVariant();
671
+ const model = _liModels.get(variant.fp32RegistryKey)?.model;
672
+ if (!model?.warmupForward) return;
579
673
  const t0 = Date.now();
580
- await _liModel.warmupForward();
581
- console.log(`[NativeInference] LI warmup forward in ${Date.now() - t0}ms`);
674
+ await model.warmupForward();
675
+ console.log(`[NativeInference] LI warmup forward (${variant.cfgKey}) in ${Date.now() - t0}ms`);
582
676
  }
583
677
 
584
678
  // ─── Cleanup ───
@@ -586,12 +680,10 @@ export async function warmupNativeLiModel() {
586
680
  export function unloadNativeModels() {
587
681
  _embeddingModel = null;
588
682
  _embeddingModelLoadPromise = null;
589
- _liModel = null;
590
- _liModelLoadPromise = null;
683
+ _liModels.clear();
591
684
  _embTokenizer = null;
592
685
  _embTokenizerLoadPromise = null;
593
- _liTokenizer = null;
594
- _liTokenizerLoadPromise = null;
686
+ _liTokenizers.clear();
595
687
  _addon = null;
596
688
  _available = null;
597
689
  _coremlCascadeLogged = false;
@@ -69,6 +69,28 @@ const BOUNDARY_TYPES = new Set([
69
69
  'class_specifier', 'namespace_definition',
70
70
  ]);
71
71
 
72
+ // AST node types that represent function/class bodies. Used by
73
+ // extractSignature() to find where the declaration's body starts so
74
+ // the signature span is everything before it (decorators + name +
75
+ // parameters + return type, excluding body).
76
+ const BODY_TYPES = new Set([
77
+ // JS/TS, Java, Go, Rust, Kotlin, Swift, C#, Ruby (sometimes)
78
+ 'block', 'statement_block', 'class_body', 'function_body',
79
+ // C / C++ — function bodies
80
+ 'compound_statement', 'field_declaration_list',
81
+ // Python uses `block` (already covered) but `:` precedes it
82
+ // PHP — function/method body
83
+ 'compound_statement_php',
84
+ // Swift / Kotlin — sometimes labelled differently
85
+ 'enum_class_body', 'enum_body', 'interface_body',
86
+ // Rust impl/trait bodies
87
+ 'declaration_list',
88
+ ]);
89
+
90
+ // Maximum signature length (chars) after whitespace normalization.
91
+ // Signatures longer than this get truncated with `…`.
92
+ const MAX_SIGNATURE_LENGTH = 200;
93
+
72
94
  // Map tree-sitter node type -> our chunk type label
73
95
  const NODE_TYPE_MAP = {
74
96
  'function_declaration': 'function',
@@ -410,12 +432,23 @@ export class TreeSitterProvider {
410
432
  /**
411
433
  * Parse file content into semantic chunks using the cAST recursive algorithm.
412
434
  * Returns array of chunk objects or null if tree-sitter can't handle it.
435
+ *
436
+ * Header-aware budget (research-only ablation, May 2026): set
437
+ * SWEET_SEARCH_CHUNK_HEADER_OVERHEAD=N to subtract N chars from the
438
+ * cAST max chunk size, leaving room for the embedding-text headers
439
+ * (path / parent / symbol / language ≈ 50–100 chars) without spilling
440
+ * past the embedding cap. Default 0 = byte-identical to shipped. The
441
+ * audit motivating this lever lives in eval/results/chunk-overflow-audit.md.
413
442
  */
414
443
  async parseFileToChunks(content, languageId, options = {}) {
415
444
  const tree = await this.parse(content, languageId);
416
445
  if (!tree) return null;
417
446
 
418
- const maxChunkSize = options.maxChunkSize || 2000;
447
+ const headerOverhead = (() => {
448
+ const v = parseInt(process.env.SWEET_SEARCH_CHUNK_HEADER_OVERHEAD || '', 10);
449
+ return Number.isFinite(v) && v >= 0 ? v : 0;
450
+ })();
451
+ const maxChunkSize = (options.maxChunkSize || 2000) - headerOverhead;
419
452
  this._chunkCounter = 0;
420
453
 
421
454
  const children = this._getChildren(tree.rootNode);
@@ -467,6 +500,7 @@ export class TreeSitterProvider {
467
500
  const firstBoundary = buffer.find(n => BOUNDARY_TYPES.has(n.type));
468
501
  const name = firstBoundary ? this._extractNodeName(firstBoundary) : null;
469
502
  const type = firstBoundary ? (NODE_TYPE_MAP[firstBoundary.type] || 'code') : 'code';
503
+ const signature = firstBoundary ? this._extractSignature(firstBoundary, content) : null;
470
504
 
471
505
  chunks.push({
472
506
  chunkId: this._nextChunkId(),
@@ -478,6 +512,7 @@ export class TreeSitterProvider {
478
512
  endLine: buffer[buffer.length - 1].endPosition.row,
479
513
  type,
480
514
  name: name || (buffer.length === 1 ? null : null),
515
+ signature,
481
516
  });
482
517
  }
483
518
  buffer = [];
@@ -536,6 +571,7 @@ export class TreeSitterProvider {
536
571
  endLine: node.endPosition.row,
537
572
  type: NODE_TYPE_MAP[node.type] || 'code',
538
573
  name: this._extractNodeName(node),
574
+ signature: this._extractSignature(node, content),
539
575
  });
540
576
  }
541
577
  }
@@ -546,6 +582,60 @@ export class TreeSitterProvider {
546
582
  return chunks;
547
583
  }
548
584
 
585
+ /**
586
+ * Extract a compact, single-line signature for a boundary AST node.
587
+ *
588
+ * Strategy: find the first body-like child (block / statement_block /
589
+ * compound_statement / class_body / declaration_list / …), and return
590
+ * the source span [node.startIndex, body.startIndex) with whitespace
591
+ * normalized to single spaces. If no body child is found (e.g.
592
+ * declarations without a body, abstract methods, interface members),
593
+ * return the full first line of the node.
594
+ *
595
+ * Returns null when the node has no children to inspect.
596
+ *
597
+ * Used by the `signature` R1 embedding-text variant. Intentionally
598
+ * does NOT alter `text`, `li_text`, or `li_greedy_text` — signature
599
+ * surface is research-only on `embedding_text`.
600
+ */
601
+ _extractSignature(node, content) {
602
+ if (!node || !content) return null;
603
+ if (!BOUNDARY_TYPES.has(node.type)) return null;
604
+
605
+ let bodyStart = null;
606
+ // Try field-name lookup first (works for most modern grammars).
607
+ const bodyField = node.childForFieldName?.('body');
608
+ if (bodyField && BODY_TYPES.has(bodyField.type)) {
609
+ bodyStart = bodyField.startIndex;
610
+ } else {
611
+ // Fall back to scanning children for a body-shaped child.
612
+ for (let i = 0; i < node.childCount; i++) {
613
+ const child = node.child(i);
614
+ if (BODY_TYPES.has(child.type)) {
615
+ bodyStart = child.startIndex;
616
+ break;
617
+ }
618
+ }
619
+ }
620
+
621
+ let raw;
622
+ if (bodyStart != null && bodyStart > node.startIndex) {
623
+ raw = content.substring(node.startIndex, bodyStart);
624
+ } else {
625
+ // No body found — declaration only (e.g. abstract method, type
626
+ // alias). Take the whole node text.
627
+ raw = content.substring(node.startIndex, node.endIndex);
628
+ }
629
+
630
+ // Normalize: collapse runs of whitespace (including newlines) to a
631
+ // single space, drop leading/trailing whitespace.
632
+ const normalized = raw.replace(/\s+/g, ' ').trim();
633
+ if (!normalized) return null;
634
+
635
+ if (normalized.length <= MAX_SIGNATURE_LENGTH) return normalized;
636
+ return normalized.slice(0, MAX_SIGNATURE_LENGTH - 1) + '…';
637
+ }
638
+
549
639
  /** Extract symbol name from an AST node */
550
640
  _extractNodeName(node) {
551
641
  // Try field name first (most reliable)
@@ -662,4 +752,4 @@ export function resetTreeSitterProvider() {
662
752
  }
663
753
 
664
754
  // Re-export constants for testing
665
- export { GRAMMAR_MAP, IDENT_TYPES, BOUNDARY_TYPES, NODE_TYPE_MAP, TAGS_QUERIES, CAPTURE_TO_ENTITY_TYPE };
755
+ export { GRAMMAR_MAP, IDENT_TYPES, BOUNDARY_TYPES, BODY_TYPES, MAX_SIGNATURE_LENGTH, NODE_TYPE_MAP, TAGS_QUERIES, CAPTURE_TO_ENTITY_TYPE };
@@ -121,11 +121,15 @@ function partitionByTokenAvailability(candidates, liIndex) {
121
121
  if (!liIndex) {
122
122
  return { withTokens: [], withoutTokens: [...candidates] };
123
123
  }
124
- const available = liIndex.hasTokens(candidates.map(c => c.id || c.entity_id));
124
+ // Graph-expanded candidates have entity_id-based public ids that don't
125
+ // match LI-indexed chunk ids; they carry the resolved chunk id under
126
+ // _liChunkId. Honour it so expanded candidates can participate in MaxSim.
127
+ const lookupId = (c) => c._liChunkId || c.id || c.entity_id;
128
+ const available = liIndex.hasTokens(candidates.map(lookupId));
125
129
  const withTokens = [];
126
130
  const withoutTokens = [];
127
131
  for (const c of candidates) {
128
- (available.has(c.id || c.entity_id) ? withTokens : withoutTokens).push(c);
132
+ (available.has(lookupId(c)) ? withTokens : withoutTokens).push(c);
129
133
  }
130
134
  return { withTokens, withoutTokens };
131
135
  }