sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,66 @@
1
+ import path from 'path';
2
+ import { DB_PATHS } from '../infrastructure/config/index.js';
3
+ import {
4
+ buildNextManifest,
5
+ readManifest,
6
+ writeManifest,
7
+ zeroManifest,
8
+ } from '../incremental-indexing/infrastructure/manifest.mjs';
9
+ import { FALLBACK_WEIGHTS_ID } from '../incremental-indexing/infrastructure/sparse-gram-delta.mjs';
10
+
11
+ function basename(filePath) {
12
+ return path.basename(filePath);
13
+ }
14
+
15
+ function sparseGramWeightsIdFromResult(result) {
16
+ if (typeof result?.weightsId === 'string' && result.weightsId) return result.weightsId;
17
+ if (typeof result?.weights_id === 'string' && result.weights_id) return result.weights_id;
18
+ if (result?.usedFallbackWeights) return FALLBACK_WEIGHTS_ID;
19
+ return null;
20
+ }
21
+
22
+ export function defaultIndexerManifestPaths() {
23
+ const liBase = basename(DB_PATHS.lateInteraction);
24
+ return {
25
+ codeGraph: basename(DB_PATHS.codeGraph),
26
+ vectors: basename(DB_PATHS.codebase),
27
+ hnsw: basename(DB_PATHS.hnswIndex),
28
+ hnswStale: basename(DB_PATHS.hnswIndex) + '.stale.bin',
29
+ binaryHnsw: basename(DB_PATHS.binaryHnswIndex),
30
+ liManifest: `${liBase}.segments/manifest.json`,
31
+ sparseBase: basename(DB_PATHS.sparseGramIndex),
32
+ };
33
+ }
34
+
35
+ export function defaultIndexerStateDir() {
36
+ return path.dirname(DB_PATHS.codebase);
37
+ }
38
+
39
+ export function publishIndexerManifest(options = {}) {
40
+ const stateDir = options.stateDir || defaultIndexerStateDir();
41
+ const defaultManifest = zeroManifest(defaultIndexerManifestPaths());
42
+ const previous = readManifest(stateDir) || defaultManifest;
43
+ const epoch = Number.isInteger(options.epoch) ? options.epoch : (previous.epoch ?? 0) + 1;
44
+ const sparseWeightsId = sparseGramWeightsIdFromResult(options.sparseGramResult);
45
+ const defaultTiers = {
46
+ codeGraph: defaultManifest.codeGraph,
47
+ vectors: defaultManifest.vectors,
48
+ hnsw: defaultManifest.hnsw,
49
+ binaryHnsw: defaultManifest.binaryHnsw,
50
+ lateInteraction: defaultManifest.lateInteraction,
51
+ sparseGram: {
52
+ ...defaultManifest.sparseGram,
53
+ ...(sparseWeightsId ? { weightsId: sparseWeightsId } : {}),
54
+ },
55
+ };
56
+ const tiers = {};
57
+ for (const [tier, descriptor] of Object.entries(defaultTiers)) {
58
+ tiers[tier] = { ...descriptor, ...(options.tiers?.[tier] || {}) };
59
+ }
60
+ const manifest = buildNextManifest(previous, {
61
+ epoch,
62
+ tiers,
63
+ });
64
+ writeManifest(stateDir, manifest);
65
+ return manifest;
66
+ }
@@ -16,12 +16,14 @@ import { runDedupPhase, formatDedupSummary } from './dedup/dedup-phase.js';
16
16
  import { DEDUP_CONFIG } from '../infrastructure/config/index.js';
17
17
  import { incrementalUpdateHNSW, buildHNSWIndex, buildLateInteractionIndex, buildQuantizedArtifactsPhase } from './indexer-ann.js';
18
18
  import { buildSparseGramArtifact } from './indexer-sparse-gram.js';
19
+ import { publishIndexerManifest } from './indexer-manifest.js';
20
+ import { contentHashSync } from '../incremental-indexing/infrastructure/hashing.mjs';
19
21
  import {
20
22
  configureLocalModelRuntime,
21
23
  resetLocalModelRuntime,
22
24
  } from '../embedding/embedding-local-model.js';
23
25
  import { isNativeInferenceAvailable } from '../infrastructure/native-inference.js';
24
- import { teardownAllModels, initIndexGpuPool, teardownIndexGpuPool, warmupQueryCpuModels, GPU_ARMING_MIN_FILES } from './model-pool.js';
26
+ import { teardownAllModels, initIndexGpuPool, teardownIndexGpuPool, warmupQueryCpuModels, GPU_ARMING_MIN_FILES, isIndexAcceleratorAvailable } from './model-pool.js';
25
27
  import {
26
28
  configureLateInteractionRuntime,
27
29
  resetLateInteractionRuntime,
@@ -423,34 +425,47 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
423
425
  }
424
426
 
425
427
  // The embedding worker pool uses ORT INT8 CPU in each worker. It must only
426
- // be active when the query-time encoder ALSO uses ORT INT8 CPU, otherwise
428
+ // be active when the index-time encoder ALSO uses ORT INT8 CPU, otherwise
427
429
  // the stored index and the query vectors live in different embedding spaces
428
- // (gencodesearchnet 83% → 58% MRR regression). Two cases where that's true:
430
+ // (gencodesearchnet 83% → 58% MRR regression queries are always ORT INT8
431
+ // CPU). Three cases where index-time embed is ORT INT8 CPU:
429
432
  // 1. Native inference isn't available at all (pre-native hosts).
430
- // 2. SWEET_SEARCH_EMBED_USE_CPU=1 the user opted into CPU embed on
433
+ // 2. No usable accelerator (Metal/CoreML/CUDA) even if the native addon
434
+ // is installed, the native model is never loaded on a no-accelerator
435
+ // host (see model-pool.initIndexGpuPool), so embed dispatch falls to
436
+ // ORT INT8. Running the pool here makes that path multi-threaded
437
+ // instead of inline.
438
+ // 3. SWEET_SEARCH_EMBED_USE_CPU=1 — the user opted into CPU embed on
431
439
  // both sides (index + query), so pool ORT embed matches dispatcher
432
- // ORT embed. This is the "ORT embed on CPU ‖ native LI on Metal"
440
+ // ORT embed. This is the "ORT embed on CPU ‖ native LI on accelerator"
433
441
  // pipeline that maximises index throughput by running embed and LI
434
442
  // on different devices.
435
443
  //
436
444
  // The historical `!shouldParallelLI` gate existed for the all-CPU era where
437
445
  // pool workers and parallel LI both wanted CPU and fought. In the CPU-embed
438
- // + Metal-LI world, that conflict goes away — pool workers do ORT on CPU
439
- // cores, the main thread drives Metal LI dispatches (negligible CPU), no
440
- // contention. So when `SWEET_SEARCH_EMBED_USE_CPU=1` we lift the gate and
441
- // let the pool run alongside parallel LI.
446
+ // + accelerator-LI world, that conflict goes away — pool workers do ORT on
447
+ // CPU cores, the main thread drives accelerator LI dispatches (negligible
448
+ // CPU), no contention. So when `SWEET_SEARCH_EMBED_USE_CPU=1` (and LI is on
449
+ // a real accelerator) we lift the gate and let the pool run alongside
450
+ // parallel LI. On a no-accelerator host LI is also on ORT CPU, so the gate
451
+ // stays in force and pool + parallel LI take turns rather than contend.
442
452
  const forceEmbedCpu = process.env.SWEET_SEARCH_EMBED_USE_CPU === '1';
443
- const queryTimeEmbedIsCpu = !isNativeInferenceAvailable() || forceEmbedCpu;
444
- // When LI is on Metal (native), pool + parallelLI is safe — the LI driver
445
- // is just dispatching commands, not competing for CPU cores.
446
- const liOnMetal = isNativeInferenceAvailable() && !noLateInteraction;
447
- const allowPoolWithParallelLi = forceEmbedCpu && liOnMetal;
453
+ const indexTimeEmbedIsCpu = !isNativeInferenceAvailable()
454
+ || !isIndexAcceleratorAvailable()
455
+ || forceEmbedCpu;
456
+ // LI runs on a native accelerator only when one is actually armed. When it
457
+ // is, pool + parallelLI is safe — the LI driver is just dispatching GPU
458
+ // commands, not competing for CPU cores.
459
+ const liOnAccelerator = isNativeInferenceAvailable()
460
+ && isIndexAcceleratorAvailable()
461
+ && !noLateInteraction;
462
+ const allowPoolWithParallelLi = forceEmbedCpu && liOnAccelerator;
448
463
  const useEmbeddingPool = !dryRun
449
464
  && filesToIndex.length > 0
450
465
  && EMBEDDING_CONFIG.provider === 'local'
451
466
  && resourcePlan.useWorkerPool
452
467
  && (!shouldParallelLI || allowPoolWithParallelLi)
453
- && queryTimeEmbedIsCpu;
468
+ && indexTimeEmbedIsCpu;
454
469
 
455
470
  if (!dryRun && EMBEDDING_CONFIG.provider === 'local' && filesToIndex.length > 0) {
456
471
  configureLocalModelRuntime({ intraOpThreads: embeddingThreads });
@@ -500,15 +515,26 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
500
515
  // run a dummy forward pass to compile Metal pipelines / CoreML variants
501
516
  // / BLAS threads.
502
517
  //
518
+ // No-accelerator skip: a host with no usable Metal / CoreML / CUDA
519
+ // accelerator indexes on the optimized ORT INT8 CPU path and never arms
520
+ // candle/native. `isIndexAcceleratorAvailable()` gates this even when the
521
+ // optional native addon is installed (e.g. Linux + the CUDA package but a
522
+ // failed/absent CUDA runtime, or SWEET_SEARCH_CUDA=0) — the JS layer is the
523
+ // authoritative selector; we never lean on Rust degrading loadWithDevice()
524
+ // to CPU. Skipping arming also skips the teardown/CPU-rewarm lifecycle in
525
+ // the `finally` below, so a CPU-only full reindex simply runs on ORT CPU.
526
+ //
503
527
  // Small-changeset skip: incremental runs with fewer than
504
528
  // GPU_ARMING_MIN_FILES files keep the ORT CPU path. The GPU load +
505
529
  // warmup + teardown + CPU rewarm round-trip costs 5–15s on M3 class
506
530
  // hardware and would dwarf the actual work (<1s per file on CPU).
507
- // Full reindex always arms the GPU regardless of file count.
531
+ // Full reindex always arms the GPU regardless of file count — but only
532
+ // when an accelerator exists.
508
533
  const shouldArmGpu = !dryRun
509
534
  && filesToIndex.length > 0
510
535
  && EMBEDDING_CONFIG.provider === 'local'
511
536
  && isNativeInferenceAvailable()
537
+ && isIndexAcceleratorAvailable()
512
538
  && (fullReindex || filesToIndex.length >= GPU_ARMING_MIN_FILES);
513
539
 
514
540
  if (shouldArmGpu) {
@@ -531,6 +557,8 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
531
557
  }
532
558
  } else if (!dryRun && filesToIndex.length > 0 && filesToIndex.length < GPU_ARMING_MIN_FILES) {
533
559
  log(`Small changeset (${filesToIndex.length} < ${GPU_ARMING_MIN_FILES} files) — using ORT CPU`, 'dim');
560
+ } else if (!dryRun && filesToIndex.length > 0 && !isIndexAcceleratorAvailable()) {
561
+ log('No inference accelerator detected — indexing on ORT INT8 CPU', 'dim');
534
562
  }
535
563
 
536
564
  try {
@@ -696,7 +724,7 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
696
724
  }
697
725
 
698
726
  export async function updateIncrementalStatePhase(options = {}) {
699
- const { dryRun, fullReindex, incrementalInfo, allFiles, vectorStats, graphStats } = options;
727
+ const { dryRun, fullReindex, incrementalInfo, allFiles, vectorStats, graphStats, manifestStateDir, sparseGramResult } = options;
700
728
 
701
729
  if (dryRun) return;
702
730
 
@@ -709,18 +737,18 @@ export async function updateIncrementalStatePhase(options = {}) {
709
737
  log('\nIncremental state updated', 'green');
710
738
  } else if (fullReindex) {
711
739
  const hashes = {};
712
- const crypto = await import('crypto');
713
740
  for (const file of allFiles) {
714
741
  try {
715
742
  const fullPath = path.join(PROJECT_ROOT, file);
716
743
  const [content, stat] = await Promise.all([
717
- fs.readFile(fullPath, 'utf-8'),
718
- fs.stat(fullPath).catch(() => null),
744
+ fs.readFile(fullPath),
745
+ fs.stat(fullPath, { bigint: true }).catch(() => null),
719
746
  ]);
720
747
  hashes[file] = {
721
- hash: crypto.createHash('sha256').update(content).digest('hex').slice(0, 16),
722
- size: stat?.size ?? null,
723
- mtime_ns: stat ? String(BigInt(Math.round(stat.mtimeMs)) * 1000000n) : null,
748
+ hash: contentHashSync(content),
749
+ size: stat ? stat.size.toString() : null,
750
+ mtime_ns: stat ? stat.mtimeNs.toString() : null,
751
+ inode: stat ? stat.ino.toString() : null,
724
752
  };
725
753
  } catch (e) { /* skip */ }
726
754
  }
@@ -731,6 +759,11 @@ export async function updateIncrementalStatePhase(options = {}) {
731
759
  });
732
760
  log('\nIncremental state saved', 'green');
733
761
  }
762
+ publishIndexerManifest({
763
+ ...(manifestStateDir ? { stateDir: manifestStateDir } : {}),
764
+ ...(sparseGramResult ? { sparseGramResult } : {}),
765
+ });
766
+ log('Reconcile manifest published', 'green');
734
767
  }
735
768
 
736
769
  export function printSummaryPhase(options) {
@@ -9,6 +9,8 @@ import {
9
9
  hasNativeSparseGramSupport,
10
10
  resolveSparseSymbolMask,
11
11
  } from '../infrastructure/native-sparse-gram.js';
12
+ import { contentHash } from '../incremental-indexing/infrastructure/hashing.mjs';
13
+ import { FALLBACK_WEIGHTS_ID } from '../incremental-indexing/infrastructure/sparse-gram-delta.mjs';
12
14
  import { atomicSwapDatabase, log } from './indexer-utils.js';
13
15
 
14
16
  async function unlinkIfExists(filePath) {
@@ -28,18 +30,7 @@ async function collectFileSymbolMasks(codeFiles) {
28
30
  const db = new Database(DB_PATHS.codebase, { readonly: true });
29
31
 
30
32
  try {
31
- const rows = db.prepare('SELECT file_path, metadata FROM vectors').iterate();
32
- for (const row of rows) {
33
- if (!masks.has(row.file_path)) continue;
34
- try {
35
- const metadata = JSON.parse(row.metadata || '{}');
36
- const typeMask = resolveSparseSymbolMask(metadata.type);
37
- if (typeMask === 0) continue;
38
- masks.set(row.file_path, masks.get(row.file_path) | typeMask);
39
- } catch {
40
- // Ignore malformed metadata rows; sparse gram build is best effort.
41
- }
42
- }
33
+ collectFileSymbolMasksFromDb(db, codeFiles, masks);
43
34
  } finally {
44
35
  db.close();
45
36
  }
@@ -47,6 +38,48 @@ async function collectFileSymbolMasks(codeFiles) {
47
38
  return codeFiles.map((filePath) => masks.get(filePath) || 0);
48
39
  }
49
40
 
41
+ async function resolveSparseGramWeightsId(result, artifactPath) {
42
+ if (typeof result?.weightsId === 'string' && result.weightsId) {
43
+ return result.weightsId;
44
+ }
45
+ if (typeof result?.weights_id === 'string' && result.weights_id) {
46
+ return result.weights_id;
47
+ }
48
+ if (result?.usedFallbackWeights) return FALLBACK_WEIGHTS_ID;
49
+ const artifactBytes = await fs.readFile(artifactPath);
50
+ return `corpus-bigram-v1-${await contentHash(artifactBytes)}`;
51
+ }
52
+
53
+ function collectFileSymbolMasksFromDb(db, codeFiles, existingMasks = null) {
54
+ const masks = existingMasks || new Map(codeFiles.map((filePath) => [filePath, 0]));
55
+ const liveSql = liveVectorSql(db);
56
+ const rows = db.prepare(`SELECT file_path, metadata FROM vectors WHERE ${liveSql}`).iterate();
57
+ for (const row of rows) {
58
+ if (!masks.has(row.file_path)) continue;
59
+ try {
60
+ const metadata = JSON.parse(row.metadata || '{}');
61
+ const typeMask = resolveSparseSymbolMask(metadata.type);
62
+ if (typeMask === 0) continue;
63
+ masks.set(row.file_path, masks.get(row.file_path) | typeMask);
64
+ } catch {
65
+ // Ignore malformed metadata rows; sparse gram build is best effort.
66
+ }
67
+ }
68
+ return codeFiles.map((filePath) => masks.get(filePath) || 0);
69
+ }
70
+
71
+ function hasVectorColumn(db, column) {
72
+ try {
73
+ return db.prepare('PRAGMA table_info(vectors)').all().some((col) => col.name === column);
74
+ } catch (_err) {
75
+ return false;
76
+ }
77
+ }
78
+
79
+ function liveVectorSql(db) {
80
+ return hasVectorColumn(db, 'epoch_retired') ? 'epoch_retired IS NULL' : '1=1';
81
+ }
82
+
50
83
  export async function buildSparseGramArtifact(allFiles, dryRun) {
51
84
  if (dryRun) {
52
85
  log('DRY RUN: Skipping sparse gram artifact build', 'magenta');
@@ -85,14 +118,22 @@ export async function buildSparseGramArtifact(allFiles, dryRun) {
85
118
  fileSymbolMasks,
86
119
  outputPath: stagedPath,
87
120
  });
121
+ const weightsId = await resolveSparseGramWeightsId(result, stagedPath);
88
122
  await atomicSwapDatabase(stagedPath, DB_PATHS.sparseGramIndex);
89
123
  log(
90
124
  `Sparse gram artifact promoted (${result.filesIndexed} files, ${result.grams} grams, ${result.postings} postings)`,
91
125
  'green'
92
126
  );
93
- return result;
127
+ return { ...result, weightsId };
94
128
  } catch (err) {
95
129
  await unlinkIfExists(stagedPath);
96
130
  throw err;
97
131
  }
98
132
  }
133
+
134
+ export const __TEST__ = {
135
+ collectFileSymbolMasks,
136
+ collectFileSymbolMasksFromDb,
137
+ liveVectorSql,
138
+ resolveSparseGramWeightsId,
139
+ };
@@ -5,11 +5,21 @@
5
5
 
6
6
  import fs from 'fs/promises';
7
7
  import { existsSync } from 'fs';
8
- import { spawn } from 'child_process';
9
8
  import path from 'path';
10
9
  import fg from 'fast-glob';
11
10
 
12
- import { PROJECT_ROOT, setQuietMode as setGlobalQuietMode, loadProjectConfig, AGENTIC_GITIGNORE_ALLOWLIST } from '../infrastructure/config/index.js';
11
+ import { PROJECT_ROOT, setQuietMode as setGlobalQuietMode } from '../infrastructure/config/index.js';
12
+ import { createAdmissionPolicy } from './admission-policy.js';
13
+
14
+ // `.gitignore` alignment now lives in gitignore-filter.js (shared with the
15
+ // incremental admission policy). Re-exported here so existing
16
+ // `import { ... } from indexer-utils` / barrel call sites keep working.
17
+ export {
18
+ toPosixPath,
19
+ isGitignoreAllowlistedAgenticPath,
20
+ getGitIgnoredPathSet,
21
+ applyGitignoreAlignment,
22
+ } from './gitignore-filter.js';
13
23
 
14
24
  const glob = fg.glob || fg;
15
25
 
@@ -205,10 +215,6 @@ export function stripWslUncPrefix(filePath) {
205
215
  return filePath;
206
216
  }
207
217
 
208
- export function toPosixPath(filePath) {
209
- return filePath.replace(/\\/g, '/');
210
- }
211
-
212
218
  export async function readFilesFromStdin() {
213
219
  return new Promise((resolve, reject) => {
214
220
  let data = '';
@@ -278,201 +284,6 @@ export async function readFilesFromStdin() {
278
284
  });
279
285
  }
280
286
 
281
- // =============================================================================
282
- // GITIGNORE ALIGNMENT
283
- // =============================================================================
284
-
285
- export function isGitignoreAllowlistedAgenticPath(relativePath) {
286
- const normalized = toPosixPath(relativePath).replace(/^\.\//, '');
287
- const basename = path.posix.basename(normalized);
288
-
289
- if (AGENTIC_GITIGNORE_ALLOWLIST.files.includes(basename)) {
290
- return true;
291
- }
292
-
293
- if (AGENTIC_GITIGNORE_ALLOWLIST.filePrefixes.some(prefix => basename.startsWith(prefix))) {
294
- return true;
295
- }
296
-
297
- return AGENTIC_GITIGNORE_ALLOWLIST.directories.some(dirPrefix =>
298
- normalized.startsWith(dirPrefix) || normalized.includes(`/${dirPrefix}`)
299
- );
300
- }
301
-
302
- /**
303
- * Run `git check-ignore` on a single batch of paths.
304
- * Returns a Set of ignored paths, or null on fatal error.
305
- */
306
- function checkIgnoreBatch(batch, reportError) {
307
- return new Promise((resolve) => {
308
- const ignoredChunks = [];
309
- let settled = false;
310
-
311
- const git = spawn('git', ['check-ignore', '-z', '--stdin'], { cwd: PROJECT_ROOT });
312
-
313
- git.stdout.on('data', chunk => ignoredChunks.push(chunk));
314
- git.stderr.on('data', () => {}); // Suppress — batched caller handles partial failures
315
-
316
- git.on('error', (err) => {
317
- if (settled) return;
318
- settled = true;
319
- reportError(`WARN: Unable to run git check-ignore (${err.message})`);
320
- resolve(null);
321
- });
322
-
323
- git.on('close', (code) => {
324
- if (settled) return;
325
- settled = true;
326
-
327
- // code 0 = some ignored, code 1 = none ignored, both valid.
328
- // code 128 = fatal (e.g. path beyond symlink) — still use partial stdout.
329
- if (code !== 0 && code !== 1 && ignoredChunks.length === 0) {
330
- resolve(null);
331
- return;
332
- }
333
-
334
- const ignored = Buffer.concat(ignoredChunks)
335
- .toString('utf8')
336
- .split('\0')
337
- .filter(Boolean)
338
- .map(toPosixPath);
339
-
340
- resolve(ignored);
341
- });
342
-
343
- const stdinPayload = `${batch.map(toPosixPath).join('\0')}\0`;
344
- git.stdin.on('error', () => {}); // Suppress EPIPE if git exits early
345
- git.stdin.end(stdinPayload);
346
- });
347
- }
348
-
349
- const CHECK_IGNORE_BATCH_SIZE = 5000;
350
-
351
- /**
352
- * Find directory components that are symlinks, so we can filter out paths
353
- * that traverse them (git check-ignore fatals on "beyond a symbolic link").
354
- */
355
- async function findSymlinkDirs(paths) {
356
- const checked = new Map();
357
- const symlinkPrefixes = [];
358
-
359
- for (const p of paths) {
360
- const parts = p.split('/');
361
- let dir = '';
362
- for (let i = 0; i < parts.length - 1; i++) {
363
- dir = dir ? `${dir}/${parts[i]}` : parts[i];
364
- if (checked.has(dir)) continue;
365
- try {
366
- const stat = await fs.lstat(path.join(PROJECT_ROOT, dir));
367
- const isLink = stat.isSymbolicLink();
368
- checked.set(dir, isLink);
369
- if (isLink) symlinkPrefixes.push(dir + '/');
370
- } catch {
371
- checked.set(dir, false);
372
- }
373
- }
374
- }
375
-
376
- return symlinkPrefixes;
377
- }
378
-
379
- export async function getGitIgnoredPathSet(paths, options = {}) {
380
- const silent = options.silent ?? false;
381
- const reportError = silent ? () => {} : logError;
382
-
383
- if (paths.length === 0) {
384
- return new Set();
385
- }
386
-
387
- const ignored = new Set();
388
-
389
- // Pre-filter paths that traverse symlinks — git check-ignore fatals on these.
390
- // Files beyond symlinks are also checked: if the symlink dir itself is ignored,
391
- // all files under it are treated as ignored too.
392
- const symlinkPrefixes = await findSymlinkDirs(paths);
393
- let safePaths = paths;
394
- if (symlinkPrefixes.length > 0) {
395
- // Check if the symlink directories themselves are ignored
396
- const symlinkDirs = symlinkPrefixes.map(p => p.slice(0, -1)); // remove trailing /
397
- const symlinkIgnored = await checkIgnoreBatch(symlinkDirs, reportError);
398
- const ignoredSymlinks = new Set(symlinkIgnored || []);
399
-
400
- safePaths = [];
401
- for (const p of paths) {
402
- const matchedPrefix = symlinkPrefixes.find(prefix => p.startsWith(prefix));
403
- if (matchedPrefix) {
404
- // Path traverses a symlink — check if symlink dir is gitignored
405
- const dir = matchedPrefix.slice(0, -1);
406
- if (ignoredSymlinks.has(dir)) {
407
- ignored.add(toPosixPath(p)); // inherit parent's ignored status
408
- }
409
- // Either way, skip git check-ignore (would fatal)
410
- } else {
411
- safePaths.push(p);
412
- }
413
- }
414
- }
415
-
416
- let failedBatches = 0;
417
-
418
- for (let i = 0; i < safePaths.length; i += CHECK_IGNORE_BATCH_SIZE) {
419
- const batch = safePaths.slice(i, i + CHECK_IGNORE_BATCH_SIZE);
420
- const result = await checkIgnoreBatch(batch, reportError);
421
- if (result) {
422
- for (const p of result) ignored.add(p);
423
- } else {
424
- failedBatches++;
425
- }
426
- }
427
-
428
- const totalBatches = Math.ceil(safePaths.length / CHECK_IGNORE_BATCH_SIZE);
429
- if (failedBatches === totalBatches && totalBatches > 0) {
430
- reportError('WARN: git check-ignore failed on all batches — gitignore filtering disabled');
431
- return null;
432
- }
433
-
434
- return ignored;
435
- }
436
-
437
- export async function applyGitignoreAlignment(files, respectGitignore, options = {}) {
438
- if (!respectGitignore || !existsSync(path.join(PROJECT_ROOT, '.git'))) {
439
- return { files, gitignored: 0 };
440
- }
441
-
442
- const bypassGitignore = new Set();
443
- const candidates = [];
444
- for (const file of files) {
445
- if (isGitignoreAllowlistedAgenticPath(file)) {
446
- bypassGitignore.add(file);
447
- } else {
448
- candidates.push(file);
449
- }
450
- }
451
-
452
- const ignoredSet = await getGitIgnoredPathSet(candidates, options);
453
- if (!ignoredSet) {
454
- return { files, gitignored: 0 };
455
- }
456
-
457
- const kept = [];
458
- let gitignored = 0;
459
- for (const file of files) {
460
- if (bypassGitignore.has(file)) {
461
- kept.push(file);
462
- continue;
463
- }
464
-
465
- const normalized = toPosixPath(file);
466
- if (ignoredSet.has(normalized)) {
467
- gitignored++;
468
- continue;
469
- }
470
- kept.push(file);
471
- }
472
-
473
- return { files: kept, gitignored };
474
- }
475
-
476
287
  // =============================================================================
477
288
  // FILE DISCOVERY
478
289
  // =============================================================================
@@ -487,19 +298,26 @@ export async function discoverFiles(options = {}) {
487
298
 
488
299
  writeLog('\n━━━ Discovering Files ━━━', 'bright');
489
300
 
490
- const projectConfig = loadProjectConfig(projectRoot);
491
- const respectGitignore = projectConfig.respectGitignore !== false;
492
- const maxFileSize = projectConfig.maxFileSize || (1 * 1024 * 1024);
493
-
494
- const discovered = await glob(projectConfig.include, {
495
- ignore: projectConfig.exclude,
301
+ // Single shared admission policy — the same include allowlist / deny-list /
302
+ // `.sweet-search-ignore` / `.gitignore` / size gates the incremental
303
+ // maintainer uses, so a fresh full index and an incrementally-maintained
304
+ // index admit exactly the same files.
305
+ const policy = createAdmissionPolicy({ projectRoot });
306
+ const maxFileSize = policy.maxFileSize;
307
+
308
+ // Enumerate via the include globs (with the exclude globs pruning big dirs
309
+ // during traversal), then apply the policy's shape gate so `.sweet-search-ignore`
310
+ // is honoured here too — the one rule full discovery did not previously apply.
311
+ const discovered = await glob(policy.includeGlobs, {
312
+ ignore: policy.excludeGlobs,
496
313
  cwd: projectRoot,
497
314
  absolute: false,
498
315
  onlyFiles: true,
499
316
  dot: true,
500
317
  });
318
+ const shaped = discovered.filter((rel) => policy.admitsShape(rel));
501
319
 
502
- const { files: allFiles, gitignored } = await applyGitignoreAlignment(discovered, respectGitignore, { silent });
320
+ const { files: allFiles, gitignored } = await policy.applyGitignore(shaped, { silent });
503
321
 
504
322
  const files = [];
505
323
  let oversized = 0;
@@ -17,6 +17,28 @@ function normalizePath(p) {
17
17
  return p.replace(/\\/g, '/');
18
18
  }
19
19
 
20
+ function chunkPath(chunk) {
21
+ return firstSafeRelativePath(
22
+ chunk?.metadata?.relative_path,
23
+ chunk?.metadata?.path,
24
+ chunk?.metadata?.file_path,
25
+ chunk?.file,
26
+ chunk?.metadata?.file,
27
+ ) || '';
28
+ }
29
+
30
+ function firstSafeRelativePath(...candidates) {
31
+ for (const candidate of candidates) {
32
+ if (typeof candidate !== 'string') continue;
33
+ const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
34
+ if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
35
+ if (/^[A-Za-z]:\//.test(normalized)) continue;
36
+ if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
37
+ return normalized;
38
+ }
39
+ return null;
40
+ }
41
+
20
42
  const _excludesByRoot = new Map();
21
43
 
22
44
  function getExcludes(projectRoot) {
@@ -93,24 +115,26 @@ export function applyIndexingChunkPolicy(chunks, options = {}) {
93
115
  const fileFirstReason = new Map();
94
116
 
95
117
  for (const chunk of chunks) {
96
- if (!chunk?.file) continue;
97
- if (fileFirstReason.has(chunk.file)) continue;
118
+ const file = chunkPath(chunk);
119
+ if (!file) continue;
120
+ if (fileFirstReason.has(file)) continue;
98
121
 
99
122
  let reason = null;
100
- if (isExcludedByConfig(chunk.file, projectRoot)) {
123
+ if (isExcludedByConfig(file, projectRoot)) {
101
124
  reason = 'excluded';
102
125
  } else {
103
126
  const text = chunk.text || chunk.content || '';
104
127
  if (chunkLooksGenerated(text)) reason = 'generated';
105
128
  }
106
- fileFirstReason.set(chunk.file, reason);
129
+ fileFirstReason.set(file, reason);
107
130
  }
108
131
 
109
132
  const kept = [];
110
133
  const skipped = [];
111
134
  const stats = emptyStats();
112
135
  for (const chunk of chunks) {
113
- const reason = chunk?.file ? fileFirstReason.get(chunk.file) : null;
136
+ const file = chunkPath(chunk);
137
+ const reason = file ? fileFirstReason.get(file) : null;
114
138
  if (reason) {
115
139
  skipped.push(chunk);
116
140
  stats[reason]++;
@@ -119,8 +143,8 @@ export function applyIndexingChunkPolicy(chunks, options = {}) {
119
143
  kept.push(chunk);
120
144
  }
121
145
  }
122
- stats.skippedFiles = new Set(skipped.map((c) => c.file).filter(Boolean)).size;
123
- stats.keptFiles = new Set(kept.map((c) => c.file).filter(Boolean)).size;
146
+ stats.skippedFiles = new Set(skipped.map(chunkPath).filter(Boolean)).size;
147
+ stats.keptFiles = new Set(kept.map(chunkPath).filter(Boolean)).size;
124
148
  return { kept, skipped, stats };
125
149
  }
126
150
 
@@ -136,5 +160,6 @@ function emptyStats() {
136
160
 
137
161
  export const _internals = {
138
162
  GENERATED_MARKERS,
163
+ chunkPath,
139
164
  resetCache,
140
165
  };