sweet-search 2.5.13 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +36 -9
  2. package/core/cli.js +41 -3
  3. package/core/embedding/embedding-local-model.js +106 -10
  4. package/core/embedding/embedding-service.js +59 -1
  5. package/core/embedding/model-client.mjs +257 -0
  6. package/core/embedding/model-server.mjs +217 -0
  7. package/core/incremental-indexing/application/maintenance-handlers.mjs +19 -98
  8. package/core/incremental-indexing/application/maintenance-worker.mjs +46 -9
  9. package/core/incremental-indexing/application/operator-cli.mjs +14 -5
  10. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +40 -0
  11. package/core/incremental-indexing/application/production-reconciler.mjs +718 -54
  12. package/core/incremental-indexing/application/reconciler.mjs +87 -15
  13. package/core/incremental-indexing/domain/cutoff-cache.mjs +191 -0
  14. package/core/incremental-indexing/domain/interval-autotune.mjs +84 -1
  15. package/core/incremental-indexing/domain/reconcile-counters.mjs +0 -4
  16. package/core/incremental-indexing/domain/watermark-scheduler.mjs +0 -24
  17. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +2 -26
  18. package/core/incremental-indexing/infrastructure/manifest.mjs +1 -9
  19. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +72 -0
  20. package/core/indexing/artifact-builder.js +1 -1
  21. package/core/indexing/dedup/dedup-phase.js +36 -17
  22. package/core/indexing/dedup/exemplar-selector.js +5 -0
  23. package/core/indexing/index-codebase-v21.js +37 -14
  24. package/core/indexing/index-maintainer.mjs +337 -6
  25. package/core/indexing/indexer-ann.js +27 -434
  26. package/core/indexing/indexer-build.js +30 -14
  27. package/core/indexing/indexer-manifest.js +0 -3
  28. package/core/indexing/indexer-phases.js +101 -25
  29. package/core/indexing/maintainer-launcher.mjs +22 -0
  30. package/core/indexing/maintainer-watcher.mjs +397 -0
  31. package/core/indexing/os-priority.mjs +160 -0
  32. package/core/indexing/rss-budget.mjs +425 -0
  33. package/core/indexing/streaming-vectors.js +450 -0
  34. package/core/infrastructure/config/platform.js +14 -10
  35. package/core/infrastructure/onnx-session-utils.js +37 -0
  36. package/core/infrastructure/sparse-gram-delta-reader.js +11 -1
  37. package/core/ranking/late-interaction-index.js +58 -7
  38. package/core/search/daemon-registry.js +199 -0
  39. package/core/search/search-read-semantic.js +9 -3
  40. package/core/search/search-semantic.js +6 -29
  41. package/core/search/search-server.js +527 -27
  42. package/core/search/session-daemon-prewarm.mjs +110 -1
  43. package/core/search/sweet-search.js +0 -38
  44. package/core/vector-store/binary-hnsw-index.js +692 -78
  45. package/core/vector-store/index.js +1 -4
  46. package/eval/agent-read-workflows/bin/_ss-argparse.mjs +51 -5
  47. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +95 -44
  48. package/eval/agent-read-workflows/bin/ss-read +2 -0
  49. package/mcp/tool-handlers.js +1 -2
  50. package/package.json +11 -8
  51. package/scripts/uninstall.js +2 -0
  52. package/core/vector-store/hnsw-index.js +0 -751
@@ -14,7 +14,7 @@ import { colors, log, logProgress, logError, discoverFiles, readFilesFromStdin,
14
14
  import { buildCodeGraph, buildVectorIndex, chunkFiles } from './indexer-build.js';
15
15
  import { runDedupPhase, formatDedupSummary } from './dedup/dedup-phase.js';
16
16
  import { DEDUP_CONFIG } from '../infrastructure/config/index.js';
17
- import { incrementalUpdateHNSW, buildHNSWIndex, buildLateInteractionIndex, buildQuantizedArtifactsPhase } from './indexer-ann.js';
17
+ import { buildLateInteractionIndex, buildQuantizedArtifactsPhase } from './indexer-ann.js';
18
18
  import { buildSparseGramArtifact } from './indexer-sparse-gram.js';
19
19
  import { publishIndexerManifest } from './indexer-manifest.js';
20
20
  import { contentHashSync } from '../incremental-indexing/infrastructure/hashing.mjs';
@@ -410,11 +410,31 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
410
410
  : resourcePlan.threadsPerLateInteractionWorker;
411
411
  const stagedLateInteractionPath = DB_PATHS.lateInteraction + '.tmp';
412
412
 
413
- // Always chunk files up front so both vector and LI encoders share
414
- // the same chunk list. Vectors are written to SQLite and HNSW reads
415
- // from the DB directly (Phase B no in-memory arrays passed around).
413
+ // ── Bounded-memory streaming path for large full rebuilds ──
414
+ //
415
+ // The in-memory path below materialises the WHOLE chunk corpus (chunkFiles
416
+ // allChunks/texts) plus all exemplar embeddings, all alias rows, and every
417
+ // LI per-token slab — peak heap O(repo). On big repos (libsql ≈ 431k chunks,
418
+ // swc ≈ 217k) that exceeds the default ~4 GB heap and crashes on EVERY
419
+ // backend (CUDA/Metal/CoreML/ORT-CPU), since the hogs are JS-side, not the
420
+ // model. For large full rebuilds we instead spill chunks to disk and embed/LI
421
+ // in bounded windows (see streaming-vectors.js) so peak heap is O(window).
422
+ //
423
+ // Gated by file count so small repos + incremental runs keep the original
424
+ // in-memory path byte-for-byte (benchmark indexes unaffected). Auto-selected,
425
+ // no opt-in flag; SWEET_SEARCH_STREAM_VECTORS=0 forces the legacy path and
426
+ // SWEET_SEARCH_STREAM_MIN_FILES tunes the threshold.
427
+ const streamMinFiles = Number(process.env.SWEET_SEARCH_STREAM_MIN_FILES) || 5000;
428
+ const useStreaming = !dryRun
429
+ && fullReindex
430
+ && filesToIndex.length >= streamMinFiles
431
+ && process.env.SWEET_SEARCH_STREAM_VECTORS !== '0';
432
+
433
+ // The in-memory path pre-chunks up front so both vector + LI encoders share
434
+ // one chunk list. The streaming path does its own windowed chunking + dedup,
435
+ // so skip this for it (this is the O(repo) allocation we're avoiding).
416
436
  let preChunked = null;
417
- if (!dryRun && filesToIndex.length > 0) {
437
+ if (!dryRun && !useStreaming && filesToIndex.length > 0) {
418
438
  preChunked = await chunkFiles(filesToIndex);
419
439
 
420
440
  // Near-duplicate dedup: annotates chunks in place with {simhash, clusterId,
@@ -562,6 +582,82 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
562
582
  }
563
583
 
564
584
  try {
585
+ // ── Streaming path: bounded-memory vectors + LI for large full rebuilds ──
586
+ if (useStreaming) {
587
+ const { getModelInfo } = await import('../embedding/embedding-service.js');
588
+ const { buildVectorsAndLiStreaming } = await import('./streaming-vectors.js');
589
+ const modelInfo = getModelInfo();
590
+
591
+ const streamed = await buildVectorsAndLiStreaming({
592
+ filesToIndex,
593
+ modelInfo,
594
+ sqliteFastMode,
595
+ noLateInteraction,
596
+ li: {
597
+ poolFactor: lateInteractionPool,
598
+ extendedSkiplist: lateInteractionExtendedSkiplist,
599
+ loadFromPath: DB_PATHS.lateInteraction,
600
+ saveToPath: stagedLateInteractionPath,
601
+ finalIndexPath: DB_PATHS.lateInteraction,
602
+ stagingSegmentDir: stagedLateInteractionSegmentDir(stagedLateInteractionPath),
603
+ workerCount: lateInteractionWorkers,
604
+ threadsPerWorker: lateInteractionWorkerThreads,
605
+ batchSize: resourcePlan.lateInteractionBatchSize,
606
+ batchSizeUpperCap: resourcePlan.lateInteractionBatchSizeUpperCap,
607
+ tokenBudget: resourcePlan.lateInteractionTokenBudget,
608
+ attentionBudget: resourcePlan.lateInteractionAttentionBudget,
609
+ },
610
+ });
611
+
612
+ // HCGS (off by default) runs independently of vectors — drain it if armed.
613
+ let hcgsResult = null;
614
+ if (hcgsPromise) {
615
+ try { hcgsResult = await hcgsPromise; } catch (e) { hcgsResult = { error: e.message }; }
616
+ if (hcgsResult && !hcgsResult.error) {
617
+ log(`Summaries regenerated (${hcgsResult.generated} generated, ${hcgsResult.skipped} skipped)`, 'green');
618
+ }
619
+ }
620
+
621
+ const vectorStats = streamed.vectorStats || { chunks: 0, embeddings: 0 };
622
+ if (vectorStats.embeddings > 0) await markPhaseComplete('vectors');
623
+
624
+ // Promote the staged LI index (built bounded), or invalidate on failure —
625
+ // same contract as the in-memory path's swap/invalidate below.
626
+ let lateInteractionResult = streamed.lateInteractionResult;
627
+ if (!noLateInteraction) {
628
+ if (streamed.liBuilt && lateInteractionResult && !lateInteractionResult.error) {
629
+ await atomicSwapLateInteractionIndex(stagedLateInteractionPath, DB_PATHS.lateInteraction);
630
+ log('Late interaction index promoted', 'green');
631
+ await markPhaseComplete('late-interaction');
632
+ } else {
633
+ await cleanupStagedLateInteractionIndex(stagedLateInteractionPath);
634
+ await invalidateLateInteractionIndex();
635
+ if (lateInteractionResult?.error) {
636
+ log(`Late interaction rebuild failed; invalidated existing index: ${lateInteractionResult.error}`, 'yellow');
637
+ lateInteractionResult = { error: lateInteractionResult.error, invalidated: true };
638
+ }
639
+ }
640
+ }
641
+
642
+ // Binary HNSW + int8 artifacts stream from the swapped codebase.db.
643
+ if (vectorStats.embeddings > 0) {
644
+ await updatePhaseProgress({ phase: 'artifacts', status: 'in_progress' });
645
+ await buildQuantizedArtifactsPhase(dryRun, {
646
+ changedFiles: filesToIndex.length,
647
+ force: forceArtifacts || fullReindex,
648
+ });
649
+ await markPhaseComplete('artifacts');
650
+ }
651
+
652
+ let sparseGramResult = null;
653
+ if (Array.isArray(allFiles) && allFiles.length > 0) {
654
+ sparseGramResult = await buildSparseGramArtifact(allFiles, dryRun);
655
+ }
656
+
657
+ await clearPhaseProgress();
658
+ return { vectorStats, hcgsResult, lateInteractionResult, sparseGramResult };
659
+ }
660
+
565
661
  const vectorPromise = buildVectorIndex(filesToIndex, dryRun, vectorOptions);
566
662
 
567
663
  // Compute LI file removal list (used by both parallel and sequential paths)
@@ -633,25 +729,6 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
633
729
  await markPhaseComplete('vectors');
634
730
  }
635
731
 
636
- try {
637
- if (!dryRun && vectorStats.embeddings > 0) {
638
- await updatePhaseProgress({ phase: 'hnsw', status: 'in_progress' });
639
- if (incrementalInfo && !fullReindex) {
640
- const allFilesToRemoveFromHNSW = [
641
- ...incrementalInfo.toIndex,
642
- ...(incrementalInfo.toRemove || [])
643
- ];
644
- await incrementalUpdateHNSW(DB_PATHS.codebase, allFilesToRemoveFromHNSW, dryRun);
645
- } else {
646
- await buildHNSWIndex(DB_PATHS.codebase, dryRun);
647
- }
648
- await markPhaseComplete('hnsw');
649
- }
650
- } catch (err) {
651
- await cleanupStagedLateInteractionIndex(stagedLateInteractionPath);
652
- throw err;
653
- }
654
-
655
732
  let lateInteractionResult = liOutcome.result;
656
733
 
657
734
  if (!liPromise && !dryRun && !noLateInteraction && (preChunked?.allChunks?.length > 0 || filesToRemoveFromLI.length > 0)) {
@@ -811,7 +888,6 @@ export function printSummaryPhase(options) {
811
888
  if (!vectorsOnly) log(` - ${DB_PATHS.codeGraph}`, 'green');
812
889
  if (!graphOnly) {
813
890
  log(` - ${DB_PATHS.codebase}`, 'green');
814
- log(` - ${DB_PATHS.hnswIndex}`, 'green');
815
891
  if (existsSync(DB_PATHS.binaryHnswIndex.replace('.idx', '.meta.json'))) {
816
892
  log(` - ${DB_PATHS.binaryHnswIndex} (Binary HNSW, 32x smaller)`, 'green');
817
893
  }
@@ -33,11 +33,27 @@ import { existsSync, readFileSync } from 'node:fs';
33
33
  import { dirname, join } from 'node:path';
34
34
  import { fileURLToPath } from 'node:url';
35
35
  import { reconcileEnablement } from '../incremental-indexing/domain/interval-autotune.mjs';
36
+ import { applyBackgroundPriority } from './os-priority.mjs';
36
37
 
37
38
  const __dirname = dirname(fileURLToPath(import.meta.url));
38
39
 
39
40
  export const MAINTAINER_LOCK_FILENAME = 'index-maintainer.lock';
40
41
 
42
+ /**
43
+ * Background-priority gate (research §4.A A.2/A.3). Default ON — this is a
44
+ * Tier-1, output-identical lever (only *when* CPU/IO is granted to the child
45
+ * changes). Honors a canonical off-token (`0`/`false`/`off`) to disable.
46
+ *
47
+ * @param {NodeJS.ProcessEnv} env
48
+ * @returns {boolean}
49
+ */
50
+ function bgPriorityEnabled(env) {
51
+ const raw = env.SWEET_SEARCH_MAINTAINER_BG_PRIORITY;
52
+ if (raw == null || raw === '') return true; // default-on
53
+ const normalized = String(raw).trim().toLowerCase();
54
+ return !(normalized === '0' || normalized === 'false' || normalized === 'off');
55
+ }
56
+
41
57
  /** Default maintainer entry: the sibling daemon in this same context. */
42
58
  export function defaultMaintainerEntry() {
43
59
  return join(__dirname, 'index-maintainer.mjs');
@@ -128,6 +144,12 @@ export function launchMaintainer(options = {}) {
128
144
  },
129
145
  });
130
146
  child.unref();
147
+ // Demote the detached child to OS background priority (best-effort, never
148
+ // throws). Runs in this foreground caller, targeting the child by pid, so
149
+ // only the child is demoted. Gate default-on (Tier-1, output-identical).
150
+ if (bgPriorityEnabled(env)) {
151
+ applyBackgroundPriority(child.pid);
152
+ }
131
153
  log(`maintainer spawned (pid ${child.pid}, detached)`);
132
154
  return { spawned: true, reason: 'spawned', pid: child.pid, stateDir };
133
155
  } catch (err) {
@@ -0,0 +1,397 @@
1
+ /**
2
+ * Event-driven maintainer file watcher (G6, Phase 3, gated).
3
+ *
4
+ * Plan: docs/INDEX_MAINTAINER_EFFICIENCY_IMPLEMENTATION_PLAN.md § "G6".
5
+ * Research: docs/INDEX_MAINTAINER_EFFICIENCY_RESEARCH.md (lever C — event-driven
6
+ * watching + a rare reconcile backstop).
7
+ *
8
+ * This module replaces the per-tick full `stat()` walk as the PRIMARY dirty-set
9
+ * producer with native filesystem events (`@parcel/watcher` — FSEvents on macOS,
10
+ * inotify on Linux, ReadDirectoryChangesW on Windows; the watcher VS Code uses).
11
+ * The full stat-walk is NOT removed: G4 demotes it to a periodic backstop so the
12
+ * correctness guarantee (eventual convergence — e.g. a dir becoming gitignored
13
+ * with no file event still gets retired) is preserved exactly.
14
+ *
15
+ * Ownership boundary (single-writer rule): this file is owned by G6. The three
16
+ * call sites in `index-maintainer.mjs` (start after the lock, early-wake in the
17
+ * sleep loop, teardown in `finally`) are owned by G4 and are NOT edited here —
18
+ * this module matches the exact `startWatcher({stateDir, projectRoot,
19
+ * admissionPolicy, onEvent, onOverflow})` contract G4 wired (and returns a
20
+ * handle with a `.close()` method G4's `finally` calls).
21
+ *
22
+ * Gate: `SWEET_SEARCH_MAINTAINER_WATCH === '1'` (checked by G4 before this module
23
+ * is imported). When the flag is off OR `@parcel/watcher` is unavailable,
24
+ * `startWatcher` returns `null` and behavior is EXACTLY today's: G4 sees a falsy
25
+ * handle, `watcherState.active` stays false, and the full per-tick walk remains
26
+ * the sole producer.
27
+ *
28
+ * Highest-severity risk (event-storm guard): the daemon writes its own queue +
29
+ * databases under the `.sweet-search` stateDir. Those writes must NEVER
30
+ * re-trigger the watcher, or each enqueue would feed back into a new enqueue. The
31
+ * resolved stateDir is therefore the first entry in the native `ignore` list, and
32
+ * a redundant in-handler stateDir-prefix guard backs it up (defense in depth).
33
+ *
34
+ * The watcher NEVER touches merkle and NEVER makes the final admit decision: it
35
+ * appends candidate paths to the queue exactly as the dirty-scan producer does
36
+ * (same line shape), and the consumer (`production-reconciler`) re-admits +
37
+ * content-hashes each file. A false-positive enqueue is harmless (the consumer
38
+ * drops it); a missed event is caught by the periodic backstop walk.
39
+ */
40
+
41
+ import fs from 'node:fs';
42
+ import path from 'node:path';
43
+
44
+ // The dirty queue file name MUST match the dirty-scan producer + the reconcile
45
+ // consumer (`dirty-scan.mjs` DIRTY_QUEUE). Kept as a literal here (rather than
46
+ // imported) to avoid a cross-module import cycle through the reconciler.
47
+ const DIRTY_QUEUE = 'index-maintainer-queue.jsonl';
48
+
49
+ // The FSEvents snapshot is written OUTSIDE the watched tree's event surface (it
50
+ // lives under stateDir, which is itself ignored) so writing it on shutdown can
51
+ // never produce a spurious startup event. `getEventsSince` replays the gap
52
+ // between the last clean shutdown and the next startup for gap-free freshness.
53
+ const SNAPSHOT_FILE = 'maintainer-watch-snapshot.bin';
54
+
55
+ // Mirror of `path-filter.mjs` DEFAULT_DENY_DIRS (that const is module-local and
56
+ // not exported). These are scoped OUT of the native watch set so the OS never
57
+ // even reports events under them (inotify-watch-budget + event-volume control on
58
+ // Linux; FSEvents is O(1) regardless but the ignore still trims event volume).
59
+ // `admissionPolicy.isExcluded` is the authoritative per-event deny check below;
60
+ // this list only narrows what the kernel watches. The stateDir is prepended
61
+ // dynamically (it is the event-storm guard, not a generic deny dir).
62
+ const DEFAULT_DENY_DIRS = Object.freeze([
63
+ 'node_modules',
64
+ '.git',
65
+ '.sweet-search',
66
+ 'dist',
67
+ 'build',
68
+ '.next',
69
+ '.nuxt',
70
+ 'target',
71
+ 'vendor',
72
+ '__pycache__',
73
+ '.venv',
74
+ 'venv',
75
+ '.cache',
76
+ '.turbo',
77
+ 'coverage',
78
+ '.parcel-cache',
79
+ '.svelte-kit',
80
+ '.vercel',
81
+ ]);
82
+
83
+ // `.git` is denied as a watched subtree (above), but a branch switch / commit /
84
+ // reset mutates these two files and must force a backstop walk (bursty churn is
85
+ // handled by the bounded full walk, not unbounded per-file events). We poll them
86
+ // cheaply via mtime rather than carving a second native watch into the denied
87
+ // `.git` dir (which @parcel/watcher's `ignore` would otherwise suppress).
88
+ const GIT_BACKSTOP_FILES = Object.freeze(['HEAD', 'index']);
89
+ const GIT_POLL_INTERVAL_MS = 2000;
90
+
91
+ /**
92
+ * Normalise an absolute path to a project-relative POSIX path, or `null` if the
93
+ * path is not under `projectRoot`. macOS reports realpaths (`/private/tmp` for
94
+ * `/tmp`), so both sides are compared after the caller resolves realpaths.
95
+ *
96
+ * @param {string} absPath
97
+ * @param {string} rootAbs Already resolved project root.
98
+ * @returns {string|null}
99
+ */
100
+ function toRel(absPath, rootAbs) {
101
+ const rel = path.relative(rootAbs, absPath);
102
+ if (!rel || rel.startsWith('..') || path.isAbsolute(rel)) return null;
103
+ return rel.split(path.sep).join('/');
104
+ }
105
+
106
+ /**
107
+ * Append a batch of relative paths to the dirty queue using the EXACT line shape
108
+ * the dirty-scan producer writes (`dirty-scan.mjs:230`):
109
+ * {file_path, timestamp, queued_at, source}
110
+ * The only difference is `source: 'watch'` (vs `'scan'`) so queue lines are
111
+ * attributable to the event path in diagnostics. The consumer ignores `source`.
112
+ *
113
+ * @param {string} stateDir
114
+ * @param {string[]} rels Project-relative POSIX paths.
115
+ */
116
+ function appendQueueLines(stateDir, rels) {
117
+ if (rels.length === 0) return;
118
+ fs.mkdirSync(stateDir, { recursive: true });
119
+ const now = Date.now();
120
+ const iso = new Date(now).toISOString();
121
+ const lines = rels
122
+ .map((rel) => `${JSON.stringify({ file_path: rel, timestamp: now, queued_at: iso, source: 'watch' })}\n`)
123
+ .join('');
124
+ fs.appendFileSync(path.join(stateDir, DIRTY_QUEUE), lines);
125
+ }
126
+
127
+ /**
128
+ * Build the native `ignore` list: the resolved stateDir FIRST (event-storm
129
+ * guard), then each default-deny dir name. @parcel/watcher accepts directory
130
+ * paths and glob-ish names in `ignore`; we pass absolute stateDir + bare dir
131
+ * names (matched at any depth by the native matcher).
132
+ *
133
+ * @param {string} rootAbs
134
+ * @param {string} stateDirAbs
135
+ * @returns {string[]}
136
+ */
137
+ function buildIgnore(rootAbs, stateDirAbs) {
138
+ const ignore = [stateDirAbs];
139
+ for (const name of DEFAULT_DENY_DIRS) ignore.push(name);
140
+ return ignore;
141
+ }
142
+
143
+ /**
144
+ * Lazily load `@parcel/watcher`. Returns `null` if the dependency (or its native
145
+ * binding) is unavailable, so the flag-off / not-installed tree stays green and
146
+ * the daemon silently falls back to the full per-tick walk.
147
+ *
148
+ * @returns {Promise<object|null>}
149
+ */
150
+ async function loadParcelWatcher() {
151
+ try {
152
+ const mod = await import('@parcel/watcher');
153
+ const w = mod.default ?? mod;
154
+ if (w && typeof w.subscribe === 'function') return w;
155
+ return null;
156
+ } catch {
157
+ return null;
158
+ }
159
+ }
160
+
161
+ /**
162
+ * Start the event-driven maintainer watcher.
163
+ *
164
+ * Contract (matched verbatim to G4's wiring in `index-maintainer.mjs`):
165
+ * startWatcher({ stateDir, projectRoot, admissionPolicy, onEvent, onOverflow })
166
+ * -> Promise<{ close(): Promise<void> } | null>
167
+ *
168
+ * - `onEvent()` — called after each batch that enqueued ≥1 candidate, so G4 sets
169
+ * `watcherState.pendingEvents = true` for early-wake out of the sleep loop.
170
+ * - `onOverflow()` — called on (a) watcher overflow / native error, (b) a
171
+ * `.git/HEAD` or `.git/index` change. G4 maps this to
172
+ * `watcherState.forceBackstopWalk = true` (+ pendingEvents) so the next tick
173
+ * runs the bounded full walk.
174
+ *
175
+ * @param {object} opts
176
+ * @param {string} opts.stateDir Resolved `.sweet-search` dir (ignored).
177
+ * @param {string} opts.projectRoot Repo root to watch.
178
+ * @param {object} opts.admissionPolicy `createAdmissionPolicy(...)` result.
179
+ * @param {Function} [opts.onEvent] Early-wake signal.
180
+ * @param {Function} [opts.onOverflow] Force-backstop signal.
181
+ * @returns {Promise<{close: () => Promise<void>} | null>}
182
+ */
183
+ export async function startWatcher({ stateDir, projectRoot, admissionPolicy, onEvent, onOverflow } = {}) {
184
+ if (!projectRoot || !stateDir) return null;
185
+
186
+ const watcher = await loadParcelWatcher();
187
+ if (!watcher) return null;
188
+
189
+ // Resolve realpaths so toRel() and the stateDir-prefix guard compare like with
190
+ // like on macOS (where /tmp -> /private/tmp). Fall back to resolve() if the
191
+ // path does not yet exist on disk.
192
+ const rootAbs = safeRealpath(path.resolve(projectRoot));
193
+ const stateDirAbs = safeRealpath(path.resolve(stateDir));
194
+ const stateDirPrefix = stateDirAbs.endsWith(path.sep) ? stateDirAbs : stateDirAbs + path.sep;
195
+
196
+ const notify = typeof onEvent === 'function' ? onEvent : () => {};
197
+ const overflow = typeof onOverflow === 'function' ? onOverflow : () => {};
198
+
199
+ const snapshotPath = path.join(stateDirAbs, SNAPSHOT_FILE);
200
+ const ignore = buildIgnore(rootAbs, stateDirAbs);
201
+ const subscribeOpts = { ignore };
202
+
203
+ // Gap-free restart: replay events the OS recorded between our last clean
204
+ // shutdown (writeSnapshot) and now, BEFORE subscribing, so an edit made while
205
+ // the daemon was down still lands in the queue. Best-effort — a missing /
206
+ // stale snapshot just means the first backstop walk catches the gap.
207
+ try {
208
+ if (fs.existsSync(snapshotPath)) {
209
+ const sinceEvents = await watcher.getEventsSince(rootAbs, snapshotPath, subscribeOpts);
210
+ handleEvents(sinceEvents, { rootAbs, stateDirPrefix, stateDir, admissionPolicy, notify });
211
+ }
212
+ } catch {
213
+ // A failed replay is non-fatal: force a backstop walk to cover the gap.
214
+ try { overflow(); } catch { /* best-effort */ }
215
+ }
216
+
217
+ let subscription = null;
218
+ try {
219
+ subscription = await watcher.subscribe(rootAbs, (err, events) => {
220
+ // Native error / overflow (IN_Q_OVERFLOW, ERROR_NOTIFY_ENUM_DIR, …): we
221
+ // cannot trust the event stream, so demand a full backstop walk.
222
+ if (err) {
223
+ try { overflow(); } catch { /* best-effort */ }
224
+ return;
225
+ }
226
+ try {
227
+ handleEvents(events, { rootAbs, stateDirPrefix, stateDir, admissionPolicy, notify });
228
+ } catch {
229
+ // A handler fault must never crash the daemon; fall back to backstop.
230
+ try { overflow(); } catch { /* best-effort */ }
231
+ }
232
+ }, subscribeOpts);
233
+ } catch {
234
+ // Could not subscribe (e.g. inotify watch-limit on Linux): no watcher; the
235
+ // full per-tick walk stays primary.
236
+ return null;
237
+ }
238
+
239
+ // `.git/HEAD` + `.git/index` poll → forceBackstopWalk on branch switch / commit.
240
+ const gitDir = path.join(rootAbs, '.git');
241
+ const gitMtimes = new Map();
242
+ primeGitMtimes(gitDir, gitMtimes);
243
+ const gitTimer = setInterval(() => {
244
+ try {
245
+ if (gitChanged(gitDir, gitMtimes)) {
246
+ try { overflow(); } catch { /* best-effort */ }
247
+ }
248
+ } catch { /* best-effort */ }
249
+ }, GIT_POLL_INTERVAL_MS);
250
+ if (gitTimer?.unref) gitTimer.unref();
251
+
252
+ return {
253
+ async close() {
254
+ clearInterval(gitTimer);
255
+ // Persist the FSEvents snapshot OUTSIDE the watch tree (under the ignored
256
+ // stateDir) for gap-free replay on next startup. Best-effort.
257
+ try {
258
+ fs.mkdirSync(stateDirAbs, { recursive: true });
259
+ await watcher.writeSnapshot(rootAbs, snapshotPath, subscribeOpts);
260
+ } catch { /* best-effort */ }
261
+ try {
262
+ if (subscription && typeof subscription.unsubscribe === 'function') {
263
+ await subscription.unsubscribe();
264
+ }
265
+ } catch { /* best-effort */ }
266
+ },
267
+ };
268
+ }
269
+
270
+ /**
271
+ * Process a batch of native events into queue lines. Pure of native concerns so
272
+ * it is reused by both the live subscription and the `getEventsSince` replay.
273
+ *
274
+ * Filtering, in order:
275
+ * 1. event-storm guard — drop anything under the stateDir (defense in depth on
276
+ * top of the native `ignore`);
277
+ * 2. relativise — drop anything outside the project root;
278
+ * 3. directory guard — drop directory events. The dirty-scan producer only
279
+ * ever enqueues regular files (`ent.isFile()`, `dirty-scan.mjs:145`), so a
280
+ * bare `create`/`update` on a directory must NOT become a queue line.
281
+ * Deletes are kept (the path is gone, can't stat) — a deleted directory rel
282
+ * is harmless: it won't be a merkle-known file, so the consumer drops it,
283
+ * exactly as dirty-scan's delete branch only retires merkle-known paths;
284
+ * 4. admission deny — drop excluded paths (node_modules, denied dirs/exts,
285
+ * `.sweet-search-ignore`) via `admissionPolicy.isExcluded`.
286
+ *
287
+ * Note: this is NOT the final admit decision (no include-allowlist / size /
288
+ * gitignore check). The consumer re-admits + content-hashes. We only need to keep
289
+ * obviously-denied churn out of the queue; over-admitting is harmless.
290
+ *
291
+ * @param {Array<{path:string,type:string}>} events
292
+ * @param {object} ctx
293
+ */
294
+ function handleEvents(events, { rootAbs, stateDirPrefix, stateDir, admissionPolicy, notify }) {
295
+ if (!Array.isArray(events) || events.length === 0) return;
296
+ const rels = [];
297
+ const seen = new Set();
298
+ for (const ev of events) {
299
+ const abs = ev && ev.path;
300
+ if (typeof abs !== 'string' || abs.length === 0) continue;
301
+ // 1. Event-storm guard: never re-trigger on our own state writes.
302
+ if (abs === stateDirPrefix.slice(0, -1) || abs.startsWith(stateDirPrefix)) continue;
303
+ // 2. Relativise; drop paths outside the project root.
304
+ const rel = toRel(abs, rootAbs);
305
+ if (!rel) continue;
306
+ if (seen.has(rel)) continue;
307
+ // 3. Directory guard: only files become queue lines (match dirty-scan).
308
+ // `delete` events have no on-disk path to stat; keep them (a deleted dir
309
+ // rel is a harmless non-merkle no-op for the consumer).
310
+ if (ev.type !== 'delete') {
311
+ try {
312
+ if (fs.statSync(abs).isDirectory()) continue;
313
+ } catch {
314
+ // Vanished between event and stat (rapid churn): let the consumer decide.
315
+ }
316
+ }
317
+ // 4. Admission deny-list (cheap, sync, I/O-free).
318
+ try {
319
+ if (admissionPolicy && typeof admissionPolicy.isExcluded === 'function' && admissionPolicy.isExcluded(rel)) {
320
+ continue;
321
+ }
322
+ } catch {
323
+ // A policy fault should not drop the event — let the consumer re-admit.
324
+ }
325
+ seen.add(rel);
326
+ rels.push(rel);
327
+ }
328
+ if (rels.length === 0) return;
329
+ appendQueueLines(stateDir, rels);
330
+ try { notify(); } catch { /* best-effort */ }
331
+ }
332
+
333
+ /**
334
+ * Resolve a realpath, falling back to the resolved path if it does not exist.
335
+ * @param {string} p
336
+ * @returns {string}
337
+ */
338
+ function safeRealpath(p) {
339
+ try {
340
+ return fs.realpathSync(p);
341
+ } catch {
342
+ return p;
343
+ }
344
+ }
345
+
346
+ /**
347
+ * Record current mtimes of the git backstop files (best-effort).
348
+ * @param {string} gitDir
349
+ * @param {Map<string, number>} store
350
+ */
351
+ function primeGitMtimes(gitDir, store) {
352
+ for (const name of GIT_BACKSTOP_FILES) {
353
+ try {
354
+ store.set(name, fs.statSync(path.join(gitDir, name)).mtimeMs);
355
+ } catch {
356
+ store.set(name, 0);
357
+ }
358
+ }
359
+ }
360
+
361
+ /**
362
+ * Return true (and update `store`) if any git backstop file changed since the
363
+ * last poll. A removed/added file (mtime 0 ↔ present) counts as a change.
364
+ * @param {string} gitDir
365
+ * @param {Map<string, number>} store
366
+ * @returns {boolean}
367
+ */
368
+ function gitChanged(gitDir, store) {
369
+ let changed = false;
370
+ for (const name of GIT_BACKSTOP_FILES) {
371
+ let mtime = 0;
372
+ try {
373
+ mtime = fs.statSync(path.join(gitDir, name)).mtimeMs;
374
+ } catch {
375
+ mtime = 0;
376
+ }
377
+ if (store.get(name) !== mtime) {
378
+ changed = true;
379
+ store.set(name, mtime);
380
+ }
381
+ }
382
+ return changed;
383
+ }
384
+
385
+ // Test seam: pure helpers exercised directly by the unit tests so they do not
386
+ // need the native watcher to assert queue-line shape / filtering / git logic.
387
+ export const __testing = {
388
+ toRel,
389
+ buildIgnore,
390
+ appendQueueLines,
391
+ handleEvents,
392
+ gitChanged,
393
+ primeGitMtimes,
394
+ DIRTY_QUEUE,
395
+ SNAPSHOT_FILE,
396
+ DEFAULT_DENY_DIRS,
397
+ };