sweet-search 2.5.13 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -9
- package/core/cli.js +41 -3
- package/core/embedding/embedding-local-model.js +106 -10
- package/core/embedding/embedding-service.js +59 -1
- package/core/embedding/model-client.mjs +257 -0
- package/core/embedding/model-server.mjs +217 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +19 -98
- package/core/incremental-indexing/application/maintenance-worker.mjs +46 -9
- package/core/incremental-indexing/application/operator-cli.mjs +14 -5
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +40 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +718 -54
- package/core/incremental-indexing/application/reconciler.mjs +87 -15
- package/core/incremental-indexing/domain/cutoff-cache.mjs +191 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +84 -1
- package/core/incremental-indexing/domain/reconcile-counters.mjs +0 -4
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +0 -24
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +2 -26
- package/core/incremental-indexing/infrastructure/manifest.mjs +1 -9
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +72 -0
- package/core/indexing/artifact-builder.js +1 -1
- package/core/indexing/dedup/dedup-phase.js +36 -17
- package/core/indexing/dedup/exemplar-selector.js +5 -0
- package/core/indexing/index-codebase-v21.js +37 -14
- package/core/indexing/index-maintainer.mjs +337 -6
- package/core/indexing/indexer-ann.js +27 -434
- package/core/indexing/indexer-build.js +30 -14
- package/core/indexing/indexer-manifest.js +0 -3
- package/core/indexing/indexer-phases.js +101 -25
- package/core/indexing/maintainer-launcher.mjs +22 -0
- package/core/indexing/maintainer-watcher.mjs +397 -0
- package/core/indexing/os-priority.mjs +160 -0
- package/core/indexing/rss-budget.mjs +425 -0
- package/core/indexing/streaming-vectors.js +450 -0
- package/core/infrastructure/config/platform.js +14 -10
- package/core/infrastructure/onnx-session-utils.js +37 -0
- package/core/infrastructure/sparse-gram-delta-reader.js +11 -1
- package/core/ranking/late-interaction-index.js +58 -7
- package/core/search/daemon-registry.js +199 -0
- package/core/search/search-read-semantic.js +9 -3
- package/core/search/search-semantic.js +6 -29
- package/core/search/search-server.js +527 -27
- package/core/search/session-daemon-prewarm.mjs +110 -1
- package/core/search/sweet-search.js +0 -38
- package/core/vector-store/binary-hnsw-index.js +692 -78
- package/core/vector-store/index.js +1 -4
- package/eval/agent-read-workflows/bin/_ss-argparse.mjs +51 -5
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +95 -44
- package/eval/agent-read-workflows/bin/ss-read +2 -0
- package/mcp/tool-handlers.js +1 -2
- package/package.json +11 -8
- package/scripts/uninstall.js +2 -0
- package/core/vector-store/hnsw-index.js +0 -751
|
@@ -14,7 +14,7 @@ import { colors, log, logProgress, logError, discoverFiles, readFilesFromStdin,
|
|
|
14
14
|
import { buildCodeGraph, buildVectorIndex, chunkFiles } from './indexer-build.js';
|
|
15
15
|
import { runDedupPhase, formatDedupSummary } from './dedup/dedup-phase.js';
|
|
16
16
|
import { DEDUP_CONFIG } from '../infrastructure/config/index.js';
|
|
17
|
-
import {
|
|
17
|
+
import { buildLateInteractionIndex, buildQuantizedArtifactsPhase } from './indexer-ann.js';
|
|
18
18
|
import { buildSparseGramArtifact } from './indexer-sparse-gram.js';
|
|
19
19
|
import { publishIndexerManifest } from './indexer-manifest.js';
|
|
20
20
|
import { contentHashSync } from '../incremental-indexing/infrastructure/hashing.mjs';
|
|
@@ -410,11 +410,31 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
|
|
|
410
410
|
: resourcePlan.threadsPerLateInteractionWorker;
|
|
411
411
|
const stagedLateInteractionPath = DB_PATHS.lateInteraction + '.tmp';
|
|
412
412
|
|
|
413
|
-
//
|
|
414
|
-
//
|
|
415
|
-
//
|
|
413
|
+
// ── Bounded-memory streaming path for large full rebuilds ──
|
|
414
|
+
//
|
|
415
|
+
// The in-memory path below materialises the WHOLE chunk corpus (chunkFiles →
|
|
416
|
+
// allChunks/texts) plus all exemplar embeddings, all alias rows, and every
|
|
417
|
+
// LI per-token slab — peak heap O(repo). On big repos (libsql ≈ 431k chunks,
|
|
418
|
+
// swc ≈ 217k) that exceeds the default ~4 GB heap and crashes on EVERY
|
|
419
|
+
// backend (CUDA/Metal/CoreML/ORT-CPU), since the hogs are JS-side, not the
|
|
420
|
+
// model. For large full rebuilds we instead spill chunks to disk and embed/LI
|
|
421
|
+
// in bounded windows (see streaming-vectors.js) so peak heap is O(window).
|
|
422
|
+
//
|
|
423
|
+
// Gated by file count so small repos + incremental runs keep the original
|
|
424
|
+
// in-memory path byte-for-byte (benchmark indexes unaffected). Auto-selected,
|
|
425
|
+
// no opt-in flag; SWEET_SEARCH_STREAM_VECTORS=0 forces the legacy path and
|
|
426
|
+
// SWEET_SEARCH_STREAM_MIN_FILES tunes the threshold.
|
|
427
|
+
const streamMinFiles = Number(process.env.SWEET_SEARCH_STREAM_MIN_FILES) || 5000;
|
|
428
|
+
const useStreaming = !dryRun
|
|
429
|
+
&& fullReindex
|
|
430
|
+
&& filesToIndex.length >= streamMinFiles
|
|
431
|
+
&& process.env.SWEET_SEARCH_STREAM_VECTORS !== '0';
|
|
432
|
+
|
|
433
|
+
// The in-memory path pre-chunks up front so both vector + LI encoders share
|
|
434
|
+
// one chunk list. The streaming path does its own windowed chunking + dedup,
|
|
435
|
+
// so skip this for it (this is the O(repo) allocation we're avoiding).
|
|
416
436
|
let preChunked = null;
|
|
417
|
-
if (!dryRun && filesToIndex.length > 0) {
|
|
437
|
+
if (!dryRun && !useStreaming && filesToIndex.length > 0) {
|
|
418
438
|
preChunked = await chunkFiles(filesToIndex);
|
|
419
439
|
|
|
420
440
|
// Near-duplicate dedup: annotates chunks in place with {simhash, clusterId,
|
|
@@ -562,6 +582,82 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
|
|
|
562
582
|
}
|
|
563
583
|
|
|
564
584
|
try {
|
|
585
|
+
// ── Streaming path: bounded-memory vectors + LI for large full rebuilds ──
|
|
586
|
+
if (useStreaming) {
|
|
587
|
+
const { getModelInfo } = await import('../embedding/embedding-service.js');
|
|
588
|
+
const { buildVectorsAndLiStreaming } = await import('./streaming-vectors.js');
|
|
589
|
+
const modelInfo = getModelInfo();
|
|
590
|
+
|
|
591
|
+
const streamed = await buildVectorsAndLiStreaming({
|
|
592
|
+
filesToIndex,
|
|
593
|
+
modelInfo,
|
|
594
|
+
sqliteFastMode,
|
|
595
|
+
noLateInteraction,
|
|
596
|
+
li: {
|
|
597
|
+
poolFactor: lateInteractionPool,
|
|
598
|
+
extendedSkiplist: lateInteractionExtendedSkiplist,
|
|
599
|
+
loadFromPath: DB_PATHS.lateInteraction,
|
|
600
|
+
saveToPath: stagedLateInteractionPath,
|
|
601
|
+
finalIndexPath: DB_PATHS.lateInteraction,
|
|
602
|
+
stagingSegmentDir: stagedLateInteractionSegmentDir(stagedLateInteractionPath),
|
|
603
|
+
workerCount: lateInteractionWorkers,
|
|
604
|
+
threadsPerWorker: lateInteractionWorkerThreads,
|
|
605
|
+
batchSize: resourcePlan.lateInteractionBatchSize,
|
|
606
|
+
batchSizeUpperCap: resourcePlan.lateInteractionBatchSizeUpperCap,
|
|
607
|
+
tokenBudget: resourcePlan.lateInteractionTokenBudget,
|
|
608
|
+
attentionBudget: resourcePlan.lateInteractionAttentionBudget,
|
|
609
|
+
},
|
|
610
|
+
});
|
|
611
|
+
|
|
612
|
+
// HCGS (off by default) runs independently of vectors — drain it if armed.
|
|
613
|
+
let hcgsResult = null;
|
|
614
|
+
if (hcgsPromise) {
|
|
615
|
+
try { hcgsResult = await hcgsPromise; } catch (e) { hcgsResult = { error: e.message }; }
|
|
616
|
+
if (hcgsResult && !hcgsResult.error) {
|
|
617
|
+
log(`Summaries regenerated (${hcgsResult.generated} generated, ${hcgsResult.skipped} skipped)`, 'green');
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
const vectorStats = streamed.vectorStats || { chunks: 0, embeddings: 0 };
|
|
622
|
+
if (vectorStats.embeddings > 0) await markPhaseComplete('vectors');
|
|
623
|
+
|
|
624
|
+
// Promote the staged LI index (built bounded), or invalidate on failure —
|
|
625
|
+
// same contract as the in-memory path's swap/invalidate below.
|
|
626
|
+
let lateInteractionResult = streamed.lateInteractionResult;
|
|
627
|
+
if (!noLateInteraction) {
|
|
628
|
+
if (streamed.liBuilt && lateInteractionResult && !lateInteractionResult.error) {
|
|
629
|
+
await atomicSwapLateInteractionIndex(stagedLateInteractionPath, DB_PATHS.lateInteraction);
|
|
630
|
+
log('Late interaction index promoted', 'green');
|
|
631
|
+
await markPhaseComplete('late-interaction');
|
|
632
|
+
} else {
|
|
633
|
+
await cleanupStagedLateInteractionIndex(stagedLateInteractionPath);
|
|
634
|
+
await invalidateLateInteractionIndex();
|
|
635
|
+
if (lateInteractionResult?.error) {
|
|
636
|
+
log(`Late interaction rebuild failed; invalidated existing index: ${lateInteractionResult.error}`, 'yellow');
|
|
637
|
+
lateInteractionResult = { error: lateInteractionResult.error, invalidated: true };
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Binary HNSW + int8 artifacts stream from the swapped codebase.db.
|
|
643
|
+
if (vectorStats.embeddings > 0) {
|
|
644
|
+
await updatePhaseProgress({ phase: 'artifacts', status: 'in_progress' });
|
|
645
|
+
await buildQuantizedArtifactsPhase(dryRun, {
|
|
646
|
+
changedFiles: filesToIndex.length,
|
|
647
|
+
force: forceArtifacts || fullReindex,
|
|
648
|
+
});
|
|
649
|
+
await markPhaseComplete('artifacts');
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
let sparseGramResult = null;
|
|
653
|
+
if (Array.isArray(allFiles) && allFiles.length > 0) {
|
|
654
|
+
sparseGramResult = await buildSparseGramArtifact(allFiles, dryRun);
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
await clearPhaseProgress();
|
|
658
|
+
return { vectorStats, hcgsResult, lateInteractionResult, sparseGramResult };
|
|
659
|
+
}
|
|
660
|
+
|
|
565
661
|
const vectorPromise = buildVectorIndex(filesToIndex, dryRun, vectorOptions);
|
|
566
662
|
|
|
567
663
|
// Compute LI file removal list (used by both parallel and sequential paths)
|
|
@@ -633,25 +729,6 @@ export async function buildVectorsAndArtifactsPhase(options = {}) {
|
|
|
633
729
|
await markPhaseComplete('vectors');
|
|
634
730
|
}
|
|
635
731
|
|
|
636
|
-
try {
|
|
637
|
-
if (!dryRun && vectorStats.embeddings > 0) {
|
|
638
|
-
await updatePhaseProgress({ phase: 'hnsw', status: 'in_progress' });
|
|
639
|
-
if (incrementalInfo && !fullReindex) {
|
|
640
|
-
const allFilesToRemoveFromHNSW = [
|
|
641
|
-
...incrementalInfo.toIndex,
|
|
642
|
-
...(incrementalInfo.toRemove || [])
|
|
643
|
-
];
|
|
644
|
-
await incrementalUpdateHNSW(DB_PATHS.codebase, allFilesToRemoveFromHNSW, dryRun);
|
|
645
|
-
} else {
|
|
646
|
-
await buildHNSWIndex(DB_PATHS.codebase, dryRun);
|
|
647
|
-
}
|
|
648
|
-
await markPhaseComplete('hnsw');
|
|
649
|
-
}
|
|
650
|
-
} catch (err) {
|
|
651
|
-
await cleanupStagedLateInteractionIndex(stagedLateInteractionPath);
|
|
652
|
-
throw err;
|
|
653
|
-
}
|
|
654
|
-
|
|
655
732
|
let lateInteractionResult = liOutcome.result;
|
|
656
733
|
|
|
657
734
|
if (!liPromise && !dryRun && !noLateInteraction && (preChunked?.allChunks?.length > 0 || filesToRemoveFromLI.length > 0)) {
|
|
@@ -811,7 +888,6 @@ export function printSummaryPhase(options) {
|
|
|
811
888
|
if (!vectorsOnly) log(` - ${DB_PATHS.codeGraph}`, 'green');
|
|
812
889
|
if (!graphOnly) {
|
|
813
890
|
log(` - ${DB_PATHS.codebase}`, 'green');
|
|
814
|
-
log(` - ${DB_PATHS.hnswIndex}`, 'green');
|
|
815
891
|
if (existsSync(DB_PATHS.binaryHnswIndex.replace('.idx', '.meta.json'))) {
|
|
816
892
|
log(` - ${DB_PATHS.binaryHnswIndex} (Binary HNSW, 32x smaller)`, 'green');
|
|
817
893
|
}
|
|
@@ -33,11 +33,27 @@ import { existsSync, readFileSync } from 'node:fs';
|
|
|
33
33
|
import { dirname, join } from 'node:path';
|
|
34
34
|
import { fileURLToPath } from 'node:url';
|
|
35
35
|
import { reconcileEnablement } from '../incremental-indexing/domain/interval-autotune.mjs';
|
|
36
|
+
import { applyBackgroundPriority } from './os-priority.mjs';
|
|
36
37
|
|
|
37
38
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
38
39
|
|
|
39
40
|
export const MAINTAINER_LOCK_FILENAME = 'index-maintainer.lock';
|
|
40
41
|
|
|
42
|
+
/**
|
|
43
|
+
* Background-priority gate (research §4.A A.2/A.3). Default ON — this is a
|
|
44
|
+
* Tier-1, output-identical lever (only *when* CPU/IO is granted to the child
|
|
45
|
+
* changes). Honors a canonical off-token (`0`/`false`/`off`) to disable.
|
|
46
|
+
*
|
|
47
|
+
* @param {NodeJS.ProcessEnv} env
|
|
48
|
+
* @returns {boolean}
|
|
49
|
+
*/
|
|
50
|
+
function bgPriorityEnabled(env) {
|
|
51
|
+
const raw = env.SWEET_SEARCH_MAINTAINER_BG_PRIORITY;
|
|
52
|
+
if (raw == null || raw === '') return true; // default-on
|
|
53
|
+
const normalized = String(raw).trim().toLowerCase();
|
|
54
|
+
return !(normalized === '0' || normalized === 'false' || normalized === 'off');
|
|
55
|
+
}
|
|
56
|
+
|
|
41
57
|
/** Default maintainer entry: the sibling daemon in this same context. */
|
|
42
58
|
export function defaultMaintainerEntry() {
|
|
43
59
|
return join(__dirname, 'index-maintainer.mjs');
|
|
@@ -128,6 +144,12 @@ export function launchMaintainer(options = {}) {
|
|
|
128
144
|
},
|
|
129
145
|
});
|
|
130
146
|
child.unref();
|
|
147
|
+
// Demote the detached child to OS background priority (best-effort, never
|
|
148
|
+
// throws). Runs in this foreground caller, targeting the child by pid, so
|
|
149
|
+
// only the child is demoted. Gate default-on (Tier-1, output-identical).
|
|
150
|
+
if (bgPriorityEnabled(env)) {
|
|
151
|
+
applyBackgroundPriority(child.pid);
|
|
152
|
+
}
|
|
131
153
|
log(`maintainer spawned (pid ${child.pid}, detached)`);
|
|
132
154
|
return { spawned: true, reason: 'spawned', pid: child.pid, stateDir };
|
|
133
155
|
} catch (err) {
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Event-driven maintainer file watcher (G6, Phase 3, gated).
|
|
3
|
+
*
|
|
4
|
+
* Plan: docs/INDEX_MAINTAINER_EFFICIENCY_IMPLEMENTATION_PLAN.md § "G6".
|
|
5
|
+
* Research: docs/INDEX_MAINTAINER_EFFICIENCY_RESEARCH.md (lever C — event-driven
|
|
6
|
+
* watching + a rare reconcile backstop).
|
|
7
|
+
*
|
|
8
|
+
* This module replaces the per-tick full `stat()` walk as the PRIMARY dirty-set
|
|
9
|
+
* producer with native filesystem events (`@parcel/watcher` — FSEvents on macOS,
|
|
10
|
+
* inotify on Linux, ReadDirectoryChangesW on Windows; the watcher VS Code uses).
|
|
11
|
+
* The full stat-walk is NOT removed: G4 demotes it to a periodic backstop so the
|
|
12
|
+
* correctness guarantee (eventual convergence — e.g. a dir becoming gitignored
|
|
13
|
+
* with no file event still gets retired) is preserved exactly.
|
|
14
|
+
*
|
|
15
|
+
* Ownership boundary (single-writer rule): this file is owned by G6. The three
|
|
16
|
+
* call sites in `index-maintainer.mjs` (start after the lock, early-wake in the
|
|
17
|
+
* sleep loop, teardown in `finally`) are owned by G4 and are NOT edited here —
|
|
18
|
+
* this module matches the exact `startWatcher({stateDir, projectRoot,
|
|
19
|
+
* admissionPolicy, onEvent, onOverflow})` contract G4 wired (and returns a
|
|
20
|
+
* handle with a `.close()` method G4's `finally` calls).
|
|
21
|
+
*
|
|
22
|
+
* Gate: `SWEET_SEARCH_MAINTAINER_WATCH === '1'` (checked by G4 before this module
|
|
23
|
+
* is imported). When the flag is off OR `@parcel/watcher` is unavailable,
|
|
24
|
+
* `startWatcher` returns `null` and behavior is EXACTLY today's: G4 sees a falsy
|
|
25
|
+
* handle, `watcherState.active` stays false, and the full per-tick walk remains
|
|
26
|
+
* the sole producer.
|
|
27
|
+
*
|
|
28
|
+
* Highest-severity risk (event-storm guard): the daemon writes its own queue +
|
|
29
|
+
* databases under the `.sweet-search` stateDir. Those writes must NEVER
|
|
30
|
+
* re-trigger the watcher, or each enqueue would feed back into a new enqueue. The
|
|
31
|
+
* resolved stateDir is therefore the first entry in the native `ignore` list, and
|
|
32
|
+
* a redundant in-handler stateDir-prefix guard backs it up (defense in depth).
|
|
33
|
+
*
|
|
34
|
+
* The watcher NEVER touches merkle and NEVER makes the final admit decision: it
|
|
35
|
+
* appends candidate paths to the queue exactly as the dirty-scan producer does
|
|
36
|
+
* (same line shape), and the consumer (`production-reconciler`) re-admits +
|
|
37
|
+
* content-hashes each file. A false-positive enqueue is harmless (the consumer
|
|
38
|
+
* drops it); a missed event is caught by the periodic backstop walk.
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
import fs from 'node:fs';
|
|
42
|
+
import path from 'node:path';
|
|
43
|
+
|
|
44
|
+
// The dirty queue file name MUST match the dirty-scan producer + the reconcile
|
|
45
|
+
// consumer (`dirty-scan.mjs` DIRTY_QUEUE). Kept as a literal here (rather than
|
|
46
|
+
// imported) to avoid a cross-module import cycle through the reconciler.
|
|
47
|
+
const DIRTY_QUEUE = 'index-maintainer-queue.jsonl';
|
|
48
|
+
|
|
49
|
+
// The FSEvents snapshot is written OUTSIDE the watched tree's event surface (it
|
|
50
|
+
// lives under stateDir, which is itself ignored) so writing it on shutdown can
|
|
51
|
+
// never produce a spurious startup event. `getEventsSince` replays the gap
|
|
52
|
+
// between the last clean shutdown and the next startup for gap-free freshness.
|
|
53
|
+
const SNAPSHOT_FILE = 'maintainer-watch-snapshot.bin';
|
|
54
|
+
|
|
55
|
+
// Mirror of `path-filter.mjs` DEFAULT_DENY_DIRS (that const is module-local and
|
|
56
|
+
// not exported). These are scoped OUT of the native watch set so the OS never
|
|
57
|
+
// even reports events under them (inotify-watch-budget + event-volume control on
|
|
58
|
+
// Linux; FSEvents is O(1) regardless but the ignore still trims event volume).
|
|
59
|
+
// `admissionPolicy.isExcluded` is the authoritative per-event deny check below;
|
|
60
|
+
// this list only narrows what the kernel watches. The stateDir is prepended
|
|
61
|
+
// dynamically (it is the event-storm guard, not a generic deny dir).
|
|
62
|
+
const DEFAULT_DENY_DIRS = Object.freeze([
|
|
63
|
+
'node_modules',
|
|
64
|
+
'.git',
|
|
65
|
+
'.sweet-search',
|
|
66
|
+
'dist',
|
|
67
|
+
'build',
|
|
68
|
+
'.next',
|
|
69
|
+
'.nuxt',
|
|
70
|
+
'target',
|
|
71
|
+
'vendor',
|
|
72
|
+
'__pycache__',
|
|
73
|
+
'.venv',
|
|
74
|
+
'venv',
|
|
75
|
+
'.cache',
|
|
76
|
+
'.turbo',
|
|
77
|
+
'coverage',
|
|
78
|
+
'.parcel-cache',
|
|
79
|
+
'.svelte-kit',
|
|
80
|
+
'.vercel',
|
|
81
|
+
]);
|
|
82
|
+
|
|
83
|
+
// `.git` is denied as a watched subtree (above), but a branch switch / commit /
|
|
84
|
+
// reset mutates these two files and must force a backstop walk (bursty churn is
|
|
85
|
+
// handled by the bounded full walk, not unbounded per-file events). We poll them
|
|
86
|
+
// cheaply via mtime rather than carving a second native watch into the denied
|
|
87
|
+
// `.git` dir (which @parcel/watcher's `ignore` would otherwise suppress).
|
|
88
|
+
const GIT_BACKSTOP_FILES = Object.freeze(['HEAD', 'index']);
|
|
89
|
+
const GIT_POLL_INTERVAL_MS = 2000;
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Normalise an absolute path to a project-relative POSIX path, or `null` if the
|
|
93
|
+
* path is not under `projectRoot`. macOS reports realpaths (`/private/tmp` for
|
|
94
|
+
* `/tmp`), so both sides are compared after the caller resolves realpaths.
|
|
95
|
+
*
|
|
96
|
+
* @param {string} absPath
|
|
97
|
+
* @param {string} rootAbs Already resolved project root.
|
|
98
|
+
* @returns {string|null}
|
|
99
|
+
*/
|
|
100
|
+
function toRel(absPath, rootAbs) {
|
|
101
|
+
const rel = path.relative(rootAbs, absPath);
|
|
102
|
+
if (!rel || rel.startsWith('..') || path.isAbsolute(rel)) return null;
|
|
103
|
+
return rel.split(path.sep).join('/');
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Append a batch of relative paths to the dirty queue using the EXACT line shape
|
|
108
|
+
* the dirty-scan producer writes (`dirty-scan.mjs:230`):
|
|
109
|
+
* {file_path, timestamp, queued_at, source}
|
|
110
|
+
* The only difference is `source: 'watch'` (vs `'scan'`) so queue lines are
|
|
111
|
+
* attributable to the event path in diagnostics. The consumer ignores `source`.
|
|
112
|
+
*
|
|
113
|
+
* @param {string} stateDir
|
|
114
|
+
* @param {string[]} rels Project-relative POSIX paths.
|
|
115
|
+
*/
|
|
116
|
+
function appendQueueLines(stateDir, rels) {
|
|
117
|
+
if (rels.length === 0) return;
|
|
118
|
+
fs.mkdirSync(stateDir, { recursive: true });
|
|
119
|
+
const now = Date.now();
|
|
120
|
+
const iso = new Date(now).toISOString();
|
|
121
|
+
const lines = rels
|
|
122
|
+
.map((rel) => `${JSON.stringify({ file_path: rel, timestamp: now, queued_at: iso, source: 'watch' })}\n`)
|
|
123
|
+
.join('');
|
|
124
|
+
fs.appendFileSync(path.join(stateDir, DIRTY_QUEUE), lines);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Build the native `ignore` list: the resolved stateDir FIRST (event-storm
|
|
129
|
+
* guard), then each default-deny dir name. @parcel/watcher accepts directory
|
|
130
|
+
* paths and glob-ish names in `ignore`; we pass absolute stateDir + bare dir
|
|
131
|
+
* names (matched at any depth by the native matcher).
|
|
132
|
+
*
|
|
133
|
+
* @param {string} rootAbs
|
|
134
|
+
* @param {string} stateDirAbs
|
|
135
|
+
* @returns {string[]}
|
|
136
|
+
*/
|
|
137
|
+
function buildIgnore(rootAbs, stateDirAbs) {
|
|
138
|
+
const ignore = [stateDirAbs];
|
|
139
|
+
for (const name of DEFAULT_DENY_DIRS) ignore.push(name);
|
|
140
|
+
return ignore;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Lazily load `@parcel/watcher`. Returns `null` if the dependency (or its native
|
|
145
|
+
* binding) is unavailable, so the flag-off / not-installed tree stays green and
|
|
146
|
+
* the daemon silently falls back to the full per-tick walk.
|
|
147
|
+
*
|
|
148
|
+
* @returns {Promise<object|null>}
|
|
149
|
+
*/
|
|
150
|
+
async function loadParcelWatcher() {
|
|
151
|
+
try {
|
|
152
|
+
const mod = await import('@parcel/watcher');
|
|
153
|
+
const w = mod.default ?? mod;
|
|
154
|
+
if (w && typeof w.subscribe === 'function') return w;
|
|
155
|
+
return null;
|
|
156
|
+
} catch {
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Start the event-driven maintainer watcher.
|
|
163
|
+
*
|
|
164
|
+
* Contract (matched verbatim to G4's wiring in `index-maintainer.mjs`):
|
|
165
|
+
* startWatcher({ stateDir, projectRoot, admissionPolicy, onEvent, onOverflow })
|
|
166
|
+
* -> Promise<{ close(): Promise<void> } | null>
|
|
167
|
+
*
|
|
168
|
+
* - `onEvent()` — called after each batch that enqueued ≥1 candidate, so G4 sets
|
|
169
|
+
* `watcherState.pendingEvents = true` for early-wake out of the sleep loop.
|
|
170
|
+
* - `onOverflow()` — called on (a) watcher overflow / native error, (b) a
|
|
171
|
+
* `.git/HEAD` or `.git/index` change. G4 maps this to
|
|
172
|
+
* `watcherState.forceBackstopWalk = true` (+ pendingEvents) so the next tick
|
|
173
|
+
* runs the bounded full walk.
|
|
174
|
+
*
|
|
175
|
+
* @param {object} opts
|
|
176
|
+
* @param {string} opts.stateDir Resolved `.sweet-search` dir (ignored).
|
|
177
|
+
* @param {string} opts.projectRoot Repo root to watch.
|
|
178
|
+
* @param {object} opts.admissionPolicy `createAdmissionPolicy(...)` result.
|
|
179
|
+
* @param {Function} [opts.onEvent] Early-wake signal.
|
|
180
|
+
* @param {Function} [opts.onOverflow] Force-backstop signal.
|
|
181
|
+
* @returns {Promise<{close: () => Promise<void>} | null>}
|
|
182
|
+
*/
|
|
183
|
+
export async function startWatcher({ stateDir, projectRoot, admissionPolicy, onEvent, onOverflow } = {}) {
|
|
184
|
+
if (!projectRoot || !stateDir) return null;
|
|
185
|
+
|
|
186
|
+
const watcher = await loadParcelWatcher();
|
|
187
|
+
if (!watcher) return null;
|
|
188
|
+
|
|
189
|
+
// Resolve realpaths so toRel() and the stateDir-prefix guard compare like with
|
|
190
|
+
// like on macOS (where /tmp -> /private/tmp). Fall back to resolve() if the
|
|
191
|
+
// path does not yet exist on disk.
|
|
192
|
+
const rootAbs = safeRealpath(path.resolve(projectRoot));
|
|
193
|
+
const stateDirAbs = safeRealpath(path.resolve(stateDir));
|
|
194
|
+
const stateDirPrefix = stateDirAbs.endsWith(path.sep) ? stateDirAbs : stateDirAbs + path.sep;
|
|
195
|
+
|
|
196
|
+
const notify = typeof onEvent === 'function' ? onEvent : () => {};
|
|
197
|
+
const overflow = typeof onOverflow === 'function' ? onOverflow : () => {};
|
|
198
|
+
|
|
199
|
+
const snapshotPath = path.join(stateDirAbs, SNAPSHOT_FILE);
|
|
200
|
+
const ignore = buildIgnore(rootAbs, stateDirAbs);
|
|
201
|
+
const subscribeOpts = { ignore };
|
|
202
|
+
|
|
203
|
+
// Gap-free restart: replay events the OS recorded between our last clean
|
|
204
|
+
// shutdown (writeSnapshot) and now, BEFORE subscribing, so an edit made while
|
|
205
|
+
// the daemon was down still lands in the queue. Best-effort — a missing /
|
|
206
|
+
// stale snapshot just means the first backstop walk catches the gap.
|
|
207
|
+
try {
|
|
208
|
+
if (fs.existsSync(snapshotPath)) {
|
|
209
|
+
const sinceEvents = await watcher.getEventsSince(rootAbs, snapshotPath, subscribeOpts);
|
|
210
|
+
handleEvents(sinceEvents, { rootAbs, stateDirPrefix, stateDir, admissionPolicy, notify });
|
|
211
|
+
}
|
|
212
|
+
} catch {
|
|
213
|
+
// A failed replay is non-fatal: force a backstop walk to cover the gap.
|
|
214
|
+
try { overflow(); } catch { /* best-effort */ }
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
let subscription = null;
|
|
218
|
+
try {
|
|
219
|
+
subscription = await watcher.subscribe(rootAbs, (err, events) => {
|
|
220
|
+
// Native error / overflow (IN_Q_OVERFLOW, ERROR_NOTIFY_ENUM_DIR, …): we
|
|
221
|
+
// cannot trust the event stream, so demand a full backstop walk.
|
|
222
|
+
if (err) {
|
|
223
|
+
try { overflow(); } catch { /* best-effort */ }
|
|
224
|
+
return;
|
|
225
|
+
}
|
|
226
|
+
try {
|
|
227
|
+
handleEvents(events, { rootAbs, stateDirPrefix, stateDir, admissionPolicy, notify });
|
|
228
|
+
} catch {
|
|
229
|
+
// A handler fault must never crash the daemon; fall back to backstop.
|
|
230
|
+
try { overflow(); } catch { /* best-effort */ }
|
|
231
|
+
}
|
|
232
|
+
}, subscribeOpts);
|
|
233
|
+
} catch {
|
|
234
|
+
// Could not subscribe (e.g. inotify watch-limit on Linux): no watcher; the
|
|
235
|
+
// full per-tick walk stays primary.
|
|
236
|
+
return null;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// `.git/HEAD` + `.git/index` poll → forceBackstopWalk on branch switch / commit.
|
|
240
|
+
const gitDir = path.join(rootAbs, '.git');
|
|
241
|
+
const gitMtimes = new Map();
|
|
242
|
+
primeGitMtimes(gitDir, gitMtimes);
|
|
243
|
+
const gitTimer = setInterval(() => {
|
|
244
|
+
try {
|
|
245
|
+
if (gitChanged(gitDir, gitMtimes)) {
|
|
246
|
+
try { overflow(); } catch { /* best-effort */ }
|
|
247
|
+
}
|
|
248
|
+
} catch { /* best-effort */ }
|
|
249
|
+
}, GIT_POLL_INTERVAL_MS);
|
|
250
|
+
if (gitTimer?.unref) gitTimer.unref();
|
|
251
|
+
|
|
252
|
+
return {
|
|
253
|
+
async close() {
|
|
254
|
+
clearInterval(gitTimer);
|
|
255
|
+
// Persist the FSEvents snapshot OUTSIDE the watch tree (under the ignored
|
|
256
|
+
// stateDir) for gap-free replay on next startup. Best-effort.
|
|
257
|
+
try {
|
|
258
|
+
fs.mkdirSync(stateDirAbs, { recursive: true });
|
|
259
|
+
await watcher.writeSnapshot(rootAbs, snapshotPath, subscribeOpts);
|
|
260
|
+
} catch { /* best-effort */ }
|
|
261
|
+
try {
|
|
262
|
+
if (subscription && typeof subscription.unsubscribe === 'function') {
|
|
263
|
+
await subscription.unsubscribe();
|
|
264
|
+
}
|
|
265
|
+
} catch { /* best-effort */ }
|
|
266
|
+
},
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Process a batch of native events into queue lines. Pure of native concerns so
|
|
272
|
+
* it is reused by both the live subscription and the `getEventsSince` replay.
|
|
273
|
+
*
|
|
274
|
+
* Filtering, in order:
|
|
275
|
+
* 1. event-storm guard — drop anything under the stateDir (defense in depth on
|
|
276
|
+
* top of the native `ignore`);
|
|
277
|
+
* 2. relativise — drop anything outside the project root;
|
|
278
|
+
* 3. directory guard — drop directory events. The dirty-scan producer only
|
|
279
|
+
* ever enqueues regular files (`ent.isFile()`, `dirty-scan.mjs:145`), so a
|
|
280
|
+
* bare `create`/`update` on a directory must NOT become a queue line.
|
|
281
|
+
* Deletes are kept (the path is gone, can't stat) — a deleted directory rel
|
|
282
|
+
* is harmless: it won't be a merkle-known file, so the consumer drops it,
|
|
283
|
+
* exactly as dirty-scan's delete branch only retires merkle-known paths;
|
|
284
|
+
* 4. admission deny — drop excluded paths (node_modules, denied dirs/exts,
|
|
285
|
+
* `.sweet-search-ignore`) via `admissionPolicy.isExcluded`.
|
|
286
|
+
*
|
|
287
|
+
* Note: this is NOT the final admit decision (no include-allowlist / size /
|
|
288
|
+
* gitignore check). The consumer re-admits + content-hashes. We only need to keep
|
|
289
|
+
* obviously-denied churn out of the queue; over-admitting is harmless.
|
|
290
|
+
*
|
|
291
|
+
* @param {Array<{path:string,type:string}>} events
|
|
292
|
+
* @param {object} ctx
|
|
293
|
+
*/
|
|
294
|
+
function handleEvents(events, { rootAbs, stateDirPrefix, stateDir, admissionPolicy, notify }) {
|
|
295
|
+
if (!Array.isArray(events) || events.length === 0) return;
|
|
296
|
+
const rels = [];
|
|
297
|
+
const seen = new Set();
|
|
298
|
+
for (const ev of events) {
|
|
299
|
+
const abs = ev && ev.path;
|
|
300
|
+
if (typeof abs !== 'string' || abs.length === 0) continue;
|
|
301
|
+
// 1. Event-storm guard: never re-trigger on our own state writes.
|
|
302
|
+
if (abs === stateDirPrefix.slice(0, -1) || abs.startsWith(stateDirPrefix)) continue;
|
|
303
|
+
// 2. Relativise; drop paths outside the project root.
|
|
304
|
+
const rel = toRel(abs, rootAbs);
|
|
305
|
+
if (!rel) continue;
|
|
306
|
+
if (seen.has(rel)) continue;
|
|
307
|
+
// 3. Directory guard: only files become queue lines (match dirty-scan).
|
|
308
|
+
// `delete` events have no on-disk path to stat; keep them (a deleted dir
|
|
309
|
+
// rel is a harmless non-merkle no-op for the consumer).
|
|
310
|
+
if (ev.type !== 'delete') {
|
|
311
|
+
try {
|
|
312
|
+
if (fs.statSync(abs).isDirectory()) continue;
|
|
313
|
+
} catch {
|
|
314
|
+
// Vanished between event and stat (rapid churn): let the consumer decide.
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
// 4. Admission deny-list (cheap, sync, I/O-free).
|
|
318
|
+
try {
|
|
319
|
+
if (admissionPolicy && typeof admissionPolicy.isExcluded === 'function' && admissionPolicy.isExcluded(rel)) {
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
} catch {
|
|
323
|
+
// A policy fault should not drop the event — let the consumer re-admit.
|
|
324
|
+
}
|
|
325
|
+
seen.add(rel);
|
|
326
|
+
rels.push(rel);
|
|
327
|
+
}
|
|
328
|
+
if (rels.length === 0) return;
|
|
329
|
+
appendQueueLines(stateDir, rels);
|
|
330
|
+
try { notify(); } catch { /* best-effort */ }
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* Resolve a realpath, falling back to the resolved path if it does not exist.
|
|
335
|
+
* @param {string} p
|
|
336
|
+
* @returns {string}
|
|
337
|
+
*/
|
|
338
|
+
function safeRealpath(p) {
|
|
339
|
+
try {
|
|
340
|
+
return fs.realpathSync(p);
|
|
341
|
+
} catch {
|
|
342
|
+
return p;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Record current mtimes of the git backstop files (best-effort).
|
|
348
|
+
* @param {string} gitDir
|
|
349
|
+
* @param {Map<string, number>} store
|
|
350
|
+
*/
|
|
351
|
+
function primeGitMtimes(gitDir, store) {
|
|
352
|
+
for (const name of GIT_BACKSTOP_FILES) {
|
|
353
|
+
try {
|
|
354
|
+
store.set(name, fs.statSync(path.join(gitDir, name)).mtimeMs);
|
|
355
|
+
} catch {
|
|
356
|
+
store.set(name, 0);
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* Return true (and update `store`) if any git backstop file changed since the
|
|
363
|
+
* last poll. A removed/added file (mtime 0 ↔ present) counts as a change.
|
|
364
|
+
* @param {string} gitDir
|
|
365
|
+
* @param {Map<string, number>} store
|
|
366
|
+
* @returns {boolean}
|
|
367
|
+
*/
|
|
368
|
+
function gitChanged(gitDir, store) {
|
|
369
|
+
let changed = false;
|
|
370
|
+
for (const name of GIT_BACKSTOP_FILES) {
|
|
371
|
+
let mtime = 0;
|
|
372
|
+
try {
|
|
373
|
+
mtime = fs.statSync(path.join(gitDir, name)).mtimeMs;
|
|
374
|
+
} catch {
|
|
375
|
+
mtime = 0;
|
|
376
|
+
}
|
|
377
|
+
if (store.get(name) !== mtime) {
|
|
378
|
+
changed = true;
|
|
379
|
+
store.set(name, mtime);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
return changed;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// Test seam: pure helpers exercised directly by the unit tests so they do not
|
|
386
|
+
// need the native watcher to assert queue-line shape / filtering / git logic.
|
|
387
|
+
export const __testing = {
|
|
388
|
+
toRel,
|
|
389
|
+
buildIgnore,
|
|
390
|
+
appendQueueLines,
|
|
391
|
+
handleEvents,
|
|
392
|
+
gitChanged,
|
|
393
|
+
primeGitMtimes,
|
|
394
|
+
DIRTY_QUEUE,
|
|
395
|
+
SNAPSHOT_FILE,
|
|
396
|
+
DEFAULT_DENY_DIRS,
|
|
397
|
+
};
|