sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,163 @@
1
+ /**
2
+ * Crash-orphan temp sweep for the reconcile state directory.
3
+ *
4
+ * Every per-tier writer in the incremental-indexing context stages its
5
+ * output to a sibling temp path and then `rename`s it into the canonical
6
+ * name:
7
+ * - HNSW / Binary HNSW sidecars: `<name>.tmp.<pid>`
8
+ * - manifest / merkle / metrics (production-reconciler, index-maintainer,
9
+ * operator-cli, production-li-delta): `<name>.tmp.<pid>`
10
+ * - reconcile manifest + LI segment manifest: `<name>.json.tmp`
11
+ * - sparse-gram + LI segment compaction: `<name>.compacting.tmp`
12
+ * - tombstone bitmaps: `<name>.bin.tmp`
13
+ * - LI stub self-heal: `<name>.selfheal.tmp`
14
+ *
15
+ * The rename is atomic and the writers unlink their own temp on an
16
+ * in-process error, so under normal operation no temp survives a tick. A
17
+ * `SIGKILL` between stage and rename, however, leaves the temp orphaned —
18
+ * and because `*.tmp.<pid>` and sparse `*.compacting.tmp` carry
19
+ * per-process / per-epoch names, repeated crashes leak monotonically.
20
+ *
21
+ * Readers never consult these paths (they read canonical names plus the
22
+ * manifest-referenced delta/segment lists; `listDeltaSegments` and the LI
23
+ * segment manifest both ignore non-canonical suffixes), so an orphan is a
24
+ * disk-usage / operator-confusion problem, not a correctness one. This
25
+ * sweep runs once at daemon startup, AFTER the state lock is held (so the
26
+ * reconcile daemon is the single writer), and removes orphans older than a
27
+ * grace window. The grace window protects a temp another writer might still
28
+ * be mid-rename on — defensive belt-and-braces, since the lock already
29
+ * excludes a second reconcile daemon.
30
+ *
31
+ * Safety contract:
32
+ * - Only files whose basename matches one of the reconcile staging-temp
33
+ * suffixes above are ever removed. Canonical artifacts (`*.usearch`,
34
+ * `*.meta.json`, `*.vectors.json`, `*.db`, `*.ssgrmdelta`, `*.sslx`,
35
+ * `reconcile-manifest.json`, `*.idx`, `*.stale.bin`, `merkle-state.json`,
36
+ * `*.jsonl`, …) never match.
37
+ * - The cold-build full-artifact stages owned by the *indexer* context
38
+ * (`*.db.tmp`, `*.idx.tmp`) are deliberately NOT matched — those are
39
+ * fixed-name (overwritten on the next cold build, so they don't leak)
40
+ * and may be a large in-flight build the reconcile daemon must not
41
+ * touch.
42
+ * - SQLite `-wal` / `-shm`, the dirty / processing / dead-letter queues,
43
+ * the lockfile, and reader heartbeats are never matched and never
44
+ * removed.
45
+ * - Directories are never removed; the LI self-heal owns `*.tmp.segments`
46
+ * directories, which this sweep refuses to recurse into.
47
+ */
48
+
49
+ import fs from 'node:fs';
50
+ import path from 'node:path';
51
+
52
+ export const DEFAULT_TMP_SWEEP_MAX_AGE_MS = 60_000;
53
+
54
+ // pid-suffixed staging temps: `foo.tmp.12345`
55
+ const PID_TMP_RE = /\.tmp\.\d+$/;
56
+
57
+ // Reconcile / maintenance staging-temp suffixes (explicit allowlist — see the
58
+ // module header for why this is an allowlist and not a `*.tmp` catch-all).
59
+ const ORPHAN_TEMP_SUFFIXES = [
60
+ '.compacting.tmp', // sparse-gram + LI segment compaction
61
+ '.selfheal.tmp', // LI stub self-heal
62
+ '.json.tmp', // reconcile manifest + LI segment manifest
63
+ '.bin.tmp', // tombstone bitmaps
64
+ ];
65
+
66
+ /**
67
+ * True when a basename is a reconcile/maintenance crash-orphan staging temp
68
+ * (and therefore safe to remove), false for canonical artifacts and
69
+ * cold-build full-artifact stages.
70
+ *
71
+ * @param {string} name
72
+ * @returns {boolean}
73
+ */
74
+ export function isOrphanTempName(name) {
75
+ if (typeof name !== 'string' || name.length === 0) return false;
76
+ if (PID_TMP_RE.test(name)) return true;
77
+ for (const suffix of ORPHAN_TEMP_SUFFIXES) {
78
+ if (name.endsWith(suffix)) return true;
79
+ }
80
+ return false;
81
+ }
82
+
83
+ /**
84
+ * True for the canonical artifact subdirs that hold compaction temps
85
+ * (sparse-gram `*.deltas`, LI `*.segments`). Orphan / self-heal directories
86
+ * (those containing `.tmp.`) are skipped so the sweep never recurses into a
87
+ * `*.tmp.segments` directory the LI self-heal is responsible for migrating.
88
+ *
89
+ * @param {string} name
90
+ * @returns {boolean}
91
+ */
92
+ export function isScannableArtifactSubdir(name) {
93
+ if (typeof name !== 'string') return false;
94
+ if (name.includes('.tmp.')) return false;
95
+ return name.endsWith('.deltas') || name.endsWith('.segments');
96
+ }
97
+
98
+ function sweepDir(dir, ctx) {
99
+ let entries;
100
+ try {
101
+ entries = fs.readdirSync(dir, { withFileTypes: true });
102
+ } catch {
103
+ return;
104
+ }
105
+ for (const entry of entries) {
106
+ if (!entry.isFile()) continue;
107
+ if (!isOrphanTempName(entry.name)) continue;
108
+ const full = path.join(dir, entry.name);
109
+ ctx.summary.scanned += 1;
110
+ let stat;
111
+ try { stat = fs.statSync(full); } catch { continue; }
112
+ // Clamp to >= 0: `mtimeMs` is a sub-millisecond float while `now` is an
113
+ // integer, so a file written microseconds ago can read as "in the
114
+ // future" and produce a spuriously negative age.
115
+ const ageMs = Math.max(0, ctx.now - stat.mtimeMs);
116
+ if (ageMs < ctx.maxAgeMs) {
117
+ ctx.summary.skippedRecent += 1;
118
+ continue;
119
+ }
120
+ try {
121
+ fs.unlinkSync(full);
122
+ ctx.summary.removed += 1;
123
+ ctx.summary.bytesReclaimed += stat.size;
124
+ ctx.summary.removedPaths.push(full);
125
+ } catch {
126
+ // Tolerate races / permission issues — best-effort cleanup.
127
+ }
128
+ }
129
+ }
130
+
131
+ /**
132
+ * Remove crash-orphaned reconcile staging temps from the state directory.
133
+ * Scans the top level plus immediate `*.deltas` / `*.segments` artifact
134
+ * subdirs. Pure of environment access — callers thread `maxAgeMs` / `now`.
135
+ *
136
+ * @param {string} stateDir
137
+ * @param {{maxAgeMs?:number, now?:number}} [opts]
138
+ * @returns {{scanned:number, removed:number, skippedRecent:number, bytesReclaimed:number, removedPaths:string[]}}
139
+ */
140
+ export function sweepStaleArtifactTemps(stateDir, opts = {}) {
141
+ const summary = { scanned: 0, removed: 0, skippedRecent: 0, bytesReclaimed: 0, removedPaths: [] };
142
+ if (!stateDir) return summary;
143
+ const maxAgeMs = Number.isFinite(opts.maxAgeMs) && opts.maxAgeMs >= 0
144
+ ? opts.maxAgeMs
145
+ : DEFAULT_TMP_SWEEP_MAX_AGE_MS;
146
+ const now = Number.isFinite(opts.now) ? opts.now : Date.now();
147
+ const ctx = { maxAgeMs, now, summary };
148
+
149
+ let topEntries;
150
+ try {
151
+ topEntries = fs.readdirSync(stateDir, { withFileTypes: true });
152
+ } catch {
153
+ return summary;
154
+ }
155
+
156
+ sweepDir(stateDir, ctx);
157
+ for (const entry of topEntries) {
158
+ if (!entry.isDirectory()) continue;
159
+ if (!isScannableArtifactSubdir(entry.name)) continue;
160
+ sweepDir(path.join(stateDir, entry.name), ctx);
161
+ }
162
+ return summary;
163
+ }
@@ -0,0 +1,121 @@
1
+ /**
2
+ * Baseline-readiness gate for the default-on incremental maintainer.
3
+ *
4
+ * Product contract: the incremental reconciler must NEVER be the first index
5
+ * builder for a non-empty repo. The first index must come from the normal full
6
+ * indexing path (`sweet-search index`). Before a complete baseline exists, the
7
+ * maintainer stays dormant and reports `waiting_for_initial_index`; once a
8
+ * complete baseline exists (including a valid empty one) reconcile runs normally.
9
+ *
10
+ * Why this is needed: the daemon's tick is a producer (`dirty-scan.mjs` diffs the
11
+ * tree against `merkle-state.json`) plus a consumer (`production-reconciler.mjs`,
12
+ * whose adapters call `createVectorSchema`/`createGraphSchema`). With no baseline,
13
+ * `merkle-state.json` is absent so the producer enqueues the WHOLE tree, and the
14
+ * consumer then builds `codebase.db` / `code-graph.db` / HNSW / LI / sparse from
15
+ * scratch one budget-bounded tick at a time — leaving a PARTIAL index that search
16
+ * mistakes for a complete one.
17
+ *
18
+ * "Complete baseline" is proven by what the FULL indexer writes in its final phase
19
+ * (`indexing/indexer-phases.js::updateIncrementalStatePhase`), which the
20
+ * incremental reconciler does NOT produce on its own:
21
+ *
22
+ * 1. `reconcile-manifest.json` published at `epoch >= 1`. The full indexer
23
+ * publishes this as its LAST step, so its presence means vectors + graph +
24
+ * HNSW + LI + sparse all finished building. A crash before that step leaves
25
+ * no manifest; a corrupt manifest reads back as null. (epoch alone is NOT a
26
+ * discriminator: a reconciler-only first tick also yields epoch 1.)
27
+ * 2. `merkle-state.json` carrying a `config_fingerprint`. ONLY the full
28
+ * indexer's tracker (`indexing/incremental-tracker.js::updateState`) writes
29
+ * this field; the reconciler's `persistManifest` never adds it (it only
30
+ * preserves one already present). So `config_fingerprint` present ⟺ a full
31
+ * index ran at least once — the exact signal that distinguishes a real
32
+ * baseline from the reconciler-only partial state the old bug produced.
33
+ * 3. The vectors DB named by the manifest exists on disk (the artifact search
34
+ * reads). Guards a manually-deleted / half-written baseline.
35
+ *
36
+ * A valid EMPTY baseline (a full index that produced an empty-but-valid index)
37
+ * satisfies 1-3 with zero tracked files, so it counts as ready. A
38
+ * partially-written, corrupt, or reconciler-only baseline fails 1 or 2 and does
39
+ * not. The check is read-only: it never mutates the state dir.
40
+ */
41
+
42
+ import fs from 'node:fs';
43
+ import path from 'node:path';
44
+ import { readManifest } from './manifest.mjs';
45
+
46
+ /** Status label surfaced in logs and `reconcile status` when no baseline exists. */
47
+ export const WAITING_FOR_INITIAL_INDEX = 'waiting_for_initial_index';
48
+
49
+ const MERKLE_STATE = 'merkle-state.json';
50
+ const DEFAULT_VECTORS_DB = 'codebase.db';
51
+
52
+ function readJsonSafe(filePath) {
53
+ try {
54
+ return JSON.parse(fs.readFileSync(filePath, 'utf8'));
55
+ } catch {
56
+ return null;
57
+ }
58
+ }
59
+
60
+ /**
61
+ * Does the merkle state carry a `config_fingerprint`? The full indexer writes a
62
+ * populated object; the reconciler never adds one. Accept either a non-empty
63
+ * object or a non-empty string for forward/backward tolerance.
64
+ */
65
+ function hasConfigFingerprint(merkle) {
66
+ const fp = merkle ? merkle.config_fingerprint : null;
67
+ if (!fp) return false;
68
+ if (typeof fp === 'string') return fp.length > 0;
69
+ if (typeof fp === 'object') return Object.keys(fp).length > 0;
70
+ return false;
71
+ }
72
+
73
+ /**
74
+ * Whether a complete baseline index exists for `stateDir`.
75
+ *
76
+ * @param {string} stateDir The `.sweet-search` directory.
77
+ * @returns {{ready: boolean, reason: string}}
78
+ * `reason` is one of: `ready`, `no-state-dir`, `no-manifest`,
79
+ * `manifest-epoch-zero`, `no-merkle-state`, `no-config-fingerprint`,
80
+ * `missing-vectors-db`.
81
+ */
82
+ export function hasCompleteBaseIndex(stateDir) {
83
+ if (!stateDir || !fs.existsSync(stateDir)) {
84
+ return { ready: false, reason: 'no-state-dir' };
85
+ }
86
+
87
+ const manifest = readManifest(stateDir);
88
+ if (!manifest) {
89
+ return { ready: false, reason: 'no-manifest' };
90
+ }
91
+ if (!Number.isInteger(manifest.epoch) || manifest.epoch < 1) {
92
+ return { ready: false, reason: 'manifest-epoch-zero' };
93
+ }
94
+
95
+ const merkle = readJsonSafe(path.join(stateDir, MERKLE_STATE));
96
+ if (!merkle) {
97
+ return { ready: false, reason: 'no-merkle-state' };
98
+ }
99
+ if (!hasConfigFingerprint(merkle)) {
100
+ return { ready: false, reason: 'no-config-fingerprint' };
101
+ }
102
+
103
+ const vectorsRel = (manifest.vectors && manifest.vectors.path) || DEFAULT_VECTORS_DB;
104
+ const vectorsPath = path.isAbsolute(vectorsRel) ? vectorsRel : path.join(stateDir, vectorsRel);
105
+ if (!fs.existsSync(vectorsPath)) {
106
+ return { ready: false, reason: 'missing-vectors-db' };
107
+ }
108
+
109
+ return { ready: true, reason: 'ready' };
110
+ }
111
+
112
+ /**
113
+ * Readiness plus a human/machine-facing `state` label for status surfaces.
114
+ *
115
+ * @param {string} stateDir
116
+ * @returns {{ready: boolean, reason: string, state: 'indexed'|'waiting_for_initial_index'}}
117
+ */
118
+ export function baselineStatus(stateDir) {
119
+ const result = hasCompleteBaseIndex(stateDir);
120
+ return { ...result, state: result.ready ? 'indexed' : WAITING_FOR_INITIAL_INDEX };
121
+ }
@@ -0,0 +1,233 @@
1
+ /**
2
+ * In-memory dirty path set.
3
+ *
4
+ * Plan § 6.1, § 9.1-§ 9.5. The watcher and polling backstop both push
5
+ * paths into this set; the reconcile tick drains it at every tick start.
6
+ *
7
+ * Guarantees:
8
+ * - Paths are normalised to forward-slash form (cross-platform).
9
+ * - Duplicate inserts are coalesced (Set semantics).
10
+ * - Insertion order is preserved on drain for deterministic tests.
11
+ * - A bounded-size policy guards against burst overflow (50 k events
12
+ * from `git checkout` in <1 s — plan § 11). Past the cap, the set
13
+ * keeps the most recent entries and emits a `dropped` count via the
14
+ * callback so the next polling backstop sweep can re-discover them.
15
+ *
16
+ * The set has no global state. Path canonicalisation uses filesystem
17
+ * realpaths when an ancestor exists so watcher events cannot enter the
18
+ * dirty set through symlink escapes.
19
+ */
20
+
21
+ import fs from 'node:fs';
22
+ import path from 'node:path';
23
+
24
+ const DEFAULT_MAX = 100_000;
25
+
26
+ function normalise(p) {
27
+ if (typeof p !== 'string') return null;
28
+ return p.replace(/\\/g, '/');
29
+ }
30
+
31
+ export class DirtySet {
32
+ /**
33
+ * @param {object} [options]
34
+ * @param {number} [options.maxSize] Hard cap on entries.
35
+ * @param {(payload:{dropped:number})=>void} [options.onOverflow]
36
+ */
37
+ constructor({ maxSize = DEFAULT_MAX, onOverflow } = {}) {
38
+ this._set = new Map(); // path → { addedAt, source, meta }
39
+ this._maxSize = maxSize;
40
+ this._onOverflow = onOverflow;
41
+ this._totalEnqueued = 0;
42
+ this._totalDropped = 0;
43
+ }
44
+
45
+ get size() {
46
+ return this._set.size;
47
+ }
48
+
49
+ get maxSize() {
50
+ return this._maxSize;
51
+ }
52
+
53
+ /**
54
+ * Enqueue a path. `source` is one of:
55
+ * - 'watcher' - notify / FSEvents / inotify
56
+ * - 'polling' - mtime backstop sweep
57
+ * - 'cli' - explicit `sweet-search index --add <path>` hint
58
+ * - 'queue' - drained from index-maintainer-queue.jsonl
59
+ *
60
+ * @param {string} filePath
61
+ * @param {string} [source='watcher']
62
+ * @param {object} [meta]
63
+ * @returns {boolean} true if newly inserted (or refreshed), false on drop.
64
+ */
65
+ add(filePath, source = 'watcher', meta) {
66
+ const p = normalise(filePath);
67
+ if (!p) return false;
68
+ this._totalEnqueued += 1;
69
+ if (this._set.has(p)) {
70
+ const entry = this._set.get(p);
71
+ entry.lastSource = source;
72
+ entry.lastSeenAt = Date.now();
73
+ if (meta) entry.meta = { ...entry.meta, ...meta };
74
+ return true;
75
+ }
76
+ if (this._set.size >= this._maxSize) {
77
+ // Drop the oldest entry to keep the most recent — those reflect the
78
+ // current edit pattern best. The dropped path will be re-discovered
79
+ // by the next polling backstop sweep.
80
+ const firstKey = this._set.keys().next().value;
81
+ if (firstKey !== undefined) {
82
+ this._set.delete(firstKey);
83
+ this._totalDropped += 1;
84
+ if (this._onOverflow) this._onOverflow({ dropped: 1, droppedTotal: this._totalDropped });
85
+ }
86
+ }
87
+ this._set.set(p, {
88
+ addedAt: Date.now(),
89
+ lastSeenAt: Date.now(),
90
+ firstSource: source,
91
+ lastSource: source,
92
+ meta: meta || null,
93
+ });
94
+ return true;
95
+ }
96
+
97
+ /**
98
+ * Bulk-add an iterable of paths. Useful when polling identifies a
99
+ * batch of dirty paths in one syscall pass.
100
+ *
101
+ * @param {Iterable<string>} paths
102
+ * @param {string} [source]
103
+ */
104
+ addMany(paths, source = 'polling') {
105
+ let added = 0;
106
+ for (const p of paths) {
107
+ if (this.add(p, source)) added += 1;
108
+ }
109
+ return added;
110
+ }
111
+
112
+ has(filePath) {
113
+ return this._set.has(normalise(filePath));
114
+ }
115
+
116
+ /**
117
+ * Drain the set into a sorted array. Returns the snapshot the caller
118
+ * should process this tick; subsequent inserts go into the next tick.
119
+ *
120
+ * @returns {Array<{path:string, firstSource:string, lastSource:string, addedAt:number, lastSeenAt:number, meta:object|null}>}
121
+ */
122
+ drain() {
123
+ const out = [];
124
+ for (const [p, entry] of this._set.entries()) {
125
+ out.push({ path: p, ...entry });
126
+ }
127
+ this._set.clear();
128
+ return out;
129
+ }
130
+
131
+ /**
132
+ * Peek without draining — primarily for tests / debug.
133
+ *
134
+ * @returns {string[]}
135
+ */
136
+ peek() {
137
+ return Array.from(this._set.keys());
138
+ }
139
+
140
+ /**
141
+ * Remove a single path without draining the rest.
142
+ *
143
+ * @param {string} filePath
144
+ */
145
+ remove(filePath) {
146
+ return this._set.delete(normalise(filePath));
147
+ }
148
+
149
+ /**
150
+ * Diagnostic counters for the operator dashboard (plan § 20.2).
151
+ *
152
+ * @returns {{size:number, maxSize:number, totalEnqueued:number, totalDropped:number}}
153
+ */
154
+ stats() {
155
+ return {
156
+ size: this._set.size,
157
+ maxSize: this._maxSize,
158
+ totalEnqueued: this._totalEnqueued,
159
+ totalDropped: this._totalDropped,
160
+ };
161
+ }
162
+ }
163
+
164
+ /**
165
+ * Resolve a path to its canonical absolute form within a project root.
166
+ * Plan § 22.1 / § 22.4: canonicalise to drop case-insensitive collisions
167
+ * and to anchor paths inside the indexed tree. The lexical check catches
168
+ * `../` traversal; the realpath check catches symlink parents that point
169
+ * outside the worktree while still allowing delete events for missing
170
+ * files under a real in-tree parent.
171
+ *
172
+ * @param {string} projectRoot
173
+ * @param {string} relativeOrAbsolute
174
+ * @returns {string|null} null if the path escapes projectRoot.
175
+ */
176
+ export function canonicaliseInsideRoot(projectRoot, relativeOrAbsolute) {
177
+ if (typeof relativeOrAbsolute !== 'string') return null;
178
+ const absoluteInput = path.isAbsolute(relativeOrAbsolute);
179
+ const abs = absoluteInput
180
+ ? relativeOrAbsolute
181
+ : path.resolve(projectRoot, relativeOrAbsolute);
182
+ const resolvedAbs = path.resolve(abs);
183
+ const resolvedRoot = path.resolve(projectRoot);
184
+ if (!absoluteInput && !isInsidePath(resolvedRoot, resolvedAbs)) return null;
185
+
186
+ const rootReal = realpathOrNull(resolvedRoot);
187
+ if (!rootReal) return null;
188
+
189
+ const existing = nearestExistingAncestor(
190
+ resolvedAbs,
191
+ absoluteInput ? path.parse(resolvedAbs).root : resolvedRoot,
192
+ );
193
+ if (!existing) return null;
194
+ const existingReal = realpathOrNull(existing.path);
195
+ if (!existingReal) return null;
196
+
197
+ const materializedReal = existing.rest
198
+ ? path.join(existingReal, existing.rest)
199
+ : existingReal;
200
+ if (!isInsidePath(rootReal, materializedReal)) return null;
201
+
202
+ return materializedReal.replace(/\\/g, '/');
203
+ }
204
+
205
+ function isInsidePath(root, candidate) {
206
+ const rel = path.relative(root, candidate);
207
+ return rel === '' || (rel && !rel.startsWith('..') && !path.isAbsolute(rel));
208
+ }
209
+
210
+ function realpathOrNull(filePath) {
211
+ try {
212
+ return fs.realpathSync.native(filePath);
213
+ } catch {
214
+ return null;
215
+ }
216
+ }
217
+
218
+ function nearestExistingAncestor(absPath, root) {
219
+ let current = absPath;
220
+ const rest = [];
221
+ while (isInsidePath(root, current)) {
222
+ try {
223
+ fs.lstatSync(current);
224
+ return { path: current, rest: rest.join(path.sep) };
225
+ } catch (err) {
226
+ if (err?.code !== 'ENOENT' && err?.code !== 'ENOTDIR') return null;
227
+ if (current === root) return null;
228
+ rest.unshift(path.basename(current));
229
+ current = path.dirname(current);
230
+ }
231
+ }
232
+ return null;
233
+ }