sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Crash-orphan temp sweep for the reconcile state directory.
|
|
3
|
+
*
|
|
4
|
+
* Every per-tier writer in the incremental-indexing context stages its
|
|
5
|
+
* output to a sibling temp path and then `rename`s it into the canonical
|
|
6
|
+
* name:
|
|
7
|
+
* - HNSW / Binary HNSW sidecars: `<name>.tmp.<pid>`
|
|
8
|
+
* - manifest / merkle / metrics (production-reconciler, index-maintainer,
|
|
9
|
+
* operator-cli, production-li-delta): `<name>.tmp.<pid>`
|
|
10
|
+
* - reconcile manifest + LI segment manifest: `<name>.json.tmp`
|
|
11
|
+
* - sparse-gram + LI segment compaction: `<name>.compacting.tmp`
|
|
12
|
+
* - tombstone bitmaps: `<name>.bin.tmp`
|
|
13
|
+
* - LI stub self-heal: `<name>.selfheal.tmp`
|
|
14
|
+
*
|
|
15
|
+
* The rename is atomic and the writers unlink their own temp on an
|
|
16
|
+
* in-process error, so under normal operation no temp survives a tick. A
|
|
17
|
+
* `SIGKILL` between stage and rename, however, leaves the temp orphaned —
|
|
18
|
+
* and because `*.tmp.<pid>` and sparse `*.compacting.tmp` carry
|
|
19
|
+
* per-process / per-epoch names, repeated crashes leak monotonically.
|
|
20
|
+
*
|
|
21
|
+
* Readers never consult these paths (they read canonical names plus the
|
|
22
|
+
* manifest-referenced delta/segment lists; `listDeltaSegments` and the LI
|
|
23
|
+
* segment manifest both ignore non-canonical suffixes), so an orphan is a
|
|
24
|
+
* disk-usage / operator-confusion problem, not a correctness one. This
|
|
25
|
+
* sweep runs once at daemon startup, AFTER the state lock is held (so the
|
|
26
|
+
* reconcile daemon is the single writer), and removes orphans older than a
|
|
27
|
+
* grace window. The grace window protects a temp another writer might still
|
|
28
|
+
* be mid-rename on — defensive belt-and-braces, since the lock already
|
|
29
|
+
* excludes a second reconcile daemon.
|
|
30
|
+
*
|
|
31
|
+
* Safety contract:
|
|
32
|
+
* - Only files whose basename matches one of the reconcile staging-temp
|
|
33
|
+
* suffixes above are ever removed. Canonical artifacts (`*.usearch`,
|
|
34
|
+
* `*.meta.json`, `*.vectors.json`, `*.db`, `*.ssgrmdelta`, `*.sslx`,
|
|
35
|
+
* `reconcile-manifest.json`, `*.idx`, `*.stale.bin`, `merkle-state.json`,
|
|
36
|
+
* `*.jsonl`, …) never match.
|
|
37
|
+
* - The cold-build full-artifact stages owned by the *indexer* context
|
|
38
|
+
* (`*.db.tmp`, `*.idx.tmp`) are deliberately NOT matched — those are
|
|
39
|
+
* fixed-name (overwritten on the next cold build, so they don't leak)
|
|
40
|
+
* and may be a large in-flight build the reconcile daemon must not
|
|
41
|
+
* touch.
|
|
42
|
+
* - SQLite `-wal` / `-shm`, the dirty / processing / dead-letter queues,
|
|
43
|
+
* the lockfile, and reader heartbeats are never matched and never
|
|
44
|
+
* removed.
|
|
45
|
+
* - Directories are never removed; the LI self-heal owns `*.tmp.segments`
|
|
46
|
+
* directories, which this sweep refuses to recurse into.
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
import fs from 'node:fs';
|
|
50
|
+
import path from 'node:path';
|
|
51
|
+
|
|
52
|
+
export const DEFAULT_TMP_SWEEP_MAX_AGE_MS = 60_000;
|
|
53
|
+
|
|
54
|
+
// pid-suffixed staging temps: `foo.tmp.12345`
|
|
55
|
+
const PID_TMP_RE = /\.tmp\.\d+$/;
|
|
56
|
+
|
|
57
|
+
// Reconcile / maintenance staging-temp suffixes (explicit allowlist — see the
|
|
58
|
+
// module header for why this is an allowlist and not a `*.tmp` catch-all).
|
|
59
|
+
const ORPHAN_TEMP_SUFFIXES = [
|
|
60
|
+
'.compacting.tmp', // sparse-gram + LI segment compaction
|
|
61
|
+
'.selfheal.tmp', // LI stub self-heal
|
|
62
|
+
'.json.tmp', // reconcile manifest + LI segment manifest
|
|
63
|
+
'.bin.tmp', // tombstone bitmaps
|
|
64
|
+
];
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* True when a basename is a reconcile/maintenance crash-orphan staging temp
|
|
68
|
+
* (and therefore safe to remove), false for canonical artifacts and
|
|
69
|
+
* cold-build full-artifact stages.
|
|
70
|
+
*
|
|
71
|
+
* @param {string} name
|
|
72
|
+
* @returns {boolean}
|
|
73
|
+
*/
|
|
74
|
+
export function isOrphanTempName(name) {
|
|
75
|
+
if (typeof name !== 'string' || name.length === 0) return false;
|
|
76
|
+
if (PID_TMP_RE.test(name)) return true;
|
|
77
|
+
for (const suffix of ORPHAN_TEMP_SUFFIXES) {
|
|
78
|
+
if (name.endsWith(suffix)) return true;
|
|
79
|
+
}
|
|
80
|
+
return false;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* True for the canonical artifact subdirs that hold compaction temps
|
|
85
|
+
* (sparse-gram `*.deltas`, LI `*.segments`). Orphan / self-heal directories
|
|
86
|
+
* (those containing `.tmp.`) are skipped so the sweep never recurses into a
|
|
87
|
+
* `*.tmp.segments` directory the LI self-heal is responsible for migrating.
|
|
88
|
+
*
|
|
89
|
+
* @param {string} name
|
|
90
|
+
* @returns {boolean}
|
|
91
|
+
*/
|
|
92
|
+
export function isScannableArtifactSubdir(name) {
|
|
93
|
+
if (typeof name !== 'string') return false;
|
|
94
|
+
if (name.includes('.tmp.')) return false;
|
|
95
|
+
return name.endsWith('.deltas') || name.endsWith('.segments');
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function sweepDir(dir, ctx) {
|
|
99
|
+
let entries;
|
|
100
|
+
try {
|
|
101
|
+
entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
102
|
+
} catch {
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
105
|
+
for (const entry of entries) {
|
|
106
|
+
if (!entry.isFile()) continue;
|
|
107
|
+
if (!isOrphanTempName(entry.name)) continue;
|
|
108
|
+
const full = path.join(dir, entry.name);
|
|
109
|
+
ctx.summary.scanned += 1;
|
|
110
|
+
let stat;
|
|
111
|
+
try { stat = fs.statSync(full); } catch { continue; }
|
|
112
|
+
// Clamp to >= 0: `mtimeMs` is a sub-millisecond float while `now` is an
|
|
113
|
+
// integer, so a file written microseconds ago can read as "in the
|
|
114
|
+
// future" and produce a spuriously negative age.
|
|
115
|
+
const ageMs = Math.max(0, ctx.now - stat.mtimeMs);
|
|
116
|
+
if (ageMs < ctx.maxAgeMs) {
|
|
117
|
+
ctx.summary.skippedRecent += 1;
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
try {
|
|
121
|
+
fs.unlinkSync(full);
|
|
122
|
+
ctx.summary.removed += 1;
|
|
123
|
+
ctx.summary.bytesReclaimed += stat.size;
|
|
124
|
+
ctx.summary.removedPaths.push(full);
|
|
125
|
+
} catch {
|
|
126
|
+
// Tolerate races / permission issues — best-effort cleanup.
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Remove crash-orphaned reconcile staging temps from the state directory.
|
|
133
|
+
* Scans the top level plus immediate `*.deltas` / `*.segments` artifact
|
|
134
|
+
* subdirs. Pure of environment access — callers thread `maxAgeMs` / `now`.
|
|
135
|
+
*
|
|
136
|
+
* @param {string} stateDir
|
|
137
|
+
* @param {{maxAgeMs?:number, now?:number}} [opts]
|
|
138
|
+
* @returns {{scanned:number, removed:number, skippedRecent:number, bytesReclaimed:number, removedPaths:string[]}}
|
|
139
|
+
*/
|
|
140
|
+
export function sweepStaleArtifactTemps(stateDir, opts = {}) {
|
|
141
|
+
const summary = { scanned: 0, removed: 0, skippedRecent: 0, bytesReclaimed: 0, removedPaths: [] };
|
|
142
|
+
if (!stateDir) return summary;
|
|
143
|
+
const maxAgeMs = Number.isFinite(opts.maxAgeMs) && opts.maxAgeMs >= 0
|
|
144
|
+
? opts.maxAgeMs
|
|
145
|
+
: DEFAULT_TMP_SWEEP_MAX_AGE_MS;
|
|
146
|
+
const now = Number.isFinite(opts.now) ? opts.now : Date.now();
|
|
147
|
+
const ctx = { maxAgeMs, now, summary };
|
|
148
|
+
|
|
149
|
+
let topEntries;
|
|
150
|
+
try {
|
|
151
|
+
topEntries = fs.readdirSync(stateDir, { withFileTypes: true });
|
|
152
|
+
} catch {
|
|
153
|
+
return summary;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
sweepDir(stateDir, ctx);
|
|
157
|
+
for (const entry of topEntries) {
|
|
158
|
+
if (!entry.isDirectory()) continue;
|
|
159
|
+
if (!isScannableArtifactSubdir(entry.name)) continue;
|
|
160
|
+
sweepDir(path.join(stateDir, entry.name), ctx);
|
|
161
|
+
}
|
|
162
|
+
return summary;
|
|
163
|
+
}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline-readiness gate for the default-on incremental maintainer.
|
|
3
|
+
*
|
|
4
|
+
* Product contract: the incremental reconciler must NEVER be the first index
|
|
5
|
+
* builder for a non-empty repo. The first index must come from the normal full
|
|
6
|
+
* indexing path (`sweet-search index`). Before a complete baseline exists, the
|
|
7
|
+
* maintainer stays dormant and reports `waiting_for_initial_index`; once a
|
|
8
|
+
* complete baseline exists (including a valid empty one) reconcile runs normally.
|
|
9
|
+
*
|
|
10
|
+
* Why this is needed: the daemon's tick is a producer (`dirty-scan.mjs` diffs the
|
|
11
|
+
* tree against `merkle-state.json`) plus a consumer (`production-reconciler.mjs`,
|
|
12
|
+
* whose adapters call `createVectorSchema`/`createGraphSchema`). With no baseline,
|
|
13
|
+
* `merkle-state.json` is absent so the producer enqueues the WHOLE tree, and the
|
|
14
|
+
* consumer then builds `codebase.db` / `code-graph.db` / HNSW / LI / sparse from
|
|
15
|
+
* scratch one budget-bounded tick at a time — leaving a PARTIAL index that search
|
|
16
|
+
* mistakes for a complete one.
|
|
17
|
+
*
|
|
18
|
+
* "Complete baseline" is proven by what the FULL indexer writes in its final phase
|
|
19
|
+
* (`indexing/indexer-phases.js::updateIncrementalStatePhase`), which the
|
|
20
|
+
* incremental reconciler does NOT produce on its own:
|
|
21
|
+
*
|
|
22
|
+
* 1. `reconcile-manifest.json` published at `epoch >= 1`. The full indexer
|
|
23
|
+
* publishes this as its LAST step, so its presence means vectors + graph +
|
|
24
|
+
* HNSW + LI + sparse all finished building. A crash before that step leaves
|
|
25
|
+
* no manifest; a corrupt manifest reads back as null. (epoch alone is NOT a
|
|
26
|
+
* discriminator: a reconciler-only first tick also yields epoch 1.)
|
|
27
|
+
* 2. `merkle-state.json` carrying a `config_fingerprint`. ONLY the full
|
|
28
|
+
* indexer's tracker (`indexing/incremental-tracker.js::updateState`) writes
|
|
29
|
+
* this field; the reconciler's `persistManifest` never adds it (it only
|
|
30
|
+
* preserves one already present). So `config_fingerprint` present ⟺ a full
|
|
31
|
+
* index ran at least once — the exact signal that distinguishes a real
|
|
32
|
+
* baseline from the reconciler-only partial state the old bug produced.
|
|
33
|
+
* 3. The vectors DB named by the manifest exists on disk (the artifact search
|
|
34
|
+
* reads). Guards a manually-deleted / half-written baseline.
|
|
35
|
+
*
|
|
36
|
+
* A valid EMPTY baseline (a full index that produced an empty-but-valid index)
|
|
37
|
+
* satisfies 1-3 with zero tracked files, so it counts as ready. A
|
|
38
|
+
* partially-written, corrupt, or reconciler-only baseline fails 1 or 2 and does
|
|
39
|
+
* not. The check is read-only: it never mutates the state dir.
|
|
40
|
+
*/
|
|
41
|
+
|
|
42
|
+
import fs from 'node:fs';
|
|
43
|
+
import path from 'node:path';
|
|
44
|
+
import { readManifest } from './manifest.mjs';
|
|
45
|
+
|
|
46
|
+
/** Status label surfaced in logs and `reconcile status` when no baseline exists. */
|
|
47
|
+
export const WAITING_FOR_INITIAL_INDEX = 'waiting_for_initial_index';
|
|
48
|
+
|
|
49
|
+
const MERKLE_STATE = 'merkle-state.json';
|
|
50
|
+
const DEFAULT_VECTORS_DB = 'codebase.db';
|
|
51
|
+
|
|
52
|
+
function readJsonSafe(filePath) {
|
|
53
|
+
try {
|
|
54
|
+
return JSON.parse(fs.readFileSync(filePath, 'utf8'));
|
|
55
|
+
} catch {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Does the merkle state carry a `config_fingerprint`? The full indexer writes a
|
|
62
|
+
* populated object; the reconciler never adds one. Accept either a non-empty
|
|
63
|
+
* object or a non-empty string for forward/backward tolerance.
|
|
64
|
+
*/
|
|
65
|
+
function hasConfigFingerprint(merkle) {
|
|
66
|
+
const fp = merkle ? merkle.config_fingerprint : null;
|
|
67
|
+
if (!fp) return false;
|
|
68
|
+
if (typeof fp === 'string') return fp.length > 0;
|
|
69
|
+
if (typeof fp === 'object') return Object.keys(fp).length > 0;
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Whether a complete baseline index exists for `stateDir`.
|
|
75
|
+
*
|
|
76
|
+
* @param {string} stateDir The `.sweet-search` directory.
|
|
77
|
+
* @returns {{ready: boolean, reason: string}}
|
|
78
|
+
* `reason` is one of: `ready`, `no-state-dir`, `no-manifest`,
|
|
79
|
+
* `manifest-epoch-zero`, `no-merkle-state`, `no-config-fingerprint`,
|
|
80
|
+
* `missing-vectors-db`.
|
|
81
|
+
*/
|
|
82
|
+
export function hasCompleteBaseIndex(stateDir) {
|
|
83
|
+
if (!stateDir || !fs.existsSync(stateDir)) {
|
|
84
|
+
return { ready: false, reason: 'no-state-dir' };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const manifest = readManifest(stateDir);
|
|
88
|
+
if (!manifest) {
|
|
89
|
+
return { ready: false, reason: 'no-manifest' };
|
|
90
|
+
}
|
|
91
|
+
if (!Number.isInteger(manifest.epoch) || manifest.epoch < 1) {
|
|
92
|
+
return { ready: false, reason: 'manifest-epoch-zero' };
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const merkle = readJsonSafe(path.join(stateDir, MERKLE_STATE));
|
|
96
|
+
if (!merkle) {
|
|
97
|
+
return { ready: false, reason: 'no-merkle-state' };
|
|
98
|
+
}
|
|
99
|
+
if (!hasConfigFingerprint(merkle)) {
|
|
100
|
+
return { ready: false, reason: 'no-config-fingerprint' };
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const vectorsRel = (manifest.vectors && manifest.vectors.path) || DEFAULT_VECTORS_DB;
|
|
104
|
+
const vectorsPath = path.isAbsolute(vectorsRel) ? vectorsRel : path.join(stateDir, vectorsRel);
|
|
105
|
+
if (!fs.existsSync(vectorsPath)) {
|
|
106
|
+
return { ready: false, reason: 'missing-vectors-db' };
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return { ready: true, reason: 'ready' };
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Readiness plus a human/machine-facing `state` label for status surfaces.
|
|
114
|
+
*
|
|
115
|
+
* @param {string} stateDir
|
|
116
|
+
* @returns {{ready: boolean, reason: string, state: 'indexed'|'waiting_for_initial_index'}}
|
|
117
|
+
*/
|
|
118
|
+
export function baselineStatus(stateDir) {
|
|
119
|
+
const result = hasCompleteBaseIndex(stateDir);
|
|
120
|
+
return { ...result, state: result.ready ? 'indexed' : WAITING_FOR_INITIAL_INDEX };
|
|
121
|
+
}
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory dirty path set.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 6.1, § 9.1-§ 9.5. The watcher and polling backstop both push
|
|
5
|
+
* paths into this set; the reconcile tick drains it at every tick start.
|
|
6
|
+
*
|
|
7
|
+
* Guarantees:
|
|
8
|
+
* - Paths are normalised to forward-slash form (cross-platform).
|
|
9
|
+
* - Duplicate inserts are coalesced (Set semantics).
|
|
10
|
+
* - Insertion order is preserved on drain for deterministic tests.
|
|
11
|
+
* - A bounded-size policy guards against burst overflow (50 k events
|
|
12
|
+
* from `git checkout` in <1 s — plan § 11). Past the cap, the set
|
|
13
|
+
* keeps the most recent entries and emits a `dropped` count via the
|
|
14
|
+
* callback so the next polling backstop sweep can re-discover them.
|
|
15
|
+
*
|
|
16
|
+
* The set has no global state. Path canonicalisation uses filesystem
|
|
17
|
+
* realpaths when an ancestor exists so watcher events cannot enter the
|
|
18
|
+
* dirty set through symlink escapes.
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import fs from 'node:fs';
|
|
22
|
+
import path from 'node:path';
|
|
23
|
+
|
|
24
|
+
const DEFAULT_MAX = 100_000;
|
|
25
|
+
|
|
26
|
+
function normalise(p) {
|
|
27
|
+
if (typeof p !== 'string') return null;
|
|
28
|
+
return p.replace(/\\/g, '/');
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export class DirtySet {
|
|
32
|
+
/**
|
|
33
|
+
* @param {object} [options]
|
|
34
|
+
* @param {number} [options.maxSize] Hard cap on entries.
|
|
35
|
+
* @param {(payload:{dropped:number})=>void} [options.onOverflow]
|
|
36
|
+
*/
|
|
37
|
+
constructor({ maxSize = DEFAULT_MAX, onOverflow } = {}) {
|
|
38
|
+
this._set = new Map(); // path → { addedAt, source, meta }
|
|
39
|
+
this._maxSize = maxSize;
|
|
40
|
+
this._onOverflow = onOverflow;
|
|
41
|
+
this._totalEnqueued = 0;
|
|
42
|
+
this._totalDropped = 0;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
get size() {
|
|
46
|
+
return this._set.size;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
get maxSize() {
|
|
50
|
+
return this._maxSize;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Enqueue a path. `source` is one of:
|
|
55
|
+
* - 'watcher' - notify / FSEvents / inotify
|
|
56
|
+
* - 'polling' - mtime backstop sweep
|
|
57
|
+
* - 'cli' - explicit `sweet-search index --add <path>` hint
|
|
58
|
+
* - 'queue' - drained from index-maintainer-queue.jsonl
|
|
59
|
+
*
|
|
60
|
+
* @param {string} filePath
|
|
61
|
+
* @param {string} [source='watcher']
|
|
62
|
+
* @param {object} [meta]
|
|
63
|
+
* @returns {boolean} true if newly inserted (or refreshed), false on drop.
|
|
64
|
+
*/
|
|
65
|
+
add(filePath, source = 'watcher', meta) {
|
|
66
|
+
const p = normalise(filePath);
|
|
67
|
+
if (!p) return false;
|
|
68
|
+
this._totalEnqueued += 1;
|
|
69
|
+
if (this._set.has(p)) {
|
|
70
|
+
const entry = this._set.get(p);
|
|
71
|
+
entry.lastSource = source;
|
|
72
|
+
entry.lastSeenAt = Date.now();
|
|
73
|
+
if (meta) entry.meta = { ...entry.meta, ...meta };
|
|
74
|
+
return true;
|
|
75
|
+
}
|
|
76
|
+
if (this._set.size >= this._maxSize) {
|
|
77
|
+
// Drop the oldest entry to keep the most recent — those reflect the
|
|
78
|
+
// current edit pattern best. The dropped path will be re-discovered
|
|
79
|
+
// by the next polling backstop sweep.
|
|
80
|
+
const firstKey = this._set.keys().next().value;
|
|
81
|
+
if (firstKey !== undefined) {
|
|
82
|
+
this._set.delete(firstKey);
|
|
83
|
+
this._totalDropped += 1;
|
|
84
|
+
if (this._onOverflow) this._onOverflow({ dropped: 1, droppedTotal: this._totalDropped });
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
this._set.set(p, {
|
|
88
|
+
addedAt: Date.now(),
|
|
89
|
+
lastSeenAt: Date.now(),
|
|
90
|
+
firstSource: source,
|
|
91
|
+
lastSource: source,
|
|
92
|
+
meta: meta || null,
|
|
93
|
+
});
|
|
94
|
+
return true;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Bulk-add an iterable of paths. Useful when polling identifies a
|
|
99
|
+
* batch of dirty paths in one syscall pass.
|
|
100
|
+
*
|
|
101
|
+
* @param {Iterable<string>} paths
|
|
102
|
+
* @param {string} [source]
|
|
103
|
+
*/
|
|
104
|
+
addMany(paths, source = 'polling') {
|
|
105
|
+
let added = 0;
|
|
106
|
+
for (const p of paths) {
|
|
107
|
+
if (this.add(p, source)) added += 1;
|
|
108
|
+
}
|
|
109
|
+
return added;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
has(filePath) {
|
|
113
|
+
return this._set.has(normalise(filePath));
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Drain the set into a sorted array. Returns the snapshot the caller
|
|
118
|
+
* should process this tick; subsequent inserts go into the next tick.
|
|
119
|
+
*
|
|
120
|
+
* @returns {Array<{path:string, firstSource:string, lastSource:string, addedAt:number, lastSeenAt:number, meta:object|null}>}
|
|
121
|
+
*/
|
|
122
|
+
drain() {
|
|
123
|
+
const out = [];
|
|
124
|
+
for (const [p, entry] of this._set.entries()) {
|
|
125
|
+
out.push({ path: p, ...entry });
|
|
126
|
+
}
|
|
127
|
+
this._set.clear();
|
|
128
|
+
return out;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Peek without draining — primarily for tests / debug.
|
|
133
|
+
*
|
|
134
|
+
* @returns {string[]}
|
|
135
|
+
*/
|
|
136
|
+
peek() {
|
|
137
|
+
return Array.from(this._set.keys());
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Remove a single path without draining the rest.
|
|
142
|
+
*
|
|
143
|
+
* @param {string} filePath
|
|
144
|
+
*/
|
|
145
|
+
remove(filePath) {
|
|
146
|
+
return this._set.delete(normalise(filePath));
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Diagnostic counters for the operator dashboard (plan § 20.2).
|
|
151
|
+
*
|
|
152
|
+
* @returns {{size:number, maxSize:number, totalEnqueued:number, totalDropped:number}}
|
|
153
|
+
*/
|
|
154
|
+
stats() {
|
|
155
|
+
return {
|
|
156
|
+
size: this._set.size,
|
|
157
|
+
maxSize: this._maxSize,
|
|
158
|
+
totalEnqueued: this._totalEnqueued,
|
|
159
|
+
totalDropped: this._totalDropped,
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Resolve a path to its canonical absolute form within a project root.
|
|
166
|
+
* Plan § 22.1 / § 22.4: canonicalise to drop case-insensitive collisions
|
|
167
|
+
* and to anchor paths inside the indexed tree. The lexical check catches
|
|
168
|
+
* `../` traversal; the realpath check catches symlink parents that point
|
|
169
|
+
* outside the worktree while still allowing delete events for missing
|
|
170
|
+
* files under a real in-tree parent.
|
|
171
|
+
*
|
|
172
|
+
* @param {string} projectRoot
|
|
173
|
+
* @param {string} relativeOrAbsolute
|
|
174
|
+
* @returns {string|null} null if the path escapes projectRoot.
|
|
175
|
+
*/
|
|
176
|
+
export function canonicaliseInsideRoot(projectRoot, relativeOrAbsolute) {
|
|
177
|
+
if (typeof relativeOrAbsolute !== 'string') return null;
|
|
178
|
+
const absoluteInput = path.isAbsolute(relativeOrAbsolute);
|
|
179
|
+
const abs = absoluteInput
|
|
180
|
+
? relativeOrAbsolute
|
|
181
|
+
: path.resolve(projectRoot, relativeOrAbsolute);
|
|
182
|
+
const resolvedAbs = path.resolve(abs);
|
|
183
|
+
const resolvedRoot = path.resolve(projectRoot);
|
|
184
|
+
if (!absoluteInput && !isInsidePath(resolvedRoot, resolvedAbs)) return null;
|
|
185
|
+
|
|
186
|
+
const rootReal = realpathOrNull(resolvedRoot);
|
|
187
|
+
if (!rootReal) return null;
|
|
188
|
+
|
|
189
|
+
const existing = nearestExistingAncestor(
|
|
190
|
+
resolvedAbs,
|
|
191
|
+
absoluteInput ? path.parse(resolvedAbs).root : resolvedRoot,
|
|
192
|
+
);
|
|
193
|
+
if (!existing) return null;
|
|
194
|
+
const existingReal = realpathOrNull(existing.path);
|
|
195
|
+
if (!existingReal) return null;
|
|
196
|
+
|
|
197
|
+
const materializedReal = existing.rest
|
|
198
|
+
? path.join(existingReal, existing.rest)
|
|
199
|
+
: existingReal;
|
|
200
|
+
if (!isInsidePath(rootReal, materializedReal)) return null;
|
|
201
|
+
|
|
202
|
+
return materializedReal.replace(/\\/g, '/');
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function isInsidePath(root, candidate) {
|
|
206
|
+
const rel = path.relative(root, candidate);
|
|
207
|
+
return rel === '' || (rel && !rel.startsWith('..') && !path.isAbsolute(rel));
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function realpathOrNull(filePath) {
|
|
211
|
+
try {
|
|
212
|
+
return fs.realpathSync.native(filePath);
|
|
213
|
+
} catch {
|
|
214
|
+
return null;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function nearestExistingAncestor(absPath, root) {
|
|
219
|
+
let current = absPath;
|
|
220
|
+
const rest = [];
|
|
221
|
+
while (isInsidePath(root, current)) {
|
|
222
|
+
try {
|
|
223
|
+
fs.lstatSync(current);
|
|
224
|
+
return { path: current, rest: rest.join(path.sep) };
|
|
225
|
+
} catch (err) {
|
|
226
|
+
if (err?.code !== 'ENOENT' && err?.code !== 'ENOTDIR') return null;
|
|
227
|
+
if (current === root) return null;
|
|
228
|
+
rest.unshift(path.basename(current));
|
|
229
|
+
current = path.dirname(current);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
return null;
|
|
233
|
+
}
|