sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -0,0 +1,477 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reconciler application service.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 6.1, § 8.1, § 13 Phase 2. The reconciler owns:
|
|
5
|
+
*
|
|
6
|
+
* - dirty-set processing,
|
|
7
|
+
* - content-hash diff,
|
|
8
|
+
* - encoder-dependency expansion,
|
|
9
|
+
* - metadata-dirty input rebuild,
|
|
10
|
+
* - per-file per-tier writes,
|
|
11
|
+
* - strict row visibility,
|
|
12
|
+
* - reader heartbeat files,
|
|
13
|
+
* - prune grace periods,
|
|
14
|
+
* - manifest publish.
|
|
15
|
+
*
|
|
16
|
+
* The reconciler coordinates adapter-provided per-tier deltas, enforces
|
|
17
|
+
* tick budgets, schedules maintenance jobs from watermarks, and publishes
|
|
18
|
+
* the manifest. Concrete graph/vector/HNSW/LI/sparse writes stay behind
|
|
19
|
+
* the injected adapters.
|
|
20
|
+
*
|
|
21
|
+
* The reconciler is intentionally pure of I/O orchestration — it accepts
|
|
22
|
+
* dependency-injected adapters so unit tests can drive every tick through
|
|
23
|
+
* synthetic inputs.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import { ReconcileCounters } from '../domain/reconcile-counters.mjs';
|
|
27
|
+
import { nextInterval } from '../domain/interval-autotune.mjs';
|
|
28
|
+
import {
|
|
29
|
+
buildNextManifest,
|
|
30
|
+
readManifest,
|
|
31
|
+
writeManifest,
|
|
32
|
+
zeroManifest,
|
|
33
|
+
} from '../infrastructure/manifest.mjs';
|
|
34
|
+
import { evaluateWatermarks, loadWatermarkConfig } from '../domain/watermark-scheduler.mjs';
|
|
35
|
+
import { beginRead, endRead, minLiveEpoch } from '../infrastructure/reader-heartbeat.mjs';
|
|
36
|
+
import { verifyStamp, writeStamp, formatStampMismatch } from '../infrastructure/worktree-stamp.mjs';
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Adapter contract (Phase 2 declaration):
|
|
40
|
+
*
|
|
41
|
+
* {
|
|
42
|
+
* readDirtySet(): Promise<DirtyFile[]>,
|
|
43
|
+
* requeueDirtyFiles(files): Promise<void>, // optional budget tail
|
|
44
|
+
* hashFile(file): Promise<{ contentHash, metadata }>,
|
|
45
|
+
* loadCurrentManifest(): object|null,
|
|
46
|
+
* persistManifest(manifest): Promise<void>,
|
|
47
|
+
* applyGraphDelta(file, parsed, epoch): Promise<{ ops }>,
|
|
48
|
+
* applyVectorDelta(file, chunks, hashes, epoch): Promise<{ ops }>,
|
|
49
|
+
* applyHNSWDelta(file, vectorOps, epoch): Promise<{ ops }>,
|
|
50
|
+
* applyBinaryHNSWDelta(file, vectorOps, epoch): Promise<{ ops }>,
|
|
51
|
+
* applyLIDelta(file, tokenOps, epoch): Promise<{ ops }>,
|
|
52
|
+
* applySparseGramDelta(file, gramOps, epoch): Promise<{ ops }>,
|
|
53
|
+
* // Any apply* call may also return either:
|
|
54
|
+
* // { manifest: {...tier descriptor...} }
|
|
55
|
+
* // or:
|
|
56
|
+
* // { manifestTiers: { sparseGram: {...}, hnsw: {...} } }
|
|
57
|
+
* // These descriptors are merged into the next epoch manifest.
|
|
58
|
+
* readMaintenanceState(ctx): Promise<object>|object,
|
|
59
|
+
* scheduleMaintenance(job): Promise<void>|void,
|
|
60
|
+
* }
|
|
61
|
+
*
|
|
62
|
+
* Maintenance scheduling uses `domain/watermark-scheduler.mjs`; the adapter
|
|
63
|
+
* persists emitted jobs to the queue used by `maintenance-worker.mjs`.
|
|
64
|
+
*/
|
|
65
|
+
|
|
66
|
+
const DEFAULT_TICK_INTERVAL_MS = 60_000;
|
|
67
|
+
const DEFAULT_CPU_BUDGET_MS = 2_000;
|
|
68
|
+
const DEFAULT_FILES_PER_TICK = 50;
|
|
69
|
+
|
|
70
|
+
const MANIFEST_TIER_KEYS = new Set([
|
|
71
|
+
'codeGraph',
|
|
72
|
+
'vectors',
|
|
73
|
+
'hnsw',
|
|
74
|
+
'binaryHnsw',
|
|
75
|
+
'lateInteraction',
|
|
76
|
+
'sparseGram',
|
|
77
|
+
]);
|
|
78
|
+
|
|
79
|
+
function isPlainObject(value) {
|
|
80
|
+
return value && typeof value === 'object' && !Array.isArray(value);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function collectManifestTier(target, tier, result) {
|
|
84
|
+
if (!result) return;
|
|
85
|
+
if (isPlainObject(result.manifest)) {
|
|
86
|
+
mergeManifestTiers(target, { [tier]: result.manifest });
|
|
87
|
+
}
|
|
88
|
+
if (isPlainObject(result.manifestTiers)) {
|
|
89
|
+
mergeManifestTiers(target, result.manifestTiers);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function mergeManifestTiers(target, update) {
|
|
94
|
+
if (!isPlainObject(update)) return target;
|
|
95
|
+
for (const [tier, descriptor] of Object.entries(update)) {
|
|
96
|
+
if (!MANIFEST_TIER_KEYS.has(tier) || !isPlainObject(descriptor)) continue;
|
|
97
|
+
target[tier] = mergeTierDescriptor(target[tier] || {}, descriptor);
|
|
98
|
+
}
|
|
99
|
+
return target;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function mergeTierDescriptor(previous, next) {
|
|
103
|
+
const merged = { ...previous, ...next };
|
|
104
|
+
if (Array.isArray(previous.deltas) || Array.isArray(next.deltas)) {
|
|
105
|
+
merged.deltas = [
|
|
106
|
+
...new Set([
|
|
107
|
+
...(Array.isArray(previous.deltas) ? previous.deltas : []),
|
|
108
|
+
...(Array.isArray(next.deltas) ? next.deltas : []),
|
|
109
|
+
]),
|
|
110
|
+
];
|
|
111
|
+
}
|
|
112
|
+
return merged;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function finalizeManifestTiers(previousManifest, tiers) {
|
|
116
|
+
const out = { ...tiers };
|
|
117
|
+
if (isPlainObject(out.sparseGram)) {
|
|
118
|
+
const previousSparse = previousManifest?.sparseGram || {};
|
|
119
|
+
const baseChanged = typeof out.sparseGram.base === 'string'
|
|
120
|
+
&& out.sparseGram.base !== previousSparse.base;
|
|
121
|
+
const weightsChanged = typeof out.sparseGram.weightsId === 'string'
|
|
122
|
+
&& out.sparseGram.weightsId !== previousSparse.weightsId;
|
|
123
|
+
if (baseChanged || weightsChanged) {
|
|
124
|
+
out.sparseGram = {
|
|
125
|
+
...out.sparseGram,
|
|
126
|
+
deltas: Array.isArray(out.sparseGram.deltas) ? out.sparseGram.deltas : [],
|
|
127
|
+
};
|
|
128
|
+
} else if (Array.isArray(out.sparseGram.deltas)) {
|
|
129
|
+
out.sparseGram = {
|
|
130
|
+
...out.sparseGram,
|
|
131
|
+
deltas: [
|
|
132
|
+
...new Set([
|
|
133
|
+
...(Array.isArray(previousSparse.deltas) ? previousSparse.deltas : []),
|
|
134
|
+
...out.sparseGram.deltas,
|
|
135
|
+
]),
|
|
136
|
+
],
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return out;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
export class Reconciler {
|
|
144
|
+
/**
|
|
145
|
+
* @param {object} options
|
|
146
|
+
* @param {string} options.stateDir
|
|
147
|
+
* @param {object} options.adapters Adapter contract above.
|
|
148
|
+
* @param {object} [options.config] Tick interval / budgets / etc.
|
|
149
|
+
* @param {Function} [options.now] Injectable clock for tests.
|
|
150
|
+
* @param {{info:Function, warn:Function, error:Function}} [options.logger]
|
|
151
|
+
* @param {(phase:string)=>void} [options.onProgress]
|
|
152
|
+
*/
|
|
153
|
+
constructor({ stateDir, adapters, config = {}, now = Date.now, logger = console, projectRoot = null, onProgress = null }) {
|
|
154
|
+
if (!stateDir) throw new Error('Reconciler: stateDir is required');
|
|
155
|
+
if (!adapters) throw new Error('Reconciler: adapters are required');
|
|
156
|
+
this.stateDir = stateDir;
|
|
157
|
+
this.projectRoot = projectRoot;
|
|
158
|
+
this.adapters = adapters;
|
|
159
|
+
this.config = {
|
|
160
|
+
intervalMs: config.intervalMs ?? DEFAULT_TICK_INTERVAL_MS,
|
|
161
|
+
cpuBudgetMs: config.cpuBudgetMs ?? DEFAULT_CPU_BUDGET_MS,
|
|
162
|
+
filesPerTick: config.filesPerTick ?? DEFAULT_FILES_PER_TICK,
|
|
163
|
+
autotuneInterval: config.autotuneInterval ?? false,
|
|
164
|
+
pinnedIntervalMs: config.pinnedIntervalMs ?? false,
|
|
165
|
+
...config,
|
|
166
|
+
};
|
|
167
|
+
this.now = now;
|
|
168
|
+
this.logger = logger;
|
|
169
|
+
this.onProgress = typeof onProgress === 'function' ? onProgress : null;
|
|
170
|
+
this._lastEpoch = 0;
|
|
171
|
+
this._running = false;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
progress(phase) {
|
|
175
|
+
this.onProgress?.(phase);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Verify the worktree stamp before any tick. Plan § 8.5 / § 14.2.4 —
|
|
180
|
+
* cross-worktree mixing is a silent footgun; verify or mint a stamp
|
|
181
|
+
* before the daemon writes to this state dir.
|
|
182
|
+
*
|
|
183
|
+
* @returns {{ok:boolean, reason?:string}}
|
|
184
|
+
*/
|
|
185
|
+
verifyStartup() {
|
|
186
|
+
if (!this.projectRoot) return { ok: true, reason: 'no-project-root' };
|
|
187
|
+
const check = verifyStamp(this.stateDir, this.projectRoot);
|
|
188
|
+
if (!check.ok) {
|
|
189
|
+
this.logger.error?.(formatStampMismatch(check));
|
|
190
|
+
return check;
|
|
191
|
+
}
|
|
192
|
+
if (check.reason === 'absent') {
|
|
193
|
+
writeStamp(this.stateDir, this.projectRoot);
|
|
194
|
+
}
|
|
195
|
+
return { ok: true, reason: check.reason };
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Load the current manifest's epoch. Falls back to 0 when no manifest
|
|
200
|
+
* has been written yet.
|
|
201
|
+
*
|
|
202
|
+
* @returns {number}
|
|
203
|
+
*/
|
|
204
|
+
currentEpoch() {
|
|
205
|
+
const manifest = this.adapters.loadCurrentManifest
|
|
206
|
+
? this.adapters.loadCurrentManifest()
|
|
207
|
+
: readManifest(this.stateDir);
|
|
208
|
+
return manifest?.epoch ?? 0;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Build the next-epoch number. Plan § 8.1 step 1-2.
|
|
213
|
+
*
|
|
214
|
+
* @returns {number}
|
|
215
|
+
*/
|
|
216
|
+
nextEpoch() {
|
|
217
|
+
const current = this.currentEpoch();
|
|
218
|
+
return Math.max(current + 1, this._lastEpoch + 1);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Run one reconcile tick. Returns the counters snapshot for the tick.
|
|
223
|
+
*
|
|
224
|
+
* Walk the dirty set through the adapter contract. Adapters own concrete
|
|
225
|
+
* tier writes; the reconciler enforces budget, maintenance scheduling,
|
|
226
|
+
* and manifest publication.
|
|
227
|
+
*
|
|
228
|
+
* @returns {Promise<object>}
|
|
229
|
+
*/
|
|
230
|
+
async tick() {
|
|
231
|
+
if (this._running) {
|
|
232
|
+
throw new Error('Reconciler.tick(): tick already in progress (single-instance enforced by lockfile)');
|
|
233
|
+
}
|
|
234
|
+
this._running = true;
|
|
235
|
+
const startedAt = this.now();
|
|
236
|
+
const counters = new ReconcileCounters();
|
|
237
|
+
const epoch = this.nextEpoch();
|
|
238
|
+
counters.set('epoch', epoch);
|
|
239
|
+
let dirty = [];
|
|
240
|
+
let dirtyCursor = 0;
|
|
241
|
+
let deferredRequeued = false;
|
|
242
|
+
let manifestPublished = false;
|
|
243
|
+
|
|
244
|
+
try {
|
|
245
|
+
dirty = await this.adapters.readDirtySet();
|
|
246
|
+
this.progress('reconciler:dirty-read');
|
|
247
|
+
counters.set('dirty_paths_seen', dirty.length);
|
|
248
|
+
|
|
249
|
+
counters.set('cpu_budget_total_ms', this.config.cpuBudgetMs);
|
|
250
|
+
|
|
251
|
+
// Track per-file outcomes for the tick summary.
|
|
252
|
+
const tierOps = {};
|
|
253
|
+
const manifestTiers = {};
|
|
254
|
+
const filesProcessed = [];
|
|
255
|
+
|
|
256
|
+
for (; dirtyCursor < dirty.length; dirtyCursor += 1) {
|
|
257
|
+
const filesAttempted = counters._fields.files_processed + counters._fields.content_unchanged;
|
|
258
|
+
if (filesAttempted >= this.config.filesPerTick) break;
|
|
259
|
+
if (filesAttempted > 0 && this.now() - startedAt >= this.config.cpuBudgetMs) break;
|
|
260
|
+
|
|
261
|
+
const file = dirty[dirtyCursor];
|
|
262
|
+
this.progress('reconciler:file:start');
|
|
263
|
+
const hashes = await this.adapters.hashFile(file);
|
|
264
|
+
this.progress('reconciler:file:hashed');
|
|
265
|
+
if (hashes && hashes.contentUnchanged) {
|
|
266
|
+
counters.observeContentUnchanged();
|
|
267
|
+
continue;
|
|
268
|
+
}
|
|
269
|
+
const fileRes = await this._reconcileOneFile(file, epoch, hashes);
|
|
270
|
+
this.progress('reconciler:file:done');
|
|
271
|
+
filesProcessed.push({ file, ...fileRes });
|
|
272
|
+
mergeManifestTiers(manifestTiers, fileRes?.manifestTiers);
|
|
273
|
+
counters.inc('files_processed');
|
|
274
|
+
counters.inc('chunks_total', fileRes?.chunksTotal ?? 0);
|
|
275
|
+
counters.inc('chunks_encoded', fileRes?.chunksEncoded ?? 0);
|
|
276
|
+
counters.inc('chunks_hash_reused', fileRes?.chunksReused ?? 0);
|
|
277
|
+
counters.inc('chunks_struct_stable', fileRes?.chunksStructStable ?? 0);
|
|
278
|
+
counters.inc('chunks_text_unchanged', fileRes?.chunksTextUnchanged ?? 0);
|
|
279
|
+
counters.inc('chunks_metadata_dirty', fileRes?.chunksMetadataDirty ?? 0);
|
|
280
|
+
counters.inc('chunks_dedup_repaired', fileRes?.chunksDedupRepaired ?? 0);
|
|
281
|
+
counters.inc('tree_sitter_error_nodes_seen', fileRes?.treeSitterErrorNodes ?? 0);
|
|
282
|
+
if ((fileRes?.treeSitterErrorNodes ?? 0) > 0) {
|
|
283
|
+
counters.inc('tree_sitter_files_with_errors');
|
|
284
|
+
}
|
|
285
|
+
for (const [tier, op] of Object.entries(fileRes?.ops ?? {})) {
|
|
286
|
+
if (typeof op === 'number') {
|
|
287
|
+
counters.inc(`ops_per_tier.${tier}`, op);
|
|
288
|
+
tierOps[tier] = (tierOps[tier] || 0) + op;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
const deferredFiles = dirty.slice(dirtyCursor);
|
|
294
|
+
if (deferredFiles.length > 0) {
|
|
295
|
+
counters.inc('dirty_paths_deferred', deferredFiles.length);
|
|
296
|
+
if (this.adapters.requeueDirtyFiles) {
|
|
297
|
+
await this.adapters.requeueDirtyFiles(deferredFiles);
|
|
298
|
+
deferredRequeued = true;
|
|
299
|
+
} else {
|
|
300
|
+
this.logger.warn?.(
|
|
301
|
+
`[reconciler] ${deferredFiles.length} dirty paths exceeded the per-tick budget; ` +
|
|
302
|
+
'adapter has no requeueDirtyFiles hook',
|
|
303
|
+
);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Publish the new manifest. Plan § 8.1 step 4: write to *.tmp,
|
|
308
|
+
// fsync, atomic rename, fsync parent dir. `writeManifest` already
|
|
309
|
+
// does that.
|
|
310
|
+
const previous = (this.adapters.loadCurrentManifest
|
|
311
|
+
? this.adapters.loadCurrentManifest()
|
|
312
|
+
: readManifest(this.stateDir))
|
|
313
|
+
?? zeroManifest({});
|
|
314
|
+
const manifest = buildNextManifest(previous, {
|
|
315
|
+
epoch,
|
|
316
|
+
tiers: finalizeManifestTiers(previous, manifestTiers),
|
|
317
|
+
});
|
|
318
|
+
await this._publishManifest(manifest);
|
|
319
|
+
this.progress('reconciler:manifest-published');
|
|
320
|
+
manifestPublished = true;
|
|
321
|
+
this._lastEpoch = epoch;
|
|
322
|
+
|
|
323
|
+
// Plan § 6.1 step 11: maintenance observes a successfully-published
|
|
324
|
+
// epoch. Never enqueue jobs for an epoch whose manifest did not land.
|
|
325
|
+
await this._scheduleMaintenance(epoch, counters, tierOps, filesProcessed);
|
|
326
|
+
this.progress('reconciler:maintenance-scheduled');
|
|
327
|
+
|
|
328
|
+
counters.set('tick_ms', this.now() - startedAt);
|
|
329
|
+
counters.set('ts', startedAt / 1000);
|
|
330
|
+
|
|
331
|
+
// Interval auto-tune (plan § 14.2.1). Skipped when the operator pins
|
|
332
|
+
// a fixed interval via `config.pinnedIntervalMs` or
|
|
333
|
+
// `config.autotuneInterval === false`.
|
|
334
|
+
if (this.config.autotuneInterval && !this.config.pinnedIntervalMs) {
|
|
335
|
+
const tuned = nextInterval({
|
|
336
|
+
currentMs: this.config.intervalMs,
|
|
337
|
+
lastTickMs: counters._fields.tick_ms,
|
|
338
|
+
dirtyAtTickStart: dirty.length,
|
|
339
|
+
cpuLoadAvg: this.config.cpuLoadAvg ?? 0,
|
|
340
|
+
maintenanceBacklog: this.config.maintenanceBacklog ?? 0,
|
|
341
|
+
});
|
|
342
|
+
if (tuned.nextMs !== this.config.intervalMs) {
|
|
343
|
+
this.logger.info?.(`[reconciler] interval ${this.config.intervalMs}ms → ${tuned.nextMs}ms (${tuned.reasons.join(',')})`);
|
|
344
|
+
this.config.intervalMs = tuned.nextMs;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return counters.snapshot();
|
|
349
|
+
} catch (err) {
|
|
350
|
+
if (!manifestPublished && dirty.length > 0) {
|
|
351
|
+
const filesToRequeue = deferredRequeued ? dirty.slice(0, dirtyCursor) : dirty;
|
|
352
|
+
if (filesToRequeue.length > 0) {
|
|
353
|
+
await this._tryRequeueDirtyAfterFailure(filesToRequeue, err);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
throw err;
|
|
357
|
+
} finally {
|
|
358
|
+
this._running = false;
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
async _reconcileOneFile(file, epoch, hashes) {
|
|
363
|
+
// Dispatch to per-tier adapter methods. Adapters can return undefined
|
|
364
|
+
// when a tier has no work for this file.
|
|
365
|
+
const ops = {};
|
|
366
|
+
this.progress('reconciler:graph:start');
|
|
367
|
+
const graph = await this.adapters.applyGraphDelta?.(file, hashes, epoch);
|
|
368
|
+
this.progress('reconciler:graph:done');
|
|
369
|
+
const manifestTiers = {};
|
|
370
|
+
collectManifestTier(manifestTiers, 'codeGraph', graph);
|
|
371
|
+
if (graph?.ops?.graph_upsert != null) ops.graph_upsert = graph.ops.graph_upsert;
|
|
372
|
+
if (graph?.ops?.graph_tombstone != null) ops.graph_tombstone = graph.ops.graph_tombstone;
|
|
373
|
+
this.progress('reconciler:vector:start');
|
|
374
|
+
const vec = await this.adapters.applyVectorDelta?.(file, hashes?.chunks ?? [], hashes, epoch);
|
|
375
|
+
this.progress('reconciler:vector:done');
|
|
376
|
+
collectManifestTier(manifestTiers, 'vectors', vec);
|
|
377
|
+
if (vec?.ops?.vectors_upsert != null) ops.vectors_upsert = vec.ops.vectors_upsert;
|
|
378
|
+
if (vec?.ops?.vectors_delete != null) ops.vectors_delete = vec.ops.vectors_delete;
|
|
379
|
+
this.progress('reconciler:hnsw:start');
|
|
380
|
+
const hnsw = await this.adapters.applyHNSWDelta?.(file, vec?.vectorOps ?? [], epoch);
|
|
381
|
+
this.progress('reconciler:hnsw:done');
|
|
382
|
+
collectManifestTier(manifestTiers, 'hnsw', hnsw);
|
|
383
|
+
if (hnsw?.ops?.hnsw_add != null) ops.hnsw_add = hnsw.ops.hnsw_add;
|
|
384
|
+
if (hnsw?.ops?.hnsw_tombstone != null) ops.hnsw_tombstone = hnsw.ops.hnsw_tombstone;
|
|
385
|
+
this.progress('reconciler:binary-hnsw:start');
|
|
386
|
+
const bin = await this.adapters.applyBinaryHNSWDelta?.(file, vec?.vectorOps ?? [], epoch);
|
|
387
|
+
this.progress('reconciler:binary-hnsw:done');
|
|
388
|
+
collectManifestTier(manifestTiers, 'binaryHnsw', bin);
|
|
389
|
+
if (bin?.ops?.binary_hnsw_append != null) ops.binary_hnsw_append = bin.ops.binary_hnsw_append;
|
|
390
|
+
if (bin?.ops?.binary_hnsw_tombstone != null) ops.binary_hnsw_tombstone = bin.ops.binary_hnsw_tombstone;
|
|
391
|
+
this.progress('reconciler:li:start');
|
|
392
|
+
const li = await this.adapters.applyLIDelta?.(file, vec?.tokenOps ?? [], epoch);
|
|
393
|
+
this.progress('reconciler:li:done');
|
|
394
|
+
collectManifestTier(manifestTiers, 'lateInteraction', li);
|
|
395
|
+
if (li?.ops?.li_segment_append != null) ops.li_segment_append = li.ops.li_segment_append;
|
|
396
|
+
if (li?.ops?.li_tombstone != null) ops.li_tombstone = li.ops.li_tombstone;
|
|
397
|
+
this.progress('reconciler:sparse:start');
|
|
398
|
+
const sg = await this.adapters.applySparseGramDelta?.(file, vec?.gramOps ?? [], epoch);
|
|
399
|
+
this.progress('reconciler:sparse:done');
|
|
400
|
+
collectManifestTier(manifestTiers, 'sparseGram', sg);
|
|
401
|
+
if (sg?.ops?.sparse_gram_delta_upsert != null) ops.sparse_gram_delta_upsert = sg.ops.sparse_gram_delta_upsert;
|
|
402
|
+
|
|
403
|
+
return {
|
|
404
|
+
chunksTotal: vec?.chunksTotal ?? 0,
|
|
405
|
+
chunksEncoded: vec?.chunksEncoded ?? 0,
|
|
406
|
+
chunksReused: vec?.chunksReused ?? 0,
|
|
407
|
+
chunksStructStable: vec?.chunksStructStable ?? 0,
|
|
408
|
+
chunksTextUnchanged: vec?.chunksTextUnchanged ?? 0,
|
|
409
|
+
chunksMetadataDirty: vec?.chunksMetadataDirty ?? 0,
|
|
410
|
+
chunksDedupRepaired: vec?.chunksDedupRepaired ?? 0,
|
|
411
|
+
treeSitterErrorNodes: graph?.treeSitterErrorNodes ?? 0,
|
|
412
|
+
manifestTiers,
|
|
413
|
+
ops,
|
|
414
|
+
};
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
async _scheduleMaintenance(epoch, counters, tierOps, filesProcessed) {
|
|
418
|
+
if (!this.adapters.readMaintenanceState) return;
|
|
419
|
+
const state = await this.adapters.readMaintenanceState({
|
|
420
|
+
epoch,
|
|
421
|
+
tierOps,
|
|
422
|
+
filesProcessed,
|
|
423
|
+
counters: counters.snapshot(),
|
|
424
|
+
});
|
|
425
|
+
const jobs = Array.isArray(state)
|
|
426
|
+
? state
|
|
427
|
+
: evaluateWatermarks(state || {}, this.config.watermarks || loadWatermarkConfig());
|
|
428
|
+
if (jobs.length === 0) return;
|
|
429
|
+
if (!this.adapters.scheduleMaintenance) {
|
|
430
|
+
this.logger.warn?.(`[reconciler] ${jobs.length} maintenance jobs crossed watermarks; adapter has no scheduleMaintenance hook`);
|
|
431
|
+
return;
|
|
432
|
+
}
|
|
433
|
+
for (const job of jobs) {
|
|
434
|
+
await this.adapters.scheduleMaintenance({ ...job, epoch: job.epoch ?? epoch });
|
|
435
|
+
counters.inc('maintenance_jobs_enqueued');
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
async _publishManifest(manifest) {
|
|
440
|
+
if (this.adapters.persistManifest) {
|
|
441
|
+
await this.adapters.persistManifest(manifest);
|
|
442
|
+
return;
|
|
443
|
+
}
|
|
444
|
+
writeManifest(this.stateDir, manifest);
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
async _tryRequeueDirtyAfterFailure(files, cause) {
|
|
448
|
+
if (!this.adapters.requeueDirtyFiles) {
|
|
449
|
+
this.logger.warn?.(
|
|
450
|
+
`[reconciler] tick failed before manifest publish and ${files.length} dirty paths were drained; ` +
|
|
451
|
+
'adapter has no requeueDirtyFiles hook',
|
|
452
|
+
);
|
|
453
|
+
return;
|
|
454
|
+
}
|
|
455
|
+
try {
|
|
456
|
+
await this.adapters.requeueDirtyFiles(files);
|
|
457
|
+
} catch (requeueErr) {
|
|
458
|
+
this.logger.error?.(
|
|
459
|
+
`[reconciler] failed to requeue ${files.length} dirty paths after tick failure ` +
|
|
460
|
+
`(${cause?.message ?? cause}): ${requeueErr?.message ?? requeueErr}`,
|
|
461
|
+
);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// ----- Reader heartbeat helpers (exposed for tests; production callers
|
|
466
|
+
// use beginRead/endRead from infrastructure/reader-heartbeat.mjs directly).
|
|
467
|
+
|
|
468
|
+
beginRead(epoch, meta) { return beginRead(this.stateDir, epoch, meta); }
|
|
469
|
+
endRead(record) { return endRead(this.stateDir, record); }
|
|
470
|
+
minLiveEpoch() { return minLiveEpoch(this.stateDir); }
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
export const __testing = {
|
|
474
|
+
DEFAULT_TICK_INTERVAL_MS,
|
|
475
|
+
DEFAULT_CPU_BUDGET_MS,
|
|
476
|
+
DEFAULT_FILES_PER_TICK,
|
|
477
|
+
};
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Synthetic tombstone-injection harness.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 13 Phase 5, § 14.1 measurement 1. The harness picks a random
|
|
5
|
+
* fraction of vectors in `codebase.db::vectors`, marks them tombstoned
|
|
6
|
+
* (sets `epoch_retired`), and lets the search path apply the
|
|
7
|
+
* manifest-epoch predicate so the GCSN dev sweep can measure
|
|
8
|
+
* MRR-vs-tombstone-fraction.
|
|
9
|
+
*
|
|
10
|
+
* Discipline (CLAUDE.md `feedback_heldout_discipline_strict.md`):
|
|
11
|
+
* - The sweep runs on **dev** GCSN, not held-out.
|
|
12
|
+
* - Use a fixed seed (`SWEET_SEARCH_TOMBSTONE_SEED`, default 42) so
|
|
13
|
+
* reruns reproduce.
|
|
14
|
+
* - The harness restores the original `epoch_retired` values on exit
|
|
15
|
+
* so the index returns to its pre-experiment state.
|
|
16
|
+
*
|
|
17
|
+
* Output:
|
|
18
|
+
* - A `restore` function the runner calls at the end of each sweep
|
|
19
|
+
* point to undo the injection.
|
|
20
|
+
* - A `report` object with the actual fraction injected (may differ
|
|
21
|
+
* from the request slightly due to row count rounding).
|
|
22
|
+
*
|
|
23
|
+
* This module is pure infrastructure — the runner script in
|
|
24
|
+
* `scripts/incremental-indexing-tombstone-sensitivity.mjs` ties it to
|
|
25
|
+
* the GCSN benchmark loop.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
const DEFAULT_SEED = 42;
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Deterministic 32-bit PRNG (mulberry32). Seeded with an integer, returns
|
|
32
|
+
* a function producing floats in [0, 1).
|
|
33
|
+
*
|
|
34
|
+
* @param {number} seed
|
|
35
|
+
* @returns {() => number}
|
|
36
|
+
*/
|
|
37
|
+
function mulberry32(seed) {
|
|
38
|
+
let s = (seed | 0) || 1;
|
|
39
|
+
return function rand() {
|
|
40
|
+
s = (s + 0x6D2B79F5) | 0;
|
|
41
|
+
let t = s;
|
|
42
|
+
t = Math.imul(t ^ (t >>> 15), t | 1);
|
|
43
|
+
t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
|
|
44
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Inject tombstones at the requested fraction.
|
|
50
|
+
*
|
|
51
|
+
* @param {object} options
|
|
52
|
+
* @param {import('better-sqlite3').Database} options.db The `codebase.db` vectors database.
|
|
53
|
+
* @param {number} options.fraction 0 < fraction <= 0.5
|
|
54
|
+
* @param {number} options.epoch ε+1 to write into epoch_retired
|
|
55
|
+
* @param {number} [options.seed=42]
|
|
56
|
+
* @param {string} [options.sessionFilter] Restrict to one `session_id` if set.
|
|
57
|
+
* @returns {{injected:number, fraction:number, restore:() => void}}
|
|
58
|
+
*/
|
|
59
|
+
export function injectTombstones({ db, fraction, epoch, seed = DEFAULT_SEED, sessionFilter }) {
|
|
60
|
+
if (!Number.isFinite(fraction) || fraction <= 0 || fraction > 0.5) {
|
|
61
|
+
throw new Error(`injectTombstones: fraction must be in (0, 0.5], got ${fraction}`);
|
|
62
|
+
}
|
|
63
|
+
if (!Number.isInteger(epoch) || epoch <= 0) {
|
|
64
|
+
throw new Error(`injectTombstones: epoch must be a positive integer, got ${epoch}`);
|
|
65
|
+
}
|
|
66
|
+
const rand = mulberry32(seed);
|
|
67
|
+
|
|
68
|
+
const liveRowsQ = sessionFilter
|
|
69
|
+
? `SELECT id FROM vectors WHERE session_id = ? AND epoch_retired IS NULL`
|
|
70
|
+
: `SELECT id FROM vectors WHERE epoch_retired IS NULL`;
|
|
71
|
+
const rows = sessionFilter
|
|
72
|
+
? db.prepare(liveRowsQ).all(sessionFilter)
|
|
73
|
+
: db.prepare(liveRowsQ).all();
|
|
74
|
+
const total = rows.length;
|
|
75
|
+
if (total === 0) {
|
|
76
|
+
return { injected: 0, fraction: 0, restore: () => {} };
|
|
77
|
+
}
|
|
78
|
+
const target = Math.floor(total * fraction);
|
|
79
|
+
|
|
80
|
+
// Fisher-Yates sample without replacement.
|
|
81
|
+
const indices = new Array(total);
|
|
82
|
+
for (let i = 0; i < total; i++) indices[i] = i;
|
|
83
|
+
for (let i = total - 1; i > 0; i--) {
|
|
84
|
+
const j = Math.floor(rand() * (i + 1));
|
|
85
|
+
const tmp = indices[i]; indices[i] = indices[j]; indices[j] = tmp;
|
|
86
|
+
}
|
|
87
|
+
const picked = indices.slice(0, target).map((i) => rows[i].id);
|
|
88
|
+
|
|
89
|
+
const update = db.prepare(`
|
|
90
|
+
UPDATE vectors
|
|
91
|
+
SET epoch_retired = ?
|
|
92
|
+
WHERE id = ? AND epoch_retired IS NULL
|
|
93
|
+
`);
|
|
94
|
+
const transaction = db.transaction((ids) => {
|
|
95
|
+
for (const id of ids) update.run(epoch, id);
|
|
96
|
+
});
|
|
97
|
+
transaction(picked);
|
|
98
|
+
|
|
99
|
+
const restore = () => {
|
|
100
|
+
const undo = db.prepare(`
|
|
101
|
+
UPDATE vectors
|
|
102
|
+
SET epoch_retired = NULL
|
|
103
|
+
WHERE id = ? AND epoch_retired = ?
|
|
104
|
+
`);
|
|
105
|
+
const undoTxn = db.transaction((ids) => {
|
|
106
|
+
for (const id of ids) undo.run(id, epoch);
|
|
107
|
+
});
|
|
108
|
+
undoTxn(picked);
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
injected: picked.length,
|
|
113
|
+
fraction: picked.length / total,
|
|
114
|
+
restore,
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Run a sweep across the requested fraction list, calling `runOnce`
|
|
120
|
+
* between injection and restore. The runner is responsible for actually
|
|
121
|
+
* executing the benchmark — this helper is intentionally agnostic.
|
|
122
|
+
*
|
|
123
|
+
* @param {object} options
|
|
124
|
+
* @param {import('better-sqlite3').Database} options.db
|
|
125
|
+
* @param {number[]} options.fractions
|
|
126
|
+
* @param {(meta:{fraction:number, injected:number}) => Promise<object>} options.runOnce
|
|
127
|
+
* @param {number} [options.seed=42]
|
|
128
|
+
* @param {number} [options.epoch=1_000_000]
|
|
129
|
+
* @returns {Promise<Array<{fraction:number, injected:number, result:object}>>}
|
|
130
|
+
*/
|
|
131
|
+
export async function sweepTombstoneFractions({ db, fractions, runOnce, seed = DEFAULT_SEED, epoch = 1_000_000 }) {
|
|
132
|
+
if (!Array.isArray(fractions) || fractions.length === 0) {
|
|
133
|
+
throw new Error('sweepTombstoneFractions: fractions must be a non-empty array');
|
|
134
|
+
}
|
|
135
|
+
const points = [];
|
|
136
|
+
for (const f of fractions) {
|
|
137
|
+
const inj = injectTombstones({ db, fraction: f, epoch, seed });
|
|
138
|
+
try {
|
|
139
|
+
const result = await runOnce({ fraction: inj.fraction, injected: inj.injected });
|
|
140
|
+
points.push({ fraction: inj.fraction, injected: inj.injected, result });
|
|
141
|
+
} finally {
|
|
142
|
+
inj.restore();
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return points;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export const __testing = { mulberry32 };
|