sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Path filter (`.sweet-search-ignore` + repo-size cap).
|
|
3
|
+
*
|
|
4
|
+
* Plan § 11 ("Default deny-list (independent of `.gitignore`) catches"),
|
|
5
|
+
* § 22.9 ("Many tiny files / Cursor's 400 k freeze") and § 14.2.7
|
|
6
|
+
* ("resolved-exclude fingerprint"). Three concerns combined:
|
|
7
|
+
*
|
|
8
|
+
* 1. **Default deny-list** for paths that should never reach the
|
|
9
|
+
* reconcile dirty set, independent of project `.gitignore`. This
|
|
10
|
+
* catches `node_modules`, build dirs, common artifact extensions,
|
|
11
|
+
* etc.
|
|
12
|
+
* 2. **`.sweet-search-ignore`** — a project-level ignore file with
|
|
13
|
+
* gitignore-compatible patterns. Allows users to opt out of paths
|
|
14
|
+
* that `.gitignore` doesn't already cover.
|
|
15
|
+
* 3. **Repo-size cap** — Cursor froze at ~400 k indexed files; we cap
|
|
16
|
+
* at 200 k by default with a warning at 50 % of cap.
|
|
17
|
+
*
|
|
18
|
+
* Pattern matching is intentionally simple — substring + extension +
|
|
19
|
+
* directory checks. Full gitignore semantics live in
|
|
20
|
+
* `core/infrastructure/config/search.js::loadProjectConfig` which the
|
|
21
|
+
* reconcile path consults for the authoritative exclude list.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import fs from 'node:fs';
|
|
25
|
+
import path from 'node:path';
|
|
26
|
+
import { loadProjectConfig } from '../../infrastructure/config/search.js';
|
|
27
|
+
|
|
28
|
+
const DEFAULT_DENY_DIRS = Object.freeze([
|
|
29
|
+
'node_modules',
|
|
30
|
+
'.git',
|
|
31
|
+
'.sweet-search',
|
|
32
|
+
'dist',
|
|
33
|
+
'build',
|
|
34
|
+
'.next',
|
|
35
|
+
'.nuxt',
|
|
36
|
+
'target',
|
|
37
|
+
'vendor',
|
|
38
|
+
'__pycache__',
|
|
39
|
+
'.venv',
|
|
40
|
+
'venv',
|
|
41
|
+
'.cache',
|
|
42
|
+
'.turbo',
|
|
43
|
+
'coverage',
|
|
44
|
+
'.parcel-cache',
|
|
45
|
+
'.svelte-kit',
|
|
46
|
+
'.vercel',
|
|
47
|
+
]);
|
|
48
|
+
|
|
49
|
+
const DEFAULT_DENY_EXTS = Object.freeze([
|
|
50
|
+
'.lock',
|
|
51
|
+
'.lockb',
|
|
52
|
+
'.min.js',
|
|
53
|
+
'.min.css',
|
|
54
|
+
'.map',
|
|
55
|
+
'.bundle.js',
|
|
56
|
+
'.pyc',
|
|
57
|
+
'.so',
|
|
58
|
+
'.dylib',
|
|
59
|
+
'.dll',
|
|
60
|
+
'.exe',
|
|
61
|
+
'.bin',
|
|
62
|
+
'.wasm',
|
|
63
|
+
]);
|
|
64
|
+
|
|
65
|
+
export const DEFAULT_REPO_SIZE_CAP = 200_000;
|
|
66
|
+
export const DEFAULT_REPO_SIZE_WARN_FRAC = 0.5;
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Parse a `.sweet-search-ignore` file. Format: gitignore-like, but we
|
|
70
|
+
* only support `glob`, `dir/`, `**`, `*`, and `#` comments. Returns
|
|
71
|
+
* the array of normalised pattern strings.
|
|
72
|
+
*
|
|
73
|
+
* @param {string} filePath
|
|
74
|
+
* @returns {string[]}
|
|
75
|
+
*/
|
|
76
|
+
export function loadIgnoreFile(filePath) {
|
|
77
|
+
if (!fs.existsSync(filePath)) return [];
|
|
78
|
+
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
79
|
+
const out = [];
|
|
80
|
+
for (const rawLine of raw.split('\n')) {
|
|
81
|
+
const line = rawLine.trim();
|
|
82
|
+
if (!line || line.startsWith('#')) continue;
|
|
83
|
+
out.push(line);
|
|
84
|
+
}
|
|
85
|
+
return out;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function patternToRegex(pattern) {
|
|
89
|
+
// Convert the subset of gitignore-style globbing used by the shared
|
|
90
|
+
// sweet-search config. In particular, leading `**/` must match root-level
|
|
91
|
+
// files too (`**/package-lock.json` matches `package-lock.json`).
|
|
92
|
+
let p = String(pattern || '').replace(/\\/g, '/');
|
|
93
|
+
let dirOnly = false;
|
|
94
|
+
if (p.startsWith('/')) p = p.slice(1);
|
|
95
|
+
if (p.endsWith('/')) { p = p.slice(0, -1); dirOnly = true; }
|
|
96
|
+
const hasSlash = p.includes('/');
|
|
97
|
+
|
|
98
|
+
let body = '';
|
|
99
|
+
for (let i = 0; i < p.length; i++) {
|
|
100
|
+
if (p.startsWith('**/', i)) {
|
|
101
|
+
body += '(?:.*/)?';
|
|
102
|
+
i += 2;
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
if (p.startsWith('/**', i) && i + 3 === p.length) {
|
|
106
|
+
body += '(?:/.*)?';
|
|
107
|
+
i += 2;
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
if (p.startsWith('**', i)) {
|
|
111
|
+
body += '.*';
|
|
112
|
+
i += 1;
|
|
113
|
+
continue;
|
|
114
|
+
}
|
|
115
|
+
const ch = p[i];
|
|
116
|
+
if (ch === '*') body += '[^/]*';
|
|
117
|
+
else if (ch === '?') body += '[^/]';
|
|
118
|
+
else body += ch.replace(/[.+^${}()|[\]\\]/g, '\\$&');
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const prefix = hasSlash ? '^' : '(?:^|.*/)';
|
|
122
|
+
const suffix = dirOnly ? '(?:/.*)?$' : '$';
|
|
123
|
+
return new RegExp(prefix + body + suffix);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Build a path-filter function from default deny-list + ignore file.
|
|
128
|
+
*
|
|
129
|
+
* The filter is true for paths that should be **excluded**.
|
|
130
|
+
*
|
|
131
|
+
* @param {{projectRoot?:string, ignoreFile?:string, extraPatterns?:string[], allowSweetSearchDir?:boolean}} [opts]
|
|
132
|
+
* @returns {(relativePath:string)=>boolean}
|
|
133
|
+
*/
|
|
134
|
+
export function buildPathFilter(opts = {}) {
|
|
135
|
+
const patterns = [];
|
|
136
|
+
if (opts.projectRoot) {
|
|
137
|
+
for (const p of loadProjectConfig(opts.projectRoot).exclude || []) {
|
|
138
|
+
if (opts.allowSweetSearchDir && String(p).includes('.sweet-search')) continue;
|
|
139
|
+
patterns.push(p);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
for (const p of (opts.extraPatterns || [])) patterns.push(p);
|
|
143
|
+
const ignoreFile = opts.ignoreFile
|
|
144
|
+
|| (opts.projectRoot ? path.join(opts.projectRoot, '.sweet-search-ignore') : null);
|
|
145
|
+
if (ignoreFile) {
|
|
146
|
+
for (const p of loadIgnoreFile(ignoreFile)) patterns.push(p);
|
|
147
|
+
}
|
|
148
|
+
const regexes = patterns.map(patternToRegex);
|
|
149
|
+
const denyDirs = new Set(DEFAULT_DENY_DIRS);
|
|
150
|
+
if (opts.allowSweetSearchDir) denyDirs.delete('.sweet-search');
|
|
151
|
+
const denyExts = new Set(DEFAULT_DENY_EXTS);
|
|
152
|
+
|
|
153
|
+
return function isExcluded(relativePath) {
|
|
154
|
+
if (typeof relativePath !== 'string') return true;
|
|
155
|
+
const norm = relativePath.replace(/\\/g, '/');
|
|
156
|
+
const parts = norm.split('/');
|
|
157
|
+
for (const part of parts) {
|
|
158
|
+
if (denyDirs.has(part)) return true;
|
|
159
|
+
}
|
|
160
|
+
const base = parts[parts.length - 1] || '';
|
|
161
|
+
const ext = path.extname(base).toLowerCase();
|
|
162
|
+
if (denyExts.has(ext)) return true;
|
|
163
|
+
// Multi-suffix matches like `.min.js` — denyExts already has them.
|
|
164
|
+
for (const compound of ['.min.js', '.min.css', '.bundle.js']) {
|
|
165
|
+
if (base.endsWith(compound)) return true;
|
|
166
|
+
}
|
|
167
|
+
for (const re of regexes) {
|
|
168
|
+
if (re.test(norm)) return true;
|
|
169
|
+
}
|
|
170
|
+
return false;
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Apply the repo-size cap policy. Plan § 22.9.
|
|
176
|
+
*
|
|
177
|
+
* @param {number} fileCount
|
|
178
|
+
* @param {{cap?:number, warnFrac?:number}} [opts]
|
|
179
|
+
* @returns {{ok:boolean, warn:boolean, cap:number, fileCount:number}}
|
|
180
|
+
*/
|
|
181
|
+
export function evaluateRepoSizeCap(fileCount, opts = {}) {
|
|
182
|
+
const cap = opts.cap ?? DEFAULT_REPO_SIZE_CAP;
|
|
183
|
+
const warnFrac = opts.warnFrac ?? DEFAULT_REPO_SIZE_WARN_FRAC;
|
|
184
|
+
return {
|
|
185
|
+
ok: fileCount <= cap,
|
|
186
|
+
warn: fileCount >= cap * warnFrac,
|
|
187
|
+
cap,
|
|
188
|
+
fileCount,
|
|
189
|
+
};
|
|
190
|
+
}
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reader heartbeat / grace policy.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 8.1.1. Strict row visibility requires bounded history retention:
|
|
5
|
+
* retired rows cannot be physically pruned until every live reader has
|
|
6
|
+
* advanced past their epoch. We track each reader's pinned epoch via a
|
|
7
|
+
* small JSON file under `.sweet-search/readers/<pid>-<boot>-<read>.json`
|
|
8
|
+
* so the maintenance scheduler can compute `min_live_epoch` across
|
|
9
|
+
* non-stale heartbeats. The per-read token is load-bearing: a long-lived
|
|
10
|
+
* MCP/server process can run concurrent queries pinned to different
|
|
11
|
+
* manifest epochs, and those pins must not overwrite each other.
|
|
12
|
+
*
|
|
13
|
+
* Lifecycle:
|
|
14
|
+
* - Each reader process (sweet-search CLI, MCP server, etc.) calls
|
|
15
|
+
* `beginRead(stateDir, epoch)` before a query and `endRead` when it
|
|
16
|
+
* finishes. The heartbeat file holds `{ epoch, pid, bootId, readId, startedAt }`.
|
|
17
|
+
* - The reconcile maintenance worker enumerates the heartbeats and:
|
|
18
|
+
* - drops files whose process no longer exists,
|
|
19
|
+
* - returns `min({live readers}.epoch)` as the prune frontier.
|
|
20
|
+
* - Heartbeats older than READER_GRACE_MS without a live pid are
|
|
21
|
+
* ignored.
|
|
22
|
+
*
|
|
23
|
+
* This file is pure I/O. The reconciler / maintenance worker uses it to
|
|
24
|
+
* compute `min_live_epoch` but does not block on it (heartbeats are
|
|
25
|
+
* advisory; correctness is preserved by tombstone-then-prune).
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import fs from 'node:fs';
|
|
29
|
+
import os from 'node:os';
|
|
30
|
+
import path from 'node:path';
|
|
31
|
+
|
|
32
|
+
export const READER_GRACE_MS = 60 * 60 * 1000; // 1h default
|
|
33
|
+
const HEARTBEAT_DIR = 'readers';
|
|
34
|
+
let heartbeatSeq = 0;
|
|
35
|
+
|
|
36
|
+
function heartbeatDir(stateDir) {
|
|
37
|
+
return path.join(stateDir, HEARTBEAT_DIR);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function heartbeatPath(stateDir, pid, bootId, readId = null) {
|
|
41
|
+
const suffix = readId ? `-${readId}` : '';
|
|
42
|
+
return path.join(heartbeatDir(stateDir), `${pid}-${bootId}${suffix}.json`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function nextReadId() {
|
|
46
|
+
heartbeatSeq = (heartbeatSeq + 1) >>> 0;
|
|
47
|
+
const time = Date.now().toString(36);
|
|
48
|
+
const seq = heartbeatSeq.toString(36);
|
|
49
|
+
const rand = Math.random().toString(36).slice(2, 8);
|
|
50
|
+
return `${time}-${seq}-${rand}`;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Returns a coarse boot-id stand-in. Plan § 8.6 mentions
|
|
55
|
+
* `/proc/sys/kernel/random/boot_id` on Linux and `kern.boottime` on macOS;
|
|
56
|
+
* we keep a cross-platform fallback based on `os.uptime()` rounded to the
|
|
57
|
+
* minute boundary, which is stable across the lifetime of a process and
|
|
58
|
+
* changes whenever the machine reboots.
|
|
59
|
+
*
|
|
60
|
+
* @returns {string}
|
|
61
|
+
*/
|
|
62
|
+
export function bootIdStub() {
|
|
63
|
+
try {
|
|
64
|
+
const procBoot = '/proc/sys/kernel/random/boot_id';
|
|
65
|
+
if (fs.existsSync(procBoot)) {
|
|
66
|
+
const raw = fs.readFileSync(procBoot, 'utf-8').trim();
|
|
67
|
+
return raw.replace(/[^a-zA-Z0-9-]/g, '');
|
|
68
|
+
}
|
|
69
|
+
} catch {
|
|
70
|
+
// fall through
|
|
71
|
+
}
|
|
72
|
+
// Stable enough for the heartbeat: the same process never gets a different
|
|
73
|
+
// value, and a reboot always changes it.
|
|
74
|
+
const epochSeconds = Math.floor(Date.now() / 1000 - os.uptime());
|
|
75
|
+
return `boot-${epochSeconds}`;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Record a reader heartbeat. Idempotent; safe to call before every query.
|
|
80
|
+
*
|
|
81
|
+
* @param {string} stateDir
|
|
82
|
+
* @param {number} epoch
|
|
83
|
+
* @param {object} [meta] Optional caller-supplied metadata (mcp-session-id,
|
|
84
|
+
* query, etc.) — stored verbatim for diagnostics.
|
|
85
|
+
* @returns {{pid:number, bootId:string, readId:string, path:string}}
|
|
86
|
+
*/
|
|
87
|
+
export function beginRead(stateDir, epoch, meta = {}) {
|
|
88
|
+
const dir = heartbeatDir(stateDir);
|
|
89
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
90
|
+
const pid = process.pid;
|
|
91
|
+
const bootId = bootIdStub();
|
|
92
|
+
const readId = nextReadId();
|
|
93
|
+
const p = heartbeatPath(stateDir, pid, bootId, readId);
|
|
94
|
+
const payload = {
|
|
95
|
+
epoch,
|
|
96
|
+
pid,
|
|
97
|
+
bootId,
|
|
98
|
+
readId,
|
|
99
|
+
startedAt: new Date().toISOString(),
|
|
100
|
+
meta,
|
|
101
|
+
};
|
|
102
|
+
fs.writeFileSync(p, JSON.stringify(payload));
|
|
103
|
+
return { pid, bootId, readId, path: p };
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Drop the heartbeat. Plan § 8.1.1 step 2 requires every reader process
|
|
108
|
+
* to delete its file when the query completes.
|
|
109
|
+
*
|
|
110
|
+
* @param {string} stateDir
|
|
111
|
+
* @param {{pid:number, bootId:string, readId?:string, path?:string}|undefined} record
|
|
112
|
+
* Return value of beginRead.
|
|
113
|
+
*/
|
|
114
|
+
export function endRead(stateDir, record) {
|
|
115
|
+
const pid = record?.pid ?? process.pid;
|
|
116
|
+
const bootId = record?.bootId ?? bootIdStub();
|
|
117
|
+
const p = typeof record?.path === 'string'
|
|
118
|
+
? record.path
|
|
119
|
+
: heartbeatPath(stateDir, pid, bootId, record?.readId ?? null);
|
|
120
|
+
try {
|
|
121
|
+
fs.unlinkSync(p);
|
|
122
|
+
} catch {
|
|
123
|
+
// Ignore — the heartbeat may have been swept by the maintenance scheduler.
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Best-effort liveness check. Returns true when the process is still
|
|
129
|
+
* running and matches the recorded boot id.
|
|
130
|
+
*
|
|
131
|
+
* @param {number} pid
|
|
132
|
+
* @param {string} bootId
|
|
133
|
+
*/
|
|
134
|
+
export function isReaderAlive(pid, bootId) {
|
|
135
|
+
if (!Number.isInteger(pid) || pid <= 0) return false;
|
|
136
|
+
if (bootId !== bootIdStub()) return false;
|
|
137
|
+
try {
|
|
138
|
+
process.kill(pid, 0);
|
|
139
|
+
return true;
|
|
140
|
+
} catch (err) {
|
|
141
|
+
return err.code === 'EPERM';
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Sweep stale heartbeats. Returns the surviving records sorted by
|
|
147
|
+
* pinned epoch.
|
|
148
|
+
*
|
|
149
|
+
* @param {string} stateDir
|
|
150
|
+
* @returns {Array<{epoch:number, pid:number, bootId:string, readId?:string, startedAt:string, meta:object}>}
|
|
151
|
+
*/
|
|
152
|
+
export function liveReaders(stateDir) {
|
|
153
|
+
const dir = heartbeatDir(stateDir);
|
|
154
|
+
if (!fs.existsSync(dir)) return [];
|
|
155
|
+
const out = [];
|
|
156
|
+
for (const name of fs.readdirSync(dir)) {
|
|
157
|
+
const p = path.join(dir, name);
|
|
158
|
+
let payload;
|
|
159
|
+
try {
|
|
160
|
+
payload = JSON.parse(fs.readFileSync(p, 'utf-8'));
|
|
161
|
+
} catch {
|
|
162
|
+
// Malformed — drop after grace.
|
|
163
|
+
tryUnlinkAfterGrace(p);
|
|
164
|
+
continue;
|
|
165
|
+
}
|
|
166
|
+
if (!Number.isInteger(payload.epoch) || !Number.isInteger(payload.pid) || typeof payload.bootId !== 'string') {
|
|
167
|
+
tryUnlinkAfterGrace(p);
|
|
168
|
+
continue;
|
|
169
|
+
}
|
|
170
|
+
if (!isReaderAlive(payload.pid, payload.bootId)) {
|
|
171
|
+
tryUnlinkAfterGrace(p);
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
out.push(payload);
|
|
175
|
+
}
|
|
176
|
+
return out.sort((a, b) => a.epoch - b.epoch);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function tryUnlinkAfterGrace(p) {
|
|
180
|
+
try {
|
|
181
|
+
const stat = fs.statSync(p);
|
|
182
|
+
const ageMs = Date.now() - stat.mtimeMs;
|
|
183
|
+
if (ageMs > READER_GRACE_MS) fs.unlinkSync(p);
|
|
184
|
+
} catch {
|
|
185
|
+
// ignore
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Compute the prune frontier — the minimum epoch any live reader pins.
|
|
191
|
+
* Returns `null` when no live readers exist, meaning all retired rows
|
|
192
|
+
* older than the current epoch are eligible for prune.
|
|
193
|
+
*
|
|
194
|
+
* @param {string} stateDir
|
|
195
|
+
* @returns {number|null}
|
|
196
|
+
*/
|
|
197
|
+
export function minLiveEpoch(stateDir) {
|
|
198
|
+
const live = liveReaders(stateDir);
|
|
199
|
+
if (live.length === 0) return null;
|
|
200
|
+
return live[0].epoch;
|
|
201
|
+
}
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema migrations for the incremental-indexing bounded context.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 7.1.6, § 7.2, § 13 Phase 1, § 33 Phase 1 Pre-Merge Checklist.
|
|
5
|
+
*
|
|
6
|
+
* Adds the strict-row-visibility and exact-encoder-input columns required by
|
|
7
|
+
* the reconcile path. All columns carry `DEFAULT` clauses so an older daemon
|
|
8
|
+
* running the original INSERT path (e.g. after a git rollback) does NOT crash
|
|
9
|
+
* with `SQLITE_CONSTRAINT_NOTNULL`. This is load-bearing: without the
|
|
10
|
+
* defaults a rollback would put the daemon into a permanent crash-loop.
|
|
11
|
+
*
|
|
12
|
+
* The migrations are idempotent: `ALTER TABLE ... ADD COLUMN` is skipped when
|
|
13
|
+
* the column already exists, mirroring the pattern in
|
|
14
|
+
* `core/graph/graph-extractor.js`.
|
|
15
|
+
*
|
|
16
|
+
* Index choice: the epoch visibility index is a full B-tree on
|
|
17
|
+
* `epoch_written` rather than a partial recent-window index. Plan § 0 / § 36.5
|
|
18
|
+
* requires Phase 0 benchmarking before committing to the partial form;
|
|
19
|
+
* SQLite's single-writer monotonic-integer append is already a fast path, so
|
|
20
|
+
* we default to the full B-tree and revisit only if Phase 0 measurement shows
|
|
21
|
+
* insertion latency creep.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Add a column only if it is missing, in a transaction-safe way.
|
|
26
|
+
* Helper kept private to this module.
|
|
27
|
+
*
|
|
28
|
+
* @param {import('better-sqlite3').Database} db
|
|
29
|
+
* @param {string} table
|
|
30
|
+
* @param {string} column
|
|
31
|
+
* @param {string} definition e.g. `"TEXT NOT NULL DEFAULT ''"`.
|
|
32
|
+
* @returns {boolean} true if the column was added; false if it already existed.
|
|
33
|
+
*/
|
|
34
|
+
function addColumnIfMissing(db, table, column, definition) {
|
|
35
|
+
const cols = db.prepare(`PRAGMA table_info(${table})`).all();
|
|
36
|
+
if (cols.some((c) => c.name === column)) return false;
|
|
37
|
+
db.exec(`ALTER TABLE ${table} ADD COLUMN ${column} ${definition}`);
|
|
38
|
+
return true;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function createIndexIfMissing(db, indexName, sql) {
|
|
42
|
+
db.exec(sql);
|
|
43
|
+
// SQLite's CREATE INDEX IF NOT EXISTS already covers idempotence; this
|
|
44
|
+
// helper exists for symmetry with addColumnIfMissing in case we ever need
|
|
45
|
+
// to wrap with logging.
|
|
46
|
+
void indexName;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Apply the reconcile-v2 vectors-table migration.
|
|
51
|
+
*
|
|
52
|
+
* Adds (plan § 7.2):
|
|
53
|
+
* - `chunk_struct_id TEXT NOT NULL DEFAULT ''`
|
|
54
|
+
* - `chunk_text_hash TEXT NOT NULL DEFAULT ''`
|
|
55
|
+
* - `embedding_input_hash TEXT NOT NULL DEFAULT ''`
|
|
56
|
+
* - `li_input_hash TEXT NOT NULL DEFAULT ''`
|
|
57
|
+
* - `metadata_fingerprint TEXT NOT NULL DEFAULT ''`
|
|
58
|
+
* - `logical_chunk_id TEXT NOT NULL DEFAULT ''`
|
|
59
|
+
* - `epoch_written INTEGER NOT NULL DEFAULT 0`
|
|
60
|
+
* - `epoch_retired INTEGER`
|
|
61
|
+
*
|
|
62
|
+
* Plus the epoch visibility index.
|
|
63
|
+
*
|
|
64
|
+
* Idempotent. Safe to call on every daemon start.
|
|
65
|
+
*
|
|
66
|
+
* @param {import('better-sqlite3').Database} db
|
|
67
|
+
* @returns {{added: string[]}}
|
|
68
|
+
*/
|
|
69
|
+
export function migrateVectorsSchema(db) {
|
|
70
|
+
const added = [];
|
|
71
|
+
const columns = [
|
|
72
|
+
['chunk_struct_id', "TEXT NOT NULL DEFAULT ''"],
|
|
73
|
+
['chunk_text_hash', "TEXT NOT NULL DEFAULT ''"],
|
|
74
|
+
['embedding_input_hash', "TEXT NOT NULL DEFAULT ''"],
|
|
75
|
+
['li_input_hash', "TEXT NOT NULL DEFAULT ''"],
|
|
76
|
+
['metadata_fingerprint', "TEXT NOT NULL DEFAULT ''"],
|
|
77
|
+
['logical_chunk_id', "TEXT NOT NULL DEFAULT ''"],
|
|
78
|
+
['epoch_written', 'INTEGER NOT NULL DEFAULT 0'],
|
|
79
|
+
['epoch_retired', 'INTEGER'],
|
|
80
|
+
];
|
|
81
|
+
// The `vectors` table is created lazily by `createVectorSchema` in
|
|
82
|
+
// `core/indexing/indexer-build.js`. We assume the caller has invoked that
|
|
83
|
+
// already; otherwise the PRAGMA call returns empty and addColumnIfMissing
|
|
84
|
+
// would throw on the ALTER. The reconciler always seeds the schema first.
|
|
85
|
+
const hasTable = db.prepare(
|
|
86
|
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors'",
|
|
87
|
+
).get();
|
|
88
|
+
if (!hasTable) return { added };
|
|
89
|
+
|
|
90
|
+
for (const [col, defn] of columns) {
|
|
91
|
+
if (addColumnIfMissing(db, 'vectors', col, defn)) added.push(col);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
createIndexIfMissing(
|
|
95
|
+
db,
|
|
96
|
+
'idx_vectors_struct',
|
|
97
|
+
'CREATE INDEX IF NOT EXISTS idx_vectors_struct ON vectors(chunk_struct_id) WHERE chunk_struct_id != \'\'',
|
|
98
|
+
);
|
|
99
|
+
createIndexIfMissing(
|
|
100
|
+
db,
|
|
101
|
+
'idx_vectors_epoch_written',
|
|
102
|
+
'CREATE INDEX IF NOT EXISTS idx_vectors_epoch_written ON vectors(epoch_written)',
|
|
103
|
+
);
|
|
104
|
+
createIndexIfMissing(
|
|
105
|
+
db,
|
|
106
|
+
'idx_vectors_epoch_retired',
|
|
107
|
+
'CREATE INDEX IF NOT EXISTS idx_vectors_epoch_retired ON vectors(epoch_retired) WHERE epoch_retired IS NOT NULL',
|
|
108
|
+
);
|
|
109
|
+
createIndexIfMissing(
|
|
110
|
+
db,
|
|
111
|
+
'idx_vectors_logical',
|
|
112
|
+
'CREATE INDEX IF NOT EXISTS idx_vectors_logical ON vectors(logical_chunk_id) WHERE logical_chunk_id != \'\'',
|
|
113
|
+
);
|
|
114
|
+
return { added };
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Apply the reconcile-v2 entities-table migration.
|
|
119
|
+
*
|
|
120
|
+
* Adds (plan § 7.1.6):
|
|
121
|
+
* - `logical_entity_id TEXT NOT NULL DEFAULT ''`
|
|
122
|
+
* - `epoch_written INTEGER NOT NULL DEFAULT 0`
|
|
123
|
+
* - `epoch_retired INTEGER`
|
|
124
|
+
*
|
|
125
|
+
* The `stale_since` column is already present; we keep it for compatibility
|
|
126
|
+
* with existing soft-delete behaviour in `core/graph/graph-extractor.js`.
|
|
127
|
+
*
|
|
128
|
+
* @param {import('better-sqlite3').Database} db
|
|
129
|
+
* @returns {{added: string[]}}
|
|
130
|
+
*/
|
|
131
|
+
export function migrateEntitiesSchema(db) {
|
|
132
|
+
const added = [];
|
|
133
|
+
const hasTable = db.prepare(
|
|
134
|
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='entities'",
|
|
135
|
+
).get();
|
|
136
|
+
if (!hasTable) return { added };
|
|
137
|
+
|
|
138
|
+
if (addColumnIfMissing(db, 'entities', 'logical_entity_id', "TEXT NOT NULL DEFAULT ''")) {
|
|
139
|
+
added.push('logical_entity_id');
|
|
140
|
+
}
|
|
141
|
+
if (addColumnIfMissing(db, 'entities', 'epoch_written', 'INTEGER NOT NULL DEFAULT 0')) {
|
|
142
|
+
added.push('epoch_written');
|
|
143
|
+
}
|
|
144
|
+
if (addColumnIfMissing(db, 'entities', 'epoch_retired', 'INTEGER')) {
|
|
145
|
+
added.push('epoch_retired');
|
|
146
|
+
}
|
|
147
|
+
createIndexIfMissing(
|
|
148
|
+
db,
|
|
149
|
+
'idx_entities_logical',
|
|
150
|
+
'CREATE INDEX IF NOT EXISTS idx_entities_logical ON entities(logical_entity_id) WHERE logical_entity_id != \'\'',
|
|
151
|
+
);
|
|
152
|
+
createIndexIfMissing(
|
|
153
|
+
db,
|
|
154
|
+
'idx_entities_epoch_written',
|
|
155
|
+
'CREATE INDEX IF NOT EXISTS idx_entities_epoch_written ON entities(epoch_written)',
|
|
156
|
+
);
|
|
157
|
+
createIndexIfMissing(
|
|
158
|
+
db,
|
|
159
|
+
'idx_entities_epoch_retired',
|
|
160
|
+
'CREATE INDEX IF NOT EXISTS idx_entities_epoch_retired ON entities(epoch_retired) WHERE epoch_retired IS NOT NULL',
|
|
161
|
+
);
|
|
162
|
+
return { added };
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Apply the reconcile-v2 relationships-table migration.
|
|
167
|
+
*
|
|
168
|
+
* Adds (plan § 7.1.6 / § 33):
|
|
169
|
+
* - `logical_relationship_id TEXT NOT NULL DEFAULT ''`
|
|
170
|
+
* - `epoch_written INTEGER NOT NULL DEFAULT 0`
|
|
171
|
+
* - `epoch_retired INTEGER`
|
|
172
|
+
*
|
|
173
|
+
* @param {import('better-sqlite3').Database} db
|
|
174
|
+
* @returns {{added: string[]}}
|
|
175
|
+
*/
|
|
176
|
+
export function migrateRelationshipsSchema(db) {
|
|
177
|
+
const added = [];
|
|
178
|
+
const hasTable = db.prepare(
|
|
179
|
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='relationships'",
|
|
180
|
+
).get();
|
|
181
|
+
if (!hasTable) return { added };
|
|
182
|
+
|
|
183
|
+
if (addColumnIfMissing(db, 'relationships', 'logical_relationship_id', "TEXT NOT NULL DEFAULT ''")) {
|
|
184
|
+
added.push('logical_relationship_id');
|
|
185
|
+
}
|
|
186
|
+
if (addColumnIfMissing(db, 'relationships', 'epoch_written', 'INTEGER NOT NULL DEFAULT 0')) {
|
|
187
|
+
added.push('epoch_written');
|
|
188
|
+
}
|
|
189
|
+
if (addColumnIfMissing(db, 'relationships', 'epoch_retired', 'INTEGER')) {
|
|
190
|
+
added.push('epoch_retired');
|
|
191
|
+
}
|
|
192
|
+
createIndexIfMissing(
|
|
193
|
+
db,
|
|
194
|
+
'idx_rel_logical',
|
|
195
|
+
'CREATE INDEX IF NOT EXISTS idx_rel_logical ON relationships(logical_relationship_id) WHERE logical_relationship_id != \'\'',
|
|
196
|
+
);
|
|
197
|
+
createIndexIfMissing(
|
|
198
|
+
db,
|
|
199
|
+
'idx_rel_epoch_written',
|
|
200
|
+
'CREATE INDEX IF NOT EXISTS idx_rel_epoch_written ON relationships(epoch_written)',
|
|
201
|
+
);
|
|
202
|
+
createIndexIfMissing(
|
|
203
|
+
db,
|
|
204
|
+
'idx_rel_epoch_retired',
|
|
205
|
+
'CREATE INDEX IF NOT EXISTS idx_rel_epoch_retired ON relationships(epoch_retired) WHERE epoch_retired IS NOT NULL',
|
|
206
|
+
);
|
|
207
|
+
return { added };
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Apply the encoder-input dependency sidecar (plan § 7.2.1 / § 13 Phase 1).
|
|
212
|
+
*
|
|
213
|
+
* Stores reverse dependencies from external facts to dependent chunks:
|
|
214
|
+
* `(dependency_key, file_path, chunk_struct_id, consumer)`
|
|
215
|
+
*
|
|
216
|
+
* `consumer` is one of `dense | li | dedup` so the reconciler can mark a
|
|
217
|
+
* chunk metadata-dirty for the specific consumer whose input changed.
|
|
218
|
+
* Future cross-file metadata rules register dependency keys whose changes
|
|
219
|
+
* expand the dirty set; the table also holds the dense / LI / dedup
|
|
220
|
+
* dependencies for same-file metadata edits.
|
|
221
|
+
*
|
|
222
|
+
* @param {import('better-sqlite3').Database} db
|
|
223
|
+
*/
|
|
224
|
+
export function ensureEncoderDepsSchema(db) {
|
|
225
|
+
db.exec(`
|
|
226
|
+
CREATE TABLE IF NOT EXISTS encoder_input_dependencies (
|
|
227
|
+
dependency_key TEXT NOT NULL,
|
|
228
|
+
file_path TEXT NOT NULL,
|
|
229
|
+
chunk_struct_id TEXT NOT NULL,
|
|
230
|
+
consumer TEXT NOT NULL CHECK (consumer IN ('dense', 'li', 'dedup')),
|
|
231
|
+
PRIMARY KEY (dependency_key, file_path, chunk_struct_id, consumer)
|
|
232
|
+
) WITHOUT ROWID;
|
|
233
|
+
`);
|
|
234
|
+
db.exec(`
|
|
235
|
+
CREATE INDEX IF NOT EXISTS idx_encoder_deps_by_chunk
|
|
236
|
+
ON encoder_input_dependencies (file_path, chunk_struct_id);
|
|
237
|
+
`);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Run every reconcile-v2 schema migration against the given database.
|
|
242
|
+
*
|
|
243
|
+
* The migrations are split by destination table so callers that only need
|
|
244
|
+
* one can call the focused helper, but the umbrella is the common path for
|
|
245
|
+
* the reconcile bootstrap.
|
|
246
|
+
*
|
|
247
|
+
* @param {{ codeGraph: import('better-sqlite3').Database, vectors: import('better-sqlite3').Database }} dbs
|
|
248
|
+
* @returns {{vectors:{added:string[]}, entities:{added:string[]}, relationships:{added:string[]}}}
|
|
249
|
+
*/
|
|
250
|
+
export function applyReconcileSchemaMigrations(dbs) {
|
|
251
|
+
const { codeGraph, vectors } = dbs;
|
|
252
|
+
const vRes = migrateVectorsSchema(vectors);
|
|
253
|
+
const eRes = migrateEntitiesSchema(codeGraph);
|
|
254
|
+
const rRes = migrateRelationshipsSchema(codeGraph);
|
|
255
|
+
ensureEncoderDepsSchema(codeGraph);
|
|
256
|
+
return { vectors: vRes, entities: eRes, relationships: rRes };
|
|
257
|
+
}
|