moflo 4.10.0 → 4.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/healer/SKILL.md +3 -1
- package/bin/lib/db-repair.mjs +358 -41
- package/bin/session-start-launcher.mjs +42 -6
- package/dist/src/cli/commands/doctor-checks-config.js +60 -0
- package/dist/src/cli/commands/doctor-checks-memory-access.js +27 -1
- package/dist/src/cli/commands/doctor-embedding-hygiene.js +48 -12
- package/dist/src/cli/commands/doctor-fixes.js +57 -0
- package/dist/src/cli/commands/doctor-registry.js +10 -1
- package/dist/src/cli/commands/doctor-render.js +118 -74
- package/dist/src/cli/commands/doctor.js +70 -25
- package/dist/src/cli/memory/bridge-core.js +36 -0
- package/dist/src/cli/memory/bridge-embedder.js +84 -3
- package/dist/src/cli/memory/memory-initializer.js +2 -2
- package/dist/src/cli/services/ephemeral-namespace-purge.js +15 -5
- package/dist/src/cli/services/memory-db-integrity-repair.js +119 -0
- package/dist/src/cli/version.js +1 -1
- package/package.json +2 -2
|
@@ -30,7 +30,9 @@ Thin wrapper around the `flo healer` CLI. All check + fix logic lives in the CLI
|
|
|
30
30
|
- `✓ N passing` (count only)
|
|
31
31
|
- `⚠ warnings` — list `name: message`; flag with `[auto-fixable]` when the result has a `fix` field
|
|
32
32
|
- `✗ failures` — same
|
|
33
|
-
- If `--fix` mode,
|
|
33
|
+
- If `--fix` mode, read `fixesApplied[]` from the JSON payload and list `{name, applied}` per entry — applied=true → "fixed", applied=false → "needs manual action". The `results[]` array is post-fix state (re-evaluated), so report the final status.
|
|
34
|
+
- If `--install` was passed, surface `claudeCodeInstall.installed` from the payload.
|
|
35
|
+
- If `--kill-zombies` was passed, surface `zombieScan.killed` / `zombieScan.found` from the payload.
|
|
34
36
|
|
|
35
37
|
4. **Nudge based on what changed.** Only mention next steps for state that *actually* changed:
|
|
36
38
|
- Daemon restarted → `Statusline should refresh within ~5s.`
|
package/bin/lib/db-repair.mjs
CHANGED
|
@@ -1,32 +1,54 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Memory-DB integrity check +
|
|
2
|
+
* Memory-DB integrity check + tiered repair (#743, #1090-followup).
|
|
3
3
|
*
|
|
4
|
-
* The `.moflo/moflo.db` SQLite file
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
* concurrent writes (
|
|
4
|
+
* The `.moflo/moflo.db` SQLite file picks up corruption in two distinct modes:
|
|
5
|
+
*
|
|
6
|
+
* 1. **Index drift** — `row N missing from sqlite_autoindex_memory_entries_1`.
|
|
7
|
+
* Row data is intact; only the unique-key b-tree is wrong. Trigger: sql.js's
|
|
8
|
+
* whole-file dump-on-flush racing with concurrent writes (#714, #743 —
|
|
9
|
+
* fixed for new installs by Phase 5 / #1084 which removed sql.js entirely).
|
|
10
|
+
* **REINDEX** rebuilds the index from canonical row data.
|
|
11
|
+
*
|
|
12
|
+
* 2. **Table b-tree corruption** — `Tree N page M cell K: Rowid X out of
|
|
13
|
+
* order`, where Tree N is a TABLE root page (not just an index). Row data
|
|
14
|
+
* is partly intact, but page ordering is broken. Triggers we've seen:
|
|
15
|
+
* - sql.js → node:sqlite migration: an old 4.9.x sql.js daemon flushes its
|
|
16
|
+
* full-file dump OVER a WAL frame that the new 4.10 backend has already
|
|
17
|
+
* written, leaving WAL referencing pages that no longer exist in main.
|
|
18
|
+
* - Concurrent multi-process writes when the daemon was disabled (#981).
|
|
19
|
+
* **REINDEX cannot fix this** — the table itself is broken. Recovery path:
|
|
20
|
+
* a) `VACUUM INTO` a fresh file (single-shot rebuild; fails fast if
|
|
21
|
+
* iteration hits an unreadable page),
|
|
22
|
+
* b) row-level salvage — chunked `SELECT rowid > ?` per table, catching
|
|
23
|
+
* per-chunk errors and skipping past corrupt page ranges,
|
|
24
|
+
* c) atomic swap with .corrupt.<TS> backup retained for forensics.
|
|
25
|
+
*
|
|
26
|
+
* 3. **Unrecoverable** — header damage, encrypted-by-malware, etc. We can't
|
|
27
|
+
* fix this; surface a clear failure and let the user decide between manual
|
|
28
|
+
* `flo memory rebuild-index` (destructive) and offline recovery tools.
|
|
9
29
|
*
|
|
10
30
|
* Symptoms when uncorrected:
|
|
11
31
|
* - `index-guidance.mjs` and `index-patterns.mjs` fail mid-write with
|
|
12
32
|
* `database disk image is malformed`, leaving partial state.
|
|
13
33
|
* - The ephemeral-namespace purge (#729) fails silently, so hive-mind /
|
|
14
34
|
* tasklist / epic-state / test-bridge-fix rows accumulate.
|
|
15
|
-
* - Vector counts in the statusline stay inflated
|
|
16
|
-
*
|
|
35
|
+
* - Vector counts in the statusline stay inflated.
|
|
36
|
+
* - Healer's deep checks throw with "database disk image is malformed",
|
|
37
|
+
* surfacing as the synthetic 'Check' failure (doctor.ts:214).
|
|
17
38
|
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
* report; manual `flo memory rebuild-index` is the fallback.
|
|
22
|
-
*
|
|
23
|
-
* MUST run BEFORE any long-lived sql.js consumer (MCP server, daemon) opens
|
|
24
|
-
* the DB and BEFORE the embeddings migration / soft-delete purge / ephemeral
|
|
25
|
-
* purge — those all swallow corruption errors and silently no-op.
|
|
39
|
+
* MUST run BEFORE any long-lived consumer (MCP server, daemon) opens the DB
|
|
40
|
+
* and BEFORE the embeddings migration / soft-delete purge / ephemeral purge —
|
|
41
|
+
* those all swallow corruption errors and silently no-op.
|
|
26
42
|
*/
|
|
27
|
-
import { existsSync } from 'node:fs';
|
|
43
|
+
import { existsSync, renameSync, unlinkSync } from 'node:fs';
|
|
28
44
|
import { memoryDbPath } from './moflo-paths.mjs';
|
|
29
45
|
import { openBackend } from './get-backend.mjs';
|
|
46
|
+
import './suppress-sqlite-warning.mjs';
|
|
47
|
+
// Resolve node:sqlite once at module load — get-backend.mjs has already
|
|
48
|
+
// loaded it by this point, so the dynamic import is a cache hit. Avoids
|
|
49
|
+
// three independent `await import('node:sqlite')` calls inside the repair
|
|
50
|
+
// functions (style cleanup; was producing no functional difference).
|
|
51
|
+
const { DatabaseSync } = await import('node:sqlite');
|
|
30
52
|
|
|
31
53
|
function isOk(execResult) {
|
|
32
54
|
const rows = execResult?.[0]?.values ?? [];
|
|
@@ -38,42 +60,337 @@ function corruptionCount(execResult) {
|
|
|
38
60
|
}
|
|
39
61
|
|
|
40
62
|
/**
|
|
41
|
-
*
|
|
42
|
-
*
|
|
43
|
-
*
|
|
44
|
-
*
|
|
45
|
-
*
|
|
46
|
-
*
|
|
63
|
+
* Open `.moflo/moflo.db` raw via node:sqlite in readonly mode and run
|
|
64
|
+
* `PRAGMA integrity_check`. Bypasses {@link openBackend} because that path
|
|
65
|
+
* sets `journal_mode=WAL`, `busy_timeout`, and `synchronous=NORMAL` on every
|
|
66
|
+
* non-readonly open — those PRAGMAs can themselves throw against a corrupt
|
|
67
|
+
* file, and the pre-#1090 code path caught those throws and reported the DB
|
|
68
|
+
* as healthy. Readonly + no PRAGMAs = the probe always reaches the
|
|
69
|
+
* `integrity_check` call regardless of file health.
|
|
70
|
+
*
|
|
71
|
+
* Exported so the TS doctor check (`checkMemoryDbIntegrity` in
|
|
72
|
+
* `src/cli/commands/doctor-checks-config.ts`) can call into the same
|
|
73
|
+
* implementation instead of re-deriving the readonly-no-PRAGMAs probe.
|
|
74
|
+
*
|
|
75
|
+
* @param {string} dbPath
|
|
76
|
+
* @returns {Promise<{ ok: boolean, errors: number, openFailed?: boolean }>}
|
|
77
|
+
*/
|
|
78
|
+
export async function probeIntegrityRaw(dbPath) {
|
|
79
|
+
let db;
|
|
80
|
+
try {
|
|
81
|
+
db = new DatabaseSync(dbPath, { readOnly: true });
|
|
82
|
+
} catch {
|
|
83
|
+
return { ok: false, errors: 0, openFailed: true };
|
|
84
|
+
}
|
|
85
|
+
try {
|
|
86
|
+
const rows = db.prepare('PRAGMA integrity_check').all();
|
|
87
|
+
if (rows.length === 1 && String(rows[0]?.integrity_check ?? '').toLowerCase() === 'ok') {
|
|
88
|
+
return { ok: true, errors: 0 };
|
|
89
|
+
}
|
|
90
|
+
return { ok: false, errors: rows.length };
|
|
91
|
+
} catch {
|
|
92
|
+
return { ok: false, errors: 0, openFailed: true };
|
|
93
|
+
} finally {
|
|
94
|
+
try { db.close(); } catch { /* already-dead handle */ }
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Tier-2 recovery: `VACUUM INTO` a fresh file. Single SQLite call that
|
|
100
|
+
* iterates every row of every table and writes them to a brand-new database
|
|
101
|
+
* with rebuilt indexes. Fails fast if iteration hits an unreadable page —
|
|
102
|
+
* caller falls back to row-level salvage.
|
|
103
|
+
*
|
|
104
|
+
* @param {string} srcPath
|
|
105
|
+
* @param {string} dstPath
|
|
106
|
+
* @returns {Promise<{ ok: boolean, error?: string }>}
|
|
107
|
+
*/
|
|
108
|
+
async function tryVacuumInto(srcPath, dstPath) {
|
|
109
|
+
try { if (existsSync(dstPath)) unlinkSync(dstPath); } catch { /* best effort */ }
|
|
110
|
+
let db;
|
|
111
|
+
try {
|
|
112
|
+
// Open writable (not readonly) — VACUUM needs to checkpoint WAL first.
|
|
113
|
+
// Skip our standard WAL pragmas (they can throw on corrupt files); SQLite
|
|
114
|
+
// applies its defaults which are sufficient for VACUUM INTO.
|
|
115
|
+
db = new DatabaseSync(srcPath);
|
|
116
|
+
} catch (err) {
|
|
117
|
+
return { ok: false, error: err?.message ?? 'open failed' };
|
|
118
|
+
}
|
|
119
|
+
try {
|
|
120
|
+
try { db.exec('PRAGMA wal_checkpoint(TRUNCATE)'); } catch { /* corrupt WAL ok */ }
|
|
121
|
+
db.exec(`VACUUM INTO '${dstPath.replace(/'/g, "''")}'`);
|
|
122
|
+
return { ok: true };
|
|
123
|
+
} catch (err) {
|
|
124
|
+
return { ok: false, error: err?.message ?? 'vacuum failed' };
|
|
125
|
+
} finally {
|
|
126
|
+
try { db.close(); } catch { /* */ }
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Tier-3 recovery: row-level salvage. Iterate each non-empty table in
|
|
132
|
+
* `rowid > ?` chunks; on any chunk-read failure, skip past that chunk's
|
|
133
|
+
* rowid range and continue. Per-table loss stats returned so the caller can
|
|
134
|
+
* surface what was preserved vs lost.
|
|
135
|
+
*
|
|
136
|
+
* Schema is copied verbatim from `sqlite_master.sql` so triggers/indexes/views
|
|
137
|
+
* are preserved alongside tables. `INSERT OR IGNORE` handles unique-key
|
|
138
|
+
* collisions from any duplicate-rowid corruption mode.
|
|
139
|
+
*
|
|
140
|
+
* @param {string} srcPath
|
|
141
|
+
* @param {string} dstPath
|
|
142
|
+
* @returns {Promise<{
|
|
143
|
+
* ok: boolean,
|
|
144
|
+
* error?: string,
|
|
145
|
+
* lossStats?: Record<string, { read: number, written: number, errors: number }>,
|
|
146
|
+
* }>}
|
|
147
|
+
*/
|
|
148
|
+
async function trySalvageRowByRow(srcPath, dstPath) {
|
|
149
|
+
try { if (existsSync(dstPath)) unlinkSync(dstPath); } catch { /* */ }
|
|
150
|
+
|
|
151
|
+
let src;
|
|
152
|
+
try {
|
|
153
|
+
src = new DatabaseSync(srcPath, { readOnly: true });
|
|
154
|
+
} catch (err) {
|
|
155
|
+
return { ok: false, error: err?.message ?? 'src open failed' };
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Open dst defensively. If this throws (e.g. permissions, dst path in a
|
|
159
|
+
// dir we can't create, or a concurrent lock on dstPath), keep the
|
|
160
|
+
// "never throws" contract by returning the failure shape — otherwise the
|
|
161
|
+
// open exception would escape past `repairMemoryDbIfCorrupt` and block
|
|
162
|
+
// session start, which is the failure mode this whole module exists to
|
|
163
|
+
// prevent.
|
|
164
|
+
let dst;
|
|
165
|
+
try {
|
|
166
|
+
dst = new DatabaseSync(dstPath);
|
|
167
|
+
} catch (err) {
|
|
168
|
+
try { src.close(); } catch { /* */ }
|
|
169
|
+
return { ok: false, error: err?.message ?? 'dst open failed' };
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const lossStats = {};
|
|
173
|
+
const CHUNK = 500;
|
|
174
|
+
|
|
175
|
+
try {
|
|
176
|
+
// Copy schema. Order matters: tables first (else indexes/triggers/views
|
|
177
|
+
// reference nonexistent tables), then everything else. sqlite_* objects
|
|
178
|
+
// (sqlite_sequence, sqlite_autoindex_*) are created implicitly by SQLite.
|
|
179
|
+
const schemaRows = src
|
|
180
|
+
.prepare(
|
|
181
|
+
"SELECT type, name, tbl_name, sql FROM sqlite_master " +
|
|
182
|
+
"WHERE sql IS NOT NULL ORDER BY CASE type " +
|
|
183
|
+
"WHEN 'table' THEN 1 WHEN 'index' THEN 2 WHEN 'view' THEN 3 ELSE 4 END",
|
|
184
|
+
)
|
|
185
|
+
.all();
|
|
186
|
+
for (const s of schemaRows) {
|
|
187
|
+
if (String(s.name).startsWith('sqlite_')) continue;
|
|
188
|
+
try { dst.exec(s.sql + ';'); } catch { /* malformed schema row — skip */ }
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Salvage rows table-by-table.
|
|
192
|
+
const tables = src
|
|
193
|
+
.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
|
|
194
|
+
.all();
|
|
195
|
+
|
|
196
|
+
for (const t of tables) {
|
|
197
|
+
const name = String(t.name);
|
|
198
|
+
lossStats[name] = { read: 0, written: 0, errors: 0 };
|
|
199
|
+
|
|
200
|
+
const cols = src.prepare(`PRAGMA table_info('${name.replace(/'/g, "''")}')`).all();
|
|
201
|
+
if (cols.length === 0) continue;
|
|
202
|
+
const colList = cols.map((c) => '"' + String(c.name).replace(/"/g, '""') + '"').join(',');
|
|
203
|
+
const placeholders = cols.map(() => '?').join(',');
|
|
204
|
+
const insert = dst.prepare(
|
|
205
|
+
`INSERT OR IGNORE INTO "${name.replace(/"/g, '""')}" (${colList}) VALUES (${placeholders})`,
|
|
206
|
+
);
|
|
207
|
+
|
|
208
|
+
let lastRowid = 0;
|
|
209
|
+
let safetyCap = 0;
|
|
210
|
+
const MAX_ITERATIONS = 100_000;
|
|
211
|
+
|
|
212
|
+
while (safetyCap++ < MAX_ITERATIONS) {
|
|
213
|
+
let rows;
|
|
214
|
+
try {
|
|
215
|
+
rows = src
|
|
216
|
+
.prepare(
|
|
217
|
+
`SELECT rowid as __rid, * FROM "${name.replace(/"/g, '""')}" ` +
|
|
218
|
+
`WHERE rowid > ? ORDER BY rowid LIMIT ${CHUNK}`,
|
|
219
|
+
)
|
|
220
|
+
.all(lastRowid);
|
|
221
|
+
} catch {
|
|
222
|
+
lossStats[name].errors++;
|
|
223
|
+
lastRowid += CHUNK;
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
if (!rows || rows.length === 0) break;
|
|
227
|
+
lossStats[name].read += rows.length;
|
|
228
|
+
for (const r of rows) {
|
|
229
|
+
try {
|
|
230
|
+
insert.run(...cols.map((c) => r[c.name]));
|
|
231
|
+
lossStats[name].written++;
|
|
232
|
+
} catch {
|
|
233
|
+
lossStats[name].errors++;
|
|
234
|
+
}
|
|
235
|
+
lastRowid = Number(r.__rid);
|
|
236
|
+
}
|
|
237
|
+
if (rows.length < CHUNK) break;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Verify the recovered file. If integrity_check still fails, the
|
|
242
|
+
// salvage didn't actually produce a clean file — surface as failure
|
|
243
|
+
// (caller will keep the corrupted original in place).
|
|
244
|
+
const checkRows = dst.prepare('PRAGMA integrity_check').all();
|
|
245
|
+
const recoveredOk =
|
|
246
|
+
checkRows.length === 1 &&
|
|
247
|
+
String(checkRows[0]?.integrity_check ?? '').toLowerCase() === 'ok';
|
|
248
|
+
if (!recoveredOk) {
|
|
249
|
+
return { ok: false, error: 'recovered file failed integrity_check', lossStats };
|
|
250
|
+
}
|
|
251
|
+
return { ok: true, lossStats };
|
|
252
|
+
} catch (err) {
|
|
253
|
+
return { ok: false, error: err?.message ?? 'salvage failed' };
|
|
254
|
+
} finally {
|
|
255
|
+
try { src.close(); } catch { /* */ }
|
|
256
|
+
try { dst.close(); } catch { /* */ }
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Atomically swap a freshly recovered DB into the canonical path, keeping the
|
|
262
|
+
* corrupted original (+ its WAL/SHM sidecars if present) under `.corrupt.<TS>`
|
|
263
|
+
* suffixes for forensics. Caller must guarantee no live writer holds the
|
|
264
|
+
* canonical file open before invoking this — see `stopWritersBeforeRepair`
|
|
265
|
+
* for the daemon-coordinated entry point.
|
|
266
|
+
*
|
|
267
|
+
* @param {string} canonicalPath
|
|
268
|
+
* @param {string} recoveredPath
|
|
269
|
+
* @returns {{ ok: boolean, error?: string, corruptSuffix: string }}
|
|
270
|
+
*/
|
|
271
|
+
function atomicSwap(canonicalPath, recoveredPath) {
|
|
272
|
+
const ts = new Date().toISOString().replace(/[:.]/g, '-').replace(/Z$/, '');
|
|
273
|
+
const corruptSuffix = `.corrupt.${ts}`;
|
|
274
|
+
try {
|
|
275
|
+
if (existsSync(canonicalPath)) {
|
|
276
|
+
renameSync(canonicalPath, canonicalPath + corruptSuffix);
|
|
277
|
+
}
|
|
278
|
+
const walPath = canonicalPath + '-wal';
|
|
279
|
+
const shmPath = canonicalPath + '-shm';
|
|
280
|
+
if (existsSync(walPath)) {
|
|
281
|
+
try { renameSync(walPath, walPath + corruptSuffix); } catch { /* not always present */ }
|
|
282
|
+
}
|
|
283
|
+
if (existsSync(shmPath)) {
|
|
284
|
+
try { renameSync(shmPath, shmPath + corruptSuffix); } catch { /* not always present */ }
|
|
285
|
+
}
|
|
286
|
+
renameSync(recoveredPath, canonicalPath);
|
|
287
|
+
return { ok: true, corruptSuffix };
|
|
288
|
+
} catch (err) {
|
|
289
|
+
return { ok: false, error: err?.message ?? 'swap failed', corruptSuffix };
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Probe the memory DB for corruption and run a tiered repair if found:
|
|
295
|
+
*
|
|
296
|
+
* - Tier 1: `REINDEX` in place (index-only corruption — #743).
|
|
297
|
+
* - Tier 2: `VACUUM INTO` fresh file + atomic swap (table b-tree corruption).
|
|
298
|
+
* - Tier 3: row-level salvage + atomic swap (deep corruption with partial
|
|
299
|
+
* row loss).
|
|
300
|
+
*
|
|
301
|
+
* Returns a structured result:
|
|
302
|
+
* - `{ repaired: false, errors: 0 }` — healthy or absent.
|
|
303
|
+
* - `{ repaired: true, errors: N, tier: 'reindex' }` — Tier 1 worked.
|
|
304
|
+
* - `{ repaired: true, errors: N, tier: 'vacuum', corruptBackup }` — Tier 2.
|
|
305
|
+
* - `{ repaired: true, errors: N, tier: 'salvage', corruptBackup, lossStats }`
|
|
306
|
+
* — Tier 3 (partial row loss possible; see `lossStats`).
|
|
307
|
+
* - `{ repaired: false, errors: N, persistent: true }` — nothing worked;
|
|
308
|
+
* manual recovery needed.
|
|
47
309
|
*
|
|
48
310
|
* Never throws; any internal failure becomes `{ repaired: false, errors: 0 }`
|
|
49
311
|
* so a probe failure cannot block session start.
|
|
312
|
+
*
|
|
313
|
+
* @param {string} projectRoot
|
|
314
|
+
* @returns {Promise<{
|
|
315
|
+
* repaired: boolean,
|
|
316
|
+
* errors: number,
|
|
317
|
+
* tier?: 'reindex' | 'vacuum' | 'salvage',
|
|
318
|
+
* persistent?: boolean,
|
|
319
|
+
* corruptBackup?: string,
|
|
320
|
+
* lossStats?: Record<string, { read: number, written: number, errors: number }>,
|
|
321
|
+
* }>}
|
|
50
322
|
*/
|
|
51
323
|
export async function repairMemoryDbIfCorrupt(projectRoot) {
|
|
52
324
|
const dbPath = memoryDbPath(projectRoot);
|
|
53
325
|
if (!existsSync(dbPath)) return { repaired: false, errors: 0 };
|
|
54
326
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
327
|
+
// Step 1 — defensive readonly probe (cannot throw on WAL-setup errors
|
|
328
|
+
// against corrupt files). If the open itself fails, fall through to the
|
|
329
|
+
// openBackend path which has retry semantics for transient lock issues;
|
|
330
|
+
// truly unopenable files surface as persistent below.
|
|
331
|
+
const probe = await probeIntegrityRaw(dbPath);
|
|
332
|
+
if (probe.ok) return { repaired: false, errors: 0 };
|
|
58
333
|
|
|
59
|
-
|
|
60
|
-
if (isOk(before)) {
|
|
61
|
-
return { repaired: false, errors: 0 };
|
|
62
|
-
}
|
|
334
|
+
const errors = probe.errors;
|
|
63
335
|
|
|
64
|
-
|
|
65
|
-
|
|
336
|
+
// Step 2 — Tier 1: REINDEX via the existing backend path. Fast for the
|
|
337
|
+
// common index-drift mode and preserves the file in place.
|
|
338
|
+
if (!probe.openFailed) {
|
|
339
|
+
try {
|
|
340
|
+
const db = await openBackend(projectRoot, { create: false });
|
|
341
|
+
try {
|
|
342
|
+
db.run('REINDEX');
|
|
343
|
+
const after = db.exec('PRAGMA integrity_check');
|
|
344
|
+
if (isOk(after)) {
|
|
345
|
+
db.save();
|
|
346
|
+
return { repaired: true, errors, tier: 'reindex' };
|
|
347
|
+
}
|
|
348
|
+
} finally {
|
|
349
|
+
try { db.close(); } catch { /* */ }
|
|
350
|
+
}
|
|
351
|
+
} catch {
|
|
352
|
+
// REINDEX path failed (often because openBackend's WAL pragmas throw
|
|
353
|
+
// on a corrupt file). Fall through to deeper recovery.
|
|
354
|
+
}
|
|
355
|
+
}
|
|
66
356
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
357
|
+
// Step 3 — Tier 2: VACUUM INTO a fresh file.
|
|
358
|
+
const recoveredPath = dbPath + '.recovered';
|
|
359
|
+
const vacuum = await tryVacuumInto(dbPath, recoveredPath);
|
|
360
|
+
if (vacuum.ok) {
|
|
361
|
+
const recoveredProbe = await probeIntegrityRaw(recoveredPath);
|
|
362
|
+
if (recoveredProbe.ok) {
|
|
363
|
+
const swap = atomicSwap(dbPath, recoveredPath);
|
|
364
|
+
if (swap.ok) {
|
|
365
|
+
return {
|
|
366
|
+
repaired: true,
|
|
367
|
+
errors: errors || corruptionCount(recoveredProbe),
|
|
368
|
+
tier: 'vacuum',
|
|
369
|
+
corruptBackup: dbPath + swap.corruptSuffix,
|
|
370
|
+
};
|
|
371
|
+
}
|
|
70
372
|
}
|
|
373
|
+
try { unlinkSync(recoveredPath); } catch { /* */ }
|
|
374
|
+
}
|
|
71
375
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
376
|
+
// Step 4 — Tier 3: row-level salvage.
|
|
377
|
+
const salvage = await trySalvageRowByRow(dbPath, recoveredPath);
|
|
378
|
+
if (salvage.ok) {
|
|
379
|
+
const swap = atomicSwap(dbPath, recoveredPath);
|
|
380
|
+
if (swap.ok) {
|
|
381
|
+
return {
|
|
382
|
+
repaired: true,
|
|
383
|
+
errors,
|
|
384
|
+
tier: 'salvage',
|
|
385
|
+
corruptBackup: dbPath + swap.corruptSuffix,
|
|
386
|
+
lossStats: salvage.lossStats,
|
|
387
|
+
};
|
|
388
|
+
}
|
|
389
|
+
try { unlinkSync(recoveredPath); } catch { /* */ }
|
|
390
|
+
} else {
|
|
391
|
+
try { if (existsSync(recoveredPath)) unlinkSync(recoveredPath); } catch { /* */ }
|
|
78
392
|
}
|
|
393
|
+
|
|
394
|
+
// Step 5 — give up.
|
|
395
|
+
return { repaired: false, errors, persistent: true };
|
|
79
396
|
}
|
|
@@ -268,15 +268,51 @@ try {
|
|
|
268
268
|
try {
|
|
269
269
|
const repair = await repairMemoryDbIfCorrupt(projectRoot);
|
|
270
270
|
if (repair?.repaired) {
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
)
|
|
271
|
+
// Three recovery tiers, three messages. Tier surfaces what level of
|
|
272
|
+
// damage the DB had so the user (and any downstream telemetry) knows
|
|
273
|
+
// whether row data was lost. See bin/lib/db-repair.mjs for the cascade.
|
|
274
|
+
if (repair.tier === 'reindex') {
|
|
275
|
+
emitMutation(
|
|
276
|
+
'repaired memory db index',
|
|
277
|
+
`${plural(repair.errors, 'index error')} fixed via REINDEX`,
|
|
278
|
+
);
|
|
279
|
+
} else if (repair.tier === 'vacuum') {
|
|
280
|
+
emitMutation(
|
|
281
|
+
'rebuilt memory db',
|
|
282
|
+
`${plural(repair.errors, 'integrity violation')} fixed via VACUUM INTO; corrupt original kept at ${repair.corruptBackup ?? '.moflo/moflo.db.corrupt.*'}`,
|
|
283
|
+
);
|
|
284
|
+
} else if (repair.tier === 'salvage') {
|
|
285
|
+
// Row-level salvage may have dropped rows; summarise loss so the
|
|
286
|
+
// user sees what's gone before downstream consumers (indexer,
|
|
287
|
+
// embeddings) re-process the survivors.
|
|
288
|
+
let lossSummary = '';
|
|
289
|
+
if (repair.lossStats) {
|
|
290
|
+
const losses = Object.entries(repair.lossStats)
|
|
291
|
+
.map(([tbl, s]) => {
|
|
292
|
+
const lost = Math.max(0, s.read - s.written);
|
|
293
|
+
return lost > 0 ? `${tbl} ${s.written}/${s.read}` : null;
|
|
294
|
+
})
|
|
295
|
+
.filter(Boolean);
|
|
296
|
+
if (losses.length > 0) lossSummary = ` (rows preserved: ${losses.join(', ')})`;
|
|
297
|
+
}
|
|
298
|
+
emitMutation(
|
|
299
|
+
'salvaged memory db',
|
|
300
|
+
`${plural(repair.errors, 'integrity violation')} recovered via row-level salvage${lossSummary}; corrupt original kept at ${repair.corruptBackup ?? '.moflo/moflo.db.corrupt.*'}`,
|
|
301
|
+
);
|
|
302
|
+
} else {
|
|
303
|
+
// Older db-repair without a `tier` field — fall back to legacy text.
|
|
304
|
+
emitMutation(
|
|
305
|
+
'repaired memory db',
|
|
306
|
+
`${plural(repair.errors, 'integrity violation')} fixed`,
|
|
307
|
+
);
|
|
308
|
+
}
|
|
275
309
|
} else if (repair?.persistent) {
|
|
276
310
|
// Surface to stderr — Claude additionalContext + the user both see this.
|
|
277
|
-
//
|
|
311
|
+
// Every recovery tier exhausted; user options are destructive only.
|
|
278
312
|
process.stderr.write(
|
|
279
|
-
`moflo: memory db has ${plural(repair.errors, '
|
|
313
|
+
`moflo: memory db has ${plural(repair.errors, 'integrity violation')} ` +
|
|
314
|
+
`that REINDEX / VACUUM INTO / row-level salvage could not fix — ` +
|
|
315
|
+
`run 'flo memory rebuild-index' (destructive) or restore from backup\n`,
|
|
280
316
|
);
|
|
281
317
|
}
|
|
282
318
|
} catch {
|
|
@@ -8,6 +8,7 @@ import { join } from 'path';
|
|
|
8
8
|
import os from 'os';
|
|
9
9
|
import { getDaemonLockHolder } from '../services/daemon-lock.js';
|
|
10
10
|
import { legacyMemoryDbPath, memoryDbCandidatePaths, memoryDbPath, } from '../services/moflo-paths.js';
|
|
11
|
+
import { probeDbIntegrity } from '../services/memory-db-integrity-repair.js';
|
|
11
12
|
import { errorDetail } from '../shared/utils/error-detail.js';
|
|
12
13
|
export async function checkConfigFile() {
|
|
13
14
|
// JSON configs (parse-validated). LEGACY-CONFIG: `.claude-flow.json` and
|
|
@@ -131,6 +132,65 @@ export async function checkMemoryDatabase() {
|
|
|
131
132
|
}
|
|
132
133
|
return { name: 'Memory Database', status: 'warn', message: 'Not initialized', fix: 'claude-flow memory configure --backend hybrid' };
|
|
133
134
|
}
|
|
135
|
+
/**
|
|
136
|
+
* Tier-1 corruption probe for `.moflo/moflo.db`. Runs `PRAGMA integrity_check`
|
|
137
|
+
* via a raw node:sqlite readonly handle — bypasses `openBackend` because that
|
|
138
|
+
* path sets WAL pragmas on open and those throw on deeply-corrupt files,
|
|
139
|
+
* masking the real failure as a generic "Check" error (doctor.ts:214).
|
|
140
|
+
*
|
|
141
|
+
* Owns the corruption signal so downstream checks (Embeddings, Semantic
|
|
142
|
+
* Quality, Memory Access Functional, etc.) don't end up doing it implicitly
|
|
143
|
+
* via their own swallow-all error paths. The companion fix in
|
|
144
|
+
* doctor-fixes.ts coordinates daemon stop + tiered repair via the JS-side
|
|
145
|
+
* `repairMemoryDbIfCorrupt` (bin/lib/db-repair.mjs).
|
|
146
|
+
*
|
|
147
|
+
* Status semantics:
|
|
148
|
+
* - `pass` — DB absent OR `integrity_check` returns 'ok'.
|
|
149
|
+
* - `fail` — corruption detected. `fix` field points at the healer's
|
|
150
|
+
* auto-recovery path (which runs REINDEX → VACUUM INTO → row-level
|
|
151
|
+
* salvage in order of escalation).
|
|
152
|
+
* - `warn` — probe itself crashed (rare; surfaces the diagnostic rather
|
|
153
|
+
* than masking it).
|
|
154
|
+
*/
|
|
155
|
+
export async function checkMemoryDbIntegrity(cwd = process.cwd()) {
|
|
156
|
+
const dbPath = memoryDbPath(cwd);
|
|
157
|
+
if (!existsSync(dbPath)) {
|
|
158
|
+
return { name: 'Memory DB Integrity', status: 'pass', message: 'DB absent (no integrity probe needed)' };
|
|
159
|
+
}
|
|
160
|
+
// Delegate to the single readonly-no-PRAGMAs probe in
|
|
161
|
+
// `bin/lib/db-repair.mjs` (via the TS service bridge). Avoids re-deriving
|
|
162
|
+
// the same DatabaseSync({ readOnly: true }) + integrity_check sequence in
|
|
163
|
+
// two places and keeps the "what counts as healthy" semantics in one file.
|
|
164
|
+
try {
|
|
165
|
+
const probe = await probeDbIntegrity(dbPath);
|
|
166
|
+
if (probe.ok) {
|
|
167
|
+
return { name: 'Memory DB Integrity', status: 'pass', message: 'PRAGMA integrity_check: ok' };
|
|
168
|
+
}
|
|
169
|
+
const message = probe.openFailed
|
|
170
|
+
? 'Unable to probe DB (readonly open failed — likely deep corruption)'
|
|
171
|
+
: `${probe.errors} integrity violation(s) detected`;
|
|
172
|
+
return {
|
|
173
|
+
name: 'Memory DB Integrity',
|
|
174
|
+
status: 'fail',
|
|
175
|
+
message,
|
|
176
|
+
fix: 'flo healer --fix -c memory-db-integrity',
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
catch (e) {
|
|
180
|
+
// The probe itself maps "readonly open failed" to `openFailed: true`
|
|
181
|
+
// and we surface that as `fail` above. Reaching the catch means the
|
|
182
|
+
// probe *module* couldn't be loaded — `findMofloPackageRoot()` returned
|
|
183
|
+
// null (broken install / wrong cwd) or the dynamic import threw. Both
|
|
184
|
+
// are first-class diagnostic failures — a broken install must not be
|
|
185
|
+
// silently downgraded to `warn` and hidden from the healer summary.
|
|
186
|
+
return {
|
|
187
|
+
name: 'Memory DB Integrity',
|
|
188
|
+
status: 'fail',
|
|
189
|
+
message: `Integrity probe unavailable: ${errorDetail(e)}`,
|
|
190
|
+
fix: 'flo healer --fix -c memory-db-integrity',
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
}
|
|
134
194
|
/**
|
|
135
195
|
* Standard MCP-config search paths: home (Claude Desktop on macOS/Linux),
|
|
136
196
|
* XDG config dir, project-local `.mcp.json`, and APPDATA on Windows.
|
|
@@ -144,7 +144,33 @@ async function runMemoryRoundTrip(ctx) {
|
|
|
144
144
|
}
|
|
145
145
|
else {
|
|
146
146
|
const top = searchOut.results?.find(r => r.key === key);
|
|
147
|
-
|
|
147
|
+
if (top) {
|
|
148
|
+
pushDetail(ctx.details, { id: `${ctx.idPrefix}.search-finds-key`, mcpTool: 'memory_search', expected: `result containing key=${key}` }, { topKey: top.key, similarity: top.similarity }, null);
|
|
149
|
+
}
|
|
150
|
+
else {
|
|
151
|
+
// #1120: search returned results but our just-stored key wasn't among
|
|
152
|
+
// them. Mirrors the #1111 empty-HNSW fallback for the non-zero case:
|
|
153
|
+
// if the row IS reachable by literal key, demote to warn — memory
|
|
154
|
+
// access works, the HNSW index just hasn't propagated the new write
|
|
155
|
+
// yet (stale-neighbor race when healer runs 2+ times in one session
|
|
156
|
+
// against accumulated probe rows). If literal retrieve also fails,
|
|
157
|
+
// surface the original fail unchanged.
|
|
158
|
+
const otherKeys = searchOut?.results?.map(r => r.key).join(', ') ?? 'none';
|
|
159
|
+
const retrievable = await literalKeyReachable(ctx.memoryTools, key, namespace);
|
|
160
|
+
if (retrievable) {
|
|
161
|
+
ctx.details.push({
|
|
162
|
+
id: `${ctx.idPrefix}.search-finds-key`,
|
|
163
|
+
mcpTool: 'memory_search',
|
|
164
|
+
status: 'warn',
|
|
165
|
+
observed: { topKeys: searchOut?.results?.map(r => r.key), retrievable: true },
|
|
166
|
+
expected: `result containing key=${key}`,
|
|
167
|
+
message: `search returned results but our key was not among them (got: ${otherKeys}); row IS reachable by literal retrieve — HNSW stale-neighbor race (newly-written row not yet propagated to the index). Memory access path works.`,
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
else {
|
|
171
|
+
pushDetail(ctx.details, { id: `${ctx.idPrefix}.search-finds-key`, mcpTool: 'memory_search', expected: `result containing key=${key}` }, { allKeys: searchOut.results?.map(r => r.key) }, `stored key ${key} not in results (got: ${otherKeys})`);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
148
174
|
}
|
|
149
175
|
// 4. memory_retrieve returns the full value (search content is truncated
|
|
150
176
|
// to a 60-char snippet). Catches write clobber and namespace bleed — we
|