@jhizzard/termdeck 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/packages/cli/src/stack.js +20 -3
- package/packages/client/public/app.js +57 -0
- package/packages/server/src/agent-adapters/gemini.js +14 -8
- package/packages/server/src/health.js +354 -110
- package/packages/server/src/index.js +138 -20
- package/packages/server/src/preflight.js +7 -1
- package/packages/server/src/setup/migrations.js +27 -1
- package/packages/server/src/setup/mnestra-migrations/021_project_tag_canonicalize_claimguard.sql +175 -0
- package/packages/server/src/setup/mnestra-migrations/022_source_agent_backfill.sql +182 -0
|
@@ -20,6 +20,33 @@
|
|
|
20
20
|
// checks (mnestra-webhook, rumen-pool) are best-effort: a failure surfaces
|
|
21
21
|
// as `warn` with detail, but does not flip `ok`.
|
|
22
22
|
//
|
|
23
|
+
// Failure taxonomy (Sprint 63 T3 §3.2 — Brad r730 cascade 2026-05-11)
|
|
24
|
+
// ──────────────────────────────────────────────────────────────────
|
|
25
|
+
// Pre-Sprint-63 every check that didn't return `pass` collapsed to `fail`
|
|
26
|
+
// with a free-text `detail` string. Operators triaging "why is the install
|
|
27
|
+
// red?" had to read each detail and guess. The cost was real: on 2026-05-11
|
|
28
|
+
// a SQLite ABI mismatch left `db = null` at boot; the resulting
|
|
29
|
+
// `red: timeout` strings (from probes that timed-out trying to use the null
|
|
30
|
+
// handle indirectly) masked the actual `init-failed` root cause for hours.
|
|
31
|
+
//
|
|
32
|
+
// Every non-pass check now carries a `category` field with one of:
|
|
33
|
+
// `red:unreachable` — network/socket level (ECONNREFUSED / EHOSTUNREACH
|
|
34
|
+
// / ENETUNREACH / ENOTFOUND on connect)
|
|
35
|
+
// `red:timeout` — request issued, no response in the window
|
|
36
|
+
// (AbortError / req timeout / pg ETIMEDOUT)
|
|
37
|
+
// `red:dependency-down` — peer responded but the dependency is unhealthy
|
|
38
|
+
// (HTTP 5xx / SQL schema error from a reachable DB)
|
|
39
|
+
// `red:init-failed` — local handle the probe needs was never initialized
|
|
40
|
+
// (db === null at boot / DATABASE_URL not set)
|
|
41
|
+
//
|
|
42
|
+
// `detail` strings are prefixed with the category for human readability:
|
|
43
|
+
// `red:unreachable (could not connect to Postgres using DATABASE_URL)`.
|
|
44
|
+
//
|
|
45
|
+
// init-failed surfaces use a log-once gate so a 30s-poll cycle on a process
|
|
46
|
+
// with a missing handle (e.g. better-sqlite3 not loaded) writes ONE warn at
|
|
47
|
+
// boot, not 2880 warns/day. Probes still emit `red:init-failed` per cycle
|
|
48
|
+
// in the JSON report — only the log emission is gated.
|
|
49
|
+
//
|
|
23
50
|
// Caching
|
|
24
51
|
// ───────
|
|
25
52
|
// Reports cached in module scope for 30s. `getFullHealth(config, { refresh: true })`
|
|
@@ -29,8 +56,9 @@
|
|
|
29
56
|
// Error handling
|
|
30
57
|
// ──────────────
|
|
31
58
|
// Every check is wrapped: any unexpected error downgrades that single check
|
|
32
|
-
// to `fail` (or `warn` for warn-checks) with the error message in `detail
|
|
33
|
-
// `
|
|
59
|
+
// to `fail` (or `warn` for warn-checks) with the error message in `detail`
|
|
60
|
+
// and a `category` from the taxonomy above. `getFullHealth()` always
|
|
61
|
+
// resolves with a structured report — never throws.
|
|
34
62
|
|
|
35
63
|
'use strict';
|
|
36
64
|
|
|
@@ -50,21 +78,104 @@ const REQUIRED_CHECKS = new Set([
|
|
|
50
78
|
'cron-job-active'
|
|
51
79
|
]);
|
|
52
80
|
|
|
81
|
+
// Sprint 63 T3 §3.2 — stable taxonomy strings. Exported so dashboard / doctor
|
|
82
|
+
// / external graders can filter by category instead of pattern-matching the
|
|
83
|
+
// detail prose. Frozen object so callers can rely on `CATEGORIES.UNREACHABLE`
|
|
84
|
+
// without accidentally rebinding.
|
|
85
|
+
const CATEGORIES = Object.freeze({
|
|
86
|
+
UNREACHABLE: 'red:unreachable',
|
|
87
|
+
TIMEOUT: 'red:timeout',
|
|
88
|
+
DEPENDENCY_DOWN: 'red:dependency-down',
|
|
89
|
+
INIT_FAILED: 'red:init-failed',
|
|
90
|
+
});
|
|
91
|
+
|
|
53
92
|
let _cache = null;
|
|
54
93
|
let _cachedAt = 0;
|
|
55
94
|
|
|
95
|
+
// Sprint 63 T3 §3.2 — log-once gate for init-failed surfaces. A 30s health
|
|
96
|
+
// poll on a process with a missing handle would otherwise log every cycle
|
|
97
|
+
// (~2880 warn lines/day per missing handle). Probes that detect a null
|
|
98
|
+
// handle at boot call `logInitFailedOnce(name, reason)`; the first call
|
|
99
|
+
// emits a warn line, subsequent calls are silent for the lifetime of the
|
|
100
|
+
// process. Probes still emit `red:init-failed` in the JSON report on every
|
|
101
|
+
// cycle — only the log line is gated. Reset via `_resetInitLogged()` test
|
|
102
|
+
// seam between cases.
|
|
103
|
+
const _initLoggedOnce = new Map();
|
|
104
|
+
function logInitFailedOnce(probeName, reason) {
|
|
105
|
+
if (_initLoggedOnce.has(probeName)) return;
|
|
106
|
+
_initLoggedOnce.set(probeName, reason);
|
|
107
|
+
// eslint-disable-next-line no-console
|
|
108
|
+
console.warn(
|
|
109
|
+
`[health] ${probeName} handle null at boot — probes will return ` +
|
|
110
|
+
`${CATEGORIES.INIT_FAILED} until next process start; reason: ${reason}`
|
|
111
|
+
);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Classify an HTTP-side failure shape `{ ok, status, error, code }` (as
|
|
115
|
+
// returned by `httpReachable`) into one of the four red:* categories.
|
|
116
|
+
function classifyHttpFailure(r) {
|
|
117
|
+
if (!r) return CATEGORIES.UNREACHABLE;
|
|
118
|
+
if (r.code === 'TIMEOUT' || r.code === 'ABORT_ERR' || r.code === 'ERR_TIMEOUT' || r.error === 'timeout') {
|
|
119
|
+
return CATEGORIES.TIMEOUT;
|
|
120
|
+
}
|
|
121
|
+
if (r.code === 'ECONNREFUSED' || r.code === 'EHOSTUNREACH' || r.code === 'ENETUNREACH' || r.code === 'ENOTFOUND') {
|
|
122
|
+
return CATEGORIES.UNREACHABLE;
|
|
123
|
+
}
|
|
124
|
+
if (typeof r.status === 'number' && r.status >= 500) return CATEGORIES.DEPENDENCY_DOWN;
|
|
125
|
+
if (typeof r.status === 'number') return CATEGORIES.DEPENDENCY_DOWN; // any non-2xx-3xx-4xx-network is "peer responded badly"
|
|
126
|
+
return CATEGORIES.UNREACHABLE;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Classify a database / Node-side failure into one of the four categories.
|
|
130
|
+
// Accepts either a raw Error or a `{ error, code }` envelope from
|
|
131
|
+
// `safeQueryRow` / `safeQueryRows`.
|
|
132
|
+
function classifyDbFailure(errOrEnvelope) {
|
|
133
|
+
if (!errOrEnvelope) return CATEGORIES.DEPENDENCY_DOWN;
|
|
134
|
+
const code = errOrEnvelope.code || (errOrEnvelope._err && errOrEnvelope._err.code);
|
|
135
|
+
const msg = String(errOrEnvelope.message || errOrEnvelope.error || errOrEnvelope);
|
|
136
|
+
if (code === 'ECONNREFUSED' || code === 'EHOSTUNREACH' || code === 'ENETUNREACH' || code === 'ENOTFOUND') {
|
|
137
|
+
return CATEGORIES.UNREACHABLE;
|
|
138
|
+
}
|
|
139
|
+
if (code === 'ETIMEDOUT' || code === 'ERR_TIMEOUT' || /\btimeout\b/i.test(msg)) {
|
|
140
|
+
return CATEGORIES.TIMEOUT;
|
|
141
|
+
}
|
|
142
|
+
// SQL errors (42703 column-not-exist, 42P01 relation-not-exist, etc.) →
|
|
143
|
+
// the dependency answered but its schema is misconfigured. That's
|
|
144
|
+
// "dependency unhealthy," not "unreachable" or "timeout."
|
|
145
|
+
return CATEGORIES.DEPENDENCY_DOWN;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Helpers to compose check results with a category-prefixed detail. Keeps
|
|
149
|
+
// each call site readable + ensures the prefix is consistent across probes.
|
|
150
|
+
function failCheck(name, category, why) {
|
|
151
|
+
return { name, status: 'fail', category, detail: `${category} (${why})` };
|
|
152
|
+
}
|
|
153
|
+
function warnCheck(name, category, why) {
|
|
154
|
+
return { name, status: 'warn', category, detail: `${category} (${why})` };
|
|
155
|
+
}
|
|
156
|
+
function passCheck(name) {
|
|
157
|
+
return { name, status: 'pass' };
|
|
158
|
+
}
|
|
159
|
+
|
|
56
160
|
// ── SQLite check ────────────────────────────────────────────────────────────
|
|
57
161
|
|
|
58
162
|
function checkSqlite(db) {
|
|
59
163
|
if (!db) {
|
|
60
|
-
|
|
164
|
+
// Sprint 63 T3 §3.2 — `db === null` is `red:init-failed`, NOT `red:timeout`.
|
|
165
|
+
// The v1.1.1 fail-fast on SQLite ABI mismatch makes this surface rare in
|
|
166
|
+
// practice, but the probe must still classify correctly because future
|
|
167
|
+
// optional deps may be allowed to be null. Log-once gate prevents the
|
|
168
|
+
// 30s poll from flooding logs.
|
|
169
|
+
logInitFailedOnce('sqlite', 'better-sqlite3 not initialized');
|
|
170
|
+
return failCheck('sqlite', CATEGORIES.INIT_FAILED, 'better-sqlite3 not initialized');
|
|
61
171
|
}
|
|
62
172
|
try {
|
|
63
173
|
const row = db.prepare('SELECT 1 AS ok').get();
|
|
64
|
-
if (row && row.ok === 1) return
|
|
65
|
-
return
|
|
174
|
+
if (row && row.ok === 1) return passCheck('sqlite');
|
|
175
|
+
return failCheck('sqlite', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
|
|
66
176
|
} catch (err) {
|
|
67
|
-
|
|
177
|
+
const cat = classifyDbFailure(err);
|
|
178
|
+
return failCheck('sqlite', cat, err && err.message ? err.message : String(err));
|
|
68
179
|
}
|
|
69
180
|
}
|
|
70
181
|
|
|
@@ -81,7 +192,12 @@ async function safeQueryRow(client, sql) {
|
|
|
81
192
|
if (r.rows && r.rows.length > 0 && r.rows[0].ok) return { ok: true };
|
|
82
193
|
return { ok: false };
|
|
83
194
|
} catch (err) {
|
|
84
|
-
|
|
195
|
+
// Surface `code` so the caller can classify into the red:* taxonomy
|
|
196
|
+
// without re-parsing the message string.
|
|
197
|
+
return {
|
|
198
|
+
error: err && err.message ? err.message : String(err),
|
|
199
|
+
code: err && err.code,
|
|
200
|
+
};
|
|
85
201
|
}
|
|
86
202
|
}
|
|
87
203
|
|
|
@@ -90,34 +206,80 @@ async function safeQueryRows(client, sql) {
|
|
|
90
206
|
const r = await client.query(sql);
|
|
91
207
|
return { rows: r.rows || [] };
|
|
92
208
|
} catch (err) {
|
|
93
|
-
return {
|
|
209
|
+
return {
|
|
210
|
+
error: err && err.message ? err.message : String(err),
|
|
211
|
+
code: err && err.code,
|
|
212
|
+
};
|
|
94
213
|
}
|
|
95
214
|
}
|
|
96
215
|
|
|
216
|
+
// Sprint 63 T3 §3.2 — track whether the most recent connect attempt timed out
|
|
217
|
+
// vs. was outright unreachable. The pg client doesn't expose this from inside
|
|
218
|
+
// the helper, so the helper records it in a return envelope.
|
|
97
219
|
async function openPgClient(databaseUrl) {
|
|
98
|
-
if (!databaseUrl) return null;
|
|
220
|
+
if (!databaseUrl) return { client: null, reason: 'no-url' };
|
|
99
221
|
let pgRunner;
|
|
100
|
-
try { pgRunner = require('./setup/pg-runner'); }
|
|
101
|
-
|
|
222
|
+
try { pgRunner = require('./setup/pg-runner'); }
|
|
223
|
+
catch (_e) { return { client: null, reason: 'pg-runner-unavailable' }; }
|
|
224
|
+
try {
|
|
225
|
+
const client = await pgRunner.connect(databaseUrl);
|
|
226
|
+
return { client, reason: null };
|
|
227
|
+
} catch (err) {
|
|
228
|
+
return {
|
|
229
|
+
client: null,
|
|
230
|
+
reason: 'connect-failed',
|
|
231
|
+
error: err && err.message ? err.message : String(err),
|
|
232
|
+
code: err && err.code,
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Sprint 63 T3 §3.2 — dependent-checks shape when there's no client. Pre-
|
|
238
|
+
// Sprint-63 these collapsed to status:'fail', detail:'pg unavailable' with
|
|
239
|
+
// no category; operators couldn't distinguish "DATABASE_URL not set"
|
|
240
|
+
// (`init-failed` — fix the .env) from "Postgres unreachable" (`unreachable`
|
|
241
|
+
// — fix the network) from "Postgres took 5s and gave up" (`timeout` — bump
|
|
242
|
+
// timeout or check pgbouncer). Each downstream check now carries the same
|
|
243
|
+
// category as the connect attempt so the dashboard can render one row
|
|
244
|
+
// "Postgres unreachable" and dim the six dependents instead of six
|
|
245
|
+
// independent-looking RED rows.
|
|
246
|
+
function pushPgUnavailableChecks(checks, primaryName, category, primaryDetail, dependentDetail) {
|
|
247
|
+
checks.push(failCheck(primaryName, category, primaryDetail));
|
|
248
|
+
for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
|
|
249
|
+
checks.push(failCheck(name, category, dependentDetail));
|
|
250
|
+
}
|
|
102
251
|
}
|
|
103
252
|
|
|
104
253
|
async function runPgChecks({ databaseUrl, _pgClient }) {
|
|
105
254
|
const checks = [];
|
|
106
|
-
const client = _pgClient || (await openPgClient(databaseUrl));
|
|
107
|
-
const owned = !_pgClient;
|
|
108
255
|
|
|
256
|
+
let client = _pgClient || null;
|
|
257
|
+
let owned = false;
|
|
258
|
+
let connectEnvelope = null;
|
|
109
259
|
if (!client) {
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
260
|
+
connectEnvelope = await openPgClient(databaseUrl);
|
|
261
|
+
client = connectEnvelope.client;
|
|
262
|
+
owned = client != null;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
if (!client) {
|
|
266
|
+
if (!databaseUrl) {
|
|
267
|
+
// No URL → init-failed (operator never set DATABASE_URL). Log-once.
|
|
268
|
+
logInitFailedOnce('mnestra-pg', 'DATABASE_URL not configured');
|
|
269
|
+
pushPgUnavailableChecks(
|
|
270
|
+
checks,
|
|
271
|
+
'mnestra-pg',
|
|
272
|
+
CATEGORIES.INIT_FAILED,
|
|
273
|
+
'DATABASE_URL not configured — set in ~/.termdeck/secrets.env',
|
|
274
|
+
'pg unavailable — DATABASE_URL not configured'
|
|
275
|
+
);
|
|
276
|
+
} else {
|
|
277
|
+
// URL set but connect failed → classify by code (timeout vs unreachable).
|
|
278
|
+
const cat = classifyDbFailure(connectEnvelope || {});
|
|
279
|
+
const why = connectEnvelope && connectEnvelope.error
|
|
280
|
+
? `could not connect to Postgres using DATABASE_URL — ${connectEnvelope.error}`
|
|
281
|
+
: 'could not connect to Postgres using DATABASE_URL';
|
|
282
|
+
pushPgUnavailableChecks(checks, 'mnestra-pg', cat, why, 'pg unavailable — connect failed');
|
|
121
283
|
}
|
|
122
284
|
return checks;
|
|
123
285
|
}
|
|
@@ -125,11 +287,11 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
|
|
|
125
287
|
try {
|
|
126
288
|
const ping = await safeQueryRow(client, 'SELECT 1 AS ok');
|
|
127
289
|
if (ping.error) {
|
|
128
|
-
checks.push(
|
|
290
|
+
checks.push(failCheck('mnestra-pg', classifyDbFailure(ping), ping.error));
|
|
129
291
|
} else if (!ping.ok) {
|
|
130
|
-
checks.push(
|
|
292
|
+
checks.push(failCheck('mnestra-pg', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned no row'));
|
|
131
293
|
} else {
|
|
132
|
-
checks.push(
|
|
294
|
+
checks.push(passCheck('mnestra-pg'));
|
|
133
295
|
}
|
|
134
296
|
|
|
135
297
|
// memory_items.source_session_id — the v0.6.5 column from Brad's saga.
|
|
@@ -137,84 +299,78 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
|
|
|
137
299
|
"SELECT 1 AS ok FROM information_schema.columns " +
|
|
138
300
|
"WHERE table_schema = 'public' AND table_name = 'memory_items' AND column_name = 'source_session_id'");
|
|
139
301
|
if (col.error) {
|
|
140
|
-
checks.push(
|
|
302
|
+
checks.push(failCheck('memory-items-col', classifyDbFailure(col), col.error));
|
|
141
303
|
} else if (!col.ok) {
|
|
142
|
-
checks.push(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
});
|
|
304
|
+
checks.push(failCheck(
|
|
305
|
+
'memory-items-col',
|
|
306
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
307
|
+
'memory_items.source_session_id missing — re-run termdeck init --mnestra --yes ' +
|
|
308
|
+
'(if loader picked up a stale set, first: npm cache clean --force && npm i -g @jhizzard/termdeck@latest)'
|
|
309
|
+
));
|
|
149
310
|
} else {
|
|
150
|
-
checks.push(
|
|
311
|
+
checks.push(passCheck('memory-items-col'));
|
|
151
312
|
}
|
|
152
313
|
|
|
153
314
|
const cron = await safeQueryRow(client,
|
|
154
315
|
"SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_cron'");
|
|
155
316
|
if (cron.error) {
|
|
156
|
-
checks.push(
|
|
317
|
+
checks.push(failCheck('pg-cron-ext', classifyDbFailure(cron), cron.error));
|
|
157
318
|
} else if (!cron.ok) {
|
|
158
|
-
checks.push(
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
319
|
+
checks.push(failCheck(
|
|
320
|
+
'pg-cron-ext',
|
|
321
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
322
|
+
'extension not enabled — Supabase dashboard → Database → Extensions → pg_cron'
|
|
323
|
+
));
|
|
163
324
|
} else {
|
|
164
|
-
checks.push(
|
|
325
|
+
checks.push(passCheck('pg-cron-ext'));
|
|
165
326
|
}
|
|
166
327
|
|
|
167
328
|
const net = await safeQueryRow(client,
|
|
168
329
|
"SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_net'");
|
|
169
330
|
if (net.error) {
|
|
170
|
-
checks.push(
|
|
331
|
+
checks.push(failCheck('pg-net-ext', classifyDbFailure(net), net.error));
|
|
171
332
|
} else if (!net.ok) {
|
|
172
|
-
checks.push(
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
333
|
+
checks.push(failCheck(
|
|
334
|
+
'pg-net-ext',
|
|
335
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
336
|
+
'extension not enabled — Supabase dashboard → Database → Extensions → pg_net'
|
|
337
|
+
));
|
|
177
338
|
} else {
|
|
178
|
-
checks.push(
|
|
339
|
+
checks.push(passCheck('pg-net-ext'));
|
|
179
340
|
}
|
|
180
341
|
|
|
181
342
|
const vault = await safeQueryRow(client,
|
|
182
343
|
"SELECT 1 AS ok FROM vault.decrypted_secrets WHERE name = 'rumen_service_role_key'");
|
|
183
344
|
if (vault.error) {
|
|
184
|
-
checks.push({
|
|
185
|
-
name: 'vault-secret',
|
|
186
|
-
status: 'fail',
|
|
187
|
-
detail: `vault.decrypted_secrets unreadable — ${vault.error}`
|
|
188
|
-
});
|
|
345
|
+
checks.push(failCheck('vault-secret', classifyDbFailure(vault), `vault.decrypted_secrets unreadable — ${vault.error}`));
|
|
189
346
|
} else if (!vault.ok) {
|
|
190
|
-
checks.push(
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
347
|
+
checks.push(failCheck(
|
|
348
|
+
'vault-secret',
|
|
349
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
350
|
+
'rumen_service_role_key missing — Supabase dashboard → Project Settings → Vault → New secret'
|
|
351
|
+
));
|
|
195
352
|
} else {
|
|
196
|
-
checks.push(
|
|
353
|
+
checks.push(passCheck('vault-secret'));
|
|
197
354
|
}
|
|
198
355
|
|
|
199
356
|
const job = await safeQueryRows(client,
|
|
200
357
|
"SELECT active FROM cron.job WHERE jobname = 'rumen-tick'");
|
|
201
358
|
if (job.error) {
|
|
202
|
-
checks.push(
|
|
359
|
+
checks.push(failCheck('cron-job-active', classifyDbFailure(job), `cron.job unreadable — ${job.error}`));
|
|
203
360
|
} else if (!job.rows || job.rows.length === 0) {
|
|
204
|
-
checks.push(
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
361
|
+
checks.push(failCheck(
|
|
362
|
+
'cron-job-active',
|
|
363
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
364
|
+
'rumen-tick row not found — re-run `termdeck init --rumen`'
|
|
365
|
+
));
|
|
209
366
|
} else if (!job.rows[0].active) {
|
|
210
|
-
checks.push(
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
});
|
|
367
|
+
checks.push(failCheck(
|
|
368
|
+
'cron-job-active',
|
|
369
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
370
|
+
"rumen-tick paused — SELECT cron.alter_job((SELECT jobid FROM cron.job WHERE jobname = 'rumen-tick'), active := true);"
|
|
371
|
+
));
|
|
216
372
|
} else {
|
|
217
|
-
checks.push(
|
|
373
|
+
checks.push(passCheck('cron-job-active'));
|
|
218
374
|
}
|
|
219
375
|
} finally {
|
|
220
376
|
if (owned) {
|
|
@@ -234,15 +390,29 @@ function httpReachable(url, timeoutMs = 2000) {
|
|
|
234
390
|
try {
|
|
235
391
|
req = mod.get(url, { timeout: timeoutMs }, (res) => {
|
|
236
392
|
const ok = res.statusCode != null && res.statusCode < 500;
|
|
393
|
+
const status = res.statusCode;
|
|
237
394
|
res.resume();
|
|
238
|
-
resolve({ ok, status
|
|
395
|
+
resolve({ ok, status });
|
|
239
396
|
});
|
|
240
397
|
} catch (err) {
|
|
241
|
-
|
|
398
|
+
// Sprint 63 T3 §3.2 — surface `code` so the caller can classify into
|
|
399
|
+
// the red:* taxonomy without re-parsing the message.
|
|
400
|
+
resolve({
|
|
401
|
+
ok: false,
|
|
402
|
+
error: err && err.message ? err.message : String(err),
|
|
403
|
+
code: err && err.code,
|
|
404
|
+
});
|
|
242
405
|
return;
|
|
243
406
|
}
|
|
244
|
-
req.on('error', (err) => resolve({
|
|
245
|
-
|
|
407
|
+
req.on('error', (err) => resolve({
|
|
408
|
+
ok: false,
|
|
409
|
+
error: err && err.message ? err.message : String(err),
|
|
410
|
+
code: err && err.code,
|
|
411
|
+
}));
|
|
412
|
+
req.on('timeout', () => {
|
|
413
|
+
try { req.destroy(); } catch (_e) { /* gone */ }
|
|
414
|
+
resolve({ ok: false, error: 'timeout', code: 'TIMEOUT' });
|
|
415
|
+
});
|
|
246
416
|
});
|
|
247
417
|
}
|
|
248
418
|
|
|
@@ -250,50 +420,66 @@ async function checkMnestraWebhook(config, options) {
|
|
|
250
420
|
if (options && typeof options._mnestraWebhookProbe === 'function') {
|
|
251
421
|
try {
|
|
252
422
|
const r = await options._mnestraWebhookProbe();
|
|
253
|
-
if (r && r.ok) return
|
|
254
|
-
|
|
423
|
+
if (r && r.ok) return passCheck('mnestra-webhook');
|
|
424
|
+
const cat = classifyHttpFailure(r);
|
|
425
|
+
return warnCheck('mnestra-webhook', cat, (r && r.detail) || (r && r.error) || 'unreachable');
|
|
255
426
|
} catch (err) {
|
|
256
|
-
|
|
427
|
+
const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
|
|
428
|
+
return warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
|
|
257
429
|
}
|
|
258
430
|
}
|
|
259
431
|
const rag = (config && config.rag) || {};
|
|
260
432
|
if (!rag.mnestraWebhookUrl) {
|
|
261
|
-
|
|
433
|
+
// Sprint 63 T3 §3.2 — URL not configured = init-failed (operator never
|
|
434
|
+
// set up the webhook), not unreachable. Log-once so a 30s poll on an
|
|
435
|
+
// unconfigured install doesn't flood warns.
|
|
436
|
+
logInitFailedOnce('mnestra-webhook', 'rag.mnestraWebhookUrl not configured');
|
|
437
|
+
return warnCheck('mnestra-webhook', CATEGORIES.INIT_FAILED, 'webhook URL not configured');
|
|
262
438
|
}
|
|
263
439
|
const healthUrl = String(rag.mnestraWebhookUrl).replace(/\/mnestra\/?$/, '/healthz');
|
|
264
440
|
const r = await httpReachable(healthUrl, 2000);
|
|
265
|
-
if (r.ok) return
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
status: '
|
|
269
|
-
|
|
270
|
-
|
|
441
|
+
if (r.ok) return passCheck('mnestra-webhook');
|
|
442
|
+
const cat = classifyHttpFailure(r);
|
|
443
|
+
const why = r.error
|
|
444
|
+
? `${r.error}${typeof r.status === 'number' ? ` (HTTP ${r.status})` : ''}`
|
|
445
|
+
: `HTTP ${r.status || '???'}`;
|
|
446
|
+
return warnCheck('mnestra-webhook', cat, why);
|
|
271
447
|
}
|
|
272
448
|
|
|
273
449
|
async function checkRumenPool(config, options) {
|
|
274
450
|
if (options && typeof options._rumenPoolProbe === 'function') {
|
|
275
451
|
try {
|
|
276
452
|
const r = await options._rumenPoolProbe();
|
|
277
|
-
if (r && r.ok) return
|
|
278
|
-
|
|
453
|
+
if (r && r.ok) return passCheck('rumen-pool');
|
|
454
|
+
// Test-seam probe should pass `category` if it has one; else infer.
|
|
455
|
+
const cat = (r && r.category) || classifyDbFailure(r || {});
|
|
456
|
+
return warnCheck('rumen-pool', cat, (r && r.detail) || (r && r.error) || 'unreachable (best-effort)');
|
|
279
457
|
} catch (err) {
|
|
280
|
-
|
|
458
|
+
const cat = classifyDbFailure(err);
|
|
459
|
+
return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
|
|
281
460
|
}
|
|
282
461
|
}
|
|
283
462
|
let pg;
|
|
284
463
|
try { pg = require('pg'); } catch (_e) { pg = null; }
|
|
285
|
-
if (!pg)
|
|
464
|
+
if (!pg) {
|
|
465
|
+
logInitFailedOnce('rumen-pool', 'pg module not installed');
|
|
466
|
+
return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'pg module not installed');
|
|
467
|
+
}
|
|
286
468
|
|
|
287
469
|
const dbUrl = (config && config.rag && config.rag.databaseUrl) || process.env.DATABASE_URL;
|
|
288
|
-
if (!dbUrl)
|
|
470
|
+
if (!dbUrl) {
|
|
471
|
+
logInitFailedOnce('rumen-pool', 'DATABASE_URL not set');
|
|
472
|
+
return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'DATABASE_URL not set');
|
|
473
|
+
}
|
|
289
474
|
|
|
290
475
|
const pool = new pg.Pool({ connectionString: dbUrl, max: 1, connectionTimeoutMillis: 3000 });
|
|
291
476
|
try {
|
|
292
477
|
const res = await pool.query('SELECT 1 AS ok');
|
|
293
|
-
if (res.rows[0] && res.rows[0].ok === 1) return
|
|
294
|
-
return
|
|
478
|
+
if (res.rows[0] && res.rows[0].ok === 1) return passCheck('rumen-pool');
|
|
479
|
+
return warnCheck('rumen-pool', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
|
|
295
480
|
} catch (err) {
|
|
296
|
-
|
|
481
|
+
const cat = classifyDbFailure(err);
|
|
482
|
+
return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
|
|
297
483
|
} finally {
|
|
298
484
|
try { await pool.end(); } catch (_e) { /* ignore */ }
|
|
299
485
|
}
|
|
@@ -316,35 +502,82 @@ async function getFullHealth(config = {}, options = {}) {
|
|
|
316
502
|
|
|
317
503
|
const checks = [];
|
|
318
504
|
|
|
505
|
+
// Sprint 63 T3 §3.2 outer-catch hardening (T4-CODEX AUDIT-CONCERN 13:27 ET):
|
|
506
|
+
// every probe is independently wrapped here so an unexpected throw in a
|
|
507
|
+
// single probe path can't sink the whole report. Pre-Sprint-63 these four
|
|
508
|
+
// catches emitted raw `{ status: 'fail'|'warn', detail }` with no
|
|
509
|
+
// `category` field — operators triaging "why is the dashboard red?" still
|
|
510
|
+
// had to read prose. The whole point of the taxonomy is that there is no
|
|
511
|
+
// such thing as an uncategorized non-pass row. Every fallback now runs the
|
|
512
|
+
// captured `err` through `classifyDbFailure` / `classifyHttpFailure` and
|
|
513
|
+
// composes a normal `failCheck` / `warnCheck` envelope. When the
|
|
514
|
+
// classifier can't infer (truly opaque throw — bug in the probe itself,
|
|
515
|
+
// not in the dependency), the default branch in each classifier returns
|
|
516
|
+
// `red:dependency-down`, which is the right-by-default category for "the
|
|
517
|
+
// probe's path is broken" — operator's first action is to inspect the
|
|
518
|
+
// peer / its config, not the local handle.
|
|
519
|
+
|
|
520
|
+
// Sprint 63 T3 §3.2 — `_throwIn` test seam. The probe functions each have
|
|
521
|
+
// their own try/catch so unreached-by-design inputs can't throw out into
|
|
522
|
+
// the outer catches below. The fence tests need a way to simulate "a
|
|
523
|
+
// probe's path threw before its own catch caught it" — i.e., the
|
|
524
|
+
// belt-and-suspenders outer catch. Set `_throwIn` to one of
|
|
525
|
+
// `'sqlite' | 'pg' | 'webhook' | 'rumen-pool'` to inject a synthetic
|
|
526
|
+
// throw at the corresponding outer-try entry. Never set in production —
|
|
527
|
+
// ignored if the value is falsy.
|
|
528
|
+
const throwIn = options._throwIn || null;
|
|
529
|
+
const synth = (where) => new Error(`test-fence: simulated throw in ${where} probe path`);
|
|
530
|
+
|
|
319
531
|
// 1. SQLite (sync — small DB, no risk of blocking)
|
|
320
|
-
try {
|
|
321
|
-
|
|
532
|
+
try {
|
|
533
|
+
if (throwIn === 'sqlite') throw synth('sqlite');
|
|
534
|
+
checks.push(checkSqlite(db));
|
|
535
|
+
}
|
|
536
|
+
catch (err) {
|
|
537
|
+
const cat = classifyDbFailure(err);
|
|
538
|
+
checks.push(failCheck('sqlite', cat, err && err.message ? err.message : String(err)));
|
|
539
|
+
}
|
|
322
540
|
|
|
323
541
|
// 2-7. Postgres-side suite
|
|
324
542
|
let pgChecks;
|
|
325
|
-
try {
|
|
543
|
+
try {
|
|
544
|
+
if (throwIn === 'pg') throw synth('pg');
|
|
545
|
+
pgChecks = await runPgChecks({ databaseUrl, _pgClient: options._pgClient });
|
|
546
|
+
}
|
|
326
547
|
catch (err) {
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
548
|
+
const cat = classifyDbFailure(err);
|
|
549
|
+
const why = err && err.message ? err.message : String(err);
|
|
550
|
+
pgChecks = [failCheck('mnestra-pg', cat, why)];
|
|
551
|
+
// Dependents inherit the same category — see runPgChecks header for the
|
|
552
|
+
// rationale (one root-cause row, not 6 independent-looking REDs).
|
|
332
553
|
for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
|
|
333
|
-
pgChecks.push(
|
|
554
|
+
pgChecks.push(failCheck(name, cat, 'pg suite aborted'));
|
|
334
555
|
}
|
|
335
556
|
}
|
|
336
557
|
for (const c of pgChecks) checks.push(c);
|
|
337
558
|
|
|
338
559
|
// 8. Mnestra webhook (warn)
|
|
339
560
|
let webhook;
|
|
340
|
-
try {
|
|
341
|
-
|
|
561
|
+
try {
|
|
562
|
+
if (throwIn === 'webhook') throw synth('webhook');
|
|
563
|
+
webhook = await checkMnestraWebhook(config, options);
|
|
564
|
+
}
|
|
565
|
+
catch (err) {
|
|
566
|
+
const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
|
|
567
|
+
webhook = warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
|
|
568
|
+
}
|
|
342
569
|
checks.push(webhook);
|
|
343
570
|
|
|
344
571
|
// 9. Rumen pool (warn)
|
|
345
572
|
let pool;
|
|
346
|
-
try {
|
|
347
|
-
|
|
573
|
+
try {
|
|
574
|
+
if (throwIn === 'rumen-pool') throw synth('rumen-pool');
|
|
575
|
+
pool = await checkRumenPool(config, options);
|
|
576
|
+
}
|
|
577
|
+
catch (err) {
|
|
578
|
+
const cat = classifyDbFailure(err);
|
|
579
|
+
pool = warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
|
|
580
|
+
}
|
|
348
581
|
checks.push(pool);
|
|
349
582
|
|
|
350
583
|
const ok = checks
|
|
@@ -370,8 +603,19 @@ function _resetCache() {
|
|
|
370
603
|
_cachedAt = 0;
|
|
371
604
|
}
|
|
372
605
|
|
|
606
|
+
// Sprint 63 T3 §3.2 — clear the init-failed log-once memory so each test
|
|
607
|
+
// case starts fresh. Without this, the first test that exercises a null-db
|
|
608
|
+
// path would silence the log on subsequent tests in the same process.
|
|
609
|
+
function _resetInitLogged() {
|
|
610
|
+
_initLoggedOnce.clear();
|
|
611
|
+
}
|
|
612
|
+
|
|
373
613
|
module.exports = {
|
|
374
614
|
getFullHealth,
|
|
375
615
|
REQUIRED_CHECKS,
|
|
376
|
-
|
|
616
|
+
CATEGORIES,
|
|
617
|
+
classifyHttpFailure,
|
|
618
|
+
classifyDbFailure,
|
|
619
|
+
_resetCache,
|
|
620
|
+
_resetInitLogged,
|
|
377
621
|
};
|