@jhizzard/termdeck 1.1.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,33 @@
20
20
  // checks (mnestra-webhook, rumen-pool) are best-effort: a failure surfaces
21
21
  // as `warn` with detail, but does not flip `ok`.
22
22
  //
23
+ // Failure taxonomy (Sprint 63 T3 §3.2 — Brad r730 cascade 2026-05-11)
24
+ // ──────────────────────────────────────────────────────────────────
25
+ // Pre-Sprint-63 every check that didn't return `pass` collapsed to `fail`
26
+ // with a free-text `detail` string. Operators triaging "why is the install
27
+ // red?" had to read each detail and guess. The cost was real: on 2026-05-11
28
+ // a SQLite ABI mismatch left `db = null` at boot; the resulting
29
+ // `red: timeout` strings (from probes that timed-out trying to use the null
30
+ // handle indirectly) masked the actual `init-failed` root cause for hours.
31
+ //
32
+ // Every non-pass check now carries a `category` field with one of:
33
+ // `red:unreachable` — network/socket level (ECONNREFUSED / EHOSTUNREACH
34
+ // / ENETUNREACH / ENOTFOUND on connect)
35
+ // `red:timeout` — request issued, no response in the window
36
+ // (AbortError / req timeout / pg ETIMEDOUT)
37
+ // `red:dependency-down` — peer responded but the dependency is unhealthy
38
+ // (HTTP 5xx / SQL schema error from a reachable DB)
39
+ // `red:init-failed` — local handle the probe needs was never initialized
40
+ // (db === null at boot / DATABASE_URL not set)
41
+ //
42
+ // `detail` strings are prefixed with the category for human readability:
43
+ // `red:unreachable (could not connect to Postgres using DATABASE_URL)`.
44
+ //
45
+ // init-failed surfaces use a log-once gate so a 30s-poll cycle on a process
46
+ // with a missing handle (e.g. better-sqlite3 not loaded) writes ONE warn at
47
+ // boot, not 2880 warns/day. Probes still emit `red:init-failed` per cycle
48
+ // in the JSON report — only the log emission is gated.
49
+ //
23
50
  // Caching
24
51
  // ───────
25
52
  // Reports cached in module scope for 30s. `getFullHealth(config, { refresh: true })`
@@ -29,8 +56,9 @@
29
56
  // Error handling
30
57
  // ──────────────
31
58
  // Every check is wrapped: any unexpected error downgrades that single check
32
- // to `fail` (or `warn` for warn-checks) with the error message in `detail`.
33
- // `getFullHealth()` always resolves with a structured report — never throws.
59
+ // to `fail` (or `warn` for warn-checks) with the error message in `detail`
60
+ // and a `category` from the taxonomy above. `getFullHealth()` always
61
+ // resolves with a structured report — never throws.
34
62
 
35
63
  'use strict';
36
64
 
@@ -50,21 +78,104 @@ const REQUIRED_CHECKS = new Set([
50
78
  'cron-job-active'
51
79
  ]);
52
80
 
81
+ // Sprint 63 T3 §3.2 — stable taxonomy strings. Exported so dashboard / doctor
82
+ // / external graders can filter by category instead of pattern-matching the
83
+ // detail prose. Frozen object so callers can rely on `CATEGORIES.UNREACHABLE`
84
+ // without accidentally rebinding.
85
+ const CATEGORIES = Object.freeze({
86
+ UNREACHABLE: 'red:unreachable',
87
+ TIMEOUT: 'red:timeout',
88
+ DEPENDENCY_DOWN: 'red:dependency-down',
89
+ INIT_FAILED: 'red:init-failed',
90
+ });
91
+
53
92
  let _cache = null;
54
93
  let _cachedAt = 0;
55
94
 
95
+ // Sprint 63 T3 §3.2 — log-once gate for init-failed surfaces. A 30s health
96
+ // poll on a process with a missing handle would otherwise log every cycle
97
+ // (~2880 warn lines/day per missing handle). Probes that detect a null
98
+ // handle at boot call `logInitFailedOnce(name, reason)`; the first call
99
+ // emits a warn line, subsequent calls are silent for the lifetime of the
100
+ // process. Probes still emit `red:init-failed` in the JSON report on every
101
+ // cycle — only the log line is gated. Reset via `_resetInitLogged()` test
102
+ // seam between cases.
103
+ const _initLoggedOnce = new Map();
104
+ function logInitFailedOnce(probeName, reason) {
105
+ if (_initLoggedOnce.has(probeName)) return;
106
+ _initLoggedOnce.set(probeName, reason);
107
+ // eslint-disable-next-line no-console
108
+ console.warn(
109
+ `[health] ${probeName} handle null at boot — probes will return ` +
110
+ `${CATEGORIES.INIT_FAILED} until next process start; reason: ${reason}`
111
+ );
112
+ }
113
+
114
+ // Classify an HTTP-side failure shape `{ ok, status, error, code }` (as
115
+ // returned by `httpReachable`) into one of the four red:* categories.
116
+ function classifyHttpFailure(r) {
117
+ if (!r) return CATEGORIES.UNREACHABLE;
118
+ if (r.code === 'TIMEOUT' || r.code === 'ABORT_ERR' || r.code === 'ERR_TIMEOUT' || r.error === 'timeout') {
119
+ return CATEGORIES.TIMEOUT;
120
+ }
121
+ if (r.code === 'ECONNREFUSED' || r.code === 'EHOSTUNREACH' || r.code === 'ENETUNREACH' || r.code === 'ENOTFOUND') {
122
+ return CATEGORIES.UNREACHABLE;
123
+ }
124
+ if (typeof r.status === 'number' && r.status >= 500) return CATEGORIES.DEPENDENCY_DOWN;
125
+ if (typeof r.status === 'number') return CATEGORIES.DEPENDENCY_DOWN; // any non-2xx-3xx-4xx-network is "peer responded badly"
126
+ return CATEGORIES.UNREACHABLE;
127
+ }
128
+
129
+ // Classify a database / Node-side failure into one of the four categories.
130
+ // Accepts either a raw Error or a `{ error, code }` envelope from
131
+ // `safeQueryRow` / `safeQueryRows`.
132
+ function classifyDbFailure(errOrEnvelope) {
133
+ if (!errOrEnvelope) return CATEGORIES.DEPENDENCY_DOWN;
134
+ const code = errOrEnvelope.code || (errOrEnvelope._err && errOrEnvelope._err.code);
135
+ const msg = String(errOrEnvelope.message || errOrEnvelope.error || errOrEnvelope);
136
+ if (code === 'ECONNREFUSED' || code === 'EHOSTUNREACH' || code === 'ENETUNREACH' || code === 'ENOTFOUND') {
137
+ return CATEGORIES.UNREACHABLE;
138
+ }
139
+ if (code === 'ETIMEDOUT' || code === 'ERR_TIMEOUT' || /\btimeout\b/i.test(msg)) {
140
+ return CATEGORIES.TIMEOUT;
141
+ }
142
+ // SQL errors (42703 column-not-exist, 42P01 relation-not-exist, etc.) →
143
+ // the dependency answered but its schema is misconfigured. That's
144
+ // "dependency unhealthy," not "unreachable" or "timeout."
145
+ return CATEGORIES.DEPENDENCY_DOWN;
146
+ }
147
+
148
+ // Helpers to compose check results with a category-prefixed detail. Keeps
149
+ // each call site readable + ensures the prefix is consistent across probes.
150
+ function failCheck(name, category, why) {
151
+ return { name, status: 'fail', category, detail: `${category} (${why})` };
152
+ }
153
+ function warnCheck(name, category, why) {
154
+ return { name, status: 'warn', category, detail: `${category} (${why})` };
155
+ }
156
+ function passCheck(name) {
157
+ return { name, status: 'pass' };
158
+ }
159
+
56
160
  // ── SQLite check ────────────────────────────────────────────────────────────
57
161
 
58
162
  function checkSqlite(db) {
59
163
  if (!db) {
60
- return { name: 'sqlite', status: 'fail', detail: 'better-sqlite3 not initialized' };
164
+ // Sprint 63 T3 §3.2 `db === null` is `red:init-failed`, NOT `red:timeout`.
165
+ // The v1.1.1 fail-fast on SQLite ABI mismatch makes this surface rare in
166
+ // practice, but the probe must still classify correctly because future
167
+ // optional deps may be allowed to be null. Log-once gate prevents the
168
+ // 30s poll from flooding logs.
169
+ logInitFailedOnce('sqlite', 'better-sqlite3 not initialized');
170
+ return failCheck('sqlite', CATEGORIES.INIT_FAILED, 'better-sqlite3 not initialized');
61
171
  }
62
172
  try {
63
173
  const row = db.prepare('SELECT 1 AS ok').get();
64
- if (row && row.ok === 1) return { name: 'sqlite', status: 'pass' };
65
- return { name: 'sqlite', status: 'fail', detail: 'SELECT 1 returned unexpected result' };
174
+ if (row && row.ok === 1) return passCheck('sqlite');
175
+ return failCheck('sqlite', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
66
176
  } catch (err) {
67
- return { name: 'sqlite', status: 'fail', detail: err && err.message ? err.message : String(err) };
177
+ const cat = classifyDbFailure(err);
178
+ return failCheck('sqlite', cat, err && err.message ? err.message : String(err));
68
179
  }
69
180
  }
70
181
 
@@ -81,7 +192,12 @@ async function safeQueryRow(client, sql) {
81
192
  if (r.rows && r.rows.length > 0 && r.rows[0].ok) return { ok: true };
82
193
  return { ok: false };
83
194
  } catch (err) {
84
- return { error: err && err.message ? err.message : String(err) };
195
+ // Surface `code` so the caller can classify into the red:* taxonomy
196
+ // without re-parsing the message string.
197
+ return {
198
+ error: err && err.message ? err.message : String(err),
199
+ code: err && err.code,
200
+ };
85
201
  }
86
202
  }
87
203
 
@@ -90,34 +206,80 @@ async function safeQueryRows(client, sql) {
90
206
  const r = await client.query(sql);
91
207
  return { rows: r.rows || [] };
92
208
  } catch (err) {
93
- return { error: err && err.message ? err.message : String(err) };
209
+ return {
210
+ error: err && err.message ? err.message : String(err),
211
+ code: err && err.code,
212
+ };
94
213
  }
95
214
  }
96
215
 
216
+ // Sprint 63 T3 §3.2 — track whether the most recent connect attempt timed out
217
+ // vs. was outright unreachable. The pg client doesn't expose this from inside
218
+ // the helper, so the helper records it in a return envelope.
97
219
  async function openPgClient(databaseUrl) {
98
- if (!databaseUrl) return null;
220
+ if (!databaseUrl) return { client: null, reason: 'no-url' };
99
221
  let pgRunner;
100
- try { pgRunner = require('./setup/pg-runner'); } catch (_e) { return null; }
101
- try { return await pgRunner.connect(databaseUrl); } catch (_e) { return null; }
222
+ try { pgRunner = require('./setup/pg-runner'); }
223
+ catch (_e) { return { client: null, reason: 'pg-runner-unavailable' }; }
224
+ try {
225
+ const client = await pgRunner.connect(databaseUrl);
226
+ return { client, reason: null };
227
+ } catch (err) {
228
+ return {
229
+ client: null,
230
+ reason: 'connect-failed',
231
+ error: err && err.message ? err.message : String(err),
232
+ code: err && err.code,
233
+ };
234
+ }
235
+ }
236
+
237
+ // Sprint 63 T3 §3.2 — dependent-checks shape when there's no client. Pre-
238
+ // Sprint-63 these collapsed to status:'fail', detail:'pg unavailable' with
239
+ // no category; operators couldn't distinguish "DATABASE_URL not set"
240
+ // (`init-failed` — fix the .env) from "Postgres unreachable" (`unreachable`
241
+ // — fix the network) from "Postgres took 5s and gave up" (`timeout` — bump
242
+ // timeout or check pgbouncer). Each downstream check now carries the same
243
+ // category as the connect attempt so the dashboard can render one row
244
+ // "Postgres unreachable" and dim the six dependents instead of six
245
+ // independent-looking RED rows.
246
+ function pushPgUnavailableChecks(checks, primaryName, category, primaryDetail, dependentDetail) {
247
+ checks.push(failCheck(primaryName, category, primaryDetail));
248
+ for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
249
+ checks.push(failCheck(name, category, dependentDetail));
250
+ }
102
251
  }
103
252
 
104
253
  async function runPgChecks({ databaseUrl, _pgClient }) {
105
254
  const checks = [];
106
- const client = _pgClient || (await openPgClient(databaseUrl));
107
- const owned = !_pgClient;
108
255
 
256
+ let client = _pgClient || null;
257
+ let owned = false;
258
+ let connectEnvelope = null;
109
259
  if (!client) {
110
- checks.push({
111
- name: 'mnestra-pg',
112
- status: 'fail',
113
- detail: databaseUrl
114
- ? 'could not connect to Postgres using DATABASE_URL'
115
- : 'DATABASE_URL not configured (set in ~/.termdeck/secrets.env)'
116
- });
117
- // Dependent checks can't run without a connection surface them as
118
- // fail rather than silently skipping so the report is complete.
119
- for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
120
- checks.push({ name, status: 'fail', detail: 'pg unavailable' });
260
+ connectEnvelope = await openPgClient(databaseUrl);
261
+ client = connectEnvelope.client;
262
+ owned = client != null;
263
+ }
264
+
265
+ if (!client) {
266
+ if (!databaseUrl) {
267
+ // No URL init-failed (operator never set DATABASE_URL). Log-once.
268
+ logInitFailedOnce('mnestra-pg', 'DATABASE_URL not configured');
269
+ pushPgUnavailableChecks(
270
+ checks,
271
+ 'mnestra-pg',
272
+ CATEGORIES.INIT_FAILED,
273
+ 'DATABASE_URL not configured — set in ~/.termdeck/secrets.env',
274
+ 'pg unavailable — DATABASE_URL not configured'
275
+ );
276
+ } else {
277
+ // URL set but connect failed → classify by code (timeout vs unreachable).
278
+ const cat = classifyDbFailure(connectEnvelope || {});
279
+ const why = connectEnvelope && connectEnvelope.error
280
+ ? `could not connect to Postgres using DATABASE_URL — ${connectEnvelope.error}`
281
+ : 'could not connect to Postgres using DATABASE_URL';
282
+ pushPgUnavailableChecks(checks, 'mnestra-pg', cat, why, 'pg unavailable — connect failed');
121
283
  }
122
284
  return checks;
123
285
  }
@@ -125,11 +287,11 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
125
287
  try {
126
288
  const ping = await safeQueryRow(client, 'SELECT 1 AS ok');
127
289
  if (ping.error) {
128
- checks.push({ name: 'mnestra-pg', status: 'fail', detail: ping.error });
290
+ checks.push(failCheck('mnestra-pg', classifyDbFailure(ping), ping.error));
129
291
  } else if (!ping.ok) {
130
- checks.push({ name: 'mnestra-pg', status: 'fail', detail: 'SELECT 1 returned no row' });
292
+ checks.push(failCheck('mnestra-pg', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned no row'));
131
293
  } else {
132
- checks.push({ name: 'mnestra-pg', status: 'pass' });
294
+ checks.push(passCheck('mnestra-pg'));
133
295
  }
134
296
 
135
297
  // memory_items.source_session_id — the v0.6.5 column from Brad's saga.
@@ -137,84 +299,78 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
137
299
  "SELECT 1 AS ok FROM information_schema.columns " +
138
300
  "WHERE table_schema = 'public' AND table_name = 'memory_items' AND column_name = 'source_session_id'");
139
301
  if (col.error) {
140
- checks.push({ name: 'memory-items-col', status: 'fail', detail: col.error });
302
+ checks.push(failCheck('memory-items-col', classifyDbFailure(col), col.error));
141
303
  } else if (!col.ok) {
142
- checks.push({
143
- name: 'memory-items-col',
144
- status: 'fail',
145
- detail:
146
- 'memory_items.source_session_id missing re-run termdeck init --mnestra --yes ' +
147
- '(if loader picked up a stale set, first: npm cache clean --force && npm i -g @jhizzard/termdeck@latest)'
148
- });
304
+ checks.push(failCheck(
305
+ 'memory-items-col',
306
+ CATEGORIES.DEPENDENCY_DOWN,
307
+ 'memory_items.source_session_id missing — re-run termdeck init --mnestra --yes ' +
308
+ '(if loader picked up a stale set, first: npm cache clean --force && npm i -g @jhizzard/termdeck@latest)'
309
+ ));
149
310
  } else {
150
- checks.push({ name: 'memory-items-col', status: 'pass' });
311
+ checks.push(passCheck('memory-items-col'));
151
312
  }
152
313
 
153
314
  const cron = await safeQueryRow(client,
154
315
  "SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_cron'");
155
316
  if (cron.error) {
156
- checks.push({ name: 'pg-cron-ext', status: 'fail', detail: cron.error });
317
+ checks.push(failCheck('pg-cron-ext', classifyDbFailure(cron), cron.error));
157
318
  } else if (!cron.ok) {
158
- checks.push({
159
- name: 'pg-cron-ext',
160
- status: 'fail',
161
- detail: 'extension not enabled — Supabase dashboard → Database → Extensions → pg_cron'
162
- });
319
+ checks.push(failCheck(
320
+ 'pg-cron-ext',
321
+ CATEGORIES.DEPENDENCY_DOWN,
322
+ 'extension not enabled — Supabase dashboard → Database → Extensions → pg_cron'
323
+ ));
163
324
  } else {
164
- checks.push({ name: 'pg-cron-ext', status: 'pass' });
325
+ checks.push(passCheck('pg-cron-ext'));
165
326
  }
166
327
 
167
328
  const net = await safeQueryRow(client,
168
329
  "SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_net'");
169
330
  if (net.error) {
170
- checks.push({ name: 'pg-net-ext', status: 'fail', detail: net.error });
331
+ checks.push(failCheck('pg-net-ext', classifyDbFailure(net), net.error));
171
332
  } else if (!net.ok) {
172
- checks.push({
173
- name: 'pg-net-ext',
174
- status: 'fail',
175
- detail: 'extension not enabled — Supabase dashboard → Database → Extensions → pg_net'
176
- });
333
+ checks.push(failCheck(
334
+ 'pg-net-ext',
335
+ CATEGORIES.DEPENDENCY_DOWN,
336
+ 'extension not enabled — Supabase dashboard → Database → Extensions → pg_net'
337
+ ));
177
338
  } else {
178
- checks.push({ name: 'pg-net-ext', status: 'pass' });
339
+ checks.push(passCheck('pg-net-ext'));
179
340
  }
180
341
 
181
342
  const vault = await safeQueryRow(client,
182
343
  "SELECT 1 AS ok FROM vault.decrypted_secrets WHERE name = 'rumen_service_role_key'");
183
344
  if (vault.error) {
184
- checks.push({
185
- name: 'vault-secret',
186
- status: 'fail',
187
- detail: `vault.decrypted_secrets unreadable — ${vault.error}`
188
- });
345
+ checks.push(failCheck('vault-secret', classifyDbFailure(vault), `vault.decrypted_secrets unreadable — ${vault.error}`));
189
346
  } else if (!vault.ok) {
190
- checks.push({
191
- name: 'vault-secret',
192
- status: 'fail',
193
- detail: 'rumen_service_role_key missing — Supabase dashboard → Project Settings → Vault → New secret'
194
- });
347
+ checks.push(failCheck(
348
+ 'vault-secret',
349
+ CATEGORIES.DEPENDENCY_DOWN,
350
+ 'rumen_service_role_key missing — Supabase dashboard → Project Settings → Vault → New secret'
351
+ ));
195
352
  } else {
196
- checks.push({ name: 'vault-secret', status: 'pass' });
353
+ checks.push(passCheck('vault-secret'));
197
354
  }
198
355
 
199
356
  const job = await safeQueryRows(client,
200
357
  "SELECT active FROM cron.job WHERE jobname = 'rumen-tick'");
201
358
  if (job.error) {
202
- checks.push({ name: 'cron-job-active', status: 'fail', detail: `cron.job unreadable — ${job.error}` });
359
+ checks.push(failCheck('cron-job-active', classifyDbFailure(job), `cron.job unreadable — ${job.error}`));
203
360
  } else if (!job.rows || job.rows.length === 0) {
204
- checks.push({
205
- name: 'cron-job-active',
206
- status: 'fail',
207
- detail: 'rumen-tick row not found — re-run `termdeck init --rumen`'
208
- });
361
+ checks.push(failCheck(
362
+ 'cron-job-active',
363
+ CATEGORIES.DEPENDENCY_DOWN,
364
+ 'rumen-tick row not found — re-run `termdeck init --rumen`'
365
+ ));
209
366
  } else if (!job.rows[0].active) {
210
- checks.push({
211
- name: 'cron-job-active',
212
- status: 'fail',
213
- detail:
214
- "rumen-tick paused — SELECT cron.alter_job((SELECT jobid FROM cron.job WHERE jobname = 'rumen-tick'), active := true);"
215
- });
367
+ checks.push(failCheck(
368
+ 'cron-job-active',
369
+ CATEGORIES.DEPENDENCY_DOWN,
370
+ "rumen-tick paused — SELECT cron.alter_job((SELECT jobid FROM cron.job WHERE jobname = 'rumen-tick'), active := true);"
371
+ ));
216
372
  } else {
217
- checks.push({ name: 'cron-job-active', status: 'pass' });
373
+ checks.push(passCheck('cron-job-active'));
218
374
  }
219
375
  } finally {
220
376
  if (owned) {
@@ -234,15 +390,29 @@ function httpReachable(url, timeoutMs = 2000) {
234
390
  try {
235
391
  req = mod.get(url, { timeout: timeoutMs }, (res) => {
236
392
  const ok = res.statusCode != null && res.statusCode < 500;
393
+ const status = res.statusCode;
237
394
  res.resume();
238
- resolve({ ok, status: res.statusCode });
395
+ resolve({ ok, status });
239
396
  });
240
397
  } catch (err) {
241
- resolve({ ok: false, error: err && err.message ? err.message : String(err) });
398
+ // Sprint 63 T3 §3.2 surface `code` so the caller can classify into
399
+ // the red:* taxonomy without re-parsing the message.
400
+ resolve({
401
+ ok: false,
402
+ error: err && err.message ? err.message : String(err),
403
+ code: err && err.code,
404
+ });
242
405
  return;
243
406
  }
244
- req.on('error', (err) => resolve({ ok: false, error: err && err.message ? err.message : String(err) }));
245
- req.on('timeout', () => { try { req.destroy(); } catch (_e) { /* gone */ } resolve({ ok: false, error: 'timeout' }); });
407
+ req.on('error', (err) => resolve({
408
+ ok: false,
409
+ error: err && err.message ? err.message : String(err),
410
+ code: err && err.code,
411
+ }));
412
+ req.on('timeout', () => {
413
+ try { req.destroy(); } catch (_e) { /* gone */ }
414
+ resolve({ ok: false, error: 'timeout', code: 'TIMEOUT' });
415
+ });
246
416
  });
247
417
  }
248
418
 
@@ -250,50 +420,66 @@ async function checkMnestraWebhook(config, options) {
250
420
  if (options && typeof options._mnestraWebhookProbe === 'function') {
251
421
  try {
252
422
  const r = await options._mnestraWebhookProbe();
253
- if (r && r.ok) return { name: 'mnestra-webhook', status: 'pass' };
254
- return { name: 'mnestra-webhook', status: 'warn', detail: (r && r.detail) || 'unreachable' };
423
+ if (r && r.ok) return passCheck('mnestra-webhook');
424
+ const cat = classifyHttpFailure(r);
425
+ return warnCheck('mnestra-webhook', cat, (r && r.detail) || (r && r.error) || 'unreachable');
255
426
  } catch (err) {
256
- return { name: 'mnestra-webhook', status: 'warn', detail: err && err.message ? err.message : String(err) };
427
+ const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
428
+ return warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
257
429
  }
258
430
  }
259
431
  const rag = (config && config.rag) || {};
260
432
  if (!rag.mnestraWebhookUrl) {
261
- return { name: 'mnestra-webhook', status: 'warn', detail: 'webhook URL not configured' };
433
+ // Sprint 63 T3 §3.2 URL not configured = init-failed (operator never
434
+ // set up the webhook), not unreachable. Log-once so a 30s poll on an
435
+ // unconfigured install doesn't flood warns.
436
+ logInitFailedOnce('mnestra-webhook', 'rag.mnestraWebhookUrl not configured');
437
+ return warnCheck('mnestra-webhook', CATEGORIES.INIT_FAILED, 'webhook URL not configured');
262
438
  }
263
439
  const healthUrl = String(rag.mnestraWebhookUrl).replace(/\/mnestra\/?$/, '/healthz');
264
440
  const r = await httpReachable(healthUrl, 2000);
265
- if (r.ok) return { name: 'mnestra-webhook', status: 'pass' };
266
- return {
267
- name: 'mnestra-webhook',
268
- status: 'warn',
269
- detail: r.error ? `unreachable — ${r.error}` : `HTTP ${r.status || '???'}`
270
- };
441
+ if (r.ok) return passCheck('mnestra-webhook');
442
+ const cat = classifyHttpFailure(r);
443
+ const why = r.error
444
+ ? `${r.error}${typeof r.status === 'number' ? ` (HTTP ${r.status})` : ''}`
445
+ : `HTTP ${r.status || '???'}`;
446
+ return warnCheck('mnestra-webhook', cat, why);
271
447
  }
272
448
 
273
449
  async function checkRumenPool(config, options) {
274
450
  if (options && typeof options._rumenPoolProbe === 'function') {
275
451
  try {
276
452
  const r = await options._rumenPoolProbe();
277
- if (r && r.ok) return { name: 'rumen-pool', status: 'pass' };
278
- return { name: 'rumen-pool', status: 'warn', detail: (r && r.detail) || 'unreachable (best-effort)' };
453
+ if (r && r.ok) return passCheck('rumen-pool');
454
+ // Test-seam probe should pass `category` if it has one; else infer.
455
+ const cat = (r && r.category) || classifyDbFailure(r || {});
456
+ return warnCheck('rumen-pool', cat, (r && r.detail) || (r && r.error) || 'unreachable (best-effort)');
279
457
  } catch (err) {
280
- return { name: 'rumen-pool', status: 'warn', detail: err && err.message ? err.message : String(err) };
458
+ const cat = classifyDbFailure(err);
459
+ return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
281
460
  }
282
461
  }
283
462
  let pg;
284
463
  try { pg = require('pg'); } catch (_e) { pg = null; }
285
- if (!pg) return { name: 'rumen-pool', status: 'warn', detail: 'pg module not installed' };
464
+ if (!pg) {
465
+ logInitFailedOnce('rumen-pool', 'pg module not installed');
466
+ return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'pg module not installed');
467
+ }
286
468
 
287
469
  const dbUrl = (config && config.rag && config.rag.databaseUrl) || process.env.DATABASE_URL;
288
- if (!dbUrl) return { name: 'rumen-pool', status: 'warn', detail: 'DATABASE_URL not set' };
470
+ if (!dbUrl) {
471
+ logInitFailedOnce('rumen-pool', 'DATABASE_URL not set');
472
+ return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'DATABASE_URL not set');
473
+ }
289
474
 
290
475
  const pool = new pg.Pool({ connectionString: dbUrl, max: 1, connectionTimeoutMillis: 3000 });
291
476
  try {
292
477
  const res = await pool.query('SELECT 1 AS ok');
293
- if (res.rows[0] && res.rows[0].ok === 1) return { name: 'rumen-pool', status: 'pass' };
294
- return { name: 'rumen-pool', status: 'warn', detail: 'SELECT 1 returned unexpected result' };
478
+ if (res.rows[0] && res.rows[0].ok === 1) return passCheck('rumen-pool');
479
+ return warnCheck('rumen-pool', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
295
480
  } catch (err) {
296
- return { name: 'rumen-pool', status: 'warn', detail: err && err.message ? err.message : String(err) };
481
+ const cat = classifyDbFailure(err);
482
+ return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
297
483
  } finally {
298
484
  try { await pool.end(); } catch (_e) { /* ignore */ }
299
485
  }
@@ -316,35 +502,82 @@ async function getFullHealth(config = {}, options = {}) {
316
502
 
317
503
  const checks = [];
318
504
 
505
+ // Sprint 63 T3 §3.2 outer-catch hardening (T4-CODEX AUDIT-CONCERN 13:27 ET):
506
+ // every probe is independently wrapped here so an unexpected throw in a
507
+ // single probe path can't sink the whole report. Pre-Sprint-63 these four
508
+ // catches emitted raw `{ status: 'fail'|'warn', detail }` with no
509
+ // `category` field — operators triaging "why is the dashboard red?" still
510
+ // had to read prose. The whole point of the taxonomy is that there is no
511
+ // such thing as an uncategorized non-pass row. Every fallback now runs the
512
+ // captured `err` through `classifyDbFailure` / `classifyHttpFailure` and
513
+ // composes a normal `failCheck` / `warnCheck` envelope. When the
514
+ // classifier can't infer (truly opaque throw — bug in the probe itself,
515
+ // not in the dependency), the default branch in each classifier returns
516
+ // `red:dependency-down`, which is the right-by-default category for "the
517
+ // probe's path is broken" — operator's first action is to inspect the
518
+ // peer / its config, not the local handle.
519
+
520
+ // Sprint 63 T3 §3.2 — `_throwIn` test seam. The probe functions each have
521
+ // their own try/catch so unreached-by-design inputs can't throw out into
522
+ // the outer catches below. The fence tests need a way to simulate "a
523
+ // probe's path threw before its own catch caught it" — i.e., the
524
+ // belt-and-suspenders outer catch. Set `_throwIn` to one of
525
+ // `'sqlite' | 'pg' | 'webhook' | 'rumen-pool'` to inject a synthetic
526
+ // throw at the corresponding outer-try entry. Never set in production —
527
+ // ignored if the value is falsy.
528
+ const throwIn = options._throwIn || null;
529
+ const synth = (where) => new Error(`test-fence: simulated throw in ${where} probe path`);
530
+
319
531
  // 1. SQLite (sync — small DB, no risk of blocking)
320
- try { checks.push(checkSqlite(db)); }
321
- catch (err) { checks.push({ name: 'sqlite', status: 'fail', detail: err && err.message ? err.message : String(err) }); }
532
+ try {
533
+ if (throwIn === 'sqlite') throw synth('sqlite');
534
+ checks.push(checkSqlite(db));
535
+ }
536
+ catch (err) {
537
+ const cat = classifyDbFailure(err);
538
+ checks.push(failCheck('sqlite', cat, err && err.message ? err.message : String(err)));
539
+ }
322
540
 
323
541
  // 2-7. Postgres-side suite
324
542
  let pgChecks;
325
- try { pgChecks = await runPgChecks({ databaseUrl, _pgClient: options._pgClient }); }
543
+ try {
544
+ if (throwIn === 'pg') throw synth('pg');
545
+ pgChecks = await runPgChecks({ databaseUrl, _pgClient: options._pgClient });
546
+ }
326
547
  catch (err) {
327
- pgChecks = [{
328
- name: 'mnestra-pg',
329
- status: 'fail',
330
- detail: err && err.message ? err.message : String(err)
331
- }];
548
+ const cat = classifyDbFailure(err);
549
+ const why = err && err.message ? err.message : String(err);
550
+ pgChecks = [failCheck('mnestra-pg', cat, why)];
551
+ // Dependents inherit the same category see runPgChecks header for the
552
+ // rationale (one root-cause row, not 6 independent-looking REDs).
332
553
  for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
333
- pgChecks.push({ name, status: 'fail', detail: 'pg suite aborted' });
554
+ pgChecks.push(failCheck(name, cat, 'pg suite aborted'));
334
555
  }
335
556
  }
336
557
  for (const c of pgChecks) checks.push(c);
337
558
 
338
559
  // 8. Mnestra webhook (warn)
339
560
  let webhook;
340
- try { webhook = await checkMnestraWebhook(config, options); }
341
- catch (err) { webhook = { name: 'mnestra-webhook', status: 'warn', detail: err && err.message ? err.message : String(err) }; }
561
+ try {
562
+ if (throwIn === 'webhook') throw synth('webhook');
563
+ webhook = await checkMnestraWebhook(config, options);
564
+ }
565
+ catch (err) {
566
+ const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
567
+ webhook = warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
568
+ }
342
569
  checks.push(webhook);
343
570
 
344
571
  // 9. Rumen pool (warn)
345
572
  let pool;
346
- try { pool = await checkRumenPool(config, options); }
347
- catch (err) { pool = { name: 'rumen-pool', status: 'warn', detail: err && err.message ? err.message : String(err) }; }
573
+ try {
574
+ if (throwIn === 'rumen-pool') throw synth('rumen-pool');
575
+ pool = await checkRumenPool(config, options);
576
+ }
577
+ catch (err) {
578
+ const cat = classifyDbFailure(err);
579
+ pool = warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
580
+ }
348
581
  checks.push(pool);
349
582
 
350
583
  const ok = checks
@@ -370,8 +603,19 @@ function _resetCache() {
370
603
  _cachedAt = 0;
371
604
  }
372
605
 
606
+ // Sprint 63 T3 §3.2 — clear the init-failed log-once memory so each test
607
+ // case starts fresh. Without this, the first test that exercises a null-db
608
+ // path would silence the log on subsequent tests in the same process.
609
+ function _resetInitLogged() {
610
+ _initLoggedOnce.clear();
611
+ }
612
+
373
613
  module.exports = {
374
614
  getFullHealth,
375
615
  REQUIRED_CHECKS,
376
- _resetCache
616
+ CATEGORIES,
617
+ classifyHttpFailure,
618
+ classifyDbFailure,
619
+ _resetCache,
620
+ _resetInitLogged,
377
621
  };