@jhizzard/termdeck 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jhizzard/termdeck",
3
- "version": "1.1.1",
3
+ "version": "1.2.0",
4
4
  "description": "Browser-based terminal multiplexer with metadata overlays, panel flashback memory recall, and AI-aware session management",
5
5
  "bin": {
6
6
  "termdeck": "./packages/cli/src/index.js"
@@ -410,7 +410,14 @@ async function checkRumen() {
410
410
  }
411
411
  const pool = new pg.Pool({ connectionString: dbUrl, max: 1, connectionTimeoutMillis: 5000 });
412
412
  try {
413
- const r = await pool.query("SELECT to_char(NOW() - MAX(created_at), 'HH24:MI:SS') AS ago FROM rumen_jobs");
413
+ // Sprint 63 T3 §3.1 `rumen_jobs` has `started_at` (migration 001), NOT
414
+ // `created_at`. Pre-Sprint-63 this probed `created_at` and threw a
415
+ // generic WARN that doctor's same-DB check did not (Sprint 35 doctor fix
416
+ // landed RUMEN_TIME_COL.rumen_jobs='started_at' but never propagated
417
+ // here). Brad reproduced on r730 2026-05-11; doctor 23/23 GREEN while
418
+ // launcher Step 3 emitted `WARN (query failed: column "created_at"
419
+ // does not exist)`. Aligned both probes to the same column.
420
+ const r = await pool.query("SELECT to_char(NOW() - MAX(started_at), 'HH24:MI:SS') AS ago FROM rumen_jobs");
414
421
  const ago = r.rows[0] && r.rows[0].ago;
415
422
  if (ago) {
416
423
  stepLine('3/4', 'Checking Rumen', 'OK', `(last job ${ago} ago)`);
@@ -419,10 +426,20 @@ async function checkRumen() {
419
426
  stepLine('3/4', 'Checking Rumen', 'WARN', '(no jobs yet — try termdeck init --rumen)');
420
427
  return { ago: null };
421
428
  } catch (err) {
422
- if (/relation .*rumen_jobs.* does not exist/i.test(String(err.message))) {
429
+ const msg = String(err && err.message ? err.message : err);
430
+ if (/relation .*rumen_jobs.* does not exist/i.test(msg)) {
423
431
  stepLine('3/4', 'Checking Rumen', 'SKIP', '(rumen_jobs table not present — run termdeck init --rumen)');
424
432
  } else {
425
- stepLine('3/4', 'Checking Rumen', 'WARN', `(query failed: ${err.message})`);
433
+ const colMatch = msg.match(/column "([^"]+)" does not exist/i);
434
+ if (colMatch) {
435
+ // Schema drift — rumen_jobs is missing the column we queried. Naming
436
+ // the column + remediation beats a bare `query failed` that operators
437
+ // learn to filter out (Brad's r730, 2026-05-11).
438
+ stepLine('3/4', 'Checking Rumen', 'WARN',
439
+ `(rumen_jobs.${colMatch[1]} column missing — re-run \`termdeck init --rumen\` to apply migration 001)`);
440
+ } else {
441
+ stepLine('3/4', 'Checking Rumen', 'WARN', `(query failed: ${err.message})`);
442
+ }
426
443
  }
427
444
  return { ago: null };
428
445
  } finally {
@@ -50,13 +50,18 @@ function statusFor(data) {
50
50
  // resolveTranscriptPath — Sprint 50 T1.
51
51
  //
52
52
  // Gemini CLI persists chats at
53
- // ~/.gemini/tmp/<basename(cwd)>/chats/session-<ISO-ts>-<short-id>.json
54
- // (single-JSON-object shape that matches parseGeminiJson, verified
55
- // 2026-05-02 substrate probe). Pick the most recently modified file whose
56
- // mtime is at-or-after `session.meta.createdAt`. Falls back to walking
57
- // every project directory under `~/.gemini/tmp/*/chats/` if the basename
58
- // heuristic produces no candidate (e.g., Gemini renormalized the project
59
- // name to deduplicate against an existing one).
53
+ // ~/.gemini/tmp/<basename(cwd)>/chats/session-<ISO-ts>-<short-id>.{json,jsonl}
54
+ // (single-JSON-object shape that matches parseGeminiJson for the .json
55
+ // flavor, verified 2026-05-02 substrate probe; .jsonl flavor introduced
56
+ // some time between 2026-05-02 and 2026-05-08, surfaced by Sprint 63 T2
57
+ // acceptance see docs/sprint-63-wave-2/EXIT-CAPTURE-VERIFICATION.md
58
+ // Finding #2. The extension filter accepts both shapes; downstream parser
59
+ // handling of JSONL deltas is a Sprint 64 candidate). Pick the most
60
+ // recently modified file whose mtime is at-or-after
61
+ // `session.meta.createdAt`. Falls back to walking every project directory
62
+ // under `~/.gemini/tmp/*/chats/` if the basename heuristic produces no
63
+ // candidate (e.g., Gemini renormalized the project name to deduplicate
64
+ // against an existing one).
60
65
  // ──────────────────────────────────────────────────────────────────────────
61
66
 
62
67
  async function resolveTranscriptPath(session) {
@@ -83,7 +88,8 @@ async function resolveTranscriptPath(session) {
83
88
  let entries;
84
89
  try { entries = fs.readdirSync(dir); } catch (_) { return; }
85
90
  for (const name of entries) {
86
- if (!name.startsWith('session-') || !name.endsWith('.json')) continue;
91
+ if (!name.startsWith('session-')) continue;
92
+ if (!name.endsWith('.json') && !name.endsWith('.jsonl')) continue;
87
93
  const full = path.join(dir, name);
88
94
  let st;
89
95
  try { st = fs.statSync(full); } catch (_) { continue; }
@@ -20,6 +20,33 @@
20
20
  // checks (mnestra-webhook, rumen-pool) are best-effort: a failure surfaces
21
21
  // as `warn` with detail, but does not flip `ok`.
22
22
  //
23
+ // Failure taxonomy (Sprint 63 T3 §3.2 — Brad r730 cascade 2026-05-11)
24
+ // ──────────────────────────────────────────────────────────────────
25
+ // Pre-Sprint-63 every check that didn't return `pass` collapsed to `fail`
26
+ // with a free-text `detail` string. Operators triaging "why is the install
27
+ // red?" had to read each detail and guess. The cost was real: on 2026-05-11
28
+ // a SQLite ABI mismatch left `db = null` at boot; the resulting
29
+ // `red: timeout` strings (from probes that timed-out trying to use the null
30
+ // handle indirectly) masked the actual `init-failed` root cause for hours.
31
+ //
32
+ // Every non-pass check now carries a `category` field with one of:
33
+ // `red:unreachable` — network/socket level (ECONNREFUSED / EHOSTUNREACH
34
+ // / ENETUNREACH / ENOTFOUND on connect)
35
+ // `red:timeout` — request issued, no response in the window
36
+ // (AbortError / req timeout / pg ETIMEDOUT)
37
+ // `red:dependency-down` — peer responded but the dependency is unhealthy
38
+ // (HTTP 5xx / SQL schema error from a reachable DB)
39
+ // `red:init-failed` — local handle the probe needs was never initialized
40
+ // (db === null at boot / DATABASE_URL not set)
41
+ //
42
+ // `detail` strings are prefixed with the category for human readability:
43
+ // `red:unreachable (could not connect to Postgres using DATABASE_URL)`.
44
+ //
45
+ // init-failed surfaces use a log-once gate so a 30s-poll cycle on a process
46
+ // with a missing handle (e.g. better-sqlite3 not loaded) writes ONE warn at
47
+ // boot, not 2880 warns/day. Probes still emit `red:init-failed` per cycle
48
+ // in the JSON report — only the log emission is gated.
49
+ //
23
50
  // Caching
24
51
  // ───────
25
52
  // Reports cached in module scope for 30s. `getFullHealth(config, { refresh: true })`
@@ -29,8 +56,9 @@
29
56
  // Error handling
30
57
  // ──────────────
31
58
  // Every check is wrapped: any unexpected error downgrades that single check
32
- // to `fail` (or `warn` for warn-checks) with the error message in `detail`.
33
- // `getFullHealth()` always resolves with a structured report — never throws.
59
+ // to `fail` (or `warn` for warn-checks) with the error message in `detail`
60
+ // and a `category` from the taxonomy above. `getFullHealth()` always
61
+ // resolves with a structured report — never throws.
34
62
 
35
63
  'use strict';
36
64
 
@@ -50,21 +78,104 @@ const REQUIRED_CHECKS = new Set([
50
78
  'cron-job-active'
51
79
  ]);
52
80
 
81
+ // Sprint 63 T3 §3.2 — stable taxonomy strings. Exported so dashboard / doctor
82
+ // / external graders can filter by category instead of pattern-matching the
83
+ // detail prose. Frozen object so callers can rely on `CATEGORIES.UNREACHABLE`
84
+ // without accidentally rebinding.
85
+ const CATEGORIES = Object.freeze({
86
+ UNREACHABLE: 'red:unreachable',
87
+ TIMEOUT: 'red:timeout',
88
+ DEPENDENCY_DOWN: 'red:dependency-down',
89
+ INIT_FAILED: 'red:init-failed',
90
+ });
91
+
53
92
  let _cache = null;
54
93
  let _cachedAt = 0;
55
94
 
95
+ // Sprint 63 T3 §3.2 — log-once gate for init-failed surfaces. A 30s health
96
+ // poll on a process with a missing handle would otherwise log every cycle
97
+ // (~2880 warn lines/day per missing handle). Probes that detect a null
98
+ // handle at boot call `logInitFailedOnce(name, reason)`; the first call
99
+ // emits a warn line, subsequent calls are silent for the lifetime of the
100
+ // process. Probes still emit `red:init-failed` in the JSON report on every
101
+ // cycle — only the log line is gated. Reset via `_resetInitLogged()` test
102
+ // seam between cases.
103
+ const _initLoggedOnce = new Map();
104
+ function logInitFailedOnce(probeName, reason) {
105
+ if (_initLoggedOnce.has(probeName)) return;
106
+ _initLoggedOnce.set(probeName, reason);
107
+ // eslint-disable-next-line no-console
108
+ console.warn(
109
+ `[health] ${probeName} handle null at boot — probes will return ` +
110
+ `${CATEGORIES.INIT_FAILED} until next process start; reason: ${reason}`
111
+ );
112
+ }
113
+
114
+ // Classify an HTTP-side failure shape `{ ok, status, error, code }` (as
115
+ // returned by `httpReachable`) into one of the four red:* categories.
116
+ function classifyHttpFailure(r) {
117
+ if (!r) return CATEGORIES.UNREACHABLE;
118
+ if (r.code === 'TIMEOUT' || r.code === 'ABORT_ERR' || r.code === 'ERR_TIMEOUT' || r.error === 'timeout') {
119
+ return CATEGORIES.TIMEOUT;
120
+ }
121
+ if (r.code === 'ECONNREFUSED' || r.code === 'EHOSTUNREACH' || r.code === 'ENETUNREACH' || r.code === 'ENOTFOUND') {
122
+ return CATEGORIES.UNREACHABLE;
123
+ }
124
+ if (typeof r.status === 'number' && r.status >= 500) return CATEGORIES.DEPENDENCY_DOWN;
125
+ if (typeof r.status === 'number') return CATEGORIES.DEPENDENCY_DOWN; // any non-2xx-3xx-4xx-network is "peer responded badly"
126
+ return CATEGORIES.UNREACHABLE;
127
+ }
128
+
129
+ // Classify a database / Node-side failure into one of the four categories.
130
+ // Accepts either a raw Error or a `{ error, code }` envelope from
131
+ // `safeQueryRow` / `safeQueryRows`.
132
+ function classifyDbFailure(errOrEnvelope) {
133
+ if (!errOrEnvelope) return CATEGORIES.DEPENDENCY_DOWN;
134
+ const code = errOrEnvelope.code || (errOrEnvelope._err && errOrEnvelope._err.code);
135
+ const msg = String(errOrEnvelope.message || errOrEnvelope.error || errOrEnvelope);
136
+ if (code === 'ECONNREFUSED' || code === 'EHOSTUNREACH' || code === 'ENETUNREACH' || code === 'ENOTFOUND') {
137
+ return CATEGORIES.UNREACHABLE;
138
+ }
139
+ if (code === 'ETIMEDOUT' || code === 'ERR_TIMEOUT' || /\btimeout\b/i.test(msg)) {
140
+ return CATEGORIES.TIMEOUT;
141
+ }
142
+ // SQL errors (42703 column-not-exist, 42P01 relation-not-exist, etc.) →
143
+ // the dependency answered but its schema is misconfigured. That's
144
+ // "dependency unhealthy," not "unreachable" or "timeout."
145
+ return CATEGORIES.DEPENDENCY_DOWN;
146
+ }
147
+
148
+ // Helpers to compose check results with a category-prefixed detail. Keeps
149
+ // each call site readable + ensures the prefix is consistent across probes.
150
+ function failCheck(name, category, why) {
151
+ return { name, status: 'fail', category, detail: `${category} (${why})` };
152
+ }
153
+ function warnCheck(name, category, why) {
154
+ return { name, status: 'warn', category, detail: `${category} (${why})` };
155
+ }
156
+ function passCheck(name) {
157
+ return { name, status: 'pass' };
158
+ }
159
+
56
160
  // ── SQLite check ────────────────────────────────────────────────────────────
57
161
 
58
162
  function checkSqlite(db) {
59
163
  if (!db) {
60
- return { name: 'sqlite', status: 'fail', detail: 'better-sqlite3 not initialized' };
164
+ // Sprint 63 T3 §3.2 `db === null` is `red:init-failed`, NOT `red:timeout`.
165
+ // The v1.1.1 fail-fast on SQLite ABI mismatch makes this surface rare in
166
+ // practice, but the probe must still classify correctly because future
167
+ // optional deps may be allowed to be null. Log-once gate prevents the
168
+ // 30s poll from flooding logs.
169
+ logInitFailedOnce('sqlite', 'better-sqlite3 not initialized');
170
+ return failCheck('sqlite', CATEGORIES.INIT_FAILED, 'better-sqlite3 not initialized');
61
171
  }
62
172
  try {
63
173
  const row = db.prepare('SELECT 1 AS ok').get();
64
- if (row && row.ok === 1) return { name: 'sqlite', status: 'pass' };
65
- return { name: 'sqlite', status: 'fail', detail: 'SELECT 1 returned unexpected result' };
174
+ if (row && row.ok === 1) return passCheck('sqlite');
175
+ return failCheck('sqlite', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
66
176
  } catch (err) {
67
- return { name: 'sqlite', status: 'fail', detail: err && err.message ? err.message : String(err) };
177
+ const cat = classifyDbFailure(err);
178
+ return failCheck('sqlite', cat, err && err.message ? err.message : String(err));
68
179
  }
69
180
  }
70
181
 
@@ -81,7 +192,12 @@ async function safeQueryRow(client, sql) {
81
192
  if (r.rows && r.rows.length > 0 && r.rows[0].ok) return { ok: true };
82
193
  return { ok: false };
83
194
  } catch (err) {
84
- return { error: err && err.message ? err.message : String(err) };
195
+ // Surface `code` so the caller can classify into the red:* taxonomy
196
+ // without re-parsing the message string.
197
+ return {
198
+ error: err && err.message ? err.message : String(err),
199
+ code: err && err.code,
200
+ };
85
201
  }
86
202
  }
87
203
 
@@ -90,34 +206,80 @@ async function safeQueryRows(client, sql) {
90
206
  const r = await client.query(sql);
91
207
  return { rows: r.rows || [] };
92
208
  } catch (err) {
93
- return { error: err && err.message ? err.message : String(err) };
209
+ return {
210
+ error: err && err.message ? err.message : String(err),
211
+ code: err && err.code,
212
+ };
94
213
  }
95
214
  }
96
215
 
216
+ // Sprint 63 T3 §3.2 — track whether the most recent connect attempt timed out
217
+ // vs. was outright unreachable. The pg client doesn't expose this from inside
218
+ // the helper, so the helper records it in a return envelope.
97
219
  async function openPgClient(databaseUrl) {
98
- if (!databaseUrl) return null;
220
+ if (!databaseUrl) return { client: null, reason: 'no-url' };
99
221
  let pgRunner;
100
- try { pgRunner = require('./setup/pg-runner'); } catch (_e) { return null; }
101
- try { return await pgRunner.connect(databaseUrl); } catch (_e) { return null; }
222
+ try { pgRunner = require('./setup/pg-runner'); }
223
+ catch (_e) { return { client: null, reason: 'pg-runner-unavailable' }; }
224
+ try {
225
+ const client = await pgRunner.connect(databaseUrl);
226
+ return { client, reason: null };
227
+ } catch (err) {
228
+ return {
229
+ client: null,
230
+ reason: 'connect-failed',
231
+ error: err && err.message ? err.message : String(err),
232
+ code: err && err.code,
233
+ };
234
+ }
235
+ }
236
+
237
+ // Sprint 63 T3 §3.2 — dependent-checks shape when there's no client. Pre-
238
+ // Sprint-63 these collapsed to status:'fail', detail:'pg unavailable' with
239
+ // no category; operators couldn't distinguish "DATABASE_URL not set"
240
+ // (`init-failed` — fix the .env) from "Postgres unreachable" (`unreachable`
241
+ // — fix the network) from "Postgres took 5s and gave up" (`timeout` — bump
242
+ // timeout or check pgbouncer). Each downstream check now carries the same
243
+ // category as the connect attempt so the dashboard can render one row
244
+ // "Postgres unreachable" and dim the six dependents instead of six
245
+ // independent-looking RED rows.
246
+ function pushPgUnavailableChecks(checks, primaryName, category, primaryDetail, dependentDetail) {
247
+ checks.push(failCheck(primaryName, category, primaryDetail));
248
+ for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
249
+ checks.push(failCheck(name, category, dependentDetail));
250
+ }
102
251
  }
103
252
 
104
253
  async function runPgChecks({ databaseUrl, _pgClient }) {
105
254
  const checks = [];
106
- const client = _pgClient || (await openPgClient(databaseUrl));
107
- const owned = !_pgClient;
108
255
 
256
+ let client = _pgClient || null;
257
+ let owned = false;
258
+ let connectEnvelope = null;
109
259
  if (!client) {
110
- checks.push({
111
- name: 'mnestra-pg',
112
- status: 'fail',
113
- detail: databaseUrl
114
- ? 'could not connect to Postgres using DATABASE_URL'
115
- : 'DATABASE_URL not configured (set in ~/.termdeck/secrets.env)'
116
- });
117
- // Dependent checks can't run without a connection surface them as
118
- // fail rather than silently skipping so the report is complete.
119
- for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
120
- checks.push({ name, status: 'fail', detail: 'pg unavailable' });
260
+ connectEnvelope = await openPgClient(databaseUrl);
261
+ client = connectEnvelope.client;
262
+ owned = client != null;
263
+ }
264
+
265
+ if (!client) {
266
+ if (!databaseUrl) {
267
+ // No URL init-failed (operator never set DATABASE_URL). Log-once.
268
+ logInitFailedOnce('mnestra-pg', 'DATABASE_URL not configured');
269
+ pushPgUnavailableChecks(
270
+ checks,
271
+ 'mnestra-pg',
272
+ CATEGORIES.INIT_FAILED,
273
+ 'DATABASE_URL not configured — set in ~/.termdeck/secrets.env',
274
+ 'pg unavailable — DATABASE_URL not configured'
275
+ );
276
+ } else {
277
+ // URL set but connect failed → classify by code (timeout vs unreachable).
278
+ const cat = classifyDbFailure(connectEnvelope || {});
279
+ const why = connectEnvelope && connectEnvelope.error
280
+ ? `could not connect to Postgres using DATABASE_URL — ${connectEnvelope.error}`
281
+ : 'could not connect to Postgres using DATABASE_URL';
282
+ pushPgUnavailableChecks(checks, 'mnestra-pg', cat, why, 'pg unavailable — connect failed');
121
283
  }
122
284
  return checks;
123
285
  }
@@ -125,11 +287,11 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
125
287
  try {
126
288
  const ping = await safeQueryRow(client, 'SELECT 1 AS ok');
127
289
  if (ping.error) {
128
- checks.push({ name: 'mnestra-pg', status: 'fail', detail: ping.error });
290
+ checks.push(failCheck('mnestra-pg', classifyDbFailure(ping), ping.error));
129
291
  } else if (!ping.ok) {
130
- checks.push({ name: 'mnestra-pg', status: 'fail', detail: 'SELECT 1 returned no row' });
292
+ checks.push(failCheck('mnestra-pg', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned no row'));
131
293
  } else {
132
- checks.push({ name: 'mnestra-pg', status: 'pass' });
294
+ checks.push(passCheck('mnestra-pg'));
133
295
  }
134
296
 
135
297
  // memory_items.source_session_id — the v0.6.5 column from Brad's saga.
@@ -137,84 +299,78 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
137
299
  "SELECT 1 AS ok FROM information_schema.columns " +
138
300
  "WHERE table_schema = 'public' AND table_name = 'memory_items' AND column_name = 'source_session_id'");
139
301
  if (col.error) {
140
- checks.push({ name: 'memory-items-col', status: 'fail', detail: col.error });
302
+ checks.push(failCheck('memory-items-col', classifyDbFailure(col), col.error));
141
303
  } else if (!col.ok) {
142
- checks.push({
143
- name: 'memory-items-col',
144
- status: 'fail',
145
- detail:
146
- 'memory_items.source_session_id missing re-run termdeck init --mnestra --yes ' +
147
- '(if loader picked up a stale set, first: npm cache clean --force && npm i -g @jhizzard/termdeck@latest)'
148
- });
304
+ checks.push(failCheck(
305
+ 'memory-items-col',
306
+ CATEGORIES.DEPENDENCY_DOWN,
307
+ 'memory_items.source_session_id missing — re-run termdeck init --mnestra --yes ' +
308
+ '(if loader picked up a stale set, first: npm cache clean --force && npm i -g @jhizzard/termdeck@latest)'
309
+ ));
149
310
  } else {
150
- checks.push({ name: 'memory-items-col', status: 'pass' });
311
+ checks.push(passCheck('memory-items-col'));
151
312
  }
152
313
 
153
314
  const cron = await safeQueryRow(client,
154
315
  "SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_cron'");
155
316
  if (cron.error) {
156
- checks.push({ name: 'pg-cron-ext', status: 'fail', detail: cron.error });
317
+ checks.push(failCheck('pg-cron-ext', classifyDbFailure(cron), cron.error));
157
318
  } else if (!cron.ok) {
158
- checks.push({
159
- name: 'pg-cron-ext',
160
- status: 'fail',
161
- detail: 'extension not enabled — Supabase dashboard → Database → Extensions → pg_cron'
162
- });
319
+ checks.push(failCheck(
320
+ 'pg-cron-ext',
321
+ CATEGORIES.DEPENDENCY_DOWN,
322
+ 'extension not enabled — Supabase dashboard → Database → Extensions → pg_cron'
323
+ ));
163
324
  } else {
164
- checks.push({ name: 'pg-cron-ext', status: 'pass' });
325
+ checks.push(passCheck('pg-cron-ext'));
165
326
  }
166
327
 
167
328
  const net = await safeQueryRow(client,
168
329
  "SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_net'");
169
330
  if (net.error) {
170
- checks.push({ name: 'pg-net-ext', status: 'fail', detail: net.error });
331
+ checks.push(failCheck('pg-net-ext', classifyDbFailure(net), net.error));
171
332
  } else if (!net.ok) {
172
- checks.push({
173
- name: 'pg-net-ext',
174
- status: 'fail',
175
- detail: 'extension not enabled — Supabase dashboard → Database → Extensions → pg_net'
176
- });
333
+ checks.push(failCheck(
334
+ 'pg-net-ext',
335
+ CATEGORIES.DEPENDENCY_DOWN,
336
+ 'extension not enabled — Supabase dashboard → Database → Extensions → pg_net'
337
+ ));
177
338
  } else {
178
- checks.push({ name: 'pg-net-ext', status: 'pass' });
339
+ checks.push(passCheck('pg-net-ext'));
179
340
  }
180
341
 
181
342
  const vault = await safeQueryRow(client,
182
343
  "SELECT 1 AS ok FROM vault.decrypted_secrets WHERE name = 'rumen_service_role_key'");
183
344
  if (vault.error) {
184
- checks.push({
185
- name: 'vault-secret',
186
- status: 'fail',
187
- detail: `vault.decrypted_secrets unreadable — ${vault.error}`
188
- });
345
+ checks.push(failCheck('vault-secret', classifyDbFailure(vault), `vault.decrypted_secrets unreadable — ${vault.error}`));
189
346
  } else if (!vault.ok) {
190
- checks.push({
191
- name: 'vault-secret',
192
- status: 'fail',
193
- detail: 'rumen_service_role_key missing — Supabase dashboard → Project Settings → Vault → New secret'
194
- });
347
+ checks.push(failCheck(
348
+ 'vault-secret',
349
+ CATEGORIES.DEPENDENCY_DOWN,
350
+ 'rumen_service_role_key missing — Supabase dashboard → Project Settings → Vault → New secret'
351
+ ));
195
352
  } else {
196
- checks.push({ name: 'vault-secret', status: 'pass' });
353
+ checks.push(passCheck('vault-secret'));
197
354
  }
198
355
 
199
356
  const job = await safeQueryRows(client,
200
357
  "SELECT active FROM cron.job WHERE jobname = 'rumen-tick'");
201
358
  if (job.error) {
202
- checks.push({ name: 'cron-job-active', status: 'fail', detail: `cron.job unreadable — ${job.error}` });
359
+ checks.push(failCheck('cron-job-active', classifyDbFailure(job), `cron.job unreadable — ${job.error}`));
203
360
  } else if (!job.rows || job.rows.length === 0) {
204
- checks.push({
205
- name: 'cron-job-active',
206
- status: 'fail',
207
- detail: 'rumen-tick row not found — re-run `termdeck init --rumen`'
208
- });
361
+ checks.push(failCheck(
362
+ 'cron-job-active',
363
+ CATEGORIES.DEPENDENCY_DOWN,
364
+ 'rumen-tick row not found — re-run `termdeck init --rumen`'
365
+ ));
209
366
  } else if (!job.rows[0].active) {
210
- checks.push({
211
- name: 'cron-job-active',
212
- status: 'fail',
213
- detail:
214
- "rumen-tick paused — SELECT cron.alter_job((SELECT jobid FROM cron.job WHERE jobname = 'rumen-tick'), active := true);"
215
- });
367
+ checks.push(failCheck(
368
+ 'cron-job-active',
369
+ CATEGORIES.DEPENDENCY_DOWN,
370
+ "rumen-tick paused — SELECT cron.alter_job((SELECT jobid FROM cron.job WHERE jobname = 'rumen-tick'), active := true);"
371
+ ));
216
372
  } else {
217
- checks.push({ name: 'cron-job-active', status: 'pass' });
373
+ checks.push(passCheck('cron-job-active'));
218
374
  }
219
375
  } finally {
220
376
  if (owned) {
@@ -234,15 +390,29 @@ function httpReachable(url, timeoutMs = 2000) {
234
390
  try {
235
391
  req = mod.get(url, { timeout: timeoutMs }, (res) => {
236
392
  const ok = res.statusCode != null && res.statusCode < 500;
393
+ const status = res.statusCode;
237
394
  res.resume();
238
- resolve({ ok, status: res.statusCode });
395
+ resolve({ ok, status });
239
396
  });
240
397
  } catch (err) {
241
- resolve({ ok: false, error: err && err.message ? err.message : String(err) });
398
+ // Sprint 63 T3 §3.2 surface `code` so the caller can classify into
399
+ // the red:* taxonomy without re-parsing the message.
400
+ resolve({
401
+ ok: false,
402
+ error: err && err.message ? err.message : String(err),
403
+ code: err && err.code,
404
+ });
242
405
  return;
243
406
  }
244
- req.on('error', (err) => resolve({ ok: false, error: err && err.message ? err.message : String(err) }));
245
- req.on('timeout', () => { try { req.destroy(); } catch (_e) { /* gone */ } resolve({ ok: false, error: 'timeout' }); });
407
+ req.on('error', (err) => resolve({
408
+ ok: false,
409
+ error: err && err.message ? err.message : String(err),
410
+ code: err && err.code,
411
+ }));
412
+ req.on('timeout', () => {
413
+ try { req.destroy(); } catch (_e) { /* gone */ }
414
+ resolve({ ok: false, error: 'timeout', code: 'TIMEOUT' });
415
+ });
246
416
  });
247
417
  }
248
418
 
@@ -250,50 +420,66 @@ async function checkMnestraWebhook(config, options) {
250
420
  if (options && typeof options._mnestraWebhookProbe === 'function') {
251
421
  try {
252
422
  const r = await options._mnestraWebhookProbe();
253
- if (r && r.ok) return { name: 'mnestra-webhook', status: 'pass' };
254
- return { name: 'mnestra-webhook', status: 'warn', detail: (r && r.detail) || 'unreachable' };
423
+ if (r && r.ok) return passCheck('mnestra-webhook');
424
+ const cat = classifyHttpFailure(r);
425
+ return warnCheck('mnestra-webhook', cat, (r && r.detail) || (r && r.error) || 'unreachable');
255
426
  } catch (err) {
256
- return { name: 'mnestra-webhook', status: 'warn', detail: err && err.message ? err.message : String(err) };
427
+ const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
428
+ return warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
257
429
  }
258
430
  }
259
431
  const rag = (config && config.rag) || {};
260
432
  if (!rag.mnestraWebhookUrl) {
261
- return { name: 'mnestra-webhook', status: 'warn', detail: 'webhook URL not configured' };
433
+ // Sprint 63 T3 §3.2 URL not configured = init-failed (operator never
434
+ // set up the webhook), not unreachable. Log-once so a 30s poll on an
435
+ // unconfigured install doesn't flood warns.
436
+ logInitFailedOnce('mnestra-webhook', 'rag.mnestraWebhookUrl not configured');
437
+ return warnCheck('mnestra-webhook', CATEGORIES.INIT_FAILED, 'webhook URL not configured');
262
438
  }
263
439
  const healthUrl = String(rag.mnestraWebhookUrl).replace(/\/mnestra\/?$/, '/healthz');
264
440
  const r = await httpReachable(healthUrl, 2000);
265
- if (r.ok) return { name: 'mnestra-webhook', status: 'pass' };
266
- return {
267
- name: 'mnestra-webhook',
268
- status: 'warn',
269
- detail: r.error ? `unreachable — ${r.error}` : `HTTP ${r.status || '???'}`
270
- };
441
+ if (r.ok) return passCheck('mnestra-webhook');
442
+ const cat = classifyHttpFailure(r);
443
+ const why = r.error
444
+ ? `${r.error}${typeof r.status === 'number' ? ` (HTTP ${r.status})` : ''}`
445
+ : `HTTP ${r.status || '???'}`;
446
+ return warnCheck('mnestra-webhook', cat, why);
271
447
  }
272
448
 
273
449
  async function checkRumenPool(config, options) {
274
450
  if (options && typeof options._rumenPoolProbe === 'function') {
275
451
  try {
276
452
  const r = await options._rumenPoolProbe();
277
- if (r && r.ok) return { name: 'rumen-pool', status: 'pass' };
278
- return { name: 'rumen-pool', status: 'warn', detail: (r && r.detail) || 'unreachable (best-effort)' };
453
+ if (r && r.ok) return passCheck('rumen-pool');
454
+ // Test-seam probe should pass `category` if it has one; else infer.
455
+ const cat = (r && r.category) || classifyDbFailure(r || {});
456
+ return warnCheck('rumen-pool', cat, (r && r.detail) || (r && r.error) || 'unreachable (best-effort)');
279
457
  } catch (err) {
280
- return { name: 'rumen-pool', status: 'warn', detail: err && err.message ? err.message : String(err) };
458
+ const cat = classifyDbFailure(err);
459
+ return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
281
460
  }
282
461
  }
283
462
  let pg;
284
463
  try { pg = require('pg'); } catch (_e) { pg = null; }
285
- if (!pg) return { name: 'rumen-pool', status: 'warn', detail: 'pg module not installed' };
464
+ if (!pg) {
465
+ logInitFailedOnce('rumen-pool', 'pg module not installed');
466
+ return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'pg module not installed');
467
+ }
286
468
 
287
469
  const dbUrl = (config && config.rag && config.rag.databaseUrl) || process.env.DATABASE_URL;
288
- if (!dbUrl) return { name: 'rumen-pool', status: 'warn', detail: 'DATABASE_URL not set' };
470
+ if (!dbUrl) {
471
+ logInitFailedOnce('rumen-pool', 'DATABASE_URL not set');
472
+ return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'DATABASE_URL not set');
473
+ }
289
474
 
290
475
  const pool = new pg.Pool({ connectionString: dbUrl, max: 1, connectionTimeoutMillis: 3000 });
291
476
  try {
292
477
  const res = await pool.query('SELECT 1 AS ok');
293
- if (res.rows[0] && res.rows[0].ok === 1) return { name: 'rumen-pool', status: 'pass' };
294
- return { name: 'rumen-pool', status: 'warn', detail: 'SELECT 1 returned unexpected result' };
478
+ if (res.rows[0] && res.rows[0].ok === 1) return passCheck('rumen-pool');
479
+ return warnCheck('rumen-pool', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
295
480
  } catch (err) {
296
- return { name: 'rumen-pool', status: 'warn', detail: err && err.message ? err.message : String(err) };
481
+ const cat = classifyDbFailure(err);
482
+ return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
297
483
  } finally {
298
484
  try { await pool.end(); } catch (_e) { /* ignore */ }
299
485
  }
@@ -316,35 +502,82 @@ async function getFullHealth(config = {}, options = {}) {
316
502
 
317
503
  const checks = [];
318
504
 
505
+ // Sprint 63 T3 §3.2 outer-catch hardening (T4-CODEX AUDIT-CONCERN 13:27 ET):
506
+ // every probe is independently wrapped here so an unexpected throw in a
507
+ // single probe path can't sink the whole report. Pre-Sprint-63 these four
508
+ // catches emitted raw `{ status: 'fail'|'warn', detail }` with no
509
+ // `category` field — operators triaging "why is the dashboard red?" still
510
+ // had to read prose. The whole point of the taxonomy is that there is no
511
+ // such thing as an uncategorized non-pass row. Every fallback now runs the
512
+ // captured `err` through `classifyDbFailure` / `classifyHttpFailure` and
513
+ // composes a normal `failCheck` / `warnCheck` envelope. When the
514
+ // classifier can't infer (truly opaque throw — bug in the probe itself,
515
+ // not in the dependency), the default branch in each classifier returns
516
+ // `red:dependency-down`, which is the right-by-default category for "the
517
+ // probe's path is broken" — operator's first action is to inspect the
518
+ // peer / its config, not the local handle.
519
+
520
+ // Sprint 63 T3 §3.2 — `_throwIn` test seam. The probe functions each have
521
+ // their own try/catch so unreached-by-design inputs can't throw out into
522
+ // the outer catches below. The fence tests need a way to simulate "a
523
+ // probe's path threw before its own catch caught it" — i.e., the
524
+ // belt-and-suspenders outer catch. Set `_throwIn` to one of
525
+ // `'sqlite' | 'pg' | 'webhook' | 'rumen-pool'` to inject a synthetic
526
+ // throw at the corresponding outer-try entry. Never set in production —
527
+ // ignored if the value is falsy.
528
+ const throwIn = options._throwIn || null;
529
+ const synth = (where) => new Error(`test-fence: simulated throw in ${where} probe path`);
530
+
319
531
  // 1. SQLite (sync — small DB, no risk of blocking)
320
- try { checks.push(checkSqlite(db)); }
321
- catch (err) { checks.push({ name: 'sqlite', status: 'fail', detail: err && err.message ? err.message : String(err) }); }
532
+ try {
533
+ if (throwIn === 'sqlite') throw synth('sqlite');
534
+ checks.push(checkSqlite(db));
535
+ }
536
+ catch (err) {
537
+ const cat = classifyDbFailure(err);
538
+ checks.push(failCheck('sqlite', cat, err && err.message ? err.message : String(err)));
539
+ }
322
540
 
323
541
  // 2-7. Postgres-side suite
324
542
  let pgChecks;
325
- try { pgChecks = await runPgChecks({ databaseUrl, _pgClient: options._pgClient }); }
543
+ try {
544
+ if (throwIn === 'pg') throw synth('pg');
545
+ pgChecks = await runPgChecks({ databaseUrl, _pgClient: options._pgClient });
546
+ }
326
547
  catch (err) {
327
- pgChecks = [{
328
- name: 'mnestra-pg',
329
- status: 'fail',
330
- detail: err && err.message ? err.message : String(err)
331
- }];
548
+ const cat = classifyDbFailure(err);
549
+ const why = err && err.message ? err.message : String(err);
550
+ pgChecks = [failCheck('mnestra-pg', cat, why)];
551
+ // Dependents inherit the same category see runPgChecks header for the
552
+ // rationale (one root-cause row, not 6 independent-looking REDs).
332
553
  for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
333
- pgChecks.push({ name, status: 'fail', detail: 'pg suite aborted' });
554
+ pgChecks.push(failCheck(name, cat, 'pg suite aborted'));
334
555
  }
335
556
  }
336
557
  for (const c of pgChecks) checks.push(c);
337
558
 
338
559
  // 8. Mnestra webhook (warn)
339
560
  let webhook;
340
- try { webhook = await checkMnestraWebhook(config, options); }
341
- catch (err) { webhook = { name: 'mnestra-webhook', status: 'warn', detail: err && err.message ? err.message : String(err) }; }
561
+ try {
562
+ if (throwIn === 'webhook') throw synth('webhook');
563
+ webhook = await checkMnestraWebhook(config, options);
564
+ }
565
+ catch (err) {
566
+ const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
567
+ webhook = warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
568
+ }
342
569
  checks.push(webhook);
343
570
 
344
571
  // 9. Rumen pool (warn)
345
572
  let pool;
346
- try { pool = await checkRumenPool(config, options); }
347
- catch (err) { pool = { name: 'rumen-pool', status: 'warn', detail: err && err.message ? err.message : String(err) }; }
573
+ try {
574
+ if (throwIn === 'rumen-pool') throw synth('rumen-pool');
575
+ pool = await checkRumenPool(config, options);
576
+ }
577
+ catch (err) {
578
+ const cat = classifyDbFailure(err);
579
+ pool = warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
580
+ }
348
581
  checks.push(pool);
349
582
 
350
583
  const ok = checks
@@ -370,8 +603,19 @@ function _resetCache() {
370
603
  _cachedAt = 0;
371
604
  }
372
605
 
606
+ // Sprint 63 T3 §3.2 — clear the init-failed log-once memory so each test
607
+ // case starts fresh. Without this, the first test that exercises a null-db
608
+ // path would silence the log on subsequent tests in the same process.
609
+ function _resetInitLogged() {
610
+ _initLoggedOnce.clear();
611
+ }
612
+
373
613
  module.exports = {
374
614
  getFullHealth,
375
615
  REQUIRED_CHECKS,
376
- _resetCache
616
+ CATEGORIES,
617
+ classifyHttpFailure,
618
+ classifyDbFailure,
619
+ _resetCache,
620
+ _resetInitLogged,
377
621
  };
@@ -37,7 +37,7 @@ try {
37
37
  }
38
38
  try { pg = require('pg'); } catch { pg = null; }
39
39
 
40
- // Module-level singleton Postgres pool for rumen_insights (petvetbid DB).
40
+ // Module-level singleton Postgres pool for rumen_insights (the daily-driver DB).
41
41
  // Lazy-initialized on first rumen endpoint hit so startup stays fast and
42
42
  // servers without DATABASE_URL never pay the connection cost.
43
43
  //
@@ -292,31 +292,42 @@ function _termdeckVersion() {
292
292
  // `pty.resize()` ioctls a stale fd. The error is race-expected, not a bug,
293
293
  // but the noisy console.error trace pollutes diagnostics and obscures real
294
294
  // errors. This helper guards against the race and downgrades the known
295
- // race-class errors (EBADF, ENOTTY, generic "ioctl failed" message shape) to
296
- // a silent return. Set TERMDECK_DEBUG_PTY_RACES=1 to log to console.debug
297
- // for diagnostics.
295
+ // race-class errors (EBADF, ENOTTY) to a silent return. Set
296
+ // TERMDECK_DEBUG_PTY_RACES=1 to log to console.debug for diagnostics.
297
+ //
298
+ // Sprint 63 T1 — `isPtyRaceError(err)` extracted so the WS message-handler
299
+ // outer catch can also downgrade race-class errors that escape the helper's
300
+ // own catch (e.g. if `pty.write` ever races the close, future code paths).
301
+ // `session.pty._destroyed` short-circuit added as belt-and-suspenders for the
302
+ // `term.kill()` → before-`term.onExit`-fires window: the DELETE handler now
303
+ // stamps `_destroyed = true` immediately after kill(), so resize attempts in
304
+ // that interval short-circuit without an ioctl call.
305
+ function isPtyRaceError(err) {
306
+ if (!err) return false;
307
+ const msg = (err.message) || '';
308
+ const code = err.code;
309
+ return code === 'EBADF' ||
310
+ code === 'ENOTTY' ||
311
+ /\b(?:EBADF|ENOTTY)\b/.test(msg);
312
+ }
313
+
298
314
  function safelyResizePty(session, cols, rows) {
299
315
  if (!session || !session.pty) return false;
316
+ if (session.pty._destroyed) return false;
300
317
  if (session.meta && session.meta.status === 'exited') return false;
301
318
  try {
302
319
  session.pty.resize(cols || 120, rows || 30);
303
320
  return true;
304
321
  } catch (err) {
305
- const msg = (err && err.message) || '';
306
- const code = err && err.code;
307
322
  // Sprint 60 v1.0.14 + T4-CODEX AUDIT-CONCERN narrowing: race classifier
308
323
  // requires explicit EBADF or ENOTTY (in code OR message). The earlier
309
324
  // shape — any "ioctl(N) failed" message — was too broad: it would have
310
325
  // silently dropped a non-race ioctl failure (e.g. EINTR, EFAULT) that
311
326
  // might indicate a real bug. Now: only the specific race-class signals
312
327
  // get suppressed; everything else rethrows so it surfaces in logs.
313
- const isRace =
314
- code === 'EBADF' ||
315
- code === 'ENOTTY' ||
316
- /\b(?:EBADF|ENOTTY)\b/.test(msg);
317
- if (isRace) {
328
+ if (isPtyRaceError(err)) {
318
329
  if (process.env.TERMDECK_DEBUG_PTY_RACES) {
319
- console.debug(`[ws] resize-after-pty-exit (race-expected): session=${session.id} ${code || msg}`);
330
+ console.debug(`[ws] resize-after-pty-exit (race-expected): session=${session.id} ${err.code || err.message}`);
320
331
  }
321
332
  return false;
322
333
  }
@@ -324,6 +335,35 @@ function safelyResizePty(session, cols, rows) {
324
335
  }
325
336
  }
326
337
 
338
+ // Sprint 63 T1 (Item 1.3) — body-parser hardening. The pre-existing
339
+ // `entity.verify.failed` / `entity.parse.failed` handler logged the error
340
+ // message but not WHICH bytes triggered the parse failure. Operators on
341
+ // Brad's r730 saw 9× SyntaxError flood over 13h with no fingerprint to
342
+ // identify the offending caller. `hexEscapePrefix` renders a 32-byte
343
+ // prefix of the raw body in a single-line, log-safe form: printable ASCII
344
+ // kept verbatim, non-printables rendered as `\xNN`, backslash escaped as
345
+ // `\\`. PII-conservative because we cap at 32 bytes (truncation marker `…`
346
+ // appended if more). The error middleware injects this into the existing
347
+ // `console.warn` line so the log signature is identifiable without
348
+ // dumping the full body.
349
+ function hexEscapePrefix(buf, maxBytes = 32) {
350
+ if (!buf || buf.length === 0) return '<no-body>';
351
+ const len = Math.min(buf.length, maxBytes);
352
+ let out = '';
353
+ for (let i = 0; i < len; i++) {
354
+ const b = buf[i];
355
+ if (b === 0x5c) {
356
+ out += '\\\\';
357
+ } else if (b >= 0x20 && b < 0x7f) {
358
+ out += String.fromCharCode(b);
359
+ } else {
360
+ out += '\\x' + b.toString(16).padStart(2, '0');
361
+ }
362
+ }
363
+ if (buf.length > maxBytes) out += '…';
364
+ return out;
365
+ }
366
+
327
367
  function createServer(config) {
328
368
  const app = express();
329
369
  const server = http.createServer(app);
@@ -346,6 +386,13 @@ function createServer(config) {
346
386
  // logs so real errors aren't drowned in noise.
347
387
  app.use(express.json({
348
388
  verify: (req, res, buf) => {
389
+ // Sprint 63 T1 (Item 1.3) — capture a stable copy of the raw body so
390
+ // the error middleware below can render a 32-byte hex-escaped prefix.
391
+ // `Buffer.from(buf)` copies because express may pool the underlying
392
+ // accumulator across requests; without the copy the error handler
393
+ // could see bytes from a later request.
394
+ req.rawBody = Buffer.from(buf);
395
+
349
396
  // O(N) single-pass scan. Only checks bytes inside double-quoted string
350
397
  // regions so structural whitespace doesn't trigger false positives.
351
398
  let inString = false;
@@ -390,7 +437,13 @@ function createServer(config) {
390
437
  err.type === 'entity.verify.failed' ||
391
438
  err instanceof SyntaxError
392
439
  )) {
393
- console.warn(`[body-parser] ${err.code || err.type || 'parse-error'}: ${err.message} (${req.method} ${req.path})`);
440
+ // Sprint 63 T1 (Item 1.3) append a 32-byte hex-escaped prefix of the
441
+ // raw body so the operator can identify which caller is sending bad
442
+ // JSON without exposing the full payload. Falls through to `<no-body>`
443
+ // if the verify callback never ran (parse error before verify, or no
444
+ // body at all).
445
+ const prefix = hexEscapePrefix(req.rawBody);
446
+ console.warn(`[body-parser] ${err.code || err.type || 'parse-error'}: ${err.message} (${req.method} ${req.path}) prefix="${prefix}"`);
394
447
  return res.status(400).json({
395
448
  error: 'Malformed JSON body',
396
449
  detail: err.message,
@@ -1189,6 +1242,18 @@ function createServer(config) {
1189
1242
  const sessUploadDir = path.join(os.tmpdir(), 'termdeck-uploads', session.id);
1190
1243
  fs.rmSync(sessUploadDir, { recursive: true, force: true });
1191
1244
  } catch (_err) { /* non-blocking */ }
1245
+
1246
+ // Sprint 63 T1 (Item 1.1) — null `session.pty` so the wrapper is
1247
+ // eligible for GC and downstream `if (session.pty)` guards correctly
1248
+ // identify the exited state. Root cause of Joshua's 2026-05-08/09
1249
+ // overnight `kern.tty.ptmx_max=511` exhaustion (516 fds for 4 panels):
1250
+ // without this nulling, node-pty's wrapper stayed pinned by onData /
1251
+ // onExit closures even after the child exited, holding the master
1252
+ // fd until next GC pass. Set AFTER `onPanelClose` fires (fire-and-
1253
+ // forget; reads `session.meta` + `session.id`, not `session.pty`) and
1254
+ // AFTER the upload-dir cleanup so any sync reader above this line
1255
+ // sees the original wrapper.
1256
+ session.pty = null;
1192
1257
  });
1193
1258
 
1194
1259
  // Wire command logging to SQLite + RAG
@@ -1346,7 +1411,7 @@ function createServer(config) {
1346
1411
  });
1347
1412
 
1348
1413
  // Graph endpoints (Sprint 38 T4) — knowledge-graph view backing graph.html.
1349
- // Reuses the petvetbid pg pool (same DATABASE_URL serves memory_items +
1414
+ // Reuses the daily-driver pg pool (same DATABASE_URL serves memory_items +
1350
1415
  // memory_relationships alongside rumen_*). Graceful-degrades when the pool
1351
1416
  // is absent.
1352
1417
  createGraphRoutes({
@@ -1376,6 +1441,14 @@ function createServer(config) {
1376
1441
  // Kill PTY process
1377
1442
  if (session.pty) {
1378
1443
  try { session.pty.kill(); } catch (err) { console.error('[pty] kill failed for session', req.params.id + ':', err); }
1444
+ // Sprint 63 T1 (Item 1.2) — stamp `_destroyed = true` on the pty wrapper
1445
+ // so `safelyResizePty` can short-circuit any resize attempts that arrive
1446
+ // in the kill()→onExit window. node-pty's `kill()` only signals the
1447
+ // child; onExit fires asynchronously once the child reaps. Without this
1448
+ // marker, a WS resize message in that window would ioctl a fd whose
1449
+ // child has just SIGHUP'd, surfacing as EBADF/ENOTTY. node-pty doesn't
1450
+ // set this property itself; the convention is owned by TermDeck.
1451
+ session.pty._destroyed = true;
1379
1452
  }
1380
1453
 
1381
1454
  sessions.remove(req.params.id);
@@ -1595,15 +1668,23 @@ function createServer(config) {
1595
1668
  });
1596
1669
 
1597
1670
  // POST /api/sessions/:id/resize - resize terminal
1671
+ // Sprint 63 T1 (Item 1.2) — distinguish "session never existed" (404) from
1672
+ // "session exists but PTY has exited" (410 Gone). Pre-Sprint-63 both paths
1673
+ // collapsed to 404 (when session.pty was null after the PTY-leak fix) or
1674
+ // 409 (when safelyResizePty returned false). 410 is the semantically
1675
+ // correct response: the resource was here, the resource is now gone.
1598
1676
  app.post('/api/sessions/:id/resize', (req, res) => {
1599
1677
  const session = sessions.get(req.params.id);
1600
- if (!session?.pty) return res.status(404).json({ error: 'Session not found' });
1678
+ if (!session) return res.status(404).json({ error: 'Session not found' });
1679
+ if (!session.pty || (session.meta && session.meta.status === 'exited')) {
1680
+ return res.status(410).json({ error: 'PTY is gone (session exited)' });
1681
+ }
1601
1682
 
1602
1683
  const { cols, rows } = req.body;
1603
1684
  try {
1604
1685
  const resized = safelyResizePty(session, cols, rows);
1605
1686
  if (!resized) {
1606
- return res.status(409).json({ error: 'Session is exited or its PTY is no longer alive' });
1687
+ return res.status(410).json({ error: 'PTY is gone (session exited)' });
1607
1688
  }
1608
1689
  res.json({ ok: true, cols, rows });
1609
1690
  } catch (err) {
@@ -2027,7 +2108,7 @@ function createServer(config) {
2027
2108
  });
2028
2109
 
2029
2110
  // ==================== Rumen insights (Sprint 4 T2) ====================
2030
- // Read-only access to rumen_insights + rumen_jobs in the petvetbid Postgres
2111
+ // Read-only access to rumen_insights + rumen_jobs in the daily-driver Postgres
2031
2112
  // instance. Contract frozen in docs/sprint-4-rumen-integration/API-CONTRACT.md.
2032
2113
 
2033
2114
  function rumenUnreachable(res) {
@@ -2268,7 +2349,7 @@ function createServer(config) {
2268
2349
 
2269
2350
  switch (parsed.type) {
2270
2351
  case 'input':
2271
- if (session.pty) {
2352
+ if (session.pty && !session.pty._destroyed) {
2272
2353
  session.pty.write(parsed.data);
2273
2354
  session.trackInput(parsed.data);
2274
2355
  }
@@ -2289,7 +2370,21 @@ function createServer(config) {
2289
2370
  }));
2290
2371
  break;
2291
2372
  }
2292
- } catch (err) { console.error('[ws] message handler error:', err); }
2373
+ } catch (err) {
2374
+ // Sprint 63 T1 (Item 1.2) — belt-and-suspenders: if a race-class
2375
+ // ioctl error somehow escapes safelyResizePty's own catch (or comes
2376
+ // from a future write/ioctl path), downgrade to console.debug
2377
+ // instead of polluting stderr with the noisy ws-message-handler
2378
+ // error log. safelyResizePty itself already catches the resize
2379
+ // path; this catches any other race-class shape that bubbles here.
2380
+ if (isPtyRaceError(err)) {
2381
+ if (process.env.TERMDECK_DEBUG_PTY_RACES) {
2382
+ console.debug(`[ws] message handler race-class (suppressed): ${err.code || err.message}`);
2383
+ }
2384
+ } else {
2385
+ console.error('[ws] message handler error:', err);
2386
+ }
2387
+ }
2293
2388
  });
2294
2389
 
2295
2390
  ws.on('close', () => {
@@ -2599,6 +2694,11 @@ module.exports = {
2599
2694
  // helper instead of re-implementing it. T4-CODEX AUDIT-CONCERN flagged that
2600
2695
  // the prior re-implementation pattern in the test could drift silently.
2601
2696
  safelyResizePty,
2697
+ // Sprint 63 T1 (Item 1.2 + 1.3) — race-class classifier + raw-body hex
2698
+ // prefix renderer exported so fence tests can import the production
2699
+ // helpers instead of re-implementing them.
2700
+ isPtyRaceError,
2701
+ hexEscapePrefix,
2602
2702
  // Sprint 48 T4 — exported for unit testing the secrets.env → PTY env merge.
2603
2703
  readTermdeckSecretsForPty,
2604
2704
  _resetTermdeckSecretsCache,
@@ -261,7 +261,13 @@ async function checkShellSanity() {
261
261
  let output = '';
262
262
  let resolved = false;
263
263
 
264
- const proc = ptyMod.spawn(shell, ['-l', '-c', 'echo TERMDECK_OK'], {
264
+ // Sprint 63 T3 §3.3 — drop `-l` (login mode). `-l` sources ~/.bash_profile
265
+ // / ~/.zshrc and friends, which on heavy profiles (nvm, conda, plugin
266
+ // managers — Brad's r730 has conda) routinely exceeds the 3s timeout
267
+ // budget below. A PTY-spawn health check answers "can $SHELL spawn a
268
+ // PTY and emit output?" — not "does the user's interactive profile
269
+ // complete fast?" Login-mode startup time is unrelated to PTY health.
270
+ const proc = ptyMod.spawn(shell, ['-c', 'echo TERMDECK_OK'], {
265
271
  name: 'xterm-256color',
266
272
  cols: 80,
267
273
  rows: 24,