npm - @jhizzard/termdeck - Versions diffs - 1.1.1 → 1.2.0 - Mend

@jhizzard/termdeck 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json +1 -1
package/packages/cli/src/stack.js +20 -3
package/packages/server/src/agent-adapters/gemini.js +14 -8
package/packages/server/src/health.js +354 -110
package/packages/server/src/index.js +119 -19
package/packages/server/src/preflight.js +7 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@jhizzard/termdeck",
-  "version": "1.1.1",
+  "version": "1.2.0",
   "description": "Browser-based terminal multiplexer with metadata overlays, panel flashback memory recall, and AI-aware session management",
   "bin": {
     "termdeck": "./packages/cli/src/index.js"

package/packages/cli/src/stack.js CHANGED Viewed

@@ -410,7 +410,14 @@ async function checkRumen() {
   }
   const pool = new pg.Pool({ connectionString: dbUrl, max: 1, connectionTimeoutMillis: 5000 });
   try {
-    const r = await pool.query("SELECT to_char(NOW() - MAX(created_at), 'HH24:MI:SS') AS ago FROM rumen_jobs");
+    // Sprint 63 T3 §3.1 — `rumen_jobs` has `started_at` (migration 001), NOT
+    // `created_at`. Pre-Sprint-63 this probed `created_at` and threw a
+    // generic WARN that doctor's same-DB check did not (Sprint 35 doctor fix
+    // landed RUMEN_TIME_COL.rumen_jobs='started_at' but never propagated
+    // here). Brad reproduced on r730 2026-05-11; doctor 23/23 GREEN while
+    // launcher Step 3 emitted `WARN  (query failed: column "created_at"
+    // does not exist)`. Aligned both probes to the same column.
+    const r = await pool.query("SELECT to_char(NOW() - MAX(started_at), 'HH24:MI:SS') AS ago FROM rumen_jobs");
     const ago = r.rows[0] && r.rows[0].ago;
     if (ago) {
       stepLine('3/4', 'Checking Rumen', 'OK', `(last job ${ago} ago)`);
@@ -419,10 +426,20 @@ async function checkRumen() {
     stepLine('3/4', 'Checking Rumen', 'WARN', '(no jobs yet — try termdeck init --rumen)');
     return { ago: null };
   } catch (err) {
-    if (/relation .*rumen_jobs.* does not exist/i.test(String(err.message))) {
+    const msg = String(err && err.message ? err.message : err);
+    if (/relation .*rumen_jobs.* does not exist/i.test(msg)) {
       stepLine('3/4', 'Checking Rumen', 'SKIP', '(rumen_jobs table not present — run termdeck init --rumen)');
     } else {
-      stepLine('3/4', 'Checking Rumen', 'WARN', `(query failed: ${err.message})`);
+      const colMatch = msg.match(/column "([^"]+)" does not exist/i);
+      if (colMatch) {
+        // Schema drift — rumen_jobs is missing the column we queried. Naming
+        // the column + remediation beats a bare `query failed` that operators
+        // learn to filter out (Brad's r730, 2026-05-11).
+        stepLine('3/4', 'Checking Rumen', 'WARN',
+          `(rumen_jobs.${colMatch[1]} column missing — re-run \`termdeck init --rumen\` to apply migration 001)`);
+      } else {
+        stepLine('3/4', 'Checking Rumen', 'WARN', `(query failed: ${err.message})`);
+      }
     }
     return { ago: null };
   } finally {

package/packages/server/src/agent-adapters/gemini.js CHANGED Viewed

@@ -50,13 +50,18 @@ function statusFor(data) {
 // resolveTranscriptPath — Sprint 50 T1.
 //
 // Gemini CLI persists chats at
-//   ~/.gemini/tmp/<basename(cwd)>/chats/session-<ISO-ts>-<short-id>.json
-// (single-JSON-object shape that matches parseGeminiJson, verified
-// 2026-05-02 substrate probe). Pick the most recently modified file whose
-// mtime is at-or-after `session.meta.createdAt`. Falls back to walking
-// every project directory under `~/.gemini/tmp/*/chats/` if the basename
-// heuristic produces no candidate (e.g., Gemini renormalized the project
-// name to deduplicate against an existing one).
+//   ~/.gemini/tmp/<basename(cwd)>/chats/session-<ISO-ts>-<short-id>.{json,jsonl}
+// (single-JSON-object shape that matches parseGeminiJson for the .json
+// flavor, verified 2026-05-02 substrate probe; .jsonl flavor introduced
+// some time between 2026-05-02 and 2026-05-08, surfaced by Sprint 63 T2
+// acceptance — see docs/sprint-63-wave-2/EXIT-CAPTURE-VERIFICATION.md
+// Finding #2. The extension filter accepts both shapes; downstream parser
+// handling of JSONL deltas is a Sprint 64 candidate). Pick the most
+// recently modified file whose mtime is at-or-after
+// `session.meta.createdAt`. Falls back to walking every project directory
+// under `~/.gemini/tmp/*/chats/` if the basename heuristic produces no
+// candidate (e.g., Gemini renormalized the project name to deduplicate
+// against an existing one).
 // ──────────────────────────────────────────────────────────────────────────
 async function resolveTranscriptPath(session) {
@@ -83,7 +88,8 @@ async function resolveTranscriptPath(session) {
     let entries;
     try { entries = fs.readdirSync(dir); } catch (_) { return; }
     for (const name of entries) {
-      if (!name.startsWith('session-') || !name.endsWith('.json')) continue;
+      if (!name.startsWith('session-')) continue;
+      if (!name.endsWith('.json') && !name.endsWith('.jsonl')) continue;
       const full = path.join(dir, name);
       let st;
       try { st = fs.statSync(full); } catch (_) { continue; }

package/packages/server/src/health.js CHANGED Viewed

@@ -20,6 +20,33 @@
 // checks (mnestra-webhook, rumen-pool) are best-effort: a failure surfaces
 // as `warn` with detail, but does not flip `ok`.
 //
+// Failure taxonomy (Sprint 63 T3 §3.2 — Brad r730 cascade 2026-05-11)
+// ──────────────────────────────────────────────────────────────────
+// Pre-Sprint-63 every check that didn't return `pass` collapsed to `fail`
+// with a free-text `detail` string. Operators triaging "why is the install
+// red?" had to read each detail and guess. The cost was real: on 2026-05-11
+// a SQLite ABI mismatch left `db = null` at boot; the resulting
+// `red: timeout` strings (from probes that timed-out trying to use the null
+// handle indirectly) masked the actual `init-failed` root cause for hours.
+//
+// Every non-pass check now carries a `category` field with one of:
+//   `red:unreachable`     — network/socket level (ECONNREFUSED / EHOSTUNREACH
+//                            / ENETUNREACH / ENOTFOUND on connect)
+//   `red:timeout`         — request issued, no response in the window
+//                            (AbortError / req timeout / pg ETIMEDOUT)
+//   `red:dependency-down` — peer responded but the dependency is unhealthy
+//                            (HTTP 5xx / SQL schema error from a reachable DB)
+//   `red:init-failed`     — local handle the probe needs was never initialized
+//                            (db === null at boot / DATABASE_URL not set)
+//
+// `detail` strings are prefixed with the category for human readability:
+// `red:unreachable (could not connect to Postgres using DATABASE_URL)`.
+//
+// init-failed surfaces use a log-once gate so a 30s-poll cycle on a process
+// with a missing handle (e.g. better-sqlite3 not loaded) writes ONE warn at
+// boot, not 2880 warns/day. Probes still emit `red:init-failed` per cycle
+// in the JSON report — only the log emission is gated.
+//
 // Caching
 // ───────
 // Reports cached in module scope for 30s. `getFullHealth(config, { refresh: true })`
@@ -29,8 +56,9 @@
 // Error handling
 // ──────────────
 // Every check is wrapped: any unexpected error downgrades that single check
-// to `fail` (or `warn` for warn-checks) with the error message in `detail`.
-// `getFullHealth()` always resolves with a structured report — never throws.
+// to `fail` (or `warn` for warn-checks) with the error message in `detail`
+// and a `category` from the taxonomy above. `getFullHealth()` always
+// resolves with a structured report — never throws.
 'use strict';
@@ -50,21 +78,104 @@ const REQUIRED_CHECKS = new Set([
   'cron-job-active'
 ]);
+// Sprint 63 T3 §3.2 — stable taxonomy strings. Exported so dashboard / doctor
+// / external graders can filter by category instead of pattern-matching the
+// detail prose. Frozen object so callers can rely on `CATEGORIES.UNREACHABLE`
+// without accidentally rebinding.
+const CATEGORIES = Object.freeze({
+  UNREACHABLE: 'red:unreachable',
+  TIMEOUT: 'red:timeout',
+  DEPENDENCY_DOWN: 'red:dependency-down',
+  INIT_FAILED: 'red:init-failed',
+});
 let _cache = null;
 let _cachedAt = 0;
+// Sprint 63 T3 §3.2 — log-once gate for init-failed surfaces. A 30s health
+// poll on a process with a missing handle would otherwise log every cycle
+// (~2880 warn lines/day per missing handle). Probes that detect a null
+// handle at boot call `logInitFailedOnce(name, reason)`; the first call
+// emits a warn line, subsequent calls are silent for the lifetime of the
+// process. Probes still emit `red:init-failed` in the JSON report on every
+// cycle — only the log line is gated. Reset via `_resetInitLogged()` test
+// seam between cases.
+const _initLoggedOnce = new Map();
+function logInitFailedOnce(probeName, reason) {
+  if (_initLoggedOnce.has(probeName)) return;
+  _initLoggedOnce.set(probeName, reason);
+  // eslint-disable-next-line no-console
+  console.warn(
+    `[health] ${probeName} handle null at boot — probes will return ` +
+    `${CATEGORIES.INIT_FAILED} until next process start; reason: ${reason}`
+  );
+}
+// Classify an HTTP-side failure shape `{ ok, status, error, code }` (as
+// returned by `httpReachable`) into one of the four red:* categories.
+function classifyHttpFailure(r) {
+  if (!r) return CATEGORIES.UNREACHABLE;
+  if (r.code === 'TIMEOUT' || r.code === 'ABORT_ERR' || r.code === 'ERR_TIMEOUT' || r.error === 'timeout') {
+    return CATEGORIES.TIMEOUT;
+  }
+  if (r.code === 'ECONNREFUSED' || r.code === 'EHOSTUNREACH' || r.code === 'ENETUNREACH' || r.code === 'ENOTFOUND') {
+    return CATEGORIES.UNREACHABLE;
+  }
+  if (typeof r.status === 'number' && r.status >= 500) return CATEGORIES.DEPENDENCY_DOWN;
+  if (typeof r.status === 'number') return CATEGORIES.DEPENDENCY_DOWN; // any non-2xx-3xx-4xx-network is "peer responded badly"
+  return CATEGORIES.UNREACHABLE;
+}
+// Classify a database / Node-side failure into one of the four categories.
+// Accepts either a raw Error or a `{ error, code }` envelope from
+// `safeQueryRow` / `safeQueryRows`.
+function classifyDbFailure(errOrEnvelope) {
+  if (!errOrEnvelope) return CATEGORIES.DEPENDENCY_DOWN;
+  const code = errOrEnvelope.code || (errOrEnvelope._err && errOrEnvelope._err.code);
+  const msg = String(errOrEnvelope.message || errOrEnvelope.error || errOrEnvelope);
+  if (code === 'ECONNREFUSED' || code === 'EHOSTUNREACH' || code === 'ENETUNREACH' || code === 'ENOTFOUND') {
+    return CATEGORIES.UNREACHABLE;
+  }
+  if (code === 'ETIMEDOUT' || code === 'ERR_TIMEOUT' || /\btimeout\b/i.test(msg)) {
+    return CATEGORIES.TIMEOUT;
+  }
+  // SQL errors (42703 column-not-exist, 42P01 relation-not-exist, etc.) →
+  // the dependency answered but its schema is misconfigured. That's
+  // "dependency unhealthy," not "unreachable" or "timeout."
+  return CATEGORIES.DEPENDENCY_DOWN;
+}
+// Helpers to compose check results with a category-prefixed detail. Keeps
+// each call site readable + ensures the prefix is consistent across probes.
+function failCheck(name, category, why) {
+  return { name, status: 'fail', category, detail: `${category} (${why})` };
+}
+function warnCheck(name, category, why) {
+  return { name, status: 'warn', category, detail: `${category} (${why})` };
+}
+function passCheck(name) {
+  return { name, status: 'pass' };
+}
 // ── SQLite check ────────────────────────────────────────────────────────────
 function checkSqlite(db) {
   if (!db) {
-    return { name: 'sqlite', status: 'fail', detail: 'better-sqlite3 not initialized' };
+    // Sprint 63 T3 §3.2 — `db === null` is `red:init-failed`, NOT `red:timeout`.
+    // The v1.1.1 fail-fast on SQLite ABI mismatch makes this surface rare in
+    // practice, but the probe must still classify correctly because future
+    // optional deps may be allowed to be null. Log-once gate prevents the
+    // 30s poll from flooding logs.
+    logInitFailedOnce('sqlite', 'better-sqlite3 not initialized');
+    return failCheck('sqlite', CATEGORIES.INIT_FAILED, 'better-sqlite3 not initialized');
   }
   try {
     const row = db.prepare('SELECT 1 AS ok').get();
-    if (row && row.ok === 1) return { name: 'sqlite', status: 'pass' };
-    return { name: 'sqlite', status: 'fail', detail: 'SELECT 1 returned unexpected result' };
+    if (row && row.ok === 1) return passCheck('sqlite');
+    return failCheck('sqlite', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
   } catch (err) {
-    return { name: 'sqlite', status: 'fail', detail: err && err.message ? err.message : String(err) };
+    const cat = classifyDbFailure(err);
+    return failCheck('sqlite', cat, err && err.message ? err.message : String(err));
   }
 }
@@ -81,7 +192,12 @@ async function safeQueryRow(client, sql) {
     if (r.rows && r.rows.length > 0 && r.rows[0].ok) return { ok: true };
     return { ok: false };
   } catch (err) {
-    return { error: err && err.message ? err.message : String(err) };
+    // Surface `code` so the caller can classify into the red:* taxonomy
+    // without re-parsing the message string.
+    return {
+      error: err && err.message ? err.message : String(err),
+      code: err && err.code,
+    };
   }
 }
@@ -90,34 +206,80 @@ async function safeQueryRows(client, sql) {
     const r = await client.query(sql);
     return { rows: r.rows || [] };
   } catch (err) {
-    return { error: err && err.message ? err.message : String(err) };
+    return {
+      error: err && err.message ? err.message : String(err),
+      code: err && err.code,
+    };
   }
 }
+// Sprint 63 T3 §3.2 — track whether the most recent connect attempt timed out
+// vs. was outright unreachable. The pg client doesn't expose this from inside
+// the helper, so the helper records it in a return envelope.
 async function openPgClient(databaseUrl) {
-  if (!databaseUrl) return null;
+  if (!databaseUrl) return { client: null, reason: 'no-url' };
   let pgRunner;
-  try { pgRunner = require('./setup/pg-runner'); } catch (_e) { return null; }
-  try { return await pgRunner.connect(databaseUrl); } catch (_e) { return null; }
+  try { pgRunner = require('./setup/pg-runner'); }
+  catch (_e) { return { client: null, reason: 'pg-runner-unavailable' }; }
+  try {
+    const client = await pgRunner.connect(databaseUrl);
+    return { client, reason: null };
+  } catch (err) {
+    return {
+      client: null,
+      reason: 'connect-failed',
+      error: err && err.message ? err.message : String(err),
+      code: err && err.code,
+    };
+  }
+}
+// Sprint 63 T3 §3.2 — dependent-checks shape when there's no client. Pre-
+// Sprint-63 these collapsed to status:'fail', detail:'pg unavailable' with
+// no category; operators couldn't distinguish "DATABASE_URL not set"
+// (`init-failed` — fix the .env) from "Postgres unreachable" (`unreachable`
+// — fix the network) from "Postgres took 5s and gave up" (`timeout` — bump
+// timeout or check pgbouncer). Each downstream check now carries the same
+// category as the connect attempt so the dashboard can render one row
+// "Postgres unreachable" and dim the six dependents instead of six
+// independent-looking RED rows.
+function pushPgUnavailableChecks(checks, primaryName, category, primaryDetail, dependentDetail) {
+  checks.push(failCheck(primaryName, category, primaryDetail));
+  for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
+    checks.push(failCheck(name, category, dependentDetail));
+  }
 }
 async function runPgChecks({ databaseUrl, _pgClient }) {
   const checks = [];
-  const client = _pgClient || (await openPgClient(databaseUrl));
-  const owned = !_pgClient;
+  let client = _pgClient || null;
+  let owned = false;
+  let connectEnvelope = null;
   if (!client) {
-    checks.push({
-      name: 'mnestra-pg',
-      status: 'fail',
-      detail: databaseUrl
-        ? 'could not connect to Postgres using DATABASE_URL'
-        : 'DATABASE_URL not configured (set in ~/.termdeck/secrets.env)'
-    });
-    // Dependent checks can't run without a connection — surface them as
-    // fail rather than silently skipping so the report is complete.
-    for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
-      checks.push({ name, status: 'fail', detail: 'pg unavailable' });
+    connectEnvelope = await openPgClient(databaseUrl);
+    client = connectEnvelope.client;
+    owned = client != null;
+  }
+  if (!client) {
+    if (!databaseUrl) {
+      // No URL → init-failed (operator never set DATABASE_URL). Log-once.
+      logInitFailedOnce('mnestra-pg', 'DATABASE_URL not configured');
+      pushPgUnavailableChecks(
+        checks,
+        'mnestra-pg',
+        CATEGORIES.INIT_FAILED,
+        'DATABASE_URL not configured — set in ~/.termdeck/secrets.env',
+        'pg unavailable — DATABASE_URL not configured'
+      );
+    } else {
+      // URL set but connect failed → classify by code (timeout vs unreachable).
+      const cat = classifyDbFailure(connectEnvelope || {});
+      const why = connectEnvelope && connectEnvelope.error
+        ? `could not connect to Postgres using DATABASE_URL — ${connectEnvelope.error}`
+        : 'could not connect to Postgres using DATABASE_URL';
+      pushPgUnavailableChecks(checks, 'mnestra-pg', cat, why, 'pg unavailable — connect failed');
     }
     return checks;
   }
@@ -125,11 +287,11 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
   try {
     const ping = await safeQueryRow(client, 'SELECT 1 AS ok');
     if (ping.error) {
-      checks.push({ name: 'mnestra-pg', status: 'fail', detail: ping.error });
+      checks.push(failCheck('mnestra-pg', classifyDbFailure(ping), ping.error));
     } else if (!ping.ok) {
-      checks.push({ name: 'mnestra-pg', status: 'fail', detail: 'SELECT 1 returned no row' });
+      checks.push(failCheck('mnestra-pg', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned no row'));
     } else {
-      checks.push({ name: 'mnestra-pg', status: 'pass' });
+      checks.push(passCheck('mnestra-pg'));
     }
     // memory_items.source_session_id — the v0.6.5 column from Brad's saga.
@@ -137,84 +299,78 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
       "SELECT 1 AS ok FROM information_schema.columns " +
       "WHERE table_schema = 'public' AND table_name = 'memory_items' AND column_name = 'source_session_id'");
     if (col.error) {
-      checks.push({ name: 'memory-items-col', status: 'fail', detail: col.error });
+      checks.push(failCheck('memory-items-col', classifyDbFailure(col), col.error));
     } else if (!col.ok) {
-      checks.push({
-        name: 'memory-items-col',
-        status: 'fail',
-        detail:
-          'memory_items.source_session_id missing — re-run termdeck init --mnestra --yes ' +
-          '(if loader picked up a stale set, first: npm cache clean --force && npm i -g @jhizzard/termdeck@latest)'
-      });
+      checks.push(failCheck(
+        'memory-items-col',
+        CATEGORIES.DEPENDENCY_DOWN,
+        'memory_items.source_session_id missing — re-run termdeck init --mnestra --yes ' +
+        '(if loader picked up a stale set, first: npm cache clean --force && npm i -g @jhizzard/termdeck@latest)'
+      ));
     } else {
-      checks.push({ name: 'memory-items-col', status: 'pass' });
+      checks.push(passCheck('memory-items-col'));
     }
     const cron = await safeQueryRow(client,
       "SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_cron'");
     if (cron.error) {
-      checks.push({ name: 'pg-cron-ext', status: 'fail', detail: cron.error });
+      checks.push(failCheck('pg-cron-ext', classifyDbFailure(cron), cron.error));
     } else if (!cron.ok) {
-      checks.push({
-        name: 'pg-cron-ext',
-        status: 'fail',
-        detail: 'extension not enabled — Supabase dashboard → Database → Extensions → pg_cron'
-      });
+      checks.push(failCheck(
+        'pg-cron-ext',
+        CATEGORIES.DEPENDENCY_DOWN,
+        'extension not enabled — Supabase dashboard → Database → Extensions → pg_cron'
+      ));
     } else {
-      checks.push({ name: 'pg-cron-ext', status: 'pass' });
+      checks.push(passCheck('pg-cron-ext'));
     }
     const net = await safeQueryRow(client,
       "SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_net'");
     if (net.error) {
-      checks.push({ name: 'pg-net-ext', status: 'fail', detail: net.error });
+      checks.push(failCheck('pg-net-ext', classifyDbFailure(net), net.error));
     } else if (!net.ok) {
-      checks.push({
-        name: 'pg-net-ext',
-        status: 'fail',
-        detail: 'extension not enabled — Supabase dashboard → Database → Extensions → pg_net'
-      });
+      checks.push(failCheck(
+        'pg-net-ext',
+        CATEGORIES.DEPENDENCY_DOWN,
+        'extension not enabled — Supabase dashboard → Database → Extensions → pg_net'
+      ));
     } else {
-      checks.push({ name: 'pg-net-ext', status: 'pass' });
+      checks.push(passCheck('pg-net-ext'));
     }
     const vault = await safeQueryRow(client,
       "SELECT 1 AS ok FROM vault.decrypted_secrets WHERE name = 'rumen_service_role_key'");
     if (vault.error) {
-      checks.push({
-        name: 'vault-secret',
-        status: 'fail',
-        detail: `vault.decrypted_secrets unreadable — ${vault.error}`
-      });
+      checks.push(failCheck('vault-secret', classifyDbFailure(vault), `vault.decrypted_secrets unreadable — ${vault.error}`));
     } else if (!vault.ok) {
-      checks.push({
-        name: 'vault-secret',
-        status: 'fail',
-        detail: 'rumen_service_role_key missing — Supabase dashboard → Project Settings → Vault → New secret'
-      });
+      checks.push(failCheck(
+        'vault-secret',
+        CATEGORIES.DEPENDENCY_DOWN,
+        'rumen_service_role_key missing — Supabase dashboard → Project Settings → Vault → New secret'
+      ));
     } else {
-      checks.push({ name: 'vault-secret', status: 'pass' });
+      checks.push(passCheck('vault-secret'));
     }
     const job = await safeQueryRows(client,
       "SELECT active FROM cron.job WHERE jobname = 'rumen-tick'");
     if (job.error) {
-      checks.push({ name: 'cron-job-active', status: 'fail', detail: `cron.job unreadable — ${job.error}` });
+      checks.push(failCheck('cron-job-active', classifyDbFailure(job), `cron.job unreadable — ${job.error}`));
     } else if (!job.rows || job.rows.length === 0) {
-      checks.push({
-        name: 'cron-job-active',
-        status: 'fail',
-        detail: 'rumen-tick row not found — re-run `termdeck init --rumen`'
-      });
+      checks.push(failCheck(
+        'cron-job-active',
+        CATEGORIES.DEPENDENCY_DOWN,
+        'rumen-tick row not found — re-run `termdeck init --rumen`'
+      ));
     } else if (!job.rows[0].active) {
-      checks.push({
-        name: 'cron-job-active',
-        status: 'fail',
-        detail:
-          "rumen-tick paused — SELECT cron.alter_job((SELECT jobid FROM cron.job WHERE jobname = 'rumen-tick'), active := true);"
-      });
+      checks.push(failCheck(
+        'cron-job-active',
+        CATEGORIES.DEPENDENCY_DOWN,
+        "rumen-tick paused — SELECT cron.alter_job((SELECT jobid FROM cron.job WHERE jobname = 'rumen-tick'), active := true);"
+      ));
     } else {
-      checks.push({ name: 'cron-job-active', status: 'pass' });
+      checks.push(passCheck('cron-job-active'));
     }
   } finally {
     if (owned) {
@@ -234,15 +390,29 @@ function httpReachable(url, timeoutMs = 2000) {
     try {
       req = mod.get(url, { timeout: timeoutMs }, (res) => {
         const ok = res.statusCode != null && res.statusCode < 500;
+        const status = res.statusCode;
         res.resume();
-        resolve({ ok, status: res.statusCode });
+        resolve({ ok, status });
       });
     } catch (err) {
-      resolve({ ok: false, error: err && err.message ? err.message : String(err) });
+      // Sprint 63 T3 §3.2 — surface `code` so the caller can classify into
+      // the red:* taxonomy without re-parsing the message.
+      resolve({
+        ok: false,
+        error: err && err.message ? err.message : String(err),
+        code: err && err.code,
+      });
       return;
     }
-    req.on('error', (err) => resolve({ ok: false, error: err && err.message ? err.message : String(err) }));
-    req.on('timeout', () => { try { req.destroy(); } catch (_e) { /* gone */ } resolve({ ok: false, error: 'timeout' }); });
+    req.on('error', (err) => resolve({
+      ok: false,
+      error: err && err.message ? err.message : String(err),
+      code: err && err.code,
+    }));
+    req.on('timeout', () => {
+      try { req.destroy(); } catch (_e) { /* gone */ }
+      resolve({ ok: false, error: 'timeout', code: 'TIMEOUT' });
+    });
   });
 }
@@ -250,50 +420,66 @@ async function checkMnestraWebhook(config, options) {
   if (options && typeof options._mnestraWebhookProbe === 'function') {
     try {
       const r = await options._mnestraWebhookProbe();
-      if (r && r.ok) return { name: 'mnestra-webhook', status: 'pass' };
-      return { name: 'mnestra-webhook', status: 'warn', detail: (r && r.detail) || 'unreachable' };
+      if (r && r.ok) return passCheck('mnestra-webhook');
+      const cat = classifyHttpFailure(r);
+      return warnCheck('mnestra-webhook', cat, (r && r.detail) || (r && r.error) || 'unreachable');
     } catch (err) {
-      return { name: 'mnestra-webhook', status: 'warn', detail: err && err.message ? err.message : String(err) };
+      const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
+      return warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
     }
   }
   const rag = (config && config.rag) || {};
   if (!rag.mnestraWebhookUrl) {
-    return { name: 'mnestra-webhook', status: 'warn', detail: 'webhook URL not configured' };
+    // Sprint 63 T3 §3.2 — URL not configured = init-failed (operator never
+    // set up the webhook), not unreachable. Log-once so a 30s poll on an
+    // unconfigured install doesn't flood warns.
+    logInitFailedOnce('mnestra-webhook', 'rag.mnestraWebhookUrl not configured');
+    return warnCheck('mnestra-webhook', CATEGORIES.INIT_FAILED, 'webhook URL not configured');
   }
   const healthUrl = String(rag.mnestraWebhookUrl).replace(/\/mnestra\/?$/, '/healthz');
   const r = await httpReachable(healthUrl, 2000);
-  if (r.ok) return { name: 'mnestra-webhook', status: 'pass' };
-  return {
-    name: 'mnestra-webhook',
-    status: 'warn',
-    detail: r.error ? `unreachable — ${r.error}` : `HTTP ${r.status || '???'}`
-  };
+  if (r.ok) return passCheck('mnestra-webhook');
+  const cat = classifyHttpFailure(r);
+  const why = r.error
+    ? `${r.error}${typeof r.status === 'number' ? ` (HTTP ${r.status})` : ''}`
+    : `HTTP ${r.status || '???'}`;
+  return warnCheck('mnestra-webhook', cat, why);
 }
 async function checkRumenPool(config, options) {
   if (options && typeof options._rumenPoolProbe === 'function') {
     try {
       const r = await options._rumenPoolProbe();
-      if (r && r.ok) return { name: 'rumen-pool', status: 'pass' };
-      return { name: 'rumen-pool', status: 'warn', detail: (r && r.detail) || 'unreachable (best-effort)' };
+      if (r && r.ok) return passCheck('rumen-pool');
+      // Test-seam probe should pass `category` if it has one; else infer.
+      const cat = (r && r.category) || classifyDbFailure(r || {});
+      return warnCheck('rumen-pool', cat, (r && r.detail) || (r && r.error) || 'unreachable (best-effort)');
     } catch (err) {
-      return { name: 'rumen-pool', status: 'warn', detail: err && err.message ? err.message : String(err) };
+      const cat = classifyDbFailure(err);
+      return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
     }
   }
   let pg;
   try { pg = require('pg'); } catch (_e) { pg = null; }
-  if (!pg) return { name: 'rumen-pool', status: 'warn', detail: 'pg module not installed' };
+  if (!pg) {
+    logInitFailedOnce('rumen-pool', 'pg module not installed');
+    return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'pg module not installed');
+  }
   const dbUrl = (config && config.rag && config.rag.databaseUrl) || process.env.DATABASE_URL;
-  if (!dbUrl) return { name: 'rumen-pool', status: 'warn', detail: 'DATABASE_URL not set' };
+  if (!dbUrl) {
+    logInitFailedOnce('rumen-pool', 'DATABASE_URL not set');
+    return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'DATABASE_URL not set');
+  }
   const pool = new pg.Pool({ connectionString: dbUrl, max: 1, connectionTimeoutMillis: 3000 });
   try {
     const res = await pool.query('SELECT 1 AS ok');
-    if (res.rows[0] && res.rows[0].ok === 1) return { name: 'rumen-pool', status: 'pass' };
-    return { name: 'rumen-pool', status: 'warn', detail: 'SELECT 1 returned unexpected result' };
+    if (res.rows[0] && res.rows[0].ok === 1) return passCheck('rumen-pool');
+    return warnCheck('rumen-pool', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
   } catch (err) {
-    return { name: 'rumen-pool', status: 'warn', detail: err && err.message ? err.message : String(err) };
+    const cat = classifyDbFailure(err);
+    return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
   } finally {
     try { await pool.end(); } catch (_e) { /* ignore */ }
   }
@@ -316,35 +502,82 @@ async function getFullHealth(config = {}, options = {}) {
   const checks = [];
+  // Sprint 63 T3 §3.2 outer-catch hardening (T4-CODEX AUDIT-CONCERN 13:27 ET):
+  // every probe is independently wrapped here so an unexpected throw in a
+  // single probe path can't sink the whole report. Pre-Sprint-63 these four
+  // catches emitted raw `{ status: 'fail'|'warn', detail }` with no
+  // `category` field — operators triaging "why is the dashboard red?" still
+  // had to read prose. The whole point of the taxonomy is that there is no
+  // such thing as an uncategorized non-pass row. Every fallback now runs the
+  // captured `err` through `classifyDbFailure` / `classifyHttpFailure` and
+  // composes a normal `failCheck` / `warnCheck` envelope. When the
+  // classifier can't infer (truly opaque throw — bug in the probe itself,
+  // not in the dependency), the default branch in each classifier returns
+  // `red:dependency-down`, which is the right-by-default category for "the
+  // probe's path is broken" — operator's first action is to inspect the
+  // peer / its config, not the local handle.
+  // Sprint 63 T3 §3.2 — `_throwIn` test seam. The probe functions each have
+  // their own try/catch so unreached-by-design inputs can't throw out into
+  // the outer catches below. The fence tests need a way to simulate "a
+  // probe's path threw before its own catch caught it" — i.e., the
+  // belt-and-suspenders outer catch. Set `_throwIn` to one of
+  // `'sqlite' | 'pg' | 'webhook' | 'rumen-pool'` to inject a synthetic
+  // throw at the corresponding outer-try entry. Never set in production —
+  // ignored if the value is falsy.
+  const throwIn = options._throwIn || null;
+  const synth = (where) => new Error(`test-fence: simulated throw in ${where} probe path`);
   // 1. SQLite (sync — small DB, no risk of blocking)
-  try { checks.push(checkSqlite(db)); }
-  catch (err) { checks.push({ name: 'sqlite', status: 'fail', detail: err && err.message ? err.message : String(err) }); }
+  try {
+    if (throwIn === 'sqlite') throw synth('sqlite');
+    checks.push(checkSqlite(db));
+  }
+  catch (err) {
+    const cat = classifyDbFailure(err);
+    checks.push(failCheck('sqlite', cat, err && err.message ? err.message : String(err)));
+  }
   // 2-7. Postgres-side suite
   let pgChecks;
-  try { pgChecks = await runPgChecks({ databaseUrl, _pgClient: options._pgClient }); }
+  try {
+    if (throwIn === 'pg') throw synth('pg');
+    pgChecks = await runPgChecks({ databaseUrl, _pgClient: options._pgClient });
+  }
   catch (err) {
-    pgChecks = [{
-      name: 'mnestra-pg',
-      status: 'fail',
-      detail: err && err.message ? err.message : String(err)
-    }];
+    const cat = classifyDbFailure(err);
+    const why = err && err.message ? err.message : String(err);
+    pgChecks = [failCheck('mnestra-pg', cat, why)];
+    // Dependents inherit the same category — see runPgChecks header for the
+    // rationale (one root-cause row, not 6 independent-looking REDs).
     for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
-      pgChecks.push({ name, status: 'fail', detail: 'pg suite aborted' });
+      pgChecks.push(failCheck(name, cat, 'pg suite aborted'));
     }
   }
   for (const c of pgChecks) checks.push(c);
   // 8. Mnestra webhook (warn)
   let webhook;
-  try { webhook = await checkMnestraWebhook(config, options); }
-  catch (err) { webhook = { name: 'mnestra-webhook', status: 'warn', detail: err && err.message ? err.message : String(err) }; }
+  try {
+    if (throwIn === 'webhook') throw synth('webhook');
+    webhook = await checkMnestraWebhook(config, options);
+  }
+  catch (err) {
+    const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
+    webhook = warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
+  }
   checks.push(webhook);
   // 9. Rumen pool (warn)
   let pool;
-  try { pool = await checkRumenPool(config, options); }
-  catch (err) { pool = { name: 'rumen-pool', status: 'warn', detail: err && err.message ? err.message : String(err) }; }
+  try {
+    if (throwIn === 'rumen-pool') throw synth('rumen-pool');
+    pool = await checkRumenPool(config, options);
+  }
+  catch (err) {
+    const cat = classifyDbFailure(err);
+    pool = warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
+  }
   checks.push(pool);
   const ok = checks
@@ -370,8 +603,19 @@ function _resetCache() {
   _cachedAt = 0;
 }
+// Sprint 63 T3 §3.2 — clear the init-failed log-once memory so each test
+// case starts fresh. Without this, the first test that exercises a null-db
+// path would silence the log on subsequent tests in the same process.
+function _resetInitLogged() {
+  _initLoggedOnce.clear();
+}
 module.exports = {
   getFullHealth,
   REQUIRED_CHECKS,
-  _resetCache
+  CATEGORIES,
+  classifyHttpFailure,
+  classifyDbFailure,
+  _resetCache,
+  _resetInitLogged,
 };

package/packages/server/src/index.js CHANGED Viewed

@@ -37,7 +37,7 @@ try {
 }
 try { pg = require('pg'); } catch { pg = null; }
-// Module-level singleton Postgres pool for rumen_insights (petvetbid DB).
+// Module-level singleton Postgres pool for rumen_insights (the daily-driver DB).
 // Lazy-initialized on first rumen endpoint hit so startup stays fast and
 // servers without DATABASE_URL never pay the connection cost.
 //
@@ -292,31 +292,42 @@ function _termdeckVersion() {
 // `pty.resize()` ioctls a stale fd. The error is race-expected, not a bug,
 // but the noisy console.error trace pollutes diagnostics and obscures real
 // errors. This helper guards against the race and downgrades the known
-// race-class errors (EBADF, ENOTTY, generic "ioctl failed" message shape) to
-// a silent return. Set TERMDECK_DEBUG_PTY_RACES=1 to log to console.debug
-// for diagnostics.
+// race-class errors (EBADF, ENOTTY) to a silent return. Set
+// TERMDECK_DEBUG_PTY_RACES=1 to log to console.debug for diagnostics.
+//
+// Sprint 63 T1 — `isPtyRaceError(err)` extracted so the WS message-handler
+// outer catch can also downgrade race-class errors that escape the helper's
+// own catch (e.g. if `pty.write` ever races the close, future code paths).
+// `session.pty._destroyed` short-circuit added as belt-and-suspenders for the
+// `term.kill()` → before-`term.onExit`-fires window: the DELETE handler now
+// stamps `_destroyed = true` immediately after kill(), so resize attempts in
+// that interval short-circuit without an ioctl call.
+function isPtyRaceError(err) {
+  if (!err) return false;
+  const msg = (err.message) || '';
+  const code = err.code;
+  return code === 'EBADF' ||
+    code === 'ENOTTY' ||
+    /\b(?:EBADF|ENOTTY)\b/.test(msg);
+}
 function safelyResizePty(session, cols, rows) {
   if (!session || !session.pty) return false;
+  if (session.pty._destroyed) return false;
   if (session.meta && session.meta.status === 'exited') return false;
   try {
     session.pty.resize(cols || 120, rows || 30);
     return true;
   } catch (err) {
-    const msg = (err && err.message) || '';
-    const code = err && err.code;
     // Sprint 60 v1.0.14 + T4-CODEX AUDIT-CONCERN narrowing: race classifier
     // requires explicit EBADF or ENOTTY (in code OR message). The earlier
     // shape — any "ioctl(N) failed" message — was too broad: it would have
     // silently dropped a non-race ioctl failure (e.g. EINTR, EFAULT) that
     // might indicate a real bug. Now: only the specific race-class signals
     // get suppressed; everything else rethrows so it surfaces in logs.
-    const isRace =
-      code === 'EBADF' ||
-      code === 'ENOTTY' ||
-      /\b(?:EBADF|ENOTTY)\b/.test(msg);
-    if (isRace) {
+    if (isPtyRaceError(err)) {
       if (process.env.TERMDECK_DEBUG_PTY_RACES) {
-        console.debug(`[ws] resize-after-pty-exit (race-expected): session=${session.id} ${code || msg}`);
+        console.debug(`[ws] resize-after-pty-exit (race-expected): session=${session.id} ${err.code || err.message}`);
       }
       return false;
     }
@@ -324,6 +335,35 @@ function safelyResizePty(session, cols, rows) {
   }
 }
+// Sprint 63 T1 (Item 1.3) — body-parser hardening. The pre-existing
+// `entity.verify.failed` / `entity.parse.failed` handler logged the error
+// message but not WHICH bytes triggered the parse failure. Operators on
+// Brad's r730 saw 9× SyntaxError flood over 13h with no fingerprint to
+// identify the offending caller. `hexEscapePrefix` renders a 32-byte
+// prefix of the raw body in a single-line, log-safe form: printable ASCII
+// kept verbatim, non-printables rendered as `\xNN`, backslash escaped as
+// `\\`. PII-conservative because we cap at 32 bytes (truncation marker `…`
+// appended if more). The error middleware injects this into the existing
+// `console.warn` line so the log signature is identifiable without
+// dumping the full body.
+function hexEscapePrefix(buf, maxBytes = 32) {
+  if (!buf || buf.length === 0) return '<no-body>';
+  const len = Math.min(buf.length, maxBytes);
+  let out = '';
+  for (let i = 0; i < len; i++) {
+    const b = buf[i];
+    if (b === 0x5c) {
+      out += '\\\\';
+    } else if (b >= 0x20 && b < 0x7f) {
+      out += String.fromCharCode(b);
+    } else {
+      out += '\\x' + b.toString(16).padStart(2, '0');
+    }
+  }
+  if (buf.length > maxBytes) out += '…';
+  return out;
+}
 function createServer(config) {
   const app = express();
   const server = http.createServer(app);
@@ -346,6 +386,13 @@ function createServer(config) {
   // logs so real errors aren't drowned in noise.
   app.use(express.json({
     verify: (req, res, buf) => {
+      // Sprint 63 T1 (Item 1.3) — capture a stable copy of the raw body so
+      // the error middleware below can render a 32-byte hex-escaped prefix.
+      // `Buffer.from(buf)` copies because express may pool the underlying
+      // accumulator across requests; without the copy the error handler
+      // could see bytes from a later request.
+      req.rawBody = Buffer.from(buf);
       // O(N) single-pass scan. Only checks bytes inside double-quoted string
       // regions so structural whitespace doesn't trigger false positives.
       let inString = false;
@@ -390,7 +437,13 @@ function createServer(config) {
       err.type === 'entity.verify.failed' ||
       err instanceof SyntaxError
     )) {
-      console.warn(`[body-parser] ${err.code || err.type || 'parse-error'}: ${err.message} (${req.method} ${req.path})`);
+      // Sprint 63 T1 (Item 1.3) — append a 32-byte hex-escaped prefix of the
+      // raw body so the operator can identify which caller is sending bad
+      // JSON without exposing the full payload. Falls through to `<no-body>`
+      // if the verify callback never ran (parse error before verify, or no
+      // body at all).
+      const prefix = hexEscapePrefix(req.rawBody);
+      console.warn(`[body-parser] ${err.code || err.type || 'parse-error'}: ${err.message} (${req.method} ${req.path}) prefix="${prefix}"`);
       return res.status(400).json({
         error: 'Malformed JSON body',
         detail: err.message,
@@ -1189,6 +1242,18 @@ function createServer(config) {
             const sessUploadDir = path.join(os.tmpdir(), 'termdeck-uploads', session.id);
             fs.rmSync(sessUploadDir, { recursive: true, force: true });
           } catch (_err) { /* non-blocking */ }
+          // Sprint 63 T1 (Item 1.1) — null `session.pty` so the wrapper is
+          // eligible for GC and downstream `if (session.pty)` guards correctly
+          // identify the exited state. Root cause of Joshua's 2026-05-08/09
+          // overnight `kern.tty.ptmx_max=511` exhaustion (516 fds for 4 panels):
+          // without this nulling, node-pty's wrapper stayed pinned by onData /
+          // onExit closures even after the child exited, holding the master
+          // fd until next GC pass. Set AFTER `onPanelClose` fires (fire-and-
+          // forget; reads `session.meta` + `session.id`, not `session.pty`) and
+          // AFTER the upload-dir cleanup so any sync reader above this line
+          // sees the original wrapper.
+          session.pty = null;
         });
         // Wire command logging to SQLite + RAG
@@ -1346,7 +1411,7 @@ function createServer(config) {
   });
   // Graph endpoints (Sprint 38 T4) — knowledge-graph view backing graph.html.
-  // Reuses the petvetbid pg pool (same DATABASE_URL serves memory_items +
+  // Reuses the daily-driver pg pool (same DATABASE_URL serves memory_items +
   // memory_relationships alongside rumen_*). Graceful-degrades when the pool
   // is absent.
   createGraphRoutes({
@@ -1376,6 +1441,14 @@ function createServer(config) {
     // Kill PTY process
     if (session.pty) {
       try { session.pty.kill(); } catch (err) { console.error('[pty] kill failed for session', req.params.id + ':', err); }
+      // Sprint 63 T1 (Item 1.2) — stamp `_destroyed = true` on the pty wrapper
+      // so `safelyResizePty` can short-circuit any resize attempts that arrive
+      // in the kill()→onExit window. node-pty's `kill()` only signals the
+      // child; onExit fires asynchronously once the child reaps. Without this
+      // marker, a WS resize message in that window would ioctl a fd whose
+      // child has just SIGHUP'd, surfacing as EBADF/ENOTTY. node-pty doesn't
+      // set this property itself; the convention is owned by TermDeck.
+      session.pty._destroyed = true;
     }
     sessions.remove(req.params.id);
@@ -1595,15 +1668,23 @@ function createServer(config) {
   });
   // POST /api/sessions/:id/resize - resize terminal
+  // Sprint 63 T1 (Item 1.2) — distinguish "session never existed" (404) from
+  // "session exists but PTY has exited" (410 Gone). Pre-Sprint-63 both paths
+  // collapsed to 404 (when session.pty was null after the PTY-leak fix) or
+  // 409 (when safelyResizePty returned false). 410 is the semantically
+  // correct response: the resource was here, the resource is now gone.
   app.post('/api/sessions/:id/resize', (req, res) => {
     const session = sessions.get(req.params.id);
-    if (!session?.pty) return res.status(404).json({ error: 'Session not found' });
+    if (!session) return res.status(404).json({ error: 'Session not found' });
+    if (!session.pty || (session.meta && session.meta.status === 'exited')) {
+      return res.status(410).json({ error: 'PTY is gone (session exited)' });
+    }
     const { cols, rows } = req.body;
     try {
       const resized = safelyResizePty(session, cols, rows);
       if (!resized) {
-        return res.status(409).json({ error: 'Session is exited or its PTY is no longer alive' });
+        return res.status(410).json({ error: 'PTY is gone (session exited)' });
       }
       res.json({ ok: true, cols, rows });
     } catch (err) {
@@ -2027,7 +2108,7 @@ function createServer(config) {
   });
   // ==================== Rumen insights (Sprint 4 T2) ====================
-  // Read-only access to rumen_insights + rumen_jobs in the petvetbid Postgres
+  // Read-only access to rumen_insights + rumen_jobs in the daily-driver Postgres
   // instance. Contract frozen in docs/sprint-4-rumen-integration/API-CONTRACT.md.
   function rumenUnreachable(res) {
@@ -2268,7 +2349,7 @@ function createServer(config) {
         switch (parsed.type) {
           case 'input':
-            if (session.pty) {
+            if (session.pty && !session.pty._destroyed) {
               session.pty.write(parsed.data);
               session.trackInput(parsed.data);
             }
@@ -2289,7 +2370,21 @@ function createServer(config) {
             }));
             break;
         }
-      } catch (err) { console.error('[ws] message handler error:', err); }
+      } catch (err) {
+        // Sprint 63 T1 (Item 1.2) — belt-and-suspenders: if a race-class
+        // ioctl error somehow escapes safelyResizePty's own catch (or comes
+        // from a future write/ioctl path), downgrade to console.debug
+        // instead of polluting stderr with the noisy ws-message-handler
+        // error log. safelyResizePty itself already catches the resize
+        // path; this catches any other race-class shape that bubbles here.
+        if (isPtyRaceError(err)) {
+          if (process.env.TERMDECK_DEBUG_PTY_RACES) {
+            console.debug(`[ws] message handler race-class (suppressed): ${err.code || err.message}`);
+          }
+        } else {
+          console.error('[ws] message handler error:', err);
+        }
+      }
     });
     ws.on('close', () => {
@@ -2599,6 +2694,11 @@ module.exports = {
   // helper instead of re-implementing it. T4-CODEX AUDIT-CONCERN flagged that
   // the prior re-implementation pattern in the test could drift silently.
   safelyResizePty,
+  // Sprint 63 T1 (Item 1.2 + 1.3) — race-class classifier + raw-body hex
+  // prefix renderer exported so fence tests can import the production
+  // helpers instead of re-implementing them.
+  isPtyRaceError,
+  hexEscapePrefix,
   // Sprint 48 T4 — exported for unit testing the secrets.env → PTY env merge.
   readTermdeckSecretsForPty,
   _resetTermdeckSecretsCache,

package/packages/server/src/preflight.js CHANGED Viewed

@@ -261,7 +261,13 @@ async function checkShellSanity() {
     let output = '';
     let resolved = false;
-    const proc = ptyMod.spawn(shell, ['-l', '-c', 'echo TERMDECK_OK'], {
+    // Sprint 63 T3 §3.3 — drop `-l` (login mode). `-l` sources ~/.bash_profile
+    // / ~/.zshrc and friends, which on heavy profiles (nvm, conda, plugin
+    // managers — Brad's r730 has conda) routinely exceeds the 3s timeout
+    // budget below. A PTY-spawn health check answers "can $SHELL spawn a
+    // PTY and emit output?" — not "does the user's interactive profile
+    // complete fast?" Login-mode startup time is unrelated to PTY health.
+    const proc = ptyMod.spawn(shell, ['-c', 'echo TERMDECK_OK'], {
       name: 'xterm-256color',
       cols: 80,
       rows: 24,