@jhizzard/termdeck 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jhizzard/termdeck",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "Browser-based terminal multiplexer with metadata overlays, panel flashback memory recall, and AI-aware session management",
|
|
5
5
|
"bin": {
|
|
6
6
|
"termdeck": "./packages/cli/src/index.js"
|
|
@@ -410,7 +410,14 @@ async function checkRumen() {
|
|
|
410
410
|
}
|
|
411
411
|
const pool = new pg.Pool({ connectionString: dbUrl, max: 1, connectionTimeoutMillis: 5000 });
|
|
412
412
|
try {
|
|
413
|
-
|
|
413
|
+
// Sprint 63 T3 §3.1 — `rumen_jobs` has `started_at` (migration 001), NOT
|
|
414
|
+
// `created_at`. Pre-Sprint-63 this probed `created_at` and threw a
|
|
415
|
+
// generic WARN that doctor's same-DB check did not (Sprint 35 doctor fix
|
|
416
|
+
// landed RUMEN_TIME_COL.rumen_jobs='started_at' but never propagated
|
|
417
|
+
// here). Brad reproduced on r730 2026-05-11; doctor 23/23 GREEN while
|
|
418
|
+
// launcher Step 3 emitted `WARN (query failed: column "created_at"
|
|
419
|
+
// does not exist)`. Aligned both probes to the same column.
|
|
420
|
+
const r = await pool.query("SELECT to_char(NOW() - MAX(started_at), 'HH24:MI:SS') AS ago FROM rumen_jobs");
|
|
414
421
|
const ago = r.rows[0] && r.rows[0].ago;
|
|
415
422
|
if (ago) {
|
|
416
423
|
stepLine('3/4', 'Checking Rumen', 'OK', `(last job ${ago} ago)`);
|
|
@@ -419,10 +426,20 @@ async function checkRumen() {
|
|
|
419
426
|
stepLine('3/4', 'Checking Rumen', 'WARN', '(no jobs yet — try termdeck init --rumen)');
|
|
420
427
|
return { ago: null };
|
|
421
428
|
} catch (err) {
|
|
422
|
-
|
|
429
|
+
const msg = String(err && err.message ? err.message : err);
|
|
430
|
+
if (/relation .*rumen_jobs.* does not exist/i.test(msg)) {
|
|
423
431
|
stepLine('3/4', 'Checking Rumen', 'SKIP', '(rumen_jobs table not present — run termdeck init --rumen)');
|
|
424
432
|
} else {
|
|
425
|
-
|
|
433
|
+
const colMatch = msg.match(/column "([^"]+)" does not exist/i);
|
|
434
|
+
if (colMatch) {
|
|
435
|
+
// Schema drift — rumen_jobs is missing the column we queried. Naming
|
|
436
|
+
// the column + remediation beats a bare `query failed` that operators
|
|
437
|
+
// learn to filter out (Brad's r730, 2026-05-11).
|
|
438
|
+
stepLine('3/4', 'Checking Rumen', 'WARN',
|
|
439
|
+
`(rumen_jobs.${colMatch[1]} column missing — re-run \`termdeck init --rumen\` to apply migration 001)`);
|
|
440
|
+
} else {
|
|
441
|
+
stepLine('3/4', 'Checking Rumen', 'WARN', `(query failed: ${err.message})`);
|
|
442
|
+
}
|
|
426
443
|
}
|
|
427
444
|
return { ago: null };
|
|
428
445
|
} finally {
|
|
@@ -50,13 +50,18 @@ function statusFor(data) {
|
|
|
50
50
|
// resolveTranscriptPath — Sprint 50 T1.
|
|
51
51
|
//
|
|
52
52
|
// Gemini CLI persists chats at
|
|
53
|
-
// ~/.gemini/tmp/<basename(cwd)>/chats/session-<ISO-ts>-<short-id>.json
|
|
54
|
-
// (single-JSON-object shape that matches parseGeminiJson
|
|
55
|
-
// 2026-05-02 substrate probe
|
|
56
|
-
//
|
|
57
|
-
//
|
|
58
|
-
//
|
|
59
|
-
//
|
|
53
|
+
// ~/.gemini/tmp/<basename(cwd)>/chats/session-<ISO-ts>-<short-id>.{json,jsonl}
|
|
54
|
+
// (single-JSON-object shape that matches parseGeminiJson for the .json
|
|
55
|
+
// flavor, verified 2026-05-02 substrate probe; .jsonl flavor introduced
|
|
56
|
+
// some time between 2026-05-02 and 2026-05-08, surfaced by Sprint 63 T2
|
|
57
|
+
// acceptance — see docs/sprint-63-wave-2/EXIT-CAPTURE-VERIFICATION.md
|
|
58
|
+
// Finding #2. The extension filter accepts both shapes; downstream parser
|
|
59
|
+
// handling of JSONL deltas is a Sprint 64 candidate). Pick the most
|
|
60
|
+
// recently modified file whose mtime is at-or-after
|
|
61
|
+
// `session.meta.createdAt`. Falls back to walking every project directory
|
|
62
|
+
// under `~/.gemini/tmp/*/chats/` if the basename heuristic produces no
|
|
63
|
+
// candidate (e.g., Gemini renormalized the project name to deduplicate
|
|
64
|
+
// against an existing one).
|
|
60
65
|
// ──────────────────────────────────────────────────────────────────────────
|
|
61
66
|
|
|
62
67
|
async function resolveTranscriptPath(session) {
|
|
@@ -83,7 +88,8 @@ async function resolveTranscriptPath(session) {
|
|
|
83
88
|
let entries;
|
|
84
89
|
try { entries = fs.readdirSync(dir); } catch (_) { return; }
|
|
85
90
|
for (const name of entries) {
|
|
86
|
-
if (!name.startsWith('session-')
|
|
91
|
+
if (!name.startsWith('session-')) continue;
|
|
92
|
+
if (!name.endsWith('.json') && !name.endsWith('.jsonl')) continue;
|
|
87
93
|
const full = path.join(dir, name);
|
|
88
94
|
let st;
|
|
89
95
|
try { st = fs.statSync(full); } catch (_) { continue; }
|
|
@@ -20,6 +20,33 @@
|
|
|
20
20
|
// checks (mnestra-webhook, rumen-pool) are best-effort: a failure surfaces
|
|
21
21
|
// as `warn` with detail, but does not flip `ok`.
|
|
22
22
|
//
|
|
23
|
+
// Failure taxonomy (Sprint 63 T3 §3.2 — Brad r730 cascade 2026-05-11)
|
|
24
|
+
// ──────────────────────────────────────────────────────────────────
|
|
25
|
+
// Pre-Sprint-63 every check that didn't return `pass` collapsed to `fail`
|
|
26
|
+
// with a free-text `detail` string. Operators triaging "why is the install
|
|
27
|
+
// red?" had to read each detail and guess. The cost was real: on 2026-05-11
|
|
28
|
+
// a SQLite ABI mismatch left `db = null` at boot; the resulting
|
|
29
|
+
// `red: timeout` strings (from probes that timed-out trying to use the null
|
|
30
|
+
// handle indirectly) masked the actual `init-failed` root cause for hours.
|
|
31
|
+
//
|
|
32
|
+
// Every non-pass check now carries a `category` field with one of:
|
|
33
|
+
// `red:unreachable` — network/socket level (ECONNREFUSED / EHOSTUNREACH
|
|
34
|
+
// / ENETUNREACH / ENOTFOUND on connect)
|
|
35
|
+
// `red:timeout` — request issued, no response in the window
|
|
36
|
+
// (AbortError / req timeout / pg ETIMEDOUT)
|
|
37
|
+
// `red:dependency-down` — peer responded but the dependency is unhealthy
|
|
38
|
+
// (HTTP 5xx / SQL schema error from a reachable DB)
|
|
39
|
+
// `red:init-failed` — local handle the probe needs was never initialized
|
|
40
|
+
// (db === null at boot / DATABASE_URL not set)
|
|
41
|
+
//
|
|
42
|
+
// `detail` strings are prefixed with the category for human readability:
|
|
43
|
+
// `red:unreachable (could not connect to Postgres using DATABASE_URL)`.
|
|
44
|
+
//
|
|
45
|
+
// init-failed surfaces use a log-once gate so a 30s-poll cycle on a process
|
|
46
|
+
// with a missing handle (e.g. better-sqlite3 not loaded) writes ONE warn at
|
|
47
|
+
// boot, not 2880 warns/day. Probes still emit `red:init-failed` per cycle
|
|
48
|
+
// in the JSON report — only the log emission is gated.
|
|
49
|
+
//
|
|
23
50
|
// Caching
|
|
24
51
|
// ───────
|
|
25
52
|
// Reports cached in module scope for 30s. `getFullHealth(config, { refresh: true })`
|
|
@@ -29,8 +56,9 @@
|
|
|
29
56
|
// Error handling
|
|
30
57
|
// ──────────────
|
|
31
58
|
// Every check is wrapped: any unexpected error downgrades that single check
|
|
32
|
-
// to `fail` (or `warn` for warn-checks) with the error message in `detail
|
|
33
|
-
// `
|
|
59
|
+
// to `fail` (or `warn` for warn-checks) with the error message in `detail`
|
|
60
|
+
// and a `category` from the taxonomy above. `getFullHealth()` always
|
|
61
|
+
// resolves with a structured report — never throws.
|
|
34
62
|
|
|
35
63
|
'use strict';
|
|
36
64
|
|
|
@@ -50,21 +78,104 @@ const REQUIRED_CHECKS = new Set([
|
|
|
50
78
|
'cron-job-active'
|
|
51
79
|
]);
|
|
52
80
|
|
|
81
|
+
// Sprint 63 T3 §3.2 — stable taxonomy strings. Exported so dashboard / doctor
|
|
82
|
+
// / external graders can filter by category instead of pattern-matching the
|
|
83
|
+
// detail prose. Frozen object so callers can rely on `CATEGORIES.UNREACHABLE`
|
|
84
|
+
// without accidentally rebinding.
|
|
85
|
+
const CATEGORIES = Object.freeze({
|
|
86
|
+
UNREACHABLE: 'red:unreachable',
|
|
87
|
+
TIMEOUT: 'red:timeout',
|
|
88
|
+
DEPENDENCY_DOWN: 'red:dependency-down',
|
|
89
|
+
INIT_FAILED: 'red:init-failed',
|
|
90
|
+
});
|
|
91
|
+
|
|
53
92
|
let _cache = null;
|
|
54
93
|
let _cachedAt = 0;
|
|
55
94
|
|
|
95
|
+
// Sprint 63 T3 §3.2 — log-once gate for init-failed surfaces. A 30s health
|
|
96
|
+
// poll on a process with a missing handle would otherwise log every cycle
|
|
97
|
+
// (~2880 warn lines/day per missing handle). Probes that detect a null
|
|
98
|
+
// handle at boot call `logInitFailedOnce(name, reason)`; the first call
|
|
99
|
+
// emits a warn line, subsequent calls are silent for the lifetime of the
|
|
100
|
+
// process. Probes still emit `red:init-failed` in the JSON report on every
|
|
101
|
+
// cycle — only the log line is gated. Reset via `_resetInitLogged()` test
|
|
102
|
+
// seam between cases.
|
|
103
|
+
const _initLoggedOnce = new Map();
|
|
104
|
+
function logInitFailedOnce(probeName, reason) {
|
|
105
|
+
if (_initLoggedOnce.has(probeName)) return;
|
|
106
|
+
_initLoggedOnce.set(probeName, reason);
|
|
107
|
+
// eslint-disable-next-line no-console
|
|
108
|
+
console.warn(
|
|
109
|
+
`[health] ${probeName} handle null at boot — probes will return ` +
|
|
110
|
+
`${CATEGORIES.INIT_FAILED} until next process start; reason: ${reason}`
|
|
111
|
+
);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Classify an HTTP-side failure shape `{ ok, status, error, code }` (as
|
|
115
|
+
// returned by `httpReachable`) into one of the four red:* categories.
|
|
116
|
+
function classifyHttpFailure(r) {
|
|
117
|
+
if (!r) return CATEGORIES.UNREACHABLE;
|
|
118
|
+
if (r.code === 'TIMEOUT' || r.code === 'ABORT_ERR' || r.code === 'ERR_TIMEOUT' || r.error === 'timeout') {
|
|
119
|
+
return CATEGORIES.TIMEOUT;
|
|
120
|
+
}
|
|
121
|
+
if (r.code === 'ECONNREFUSED' || r.code === 'EHOSTUNREACH' || r.code === 'ENETUNREACH' || r.code === 'ENOTFOUND') {
|
|
122
|
+
return CATEGORIES.UNREACHABLE;
|
|
123
|
+
}
|
|
124
|
+
if (typeof r.status === 'number' && r.status >= 500) return CATEGORIES.DEPENDENCY_DOWN;
|
|
125
|
+
if (typeof r.status === 'number') return CATEGORIES.DEPENDENCY_DOWN; // any non-2xx-3xx-4xx-network is "peer responded badly"
|
|
126
|
+
return CATEGORIES.UNREACHABLE;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Classify a database / Node-side failure into one of the four categories.
|
|
130
|
+
// Accepts either a raw Error or a `{ error, code }` envelope from
|
|
131
|
+
// `safeQueryRow` / `safeQueryRows`.
|
|
132
|
+
function classifyDbFailure(errOrEnvelope) {
|
|
133
|
+
if (!errOrEnvelope) return CATEGORIES.DEPENDENCY_DOWN;
|
|
134
|
+
const code = errOrEnvelope.code || (errOrEnvelope._err && errOrEnvelope._err.code);
|
|
135
|
+
const msg = String(errOrEnvelope.message || errOrEnvelope.error || errOrEnvelope);
|
|
136
|
+
if (code === 'ECONNREFUSED' || code === 'EHOSTUNREACH' || code === 'ENETUNREACH' || code === 'ENOTFOUND') {
|
|
137
|
+
return CATEGORIES.UNREACHABLE;
|
|
138
|
+
}
|
|
139
|
+
if (code === 'ETIMEDOUT' || code === 'ERR_TIMEOUT' || /\btimeout\b/i.test(msg)) {
|
|
140
|
+
return CATEGORIES.TIMEOUT;
|
|
141
|
+
}
|
|
142
|
+
// SQL errors (42703 column-not-exist, 42P01 relation-not-exist, etc.) →
|
|
143
|
+
// the dependency answered but its schema is misconfigured. That's
|
|
144
|
+
// "dependency unhealthy," not "unreachable" or "timeout."
|
|
145
|
+
return CATEGORIES.DEPENDENCY_DOWN;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Helpers to compose check results with a category-prefixed detail. Keeps
|
|
149
|
+
// each call site readable + ensures the prefix is consistent across probes.
|
|
150
|
+
function failCheck(name, category, why) {
|
|
151
|
+
return { name, status: 'fail', category, detail: `${category} (${why})` };
|
|
152
|
+
}
|
|
153
|
+
function warnCheck(name, category, why) {
|
|
154
|
+
return { name, status: 'warn', category, detail: `${category} (${why})` };
|
|
155
|
+
}
|
|
156
|
+
function passCheck(name) {
|
|
157
|
+
return { name, status: 'pass' };
|
|
158
|
+
}
|
|
159
|
+
|
|
56
160
|
// ── SQLite check ────────────────────────────────────────────────────────────
|
|
57
161
|
|
|
58
162
|
function checkSqlite(db) {
|
|
59
163
|
if (!db) {
|
|
60
|
-
|
|
164
|
+
// Sprint 63 T3 §3.2 — `db === null` is `red:init-failed`, NOT `red:timeout`.
|
|
165
|
+
// The v1.1.1 fail-fast on SQLite ABI mismatch makes this surface rare in
|
|
166
|
+
// practice, but the probe must still classify correctly because future
|
|
167
|
+
// optional deps may be allowed to be null. Log-once gate prevents the
|
|
168
|
+
// 30s poll from flooding logs.
|
|
169
|
+
logInitFailedOnce('sqlite', 'better-sqlite3 not initialized');
|
|
170
|
+
return failCheck('sqlite', CATEGORIES.INIT_FAILED, 'better-sqlite3 not initialized');
|
|
61
171
|
}
|
|
62
172
|
try {
|
|
63
173
|
const row = db.prepare('SELECT 1 AS ok').get();
|
|
64
|
-
if (row && row.ok === 1) return
|
|
65
|
-
return
|
|
174
|
+
if (row && row.ok === 1) return passCheck('sqlite');
|
|
175
|
+
return failCheck('sqlite', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
|
|
66
176
|
} catch (err) {
|
|
67
|
-
|
|
177
|
+
const cat = classifyDbFailure(err);
|
|
178
|
+
return failCheck('sqlite', cat, err && err.message ? err.message : String(err));
|
|
68
179
|
}
|
|
69
180
|
}
|
|
70
181
|
|
|
@@ -81,7 +192,12 @@ async function safeQueryRow(client, sql) {
|
|
|
81
192
|
if (r.rows && r.rows.length > 0 && r.rows[0].ok) return { ok: true };
|
|
82
193
|
return { ok: false };
|
|
83
194
|
} catch (err) {
|
|
84
|
-
|
|
195
|
+
// Surface `code` so the caller can classify into the red:* taxonomy
|
|
196
|
+
// without re-parsing the message string.
|
|
197
|
+
return {
|
|
198
|
+
error: err && err.message ? err.message : String(err),
|
|
199
|
+
code: err && err.code,
|
|
200
|
+
};
|
|
85
201
|
}
|
|
86
202
|
}
|
|
87
203
|
|
|
@@ -90,34 +206,80 @@ async function safeQueryRows(client, sql) {
|
|
|
90
206
|
const r = await client.query(sql);
|
|
91
207
|
return { rows: r.rows || [] };
|
|
92
208
|
} catch (err) {
|
|
93
|
-
return {
|
|
209
|
+
return {
|
|
210
|
+
error: err && err.message ? err.message : String(err),
|
|
211
|
+
code: err && err.code,
|
|
212
|
+
};
|
|
94
213
|
}
|
|
95
214
|
}
|
|
96
215
|
|
|
216
|
+
// Sprint 63 T3 §3.2 — track whether the most recent connect attempt timed out
|
|
217
|
+
// vs. was outright unreachable. The pg client doesn't expose this from inside
|
|
218
|
+
// the helper, so the helper records it in a return envelope.
|
|
97
219
|
async function openPgClient(databaseUrl) {
|
|
98
|
-
if (!databaseUrl) return null;
|
|
220
|
+
if (!databaseUrl) return { client: null, reason: 'no-url' };
|
|
99
221
|
let pgRunner;
|
|
100
|
-
try { pgRunner = require('./setup/pg-runner'); }
|
|
101
|
-
|
|
222
|
+
try { pgRunner = require('./setup/pg-runner'); }
|
|
223
|
+
catch (_e) { return { client: null, reason: 'pg-runner-unavailable' }; }
|
|
224
|
+
try {
|
|
225
|
+
const client = await pgRunner.connect(databaseUrl);
|
|
226
|
+
return { client, reason: null };
|
|
227
|
+
} catch (err) {
|
|
228
|
+
return {
|
|
229
|
+
client: null,
|
|
230
|
+
reason: 'connect-failed',
|
|
231
|
+
error: err && err.message ? err.message : String(err),
|
|
232
|
+
code: err && err.code,
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Sprint 63 T3 §3.2 — dependent-checks shape when there's no client. Pre-
|
|
238
|
+
// Sprint-63 these collapsed to status:'fail', detail:'pg unavailable' with
|
|
239
|
+
// no category; operators couldn't distinguish "DATABASE_URL not set"
|
|
240
|
+
// (`init-failed` — fix the .env) from "Postgres unreachable" (`unreachable`
|
|
241
|
+
// — fix the network) from "Postgres took 5s and gave up" (`timeout` — bump
|
|
242
|
+
// timeout or check pgbouncer). Each downstream check now carries the same
|
|
243
|
+
// category as the connect attempt so the dashboard can render one row
|
|
244
|
+
// "Postgres unreachable" and dim the six dependents instead of six
|
|
245
|
+
// independent-looking RED rows.
|
|
246
|
+
function pushPgUnavailableChecks(checks, primaryName, category, primaryDetail, dependentDetail) {
|
|
247
|
+
checks.push(failCheck(primaryName, category, primaryDetail));
|
|
248
|
+
for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
|
|
249
|
+
checks.push(failCheck(name, category, dependentDetail));
|
|
250
|
+
}
|
|
102
251
|
}
|
|
103
252
|
|
|
104
253
|
async function runPgChecks({ databaseUrl, _pgClient }) {
|
|
105
254
|
const checks = [];
|
|
106
|
-
const client = _pgClient || (await openPgClient(databaseUrl));
|
|
107
|
-
const owned = !_pgClient;
|
|
108
255
|
|
|
256
|
+
let client = _pgClient || null;
|
|
257
|
+
let owned = false;
|
|
258
|
+
let connectEnvelope = null;
|
|
109
259
|
if (!client) {
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
260
|
+
connectEnvelope = await openPgClient(databaseUrl);
|
|
261
|
+
client = connectEnvelope.client;
|
|
262
|
+
owned = client != null;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
if (!client) {
|
|
266
|
+
if (!databaseUrl) {
|
|
267
|
+
// No URL → init-failed (operator never set DATABASE_URL). Log-once.
|
|
268
|
+
logInitFailedOnce('mnestra-pg', 'DATABASE_URL not configured');
|
|
269
|
+
pushPgUnavailableChecks(
|
|
270
|
+
checks,
|
|
271
|
+
'mnestra-pg',
|
|
272
|
+
CATEGORIES.INIT_FAILED,
|
|
273
|
+
'DATABASE_URL not configured — set in ~/.termdeck/secrets.env',
|
|
274
|
+
'pg unavailable — DATABASE_URL not configured'
|
|
275
|
+
);
|
|
276
|
+
} else {
|
|
277
|
+
// URL set but connect failed → classify by code (timeout vs unreachable).
|
|
278
|
+
const cat = classifyDbFailure(connectEnvelope || {});
|
|
279
|
+
const why = connectEnvelope && connectEnvelope.error
|
|
280
|
+
? `could not connect to Postgres using DATABASE_URL — ${connectEnvelope.error}`
|
|
281
|
+
: 'could not connect to Postgres using DATABASE_URL';
|
|
282
|
+
pushPgUnavailableChecks(checks, 'mnestra-pg', cat, why, 'pg unavailable — connect failed');
|
|
121
283
|
}
|
|
122
284
|
return checks;
|
|
123
285
|
}
|
|
@@ -125,11 +287,11 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
|
|
|
125
287
|
try {
|
|
126
288
|
const ping = await safeQueryRow(client, 'SELECT 1 AS ok');
|
|
127
289
|
if (ping.error) {
|
|
128
|
-
checks.push(
|
|
290
|
+
checks.push(failCheck('mnestra-pg', classifyDbFailure(ping), ping.error));
|
|
129
291
|
} else if (!ping.ok) {
|
|
130
|
-
checks.push(
|
|
292
|
+
checks.push(failCheck('mnestra-pg', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned no row'));
|
|
131
293
|
} else {
|
|
132
|
-
checks.push(
|
|
294
|
+
checks.push(passCheck('mnestra-pg'));
|
|
133
295
|
}
|
|
134
296
|
|
|
135
297
|
// memory_items.source_session_id — the v0.6.5 column from Brad's saga.
|
|
@@ -137,84 +299,78 @@ async function runPgChecks({ databaseUrl, _pgClient }) {
|
|
|
137
299
|
"SELECT 1 AS ok FROM information_schema.columns " +
|
|
138
300
|
"WHERE table_schema = 'public' AND table_name = 'memory_items' AND column_name = 'source_session_id'");
|
|
139
301
|
if (col.error) {
|
|
140
|
-
checks.push(
|
|
302
|
+
checks.push(failCheck('memory-items-col', classifyDbFailure(col), col.error));
|
|
141
303
|
} else if (!col.ok) {
|
|
142
|
-
checks.push(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
});
|
|
304
|
+
checks.push(failCheck(
|
|
305
|
+
'memory-items-col',
|
|
306
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
307
|
+
'memory_items.source_session_id missing — re-run termdeck init --mnestra --yes ' +
|
|
308
|
+
'(if loader picked up a stale set, first: npm cache clean --force && npm i -g @jhizzard/termdeck@latest)'
|
|
309
|
+
));
|
|
149
310
|
} else {
|
|
150
|
-
checks.push(
|
|
311
|
+
checks.push(passCheck('memory-items-col'));
|
|
151
312
|
}
|
|
152
313
|
|
|
153
314
|
const cron = await safeQueryRow(client,
|
|
154
315
|
"SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_cron'");
|
|
155
316
|
if (cron.error) {
|
|
156
|
-
checks.push(
|
|
317
|
+
checks.push(failCheck('pg-cron-ext', classifyDbFailure(cron), cron.error));
|
|
157
318
|
} else if (!cron.ok) {
|
|
158
|
-
checks.push(
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
319
|
+
checks.push(failCheck(
|
|
320
|
+
'pg-cron-ext',
|
|
321
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
322
|
+
'extension not enabled — Supabase dashboard → Database → Extensions → pg_cron'
|
|
323
|
+
));
|
|
163
324
|
} else {
|
|
164
|
-
checks.push(
|
|
325
|
+
checks.push(passCheck('pg-cron-ext'));
|
|
165
326
|
}
|
|
166
327
|
|
|
167
328
|
const net = await safeQueryRow(client,
|
|
168
329
|
"SELECT 1 AS ok FROM pg_extension WHERE extname = 'pg_net'");
|
|
169
330
|
if (net.error) {
|
|
170
|
-
checks.push(
|
|
331
|
+
checks.push(failCheck('pg-net-ext', classifyDbFailure(net), net.error));
|
|
171
332
|
} else if (!net.ok) {
|
|
172
|
-
checks.push(
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
333
|
+
checks.push(failCheck(
|
|
334
|
+
'pg-net-ext',
|
|
335
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
336
|
+
'extension not enabled — Supabase dashboard → Database → Extensions → pg_net'
|
|
337
|
+
));
|
|
177
338
|
} else {
|
|
178
|
-
checks.push(
|
|
339
|
+
checks.push(passCheck('pg-net-ext'));
|
|
179
340
|
}
|
|
180
341
|
|
|
181
342
|
const vault = await safeQueryRow(client,
|
|
182
343
|
"SELECT 1 AS ok FROM vault.decrypted_secrets WHERE name = 'rumen_service_role_key'");
|
|
183
344
|
if (vault.error) {
|
|
184
|
-
checks.push({
|
|
185
|
-
name: 'vault-secret',
|
|
186
|
-
status: 'fail',
|
|
187
|
-
detail: `vault.decrypted_secrets unreadable — ${vault.error}`
|
|
188
|
-
});
|
|
345
|
+
checks.push(failCheck('vault-secret', classifyDbFailure(vault), `vault.decrypted_secrets unreadable — ${vault.error}`));
|
|
189
346
|
} else if (!vault.ok) {
|
|
190
|
-
checks.push(
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
347
|
+
checks.push(failCheck(
|
|
348
|
+
'vault-secret',
|
|
349
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
350
|
+
'rumen_service_role_key missing — Supabase dashboard → Project Settings → Vault → New secret'
|
|
351
|
+
));
|
|
195
352
|
} else {
|
|
196
|
-
checks.push(
|
|
353
|
+
checks.push(passCheck('vault-secret'));
|
|
197
354
|
}
|
|
198
355
|
|
|
199
356
|
const job = await safeQueryRows(client,
|
|
200
357
|
"SELECT active FROM cron.job WHERE jobname = 'rumen-tick'");
|
|
201
358
|
if (job.error) {
|
|
202
|
-
checks.push(
|
|
359
|
+
checks.push(failCheck('cron-job-active', classifyDbFailure(job), `cron.job unreadable — ${job.error}`));
|
|
203
360
|
} else if (!job.rows || job.rows.length === 0) {
|
|
204
|
-
checks.push(
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
361
|
+
checks.push(failCheck(
|
|
362
|
+
'cron-job-active',
|
|
363
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
364
|
+
'rumen-tick row not found — re-run `termdeck init --rumen`'
|
|
365
|
+
));
|
|
209
366
|
} else if (!job.rows[0].active) {
|
|
210
|
-
checks.push(
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
});
|
|
367
|
+
checks.push(failCheck(
|
|
368
|
+
'cron-job-active',
|
|
369
|
+
CATEGORIES.DEPENDENCY_DOWN,
|
|
370
|
+
"rumen-tick paused — SELECT cron.alter_job((SELECT jobid FROM cron.job WHERE jobname = 'rumen-tick'), active := true);"
|
|
371
|
+
));
|
|
216
372
|
} else {
|
|
217
|
-
checks.push(
|
|
373
|
+
checks.push(passCheck('cron-job-active'));
|
|
218
374
|
}
|
|
219
375
|
} finally {
|
|
220
376
|
if (owned) {
|
|
@@ -234,15 +390,29 @@ function httpReachable(url, timeoutMs = 2000) {
|
|
|
234
390
|
try {
|
|
235
391
|
req = mod.get(url, { timeout: timeoutMs }, (res) => {
|
|
236
392
|
const ok = res.statusCode != null && res.statusCode < 500;
|
|
393
|
+
const status = res.statusCode;
|
|
237
394
|
res.resume();
|
|
238
|
-
resolve({ ok, status
|
|
395
|
+
resolve({ ok, status });
|
|
239
396
|
});
|
|
240
397
|
} catch (err) {
|
|
241
|
-
|
|
398
|
+
// Sprint 63 T3 §3.2 — surface `code` so the caller can classify into
|
|
399
|
+
// the red:* taxonomy without re-parsing the message.
|
|
400
|
+
resolve({
|
|
401
|
+
ok: false,
|
|
402
|
+
error: err && err.message ? err.message : String(err),
|
|
403
|
+
code: err && err.code,
|
|
404
|
+
});
|
|
242
405
|
return;
|
|
243
406
|
}
|
|
244
|
-
req.on('error', (err) => resolve({
|
|
245
|
-
|
|
407
|
+
req.on('error', (err) => resolve({
|
|
408
|
+
ok: false,
|
|
409
|
+
error: err && err.message ? err.message : String(err),
|
|
410
|
+
code: err && err.code,
|
|
411
|
+
}));
|
|
412
|
+
req.on('timeout', () => {
|
|
413
|
+
try { req.destroy(); } catch (_e) { /* gone */ }
|
|
414
|
+
resolve({ ok: false, error: 'timeout', code: 'TIMEOUT' });
|
|
415
|
+
});
|
|
246
416
|
});
|
|
247
417
|
}
|
|
248
418
|
|
|
@@ -250,50 +420,66 @@ async function checkMnestraWebhook(config, options) {
|
|
|
250
420
|
if (options && typeof options._mnestraWebhookProbe === 'function') {
|
|
251
421
|
try {
|
|
252
422
|
const r = await options._mnestraWebhookProbe();
|
|
253
|
-
if (r && r.ok) return
|
|
254
|
-
|
|
423
|
+
if (r && r.ok) return passCheck('mnestra-webhook');
|
|
424
|
+
const cat = classifyHttpFailure(r);
|
|
425
|
+
return warnCheck('mnestra-webhook', cat, (r && r.detail) || (r && r.error) || 'unreachable');
|
|
255
426
|
} catch (err) {
|
|
256
|
-
|
|
427
|
+
const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
|
|
428
|
+
return warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
|
|
257
429
|
}
|
|
258
430
|
}
|
|
259
431
|
const rag = (config && config.rag) || {};
|
|
260
432
|
if (!rag.mnestraWebhookUrl) {
|
|
261
|
-
|
|
433
|
+
// Sprint 63 T3 §3.2 — URL not configured = init-failed (operator never
|
|
434
|
+
// set up the webhook), not unreachable. Log-once so a 30s poll on an
|
|
435
|
+
// unconfigured install doesn't flood warns.
|
|
436
|
+
logInitFailedOnce('mnestra-webhook', 'rag.mnestraWebhookUrl not configured');
|
|
437
|
+
return warnCheck('mnestra-webhook', CATEGORIES.INIT_FAILED, 'webhook URL not configured');
|
|
262
438
|
}
|
|
263
439
|
const healthUrl = String(rag.mnestraWebhookUrl).replace(/\/mnestra\/?$/, '/healthz');
|
|
264
440
|
const r = await httpReachable(healthUrl, 2000);
|
|
265
|
-
if (r.ok) return
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
status: '
|
|
269
|
-
|
|
270
|
-
|
|
441
|
+
if (r.ok) return passCheck('mnestra-webhook');
|
|
442
|
+
const cat = classifyHttpFailure(r);
|
|
443
|
+
const why = r.error
|
|
444
|
+
? `${r.error}${typeof r.status === 'number' ? ` (HTTP ${r.status})` : ''}`
|
|
445
|
+
: `HTTP ${r.status || '???'}`;
|
|
446
|
+
return warnCheck('mnestra-webhook', cat, why);
|
|
271
447
|
}
|
|
272
448
|
|
|
273
449
|
async function checkRumenPool(config, options) {
|
|
274
450
|
if (options && typeof options._rumenPoolProbe === 'function') {
|
|
275
451
|
try {
|
|
276
452
|
const r = await options._rumenPoolProbe();
|
|
277
|
-
if (r && r.ok) return
|
|
278
|
-
|
|
453
|
+
if (r && r.ok) return passCheck('rumen-pool');
|
|
454
|
+
// Test-seam probe should pass `category` if it has one; else infer.
|
|
455
|
+
const cat = (r && r.category) || classifyDbFailure(r || {});
|
|
456
|
+
return warnCheck('rumen-pool', cat, (r && r.detail) || (r && r.error) || 'unreachable (best-effort)');
|
|
279
457
|
} catch (err) {
|
|
280
|
-
|
|
458
|
+
const cat = classifyDbFailure(err);
|
|
459
|
+
return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
|
|
281
460
|
}
|
|
282
461
|
}
|
|
283
462
|
let pg;
|
|
284
463
|
try { pg = require('pg'); } catch (_e) { pg = null; }
|
|
285
|
-
if (!pg)
|
|
464
|
+
if (!pg) {
|
|
465
|
+
logInitFailedOnce('rumen-pool', 'pg module not installed');
|
|
466
|
+
return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'pg module not installed');
|
|
467
|
+
}
|
|
286
468
|
|
|
287
469
|
const dbUrl = (config && config.rag && config.rag.databaseUrl) || process.env.DATABASE_URL;
|
|
288
|
-
if (!dbUrl)
|
|
470
|
+
if (!dbUrl) {
|
|
471
|
+
logInitFailedOnce('rumen-pool', 'DATABASE_URL not set');
|
|
472
|
+
return warnCheck('rumen-pool', CATEGORIES.INIT_FAILED, 'DATABASE_URL not set');
|
|
473
|
+
}
|
|
289
474
|
|
|
290
475
|
const pool = new pg.Pool({ connectionString: dbUrl, max: 1, connectionTimeoutMillis: 3000 });
|
|
291
476
|
try {
|
|
292
477
|
const res = await pool.query('SELECT 1 AS ok');
|
|
293
|
-
if (res.rows[0] && res.rows[0].ok === 1) return
|
|
294
|
-
return
|
|
478
|
+
if (res.rows[0] && res.rows[0].ok === 1) return passCheck('rumen-pool');
|
|
479
|
+
return warnCheck('rumen-pool', CATEGORIES.DEPENDENCY_DOWN, 'SELECT 1 returned unexpected result');
|
|
295
480
|
} catch (err) {
|
|
296
|
-
|
|
481
|
+
const cat = classifyDbFailure(err);
|
|
482
|
+
return warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
|
|
297
483
|
} finally {
|
|
298
484
|
try { await pool.end(); } catch (_e) { /* ignore */ }
|
|
299
485
|
}
|
|
@@ -316,35 +502,82 @@ async function getFullHealth(config = {}, options = {}) {
|
|
|
316
502
|
|
|
317
503
|
const checks = [];
|
|
318
504
|
|
|
505
|
+
// Sprint 63 T3 §3.2 outer-catch hardening (T4-CODEX AUDIT-CONCERN 13:27 ET):
|
|
506
|
+
// every probe is independently wrapped here so an unexpected throw in a
|
|
507
|
+
// single probe path can't sink the whole report. Pre-Sprint-63 these four
|
|
508
|
+
// catches emitted raw `{ status: 'fail'|'warn', detail }` with no
|
|
509
|
+
// `category` field — operators triaging "why is the dashboard red?" still
|
|
510
|
+
// had to read prose. The whole point of the taxonomy is that there is no
|
|
511
|
+
// such thing as an uncategorized non-pass row. Every fallback now runs the
|
|
512
|
+
// captured `err` through `classifyDbFailure` / `classifyHttpFailure` and
|
|
513
|
+
// composes a normal `failCheck` / `warnCheck` envelope. When the
|
|
514
|
+
// classifier can't infer (truly opaque throw — bug in the probe itself,
|
|
515
|
+
// not in the dependency), the default branch in each classifier returns
|
|
516
|
+
// `red:dependency-down`, which is the right-by-default category for "the
|
|
517
|
+
// probe's path is broken" — operator's first action is to inspect the
|
|
518
|
+
// peer / its config, not the local handle.
|
|
519
|
+
|
|
520
|
+
// Sprint 63 T3 §3.2 — `_throwIn` test seam. The probe functions each have
|
|
521
|
+
// their own try/catch so unreached-by-design inputs can't throw out into
|
|
522
|
+
// the outer catches below. The fence tests need a way to simulate "a
|
|
523
|
+
// probe's path threw before its own catch caught it" — i.e., the
|
|
524
|
+
// belt-and-suspenders outer catch. Set `_throwIn` to one of
|
|
525
|
+
// `'sqlite' | 'pg' | 'webhook' | 'rumen-pool'` to inject a synthetic
|
|
526
|
+
// throw at the corresponding outer-try entry. Never set in production —
|
|
527
|
+
// ignored if the value is falsy.
|
|
528
|
+
const throwIn = options._throwIn || null;
|
|
529
|
+
const synth = (where) => new Error(`test-fence: simulated throw in ${where} probe path`);
|
|
530
|
+
|
|
319
531
|
// 1. SQLite (sync — small DB, no risk of blocking)
|
|
320
|
-
try {
|
|
321
|
-
|
|
532
|
+
try {
|
|
533
|
+
if (throwIn === 'sqlite') throw synth('sqlite');
|
|
534
|
+
checks.push(checkSqlite(db));
|
|
535
|
+
}
|
|
536
|
+
catch (err) {
|
|
537
|
+
const cat = classifyDbFailure(err);
|
|
538
|
+
checks.push(failCheck('sqlite', cat, err && err.message ? err.message : String(err)));
|
|
539
|
+
}
|
|
322
540
|
|
|
323
541
|
// 2-7. Postgres-side suite
|
|
324
542
|
let pgChecks;
|
|
325
|
-
try {
|
|
543
|
+
try {
|
|
544
|
+
if (throwIn === 'pg') throw synth('pg');
|
|
545
|
+
pgChecks = await runPgChecks({ databaseUrl, _pgClient: options._pgClient });
|
|
546
|
+
}
|
|
326
547
|
catch (err) {
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
548
|
+
const cat = classifyDbFailure(err);
|
|
549
|
+
const why = err && err.message ? err.message : String(err);
|
|
550
|
+
pgChecks = [failCheck('mnestra-pg', cat, why)];
|
|
551
|
+
// Dependents inherit the same category — see runPgChecks header for the
|
|
552
|
+
// rationale (one root-cause row, not 6 independent-looking REDs).
|
|
332
553
|
for (const name of ['memory-items-col', 'pg-cron-ext', 'pg-net-ext', 'vault-secret', 'cron-job-active']) {
|
|
333
|
-
pgChecks.push(
|
|
554
|
+
pgChecks.push(failCheck(name, cat, 'pg suite aborted'));
|
|
334
555
|
}
|
|
335
556
|
}
|
|
336
557
|
for (const c of pgChecks) checks.push(c);
|
|
337
558
|
|
|
338
559
|
// 8. Mnestra webhook (warn)
|
|
339
560
|
let webhook;
|
|
340
|
-
try {
|
|
341
|
-
|
|
561
|
+
try {
|
|
562
|
+
if (throwIn === 'webhook') throw synth('webhook');
|
|
563
|
+
webhook = await checkMnestraWebhook(config, options);
|
|
564
|
+
}
|
|
565
|
+
catch (err) {
|
|
566
|
+
const cat = classifyHttpFailure({ code: err && err.code, error: err && err.message });
|
|
567
|
+
webhook = warnCheck('mnestra-webhook', cat, err && err.message ? err.message : String(err));
|
|
568
|
+
}
|
|
342
569
|
checks.push(webhook);
|
|
343
570
|
|
|
344
571
|
// 9. Rumen pool (warn)
|
|
345
572
|
let pool;
|
|
346
|
-
try {
|
|
347
|
-
|
|
573
|
+
try {
|
|
574
|
+
if (throwIn === 'rumen-pool') throw synth('rumen-pool');
|
|
575
|
+
pool = await checkRumenPool(config, options);
|
|
576
|
+
}
|
|
577
|
+
catch (err) {
|
|
578
|
+
const cat = classifyDbFailure(err);
|
|
579
|
+
pool = warnCheck('rumen-pool', cat, err && err.message ? err.message : String(err));
|
|
580
|
+
}
|
|
348
581
|
checks.push(pool);
|
|
349
582
|
|
|
350
583
|
const ok = checks
|
|
@@ -370,8 +603,19 @@ function _resetCache() {
|
|
|
370
603
|
_cachedAt = 0;
|
|
371
604
|
}
|
|
372
605
|
|
|
606
|
+
// Sprint 63 T3 §3.2 — clear the init-failed log-once memory so each test
|
|
607
|
+
// case starts fresh. Without this, the first test that exercises a null-db
|
|
608
|
+
// path would silence the log on subsequent tests in the same process.
|
|
609
|
+
function _resetInitLogged() {
|
|
610
|
+
_initLoggedOnce.clear();
|
|
611
|
+
}
|
|
612
|
+
|
|
373
613
|
module.exports = {
|
|
374
614
|
getFullHealth,
|
|
375
615
|
REQUIRED_CHECKS,
|
|
376
|
-
|
|
616
|
+
CATEGORIES,
|
|
617
|
+
classifyHttpFailure,
|
|
618
|
+
classifyDbFailure,
|
|
619
|
+
_resetCache,
|
|
620
|
+
_resetInitLogged,
|
|
377
621
|
};
|
|
@@ -37,7 +37,7 @@ try {
|
|
|
37
37
|
}
|
|
38
38
|
try { pg = require('pg'); } catch { pg = null; }
|
|
39
39
|
|
|
40
|
-
// Module-level singleton Postgres pool for rumen_insights (
|
|
40
|
+
// Module-level singleton Postgres pool for rumen_insights (the daily-driver DB).
|
|
41
41
|
// Lazy-initialized on first rumen endpoint hit so startup stays fast and
|
|
42
42
|
// servers without DATABASE_URL never pay the connection cost.
|
|
43
43
|
//
|
|
@@ -292,31 +292,42 @@ function _termdeckVersion() {
|
|
|
292
292
|
// `pty.resize()` ioctls a stale fd. The error is race-expected, not a bug,
|
|
293
293
|
// but the noisy console.error trace pollutes diagnostics and obscures real
|
|
294
294
|
// errors. This helper guards against the race and downgrades the known
|
|
295
|
-
// race-class errors (EBADF, ENOTTY
|
|
296
|
-
//
|
|
297
|
-
//
|
|
295
|
+
// race-class errors (EBADF, ENOTTY) to a silent return. Set
|
|
296
|
+
// TERMDECK_DEBUG_PTY_RACES=1 to log to console.debug for diagnostics.
|
|
297
|
+
//
|
|
298
|
+
// Sprint 63 T1 — `isPtyRaceError(err)` extracted so the WS message-handler
|
|
299
|
+
// outer catch can also downgrade race-class errors that escape the helper's
|
|
300
|
+
// own catch (e.g. if `pty.write` ever races the close, future code paths).
|
|
301
|
+
// `session.pty._destroyed` short-circuit added as belt-and-suspenders for the
|
|
302
|
+
// `term.kill()` → before-`term.onExit`-fires window: the DELETE handler now
|
|
303
|
+
// stamps `_destroyed = true` immediately after kill(), so resize attempts in
|
|
304
|
+
// that interval short-circuit without an ioctl call.
|
|
305
|
+
function isPtyRaceError(err) {
|
|
306
|
+
if (!err) return false;
|
|
307
|
+
const msg = (err.message) || '';
|
|
308
|
+
const code = err.code;
|
|
309
|
+
return code === 'EBADF' ||
|
|
310
|
+
code === 'ENOTTY' ||
|
|
311
|
+
/\b(?:EBADF|ENOTTY)\b/.test(msg);
|
|
312
|
+
}
|
|
313
|
+
|
|
298
314
|
function safelyResizePty(session, cols, rows) {
|
|
299
315
|
if (!session || !session.pty) return false;
|
|
316
|
+
if (session.pty._destroyed) return false;
|
|
300
317
|
if (session.meta && session.meta.status === 'exited') return false;
|
|
301
318
|
try {
|
|
302
319
|
session.pty.resize(cols || 120, rows || 30);
|
|
303
320
|
return true;
|
|
304
321
|
} catch (err) {
|
|
305
|
-
const msg = (err && err.message) || '';
|
|
306
|
-
const code = err && err.code;
|
|
307
322
|
// Sprint 60 v1.0.14 + T4-CODEX AUDIT-CONCERN narrowing: race classifier
|
|
308
323
|
// requires explicit EBADF or ENOTTY (in code OR message). The earlier
|
|
309
324
|
// shape — any "ioctl(N) failed" message — was too broad: it would have
|
|
310
325
|
// silently dropped a non-race ioctl failure (e.g. EINTR, EFAULT) that
|
|
311
326
|
// might indicate a real bug. Now: only the specific race-class signals
|
|
312
327
|
// get suppressed; everything else rethrows so it surfaces in logs.
|
|
313
|
-
|
|
314
|
-
code === 'EBADF' ||
|
|
315
|
-
code === 'ENOTTY' ||
|
|
316
|
-
/\b(?:EBADF|ENOTTY)\b/.test(msg);
|
|
317
|
-
if (isRace) {
|
|
328
|
+
if (isPtyRaceError(err)) {
|
|
318
329
|
if (process.env.TERMDECK_DEBUG_PTY_RACES) {
|
|
319
|
-
console.debug(`[ws] resize-after-pty-exit (race-expected): session=${session.id} ${code ||
|
|
330
|
+
console.debug(`[ws] resize-after-pty-exit (race-expected): session=${session.id} ${err.code || err.message}`);
|
|
320
331
|
}
|
|
321
332
|
return false;
|
|
322
333
|
}
|
|
@@ -324,6 +335,35 @@ function safelyResizePty(session, cols, rows) {
|
|
|
324
335
|
}
|
|
325
336
|
}
|
|
326
337
|
|
|
338
|
+
// Sprint 63 T1 (Item 1.3) — body-parser hardening. The pre-existing
|
|
339
|
+
// `entity.verify.failed` / `entity.parse.failed` handler logged the error
|
|
340
|
+
// message but not WHICH bytes triggered the parse failure. Operators on
|
|
341
|
+
// Brad's r730 saw 9× SyntaxError flood over 13h with no fingerprint to
|
|
342
|
+
// identify the offending caller. `hexEscapePrefix` renders a 32-byte
|
|
343
|
+
// prefix of the raw body in a single-line, log-safe form: printable ASCII
|
|
344
|
+
// kept verbatim, non-printables rendered as `\xNN`, backslash escaped as
|
|
345
|
+
// `\\`. PII-conservative because we cap at 32 bytes (truncation marker `…`
|
|
346
|
+
// appended if more). The error middleware injects this into the existing
|
|
347
|
+
// `console.warn` line so the log signature is identifiable without
|
|
348
|
+
// dumping the full body.
|
|
349
|
+
function hexEscapePrefix(buf, maxBytes = 32) {
|
|
350
|
+
if (!buf || buf.length === 0) return '<no-body>';
|
|
351
|
+
const len = Math.min(buf.length, maxBytes);
|
|
352
|
+
let out = '';
|
|
353
|
+
for (let i = 0; i < len; i++) {
|
|
354
|
+
const b = buf[i];
|
|
355
|
+
if (b === 0x5c) {
|
|
356
|
+
out += '\\\\';
|
|
357
|
+
} else if (b >= 0x20 && b < 0x7f) {
|
|
358
|
+
out += String.fromCharCode(b);
|
|
359
|
+
} else {
|
|
360
|
+
out += '\\x' + b.toString(16).padStart(2, '0');
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
if (buf.length > maxBytes) out += '…';
|
|
364
|
+
return out;
|
|
365
|
+
}
|
|
366
|
+
|
|
327
367
|
function createServer(config) {
|
|
328
368
|
const app = express();
|
|
329
369
|
const server = http.createServer(app);
|
|
@@ -346,6 +386,13 @@ function createServer(config) {
|
|
|
346
386
|
// logs so real errors aren't drowned in noise.
|
|
347
387
|
app.use(express.json({
|
|
348
388
|
verify: (req, res, buf) => {
|
|
389
|
+
// Sprint 63 T1 (Item 1.3) — capture a stable copy of the raw body so
|
|
390
|
+
// the error middleware below can render a 32-byte hex-escaped prefix.
|
|
391
|
+
// `Buffer.from(buf)` copies because express may pool the underlying
|
|
392
|
+
// accumulator across requests; without the copy the error handler
|
|
393
|
+
// could see bytes from a later request.
|
|
394
|
+
req.rawBody = Buffer.from(buf);
|
|
395
|
+
|
|
349
396
|
// O(N) single-pass scan. Only checks bytes inside double-quoted string
|
|
350
397
|
// regions so structural whitespace doesn't trigger false positives.
|
|
351
398
|
let inString = false;
|
|
@@ -390,7 +437,13 @@ function createServer(config) {
|
|
|
390
437
|
err.type === 'entity.verify.failed' ||
|
|
391
438
|
err instanceof SyntaxError
|
|
392
439
|
)) {
|
|
393
|
-
|
|
440
|
+
// Sprint 63 T1 (Item 1.3) — append a 32-byte hex-escaped prefix of the
|
|
441
|
+
// raw body so the operator can identify which caller is sending bad
|
|
442
|
+
// JSON without exposing the full payload. Falls through to `<no-body>`
|
|
443
|
+
// if the verify callback never ran (parse error before verify, or no
|
|
444
|
+
// body at all).
|
|
445
|
+
const prefix = hexEscapePrefix(req.rawBody);
|
|
446
|
+
console.warn(`[body-parser] ${err.code || err.type || 'parse-error'}: ${err.message} (${req.method} ${req.path}) prefix="${prefix}"`);
|
|
394
447
|
return res.status(400).json({
|
|
395
448
|
error: 'Malformed JSON body',
|
|
396
449
|
detail: err.message,
|
|
@@ -1189,6 +1242,18 @@ function createServer(config) {
|
|
|
1189
1242
|
const sessUploadDir = path.join(os.tmpdir(), 'termdeck-uploads', session.id);
|
|
1190
1243
|
fs.rmSync(sessUploadDir, { recursive: true, force: true });
|
|
1191
1244
|
} catch (_err) { /* non-blocking */ }
|
|
1245
|
+
|
|
1246
|
+
// Sprint 63 T1 (Item 1.1) — null `session.pty` so the wrapper is
|
|
1247
|
+
// eligible for GC and downstream `if (session.pty)` guards correctly
|
|
1248
|
+
// identify the exited state. Root cause of Joshua's 2026-05-08/09
|
|
1249
|
+
// overnight `kern.tty.ptmx_max=511` exhaustion (516 fds for 4 panels):
|
|
1250
|
+
// without this nulling, node-pty's wrapper stayed pinned by onData /
|
|
1251
|
+
// onExit closures even after the child exited, holding the master
|
|
1252
|
+
// fd until next GC pass. Set AFTER `onPanelClose` fires (fire-and-
|
|
1253
|
+
// forget; reads `session.meta` + `session.id`, not `session.pty`) and
|
|
1254
|
+
// AFTER the upload-dir cleanup so any sync reader above this line
|
|
1255
|
+
// sees the original wrapper.
|
|
1256
|
+
session.pty = null;
|
|
1192
1257
|
});
|
|
1193
1258
|
|
|
1194
1259
|
// Wire command logging to SQLite + RAG
|
|
@@ -1346,7 +1411,7 @@ function createServer(config) {
|
|
|
1346
1411
|
});
|
|
1347
1412
|
|
|
1348
1413
|
// Graph endpoints (Sprint 38 T4) — knowledge-graph view backing graph.html.
|
|
1349
|
-
// Reuses the
|
|
1414
|
+
// Reuses the daily-driver pg pool (same DATABASE_URL serves memory_items +
|
|
1350
1415
|
// memory_relationships alongside rumen_*). Graceful-degrades when the pool
|
|
1351
1416
|
// is absent.
|
|
1352
1417
|
createGraphRoutes({
|
|
@@ -1376,6 +1441,14 @@ function createServer(config) {
|
|
|
1376
1441
|
// Kill PTY process
|
|
1377
1442
|
if (session.pty) {
|
|
1378
1443
|
try { session.pty.kill(); } catch (err) { console.error('[pty] kill failed for session', req.params.id + ':', err); }
|
|
1444
|
+
// Sprint 63 T1 (Item 1.2) — stamp `_destroyed = true` on the pty wrapper
|
|
1445
|
+
// so `safelyResizePty` can short-circuit any resize attempts that arrive
|
|
1446
|
+
// in the kill()→onExit window. node-pty's `kill()` only signals the
|
|
1447
|
+
// child; onExit fires asynchronously once the child reaps. Without this
|
|
1448
|
+
// marker, a WS resize message in that window would ioctl a fd whose
|
|
1449
|
+
// child has just SIGHUP'd, surfacing as EBADF/ENOTTY. node-pty doesn't
|
|
1450
|
+
// set this property itself; the convention is owned by TermDeck.
|
|
1451
|
+
session.pty._destroyed = true;
|
|
1379
1452
|
}
|
|
1380
1453
|
|
|
1381
1454
|
sessions.remove(req.params.id);
|
|
@@ -1595,15 +1668,23 @@ function createServer(config) {
|
|
|
1595
1668
|
});
|
|
1596
1669
|
|
|
1597
1670
|
// POST /api/sessions/:id/resize - resize terminal
|
|
1671
|
+
// Sprint 63 T1 (Item 1.2) — distinguish "session never existed" (404) from
|
|
1672
|
+
// "session exists but PTY has exited" (410 Gone). Pre-Sprint-63 both paths
|
|
1673
|
+
// collapsed to 404 (when session.pty was null after the PTY-leak fix) or
|
|
1674
|
+
// 409 (when safelyResizePty returned false). 410 is the semantically
|
|
1675
|
+
// correct response: the resource was here, the resource is now gone.
|
|
1598
1676
|
app.post('/api/sessions/:id/resize', (req, res) => {
|
|
1599
1677
|
const session = sessions.get(req.params.id);
|
|
1600
|
-
if (!session
|
|
1678
|
+
if (!session) return res.status(404).json({ error: 'Session not found' });
|
|
1679
|
+
if (!session.pty || (session.meta && session.meta.status === 'exited')) {
|
|
1680
|
+
return res.status(410).json({ error: 'PTY is gone (session exited)' });
|
|
1681
|
+
}
|
|
1601
1682
|
|
|
1602
1683
|
const { cols, rows } = req.body;
|
|
1603
1684
|
try {
|
|
1604
1685
|
const resized = safelyResizePty(session, cols, rows);
|
|
1605
1686
|
if (!resized) {
|
|
1606
|
-
return res.status(
|
|
1687
|
+
return res.status(410).json({ error: 'PTY is gone (session exited)' });
|
|
1607
1688
|
}
|
|
1608
1689
|
res.json({ ok: true, cols, rows });
|
|
1609
1690
|
} catch (err) {
|
|
@@ -2027,7 +2108,7 @@ function createServer(config) {
|
|
|
2027
2108
|
});
|
|
2028
2109
|
|
|
2029
2110
|
// ==================== Rumen insights (Sprint 4 T2) ====================
|
|
2030
|
-
// Read-only access to rumen_insights + rumen_jobs in the
|
|
2111
|
+
// Read-only access to rumen_insights + rumen_jobs in the daily-driver Postgres
|
|
2031
2112
|
// instance. Contract frozen in docs/sprint-4-rumen-integration/API-CONTRACT.md.
|
|
2032
2113
|
|
|
2033
2114
|
function rumenUnreachable(res) {
|
|
@@ -2268,7 +2349,7 @@ function createServer(config) {
|
|
|
2268
2349
|
|
|
2269
2350
|
switch (parsed.type) {
|
|
2270
2351
|
case 'input':
|
|
2271
|
-
if (session.pty) {
|
|
2352
|
+
if (session.pty && !session.pty._destroyed) {
|
|
2272
2353
|
session.pty.write(parsed.data);
|
|
2273
2354
|
session.trackInput(parsed.data);
|
|
2274
2355
|
}
|
|
@@ -2289,7 +2370,21 @@ function createServer(config) {
|
|
|
2289
2370
|
}));
|
|
2290
2371
|
break;
|
|
2291
2372
|
}
|
|
2292
|
-
} catch (err) {
|
|
2373
|
+
} catch (err) {
|
|
2374
|
+
// Sprint 63 T1 (Item 1.2) — belt-and-suspenders: if a race-class
|
|
2375
|
+
// ioctl error somehow escapes safelyResizePty's own catch (or comes
|
|
2376
|
+
// from a future write/ioctl path), downgrade to console.debug
|
|
2377
|
+
// instead of polluting stderr with the noisy ws-message-handler
|
|
2378
|
+
// error log. safelyResizePty itself already catches the resize
|
|
2379
|
+
// path; this catches any other race-class shape that bubbles here.
|
|
2380
|
+
if (isPtyRaceError(err)) {
|
|
2381
|
+
if (process.env.TERMDECK_DEBUG_PTY_RACES) {
|
|
2382
|
+
console.debug(`[ws] message handler race-class (suppressed): ${err.code || err.message}`);
|
|
2383
|
+
}
|
|
2384
|
+
} else {
|
|
2385
|
+
console.error('[ws] message handler error:', err);
|
|
2386
|
+
}
|
|
2387
|
+
}
|
|
2293
2388
|
});
|
|
2294
2389
|
|
|
2295
2390
|
ws.on('close', () => {
|
|
@@ -2599,6 +2694,11 @@ module.exports = {
|
|
|
2599
2694
|
// helper instead of re-implementing it. T4-CODEX AUDIT-CONCERN flagged that
|
|
2600
2695
|
// the prior re-implementation pattern in the test could drift silently.
|
|
2601
2696
|
safelyResizePty,
|
|
2697
|
+
// Sprint 63 T1 (Item 1.2 + 1.3) — race-class classifier + raw-body hex
|
|
2698
|
+
// prefix renderer exported so fence tests can import the production
|
|
2699
|
+
// helpers instead of re-implementing them.
|
|
2700
|
+
isPtyRaceError,
|
|
2701
|
+
hexEscapePrefix,
|
|
2602
2702
|
// Sprint 48 T4 — exported for unit testing the secrets.env → PTY env merge.
|
|
2603
2703
|
readTermdeckSecretsForPty,
|
|
2604
2704
|
_resetTermdeckSecretsCache,
|
|
@@ -261,7 +261,13 @@ async function checkShellSanity() {
|
|
|
261
261
|
let output = '';
|
|
262
262
|
let resolved = false;
|
|
263
263
|
|
|
264
|
-
|
|
264
|
+
// Sprint 63 T3 §3.3 — drop `-l` (login mode). `-l` sources ~/.bash_profile
|
|
265
|
+
// / ~/.zshrc and friends, which on heavy profiles (nvm, conda, plugin
|
|
266
|
+
// managers — Brad's r730 has conda) routinely exceeds the 3s timeout
|
|
267
|
+
// budget below. A PTY-spawn health check answers "can $SHELL spawn a
|
|
268
|
+
// PTY and emit output?" — not "does the user's interactive profile
|
|
269
|
+
// complete fast?" Login-mode startup time is unrelated to PTY health.
|
|
270
|
+
const proc = ptyMod.spawn(shell, ['-c', 'echo TERMDECK_OK'], {
|
|
265
271
|
name: 'xterm-256color',
|
|
266
272
|
cols: 80,
|
|
267
273
|
rows: 24,
|