ai-or-die 0.1.76 → 0.1.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/supervisor.js +51 -1
- package/package.json +2 -1
- package/src/base-bridge.js +115 -2
- package/src/job-guard.js +249 -0
- package/src/server.js +45 -3
- package/src/utils/process-tree.js +95 -0
package/bin/supervisor.js
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
const { spawn } = require('child_process');
|
|
6
6
|
const path = require('path');
|
|
7
7
|
const { RESTART_EXIT_CODE } = require('../src/restart-manager');
|
|
8
|
+
const jobGuard = require('../src/job-guard');
|
|
8
9
|
|
|
9
10
|
// ---------------------------------------------------------------------------
|
|
10
11
|
// Tunables — all overridable via env vars so the regression test can shrink
|
|
@@ -14,7 +15,12 @@ const { RESTART_EXIT_CODE } = require('../src/restart-manager');
|
|
|
14
15
|
|
|
15
16
|
const RESTART_DELAY_MS = parseInt(process.env.RESTART_DELAY_MS, 10) || 1000; // clean RESTART_EXIT_CODE respawn
|
|
16
17
|
const CRASH_RESTART_DELAY_MS = parseInt(process.env.CRASH_RESTART_DELAY_MS, 10) || 3000; // normal crash respawn
|
|
17
|
-
|
|
18
|
+
// Must stay strictly GREATER than the server's own 15s force-exit budget
|
|
19
|
+
// (src/server.js handleShutdown) so a hung graceful shutdown lets the server
|
|
20
|
+
// finish (or self-force-exit) its own ordered teardown — including killing its
|
|
21
|
+
// PTY trees — before the supervisor hard-kills it. A supervisor SIGKILL that
|
|
22
|
+
// preempted the server would orphan the server's PTY/grandchild tree.
|
|
23
|
+
const SHUTDOWN_TIMEOUT_MS = parseInt(process.env.SHUTDOWN_TIMEOUT_MS, 10) || 20000; // SIGINT/SIGTERM hard-kill fallback
|
|
18
24
|
|
|
19
25
|
// Tier 1 — tight crash loop. 3 crashes in 30 s historically tripped a hard
|
|
20
26
|
// process.exit(1). The fix replaces that with an extended restart delay
|
|
@@ -49,6 +55,12 @@ let spawnCount = 0;
|
|
|
49
55
|
let crashTimestamps = [];
|
|
50
56
|
let pendingRestartTimer = null;
|
|
51
57
|
|
|
58
|
+
// Windows Job Object guard. Set up once at startup (below, before the first spawn).
|
|
59
|
+
// `jobGuardHandle` is held for the supervisor's entire life and intentionally NEVER
|
|
60
|
+
// closed by us — process death closes it, which is exactly what fires KILL_ON_JOB_CLOSE.
|
|
61
|
+
let jobGuardHandle = null;
|
|
62
|
+
let jobGuardActive = false;
|
|
63
|
+
|
|
52
64
|
// Queued IPC message delivered to the NEXT spawned child once its IPC channel
|
|
53
65
|
// is open. Used to forward tier-2 escalation downstream so the in-process
|
|
54
66
|
// server can surface it to the browser ("supervisor is throttling restarts").
|
|
@@ -121,6 +133,9 @@ function startServer() {
|
|
|
121
133
|
env: {
|
|
122
134
|
...process.env,
|
|
123
135
|
SUPERVISED: '1',
|
|
136
|
+
// Tell the server whether the kill-on-close job guard is active, so it can
|
|
137
|
+
// surface the unprotected state in /api/diagnostics (jobGuard:false).
|
|
138
|
+
AOD_JOB_GUARD: jobGuardActive ? '1' : '0',
|
|
124
139
|
...(isRestart ? { AOD_SUPERVISOR_RESTART: '1' } : {})
|
|
125
140
|
}
|
|
126
141
|
});
|
|
@@ -252,4 +267,39 @@ process.on('message', (msg) => {
|
|
|
252
267
|
if (msg && msg.type === 'shutdown') shutdownGracefully();
|
|
253
268
|
});
|
|
254
269
|
|
|
270
|
+
// Establish the Windows Job Object guard BEFORE the first spawn so the server and its
|
|
271
|
+
// entire future tree (PTYs, the CLI's node/bun MCP grandchildren) auto-join the job.
|
|
272
|
+
// AssignProcessToJobObject is forward-looking: future children join, so the supervisor
|
|
273
|
+
// must be assigned while it still has no descendants — i.e. before startServer(). When
|
|
274
|
+
// the supervisor dies for ANY reason (Ctrl+C, crash, taskkill /F, console close), the OS
|
|
275
|
+
// closes the in-process handle and the kernel atomically terminates the whole job.
|
|
276
|
+
// No-op on non-Windows / when koffi is unavailable (jobGuardActive stays false →
|
|
277
|
+
// best-effort teardown, surfaced as jobGuard:false in /api/diagnostics).
|
|
278
|
+
function setupJobGuard() {
|
|
279
|
+
try {
|
|
280
|
+
if (!jobGuard.isAvailable()) {
|
|
281
|
+
if (process.platform === 'win32') {
|
|
282
|
+
console.warn('[supervisor] ⚠ process-guard: koffi unavailable; using best-effort teardown. ' +
|
|
283
|
+
'Child node/bun processes may survive an uncatchable kill (taskkill /F).');
|
|
284
|
+
}
|
|
285
|
+
return;
|
|
286
|
+
}
|
|
287
|
+
jobGuardHandle = jobGuard.createKillOnCloseJob();
|
|
288
|
+
if (jobGuardHandle && jobGuard.assignSelf(jobGuardHandle)) {
|
|
289
|
+
jobGuardActive = true;
|
|
290
|
+
console.log('[supervisor] process-guard: kill-on-close job active — the whole tree dies with the supervisor.');
|
|
291
|
+
} else {
|
|
292
|
+
jobGuardHandle = null;
|
|
293
|
+
console.warn('[supervisor] ⚠ process-guard: could not create/assign the job object ' +
|
|
294
|
+
'(outer job UI limits / EDR / silo?); using best-effort teardown. ' +
|
|
295
|
+
'Child node/bun processes may survive an uncatchable kill.');
|
|
296
|
+
}
|
|
297
|
+
} catch (e) {
|
|
298
|
+
jobGuardHandle = null;
|
|
299
|
+
jobGuardActive = false;
|
|
300
|
+
console.warn('[supervisor] ⚠ process-guard: setup failed (' + (e && e.message) + '); continuing without it.');
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
setupJobGuard();
|
|
255
305
|
startServer();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ai-or-die",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.77",
|
|
4
4
|
"description": "Universal AI coding terminal — Claude, Copilot, Gemini & more in your browser",
|
|
5
5
|
"main": "src/server.js",
|
|
6
6
|
"bin": {
|
|
@@ -47,6 +47,7 @@
|
|
|
47
47
|
"cors": "^2.8.5",
|
|
48
48
|
"express": "^4.19.2",
|
|
49
49
|
"fuzzysort": "^3.1.0",
|
|
50
|
+
"koffi": "^3.0.2",
|
|
50
51
|
"open": "^10.1.0",
|
|
51
52
|
"selfsigned": "^2.4.1",
|
|
52
53
|
"sherpa-onnx-node": "^1.12.24",
|
package/src/base-bridge.js
CHANGED
|
@@ -3,6 +3,8 @@ const { execFile } = require('child_process');
|
|
|
3
3
|
const path = require('path');
|
|
4
4
|
const fs = require('fs');
|
|
5
5
|
const os = require('os');
|
|
6
|
+
const jobGuard = require('./job-guard');
|
|
7
|
+
const { killProcessTreeSync } = require('./utils/process-tree');
|
|
6
8
|
|
|
7
9
|
/** Chunk size for PTY writes — safely below ConPTY ~16KB kernel buffer */
|
|
8
10
|
const PTY_WRITE_CHUNK_SIZE = 4096;
|
|
@@ -314,6 +316,11 @@ class BaseBridge {
|
|
|
314
316
|
|
|
315
317
|
this.sessions.set(sessionId, session);
|
|
316
318
|
|
|
319
|
+
// Windows: enclose the PTY in its own kill-on-close Job Object now, before the CLI
|
|
320
|
+
// boots, so the CLI's future node/bun MCP grandchildren auto-join and can be reaped
|
|
321
|
+
// atomically on stopSession. No-op elsewhere.
|
|
322
|
+
this._attachPtyJob(session, ptyProcess);
|
|
323
|
+
|
|
317
324
|
// Spawn watchdog: if no data, exit, or error arrives within 30s, treat as failure
|
|
318
325
|
let receivedLifeSign = false;
|
|
319
326
|
const ptyStartedAt = Date.now();
|
|
@@ -334,6 +341,14 @@ class BaseBridge {
|
|
|
334
341
|
// the kill() below succeeds.
|
|
335
342
|
this._disposePtyDisposables(session, sessionId);
|
|
336
343
|
try { ptyProcess.kill(); } catch (e) { /* ignore */ }
|
|
344
|
+
// Reap the PTY subtree (Windows job close / POSIX group kill) so a hung-at-startup
|
|
345
|
+
// shell + any children it already spawned don't leak.
|
|
346
|
+
{
|
|
347
|
+
const jobClosed = this._disposePtyJob(session);
|
|
348
|
+
if (!jobClosed && ptyProcess && ptyProcess.pid) {
|
|
349
|
+
try { killProcessTreeSync(ptyProcess.pid); } catch (_) { /* best-effort */ }
|
|
350
|
+
}
|
|
351
|
+
}
|
|
337
352
|
onError(new Error(`${this.toolName} process did not respond within ${SPAWN_TIMEOUT_MS / 1000} seconds. The command may not be installed or may have hung during startup.`));
|
|
338
353
|
}
|
|
339
354
|
}, SPAWN_TIMEOUT_MS);
|
|
@@ -390,6 +405,10 @@ class BaseBridge {
|
|
|
390
405
|
// still hold references to the data-buffer closures. Skip onExit
|
|
391
406
|
// self-disposal (node-pty already removed it on fire).
|
|
392
407
|
this._disposePtyDisposables(session, sessionId);
|
|
408
|
+
// The PTY exited on its own, but the CLI's node/bun grandchildren may still be
|
|
409
|
+
// alive (node-pty doesn't walk the console process list). Closing the per-PTY
|
|
410
|
+
// kill-on-close job reaps them and frees the handle (Windows; no-op on POSIX).
|
|
411
|
+
this._disposePtyJob(session);
|
|
393
412
|
if (this.sessions.has(sessionId)) {
|
|
394
413
|
session.active = false;
|
|
395
414
|
this.sessions.delete(sessionId);
|
|
@@ -444,6 +463,14 @@ class BaseBridge {
|
|
|
444
463
|
// the shell is alive but unreadable) doesn't leak as a zombie process /
|
|
445
464
|
// FD — mirrors the spawn-watchdog teardown. Harmless if already dead.
|
|
446
465
|
try { ptyProcess.kill(); } catch (e) { /* ignore — may already be dead */ }
|
|
466
|
+
// Reap the subtree (Windows job close / POSIX group kill) so the shell's
|
|
467
|
+
// node/bun grandchildren don't outlive the failed session.
|
|
468
|
+
{
|
|
469
|
+
const jobClosed = this._disposePtyJob(session);
|
|
470
|
+
if (!jobClosed && ptyProcess && ptyProcess.pid) {
|
|
471
|
+
try { killProcessTreeSync(ptyProcess.pid); } catch (_) { /* best-effort */ }
|
|
472
|
+
}
|
|
473
|
+
}
|
|
447
474
|
if (this.sessions.has(sessionId)) {
|
|
448
475
|
session.active = false;
|
|
449
476
|
this.sessions.delete(sessionId);
|
|
@@ -584,6 +611,53 @@ class BaseBridge {
|
|
|
584
611
|
}
|
|
585
612
|
}
|
|
586
613
|
|
|
614
|
+
/**
|
|
615
|
+
* Windows only: put a freshly-spawned PTY in its OWN kill-on-close Job Object so the
|
|
616
|
+
* PTY and the CLI's node/bun MCP grandchildren can be torn down atomically per session
|
|
617
|
+
* (see _disposePtyJob). Assigned right after spawn — before the CLI boots and spawns its
|
|
618
|
+
* children — so those future grandchildren auto-join the job. No-op on POSIX / when the
|
|
619
|
+
* job guard is unavailable (then teardown falls back to process-group / taskkill).
|
|
620
|
+
* Defence in depth: the PTY is also in the supervisor-level job, so supervisor death
|
|
621
|
+
* reaps it regardless.
|
|
622
|
+
* @private
|
|
623
|
+
*/
|
|
624
|
+
_attachPtyJob(session, ptyProcess) {
|
|
625
|
+
if (process.platform !== 'win32' || !session) return;
|
|
626
|
+
try {
|
|
627
|
+
if (!jobGuard.isAvailable()) return;
|
|
628
|
+
const pid = ptyProcess && ptyProcess.pid;
|
|
629
|
+
if (!pid) return;
|
|
630
|
+
// Defensive: if a handle already exists for this session (re-entrant call), close it
|
|
631
|
+
// first so we never overwrite a live kernel handle and leak it.
|
|
632
|
+
if (session.jobHandle) this._disposePtyJob(session);
|
|
633
|
+
const handle = jobGuard.createKillOnCloseJob();
|
|
634
|
+
if (!handle) return;
|
|
635
|
+
if (jobGuard.assignPid(handle, pid)) {
|
|
636
|
+
session.jobHandle = handle;
|
|
637
|
+
} else {
|
|
638
|
+
// Assign failed (already-orphaned / access denied) — closing an empty job is harmless.
|
|
639
|
+
jobGuard.closeJob(handle);
|
|
640
|
+
}
|
|
641
|
+
} catch (_) { /* best-effort; never break session start */ }
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
/**
|
|
645
|
+
* Close a session's per-PTY job handle. On Windows this is the deterministic teardown:
|
|
646
|
+
* KILL_ON_JOB_CLOSE terminates the PTY + every descendant still in the job (the node/bun
|
|
647
|
+
* grandchildren). Also frees the kernel handle. Idempotent; no-op when no handle exists.
|
|
648
|
+
* Returns true when a job was actually closed (the subtree is reaped deterministically),
|
|
649
|
+
* false otherwise — callers use this to decide whether to escalate to the best-effort
|
|
650
|
+
* fallback (taskkill /T /F on Windows / process-group kill on POSIX) for the degraded
|
|
651
|
+
* path where no job exists (koffi unavailable: SEA binary, EDR/CLM-blocked, or POSIX).
|
|
652
|
+
* @private
|
|
653
|
+
*/
|
|
654
|
+
_disposePtyJob(session) {
|
|
655
|
+
if (!session || !session.jobHandle) return false;
|
|
656
|
+
const h = session.jobHandle;
|
|
657
|
+
session.jobHandle = null;
|
|
658
|
+
try { return !!jobGuard.closeJob(h); } catch (_) { return false; }
|
|
659
|
+
}
|
|
660
|
+
|
|
587
661
|
async stopSession(sessionId) {
|
|
588
662
|
const session = this.sessions.get(sessionId);
|
|
589
663
|
if (!session) {
|
|
@@ -608,13 +682,20 @@ class BaseBridge {
|
|
|
608
682
|
// runs.
|
|
609
683
|
this._disposePtyDisposables(session, sessionId);
|
|
610
684
|
|
|
611
|
-
|
|
685
|
+
// No live process to wait on — close the per-PTY job (reaps any lingering grandchildren
|
|
686
|
+
// and frees the kernel handle) before returning, so this path can't leak the handle.
|
|
687
|
+
if (!session.process) { this._disposePtyJob(session); return; }
|
|
688
|
+
|
|
689
|
+
// Capture the pid before kill(): node-pty may clear it on exit, and we need it
|
|
690
|
+
// for the POSIX process-group escalation below.
|
|
691
|
+
const ptyPid = session.process.pid;
|
|
612
692
|
|
|
613
693
|
// Return a promise that resolves when the PTY process actually exits
|
|
614
694
|
// (or after a bounded timeout), so callers can await clean shutdown.
|
|
615
695
|
return new Promise((resolve) => {
|
|
616
696
|
let settled = false;
|
|
617
697
|
let waitDisposable = null;
|
|
698
|
+
let exited = false;
|
|
618
699
|
|
|
619
700
|
const cleanup = () => {
|
|
620
701
|
if (settled) return;
|
|
@@ -628,11 +709,24 @@ class BaseBridge {
|
|
|
628
709
|
if (waitDisposable && typeof waitDisposable.dispose === 'function') {
|
|
629
710
|
try { waitDisposable.dispose(); } catch (_) { /* ignore */ }
|
|
630
711
|
}
|
|
712
|
+
// Deterministic subtree teardown. On Windows with the job guard, closing the
|
|
713
|
+
// per-PTY kill-on-close job terminates the shell + its node/bun grandchildren and
|
|
714
|
+
// frees the handle — what node-pty's own kill() cannot do. When no job was closed
|
|
715
|
+
// (degraded: koffi unavailable in a SEA binary / EDR-blocked, or POSIX) AND the PTY
|
|
716
|
+
// did not exit on its own, escalate to the best-effort fallback: taskkill /T /F on
|
|
717
|
+
// Windows, or a process-group kill on POSIX (node-pty PTYs are session leaders). We
|
|
718
|
+
// skip the fallback after a clean exit to sidestep any pid/pgid-reuse window.
|
|
719
|
+
{
|
|
720
|
+
const jobClosed = this._disposePtyJob(session);
|
|
721
|
+
if (!jobClosed && !exited && ptyPid) {
|
|
722
|
+
try { killProcessTreeSync(ptyPid); } catch (_) { /* best-effort */ }
|
|
723
|
+
}
|
|
724
|
+
}
|
|
631
725
|
resolve();
|
|
632
726
|
};
|
|
633
727
|
|
|
634
728
|
try {
|
|
635
|
-
const handle = session.process.onExit(() => cleanup());
|
|
729
|
+
const handle = session.process.onExit(() => { exited = true; cleanup(); });
|
|
636
730
|
// node-pty returns an IDisposable; older mocks may return undefined.
|
|
637
731
|
if (handle && typeof handle.dispose === 'function') waitDisposable = handle;
|
|
638
732
|
} catch (_) {
|
|
@@ -672,6 +766,25 @@ class BaseBridge {
|
|
|
672
766
|
await this.stopSession(sessionId);
|
|
673
767
|
}
|
|
674
768
|
}
|
|
769
|
+
|
|
770
|
+
/**
|
|
771
|
+
* Synchronous, best-effort reap of EVERY live PTY subtree this bridge owns. For the
|
|
772
|
+
* uncaughtException and supervisor-death (IPC disconnect) paths, where we are about to
|
|
773
|
+
* exit and cannot await async teardown. Windows: close each per-PTY kill-on-close job
|
|
774
|
+
* (terminates the shell + node/bun grandchildren). POSIX: process-group kill of each PTY.
|
|
775
|
+
* Never throws.
|
|
776
|
+
*/
|
|
777
|
+
killAllSubtreesSync() {
|
|
778
|
+
for (const [, session] of this.sessions) {
|
|
779
|
+
let jobClosed = false;
|
|
780
|
+
try { jobClosed = this._disposePtyJob(session); } catch (_) { jobClosed = false; }
|
|
781
|
+
// Degraded fallback (no job closed): taskkill /T /F on Windows, process-group kill
|
|
782
|
+
// on POSIX. When the job WAS closed the kernel already reaped the subtree.
|
|
783
|
+
if (!jobClosed && session && session.process && session.process.pid) {
|
|
784
|
+
try { killProcessTreeSync(session.process.pid); } catch (_) { /* ignore */ }
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
}
|
|
675
788
|
}
|
|
676
789
|
|
|
677
790
|
module.exports = BaseBridge;
|
package/src/job-guard.js
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Windows Job Object guard — deterministic process-tree teardown.
|
|
4
|
+
//
|
|
5
|
+
// The core mechanism for "no zombie node/bun processes": a Win32 Job Object with
|
|
6
|
+
// JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE. When the last handle to such a job closes
|
|
7
|
+
// (the holding process dies for ANY reason — Ctrl+C, crash, taskkill /F, console
|
|
8
|
+
// close), the kernel terminates EVERY process in the job atomically, with no user
|
|
9
|
+
// code running. This is the only Windows mechanism that survives an uncatchable kill.
|
|
10
|
+
//
|
|
11
|
+
// Two uses (see docs/specs/process-shutdown.md, ADR-00NN):
|
|
12
|
+
// 1. Supervisor-level job: bin/supervisor.js creates a kill-on-close job and assigns
|
|
13
|
+
// ITSELF before forking the server. AssignProcessToJobObject is forward-looking, so
|
|
14
|
+
// every future descendant (server, PTYs, the CLI's node/bun MCP grandchildren) joins
|
|
15
|
+
// the job. Supervisor death closes the in-process handle → the whole tree dies. The
|
|
16
|
+
// job persists across server restarts (only supervisor death closes the handle), so
|
|
17
|
+
// the legitimate exit-75 memory restart is unaffected.
|
|
18
|
+
// 2. Per-PTY nested job: src/base-bridge.js puts each PTY in its own kill-on-close job;
|
|
19
|
+
// closing that handle on stopSession atomically kills the PTY + its grandchildren —
|
|
20
|
+
// deterministic per-session teardown that also satisfies "restart independently".
|
|
21
|
+
//
|
|
22
|
+
// Held IN-PROCESS via the koffi FFI (not an external helper, which would be a single
|
|
23
|
+
// point of failure, and not PowerShell, whose Add-Type→csc.exe is blocked by CLM/WDAC/
|
|
24
|
+
// AMSI on the hardened corporate boxes that are the primary audience). The job's
|
|
25
|
+
// BREAKAWAY_OK flag is deliberately left OFF so a child requesting CREATE_BREAKAWAY_FROM_JOB
|
|
26
|
+
// is kept in the job rather than escaping it.
|
|
27
|
+
//
|
|
28
|
+
// Windows-only by design; a no-op on macOS/Linux and whenever koffi is unavailable
|
|
29
|
+
// (e.g. under Bun, or a locked-down box). Never throws into the caller — the guard must
|
|
30
|
+
// never break startup; failure degrades to best-effort taskkill (jobGuard:false).
|
|
31
|
+
|
|
32
|
+
const IS_WIN = process.platform === 'win32';
|
|
33
|
+
|
|
34
|
+
// JOBOBJECTINFOCLASS
|
|
35
|
+
const JobObjectExtendedLimitInformation = 9;
|
|
36
|
+
// JOBOBJECT_BASIC_LIMIT_INFORMATION.LimitFlags
|
|
37
|
+
const JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x00002000;
|
|
38
|
+
// OpenProcess access rights needed to assign a foreign pid to a job
|
|
39
|
+
const PROCESS_TERMINATE = 0x0001;
|
|
40
|
+
const PROCESS_SET_QUOTA = 0x0100;
|
|
41
|
+
|
|
42
|
+
let _koffi = null;
|
|
43
|
+
let _api = null;
|
|
44
|
+
let _loadError = null;
|
|
45
|
+
|
|
46
|
+
// Lazily bind kernel32 via koffi. Returns the bound API or null (cached).
|
|
47
|
+
function _ensureApi() {
|
|
48
|
+
if (_api || _loadError) return _api;
|
|
49
|
+
if (!IS_WIN) { _loadError = new Error('not win32'); return null; }
|
|
50
|
+
try {
|
|
51
|
+
_koffi = require('koffi');
|
|
52
|
+
|
|
53
|
+
const JOBOBJECT_BASIC_LIMIT_INFORMATION = _koffi.struct('JOBOBJECT_BASIC_LIMIT_INFORMATION', {
|
|
54
|
+
PerProcessUserTimeLimit: 'int64',
|
|
55
|
+
PerJobUserTimeLimit: 'int64',
|
|
56
|
+
LimitFlags: 'uint32',
|
|
57
|
+
MinimumWorkingSetSize: 'size_t',
|
|
58
|
+
MaximumWorkingSetSize: 'size_t',
|
|
59
|
+
ActiveProcessLimit: 'uint32',
|
|
60
|
+
Affinity: 'size_t',
|
|
61
|
+
PriorityClass: 'uint32',
|
|
62
|
+
SchedulingClass: 'uint32',
|
|
63
|
+
});
|
|
64
|
+
const IO_COUNTERS = _koffi.struct('IO_COUNTERS', {
|
|
65
|
+
ReadOperationCount: 'uint64',
|
|
66
|
+
WriteOperationCount: 'uint64',
|
|
67
|
+
OtherOperationCount: 'uint64',
|
|
68
|
+
ReadTransferCount: 'uint64',
|
|
69
|
+
WriteTransferCount: 'uint64',
|
|
70
|
+
OtherTransferCount: 'uint64',
|
|
71
|
+
});
|
|
72
|
+
const JOBOBJECT_EXTENDED_LIMIT_INFORMATION = _koffi.struct('JOBOBJECT_EXTENDED_LIMIT_INFORMATION', {
|
|
73
|
+
BasicLimitInformation: JOBOBJECT_BASIC_LIMIT_INFORMATION,
|
|
74
|
+
IoInfo: IO_COUNTERS,
|
|
75
|
+
ProcessMemoryLimit: 'size_t',
|
|
76
|
+
JobMemoryLimit: 'size_t',
|
|
77
|
+
PeakProcessMemoryUsed: 'size_t',
|
|
78
|
+
PeakJobMemoryUsed: 'size_t',
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
const k = _koffi.load('kernel32.dll');
|
|
82
|
+
_api = {
|
|
83
|
+
sizeofExtLimit: _koffi.sizeof(JOBOBJECT_EXTENDED_LIMIT_INFORMATION),
|
|
84
|
+
CreateJobObjectW: k.func('void* __stdcall CreateJobObjectW(void* lpJobAttributes, void* lpName)'),
|
|
85
|
+
// The struct is registered in koffi's type registry under its name, so the C
|
|
86
|
+
// prototype can reference it by that name (passed by pointer → marshaled from a JS object).
|
|
87
|
+
SetInformationJobObject: k.func('int __stdcall SetInformationJobObject(void* hJob, int JobObjectInformationClass, ' +
|
|
88
|
+
'JOBOBJECT_EXTENDED_LIMIT_INFORMATION* lpJobObjectInformation, uint32 cbJobObjectInformationLength)'),
|
|
89
|
+
AssignProcessToJobObject: k.func('int __stdcall AssignProcessToJobObject(void* hJob, void* hProcess)'),
|
|
90
|
+
OpenProcess: k.func('void* __stdcall OpenProcess(uint32 dwDesiredAccess, int bInheritHandle, uint32 dwProcessId)'),
|
|
91
|
+
GetCurrentProcess: k.func('void* __stdcall GetCurrentProcess()'),
|
|
92
|
+
CloseHandle: k.func('int __stdcall CloseHandle(void* hObject)'),
|
|
93
|
+
GetLastError: k.func('uint32 __stdcall GetLastError()'),
|
|
94
|
+
};
|
|
95
|
+
} catch (err) {
|
|
96
|
+
_loadError = err;
|
|
97
|
+
_api = null;
|
|
98
|
+
}
|
|
99
|
+
return _api;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// True only when the koffi-backed Win32 binding is usable on this platform.
|
|
103
|
+
// `AOD_DISABLE_JOB_GUARD=1` forces it off (operator escape hatch if koffi/the FFI ever
|
|
104
|
+
// misbehaves, and the hook that lets tests exercise the best-effort degraded teardown).
|
|
105
|
+
// In the SEA single-file binary koffi is externalized out of the bundle and there is no
|
|
106
|
+
// node_modules, so it can never load — short-circuit to degraded mode without attempting
|
|
107
|
+
// the require (keeps the PTY-start hot path free of a doomed module lookup).
|
|
108
|
+
function isAvailable() {
|
|
109
|
+
if (process.env.AOD_DISABLE_JOB_GUARD === '1') return false;
|
|
110
|
+
if (typeof global !== 'undefined' && global.__SEA_MODE__) return false;
|
|
111
|
+
return !!_ensureApi();
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Explicit NULL-handle predicate. koffi 3.x returns JS `null` for a NULL pointer and a
|
|
115
|
+
// BigInt for a valid HANDLE (verified on koffi 3.0.2), so a bare `!h` already works; this
|
|
116
|
+
// guards the common shapes explicitly so a future koffi representation of NULL (0 / 0n /
|
|
117
|
+
// undefined) can't slip an invalid handle into a WinAPI call.
|
|
118
|
+
function _isNullHandle(h) {
|
|
119
|
+
return h === null || h === undefined || h === 0 || h === 0n;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Create a job object with KILL_ON_JOB_CLOSE set (BREAKAWAY_OK deliberately OFF).
|
|
123
|
+
// Returns the job handle (opaque, pass back to assign*/closeJob) or null on any failure.
|
|
124
|
+
function createKillOnCloseJob() {
|
|
125
|
+
const api = _ensureApi();
|
|
126
|
+
if (!api) return null;
|
|
127
|
+
let job = null;
|
|
128
|
+
try {
|
|
129
|
+
job = api.CreateJobObjectW(null, null);
|
|
130
|
+
if (_isNullHandle(job)) return null;
|
|
131
|
+
const info = {
|
|
132
|
+
BasicLimitInformation: {
|
|
133
|
+
PerProcessUserTimeLimit: 0n,
|
|
134
|
+
PerJobUserTimeLimit: 0n,
|
|
135
|
+
LimitFlags: JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE,
|
|
136
|
+
MinimumWorkingSetSize: 0,
|
|
137
|
+
MaximumWorkingSetSize: 0,
|
|
138
|
+
ActiveProcessLimit: 0,
|
|
139
|
+
Affinity: 0,
|
|
140
|
+
PriorityClass: 0,
|
|
141
|
+
SchedulingClass: 0,
|
|
142
|
+
},
|
|
143
|
+
IoInfo: {
|
|
144
|
+
ReadOperationCount: 0n, WriteOperationCount: 0n, OtherOperationCount: 0n,
|
|
145
|
+
ReadTransferCount: 0n, WriteTransferCount: 0n, OtherTransferCount: 0n,
|
|
146
|
+
},
|
|
147
|
+
ProcessMemoryLimit: 0,
|
|
148
|
+
JobMemoryLimit: 0,
|
|
149
|
+
PeakProcessMemoryUsed: 0,
|
|
150
|
+
PeakJobMemoryUsed: 0,
|
|
151
|
+
};
|
|
152
|
+
const ok = api.SetInformationJobObject(job, JobObjectExtendedLimitInformation, info, api.sizeofExtLimit);
|
|
153
|
+
if (!ok) {
|
|
154
|
+
// Could not arm kill-on-close — a job without it is useless (worse: it would
|
|
155
|
+
// hold processes without ever reaping them). Close and fail closed to null.
|
|
156
|
+
try { api.CloseHandle(job); } catch (_) { /* ignore */ }
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
159
|
+
return job;
|
|
160
|
+
} catch (_) {
|
|
161
|
+
if (job) { try { api.CloseHandle(job); } catch (__) { /* ignore */ } }
|
|
162
|
+
return null;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Assign the CURRENT process to the job (used by the supervisor before forking).
|
|
167
|
+
// Returns true on success. After this, all future descendants auto-join the job.
|
|
168
|
+
function assignSelf(job) {
|
|
169
|
+
const api = _ensureApi();
|
|
170
|
+
if (!api || _isNullHandle(job)) return false;
|
|
171
|
+
try {
|
|
172
|
+
const self = api.GetCurrentProcess(); // pseudo-handle (-1); valid for AssignProcessToJobObject
|
|
173
|
+
return !!api.AssignProcessToJobObject(job, self);
|
|
174
|
+
} catch (_) {
|
|
175
|
+
return false;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Assign a foreign process (by pid) to the job (used per-PTY). Opens a scoped handle
|
|
180
|
+
// with exactly the rights needed, assigns, then closes that process handle (NOT the job).
|
|
181
|
+
// Returns true on success.
|
|
182
|
+
//
|
|
183
|
+
// PID-reuse safety: callers must pass the pid of a process they KNOW is currently alive
|
|
184
|
+
// and call this synchronously after spawning it (base-bridge._attachPtyJob runs in the same
|
|
185
|
+
// synchronous tick as node-pty's spawn() that produced the pid), so there is no async window
|
|
186
|
+
// in which the pid could be recycled before OpenProcess. Do NOT call this with a pid that
|
|
187
|
+
// may have already exited.
|
|
188
|
+
function assignPid(job, pid) {
|
|
189
|
+
const api = _ensureApi();
|
|
190
|
+
if (!api || _isNullHandle(job) || !pid) return false;
|
|
191
|
+
let h = null;
|
|
192
|
+
try {
|
|
193
|
+
h = api.OpenProcess(PROCESS_TERMINATE | PROCESS_SET_QUOTA, 0, pid >>> 0);
|
|
194
|
+
if (_isNullHandle(h)) return false;
|
|
195
|
+
return !!api.AssignProcessToJobObject(job, h);
|
|
196
|
+
} catch (_) {
|
|
197
|
+
return false;
|
|
198
|
+
} finally {
|
|
199
|
+
if (!_isNullHandle(h)) { try { api.CloseHandle(h); } catch (_) { /* ignore */ } }
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Close a job handle. For a per-PTY kill-on-close job this is the teardown trigger:
|
|
204
|
+
// it terminates every process still in the job. Idempotent-safe to call with null.
|
|
205
|
+
// NEVER call this on the supervisor-level job (its close = kill the whole tree); the
|
|
206
|
+
// supervisor holds it for life and lets process death close it.
|
|
207
|
+
function closeJob(job) {
|
|
208
|
+
const api = _ensureApi();
|
|
209
|
+
if (!api || _isNullHandle(job)) return false;
|
|
210
|
+
try {
|
|
211
|
+
return !!api.CloseHandle(job);
|
|
212
|
+
} catch (_) {
|
|
213
|
+
return false;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
module.exports = {
|
|
218
|
+
isAvailable,
|
|
219
|
+
createKillOnCloseJob,
|
|
220
|
+
assignSelf,
|
|
221
|
+
assignPid,
|
|
222
|
+
closeJob,
|
|
223
|
+
// exposed for diagnostics/tests
|
|
224
|
+
_loadError: () => _loadError,
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
// --- self-test: `node src/job-guard.js` ------------------------------------------
|
|
228
|
+
// Proves the koffi bindings + struct marshaling work end to end on this host:
|
|
229
|
+
// create a kill-on-close job, assign a spawned child, close the job, assert the child dies.
|
|
230
|
+
if (require.main === module) {
|
|
231
|
+
if (!IS_WIN) { console.log('job-guard self-test: non-win32, no-op OK'); process.exit(0); }
|
|
232
|
+
const { spawn } = require('child_process');
|
|
233
|
+
console.log('koffi available:', isAvailable(), 'loadError:', _loadError && _loadError.message);
|
|
234
|
+
const job = createKillOnCloseJob();
|
|
235
|
+
console.log('createKillOnCloseJob ->', job ? 'OK' : 'FAIL');
|
|
236
|
+
if (!job) process.exit(1);
|
|
237
|
+
// Long-lived child that does nothing but stay alive.
|
|
238
|
+
const child = spawn(process.execPath, ['-e', 'setInterval(()=>{}, 1e9)'], { stdio: 'ignore' });
|
|
239
|
+
console.log('spawned child pid', child.pid);
|
|
240
|
+
setTimeout(() => {
|
|
241
|
+
const assigned = assignPid(job, child.pid);
|
|
242
|
+
console.log('assignPid ->', assigned ? 'OK' : 'FAIL');
|
|
243
|
+
closeJob(job);
|
|
244
|
+
console.log('closeJob called; waiting to see if child dies...');
|
|
245
|
+
let exited = false;
|
|
246
|
+
child.on('exit', (code, sig) => { exited = true; console.log(`child exited code=${code} sig=${sig} -> KILL-ON-CLOSE OK`); process.exit(0); });
|
|
247
|
+
setTimeout(() => { if (!exited) { console.log('child STILL ALIVE -> FAIL'); try { child.kill(); } catch (_) {} process.exit(2); } }, 2000);
|
|
248
|
+
}, 300);
|
|
249
|
+
}
|
package/src/server.js
CHANGED
|
@@ -384,6 +384,10 @@ class ClaudeCodeWebServer {
|
|
|
384
384
|
} catch (saveErr) {
|
|
385
385
|
console.error('Failed to save sessions on crash:', saveErr);
|
|
386
386
|
}
|
|
387
|
+
// Reap PTY subtrees so the CLI's node/bun grandchildren don't outlive this crashed
|
|
388
|
+
// server. Synchronous (the event loop is unsafe here). Windows closes each per-PTY
|
|
389
|
+
// job; POSIX group-kills. Best-effort; never rethrows.
|
|
390
|
+
try { this._reapAllPtySubtreesSync(); } catch (_) { /* ignore */ }
|
|
387
391
|
process.exit(1);
|
|
388
392
|
});
|
|
389
393
|
process.on('unhandledRejection', (reason) => {
|
|
@@ -401,10 +405,21 @@ class ClaudeCodeWebServer {
|
|
|
401
405
|
this.handleShutdown();
|
|
402
406
|
}
|
|
403
407
|
});
|
|
404
|
-
// If the supervisor
|
|
408
|
+
// If the supervisor's IPC channel drops, the supervisor died. Per the
|
|
409
|
+
// "everything dies when the main process dies" contract, this server must NOT
|
|
410
|
+
// keep running standalone (the old behavior) — it tears down its own PTY trees
|
|
411
|
+
// (incl. the CLI's node/bun grandchildren) and shuts down.
|
|
405
412
|
process.on('disconnect', () => {
|
|
406
|
-
|
|
407
|
-
|
|
413
|
+
// Expected channel close: a graceful shutdown / memory-restart we initiated is
|
|
414
|
+
// already in flight (the supervisor sent {type:'shutdown'} or we exited 75). No-op.
|
|
415
|
+
if (this.isShuttingDown) return;
|
|
416
|
+
console.warn('IPC channel disconnected (supervisor died). Tearing down this server and its process tree.');
|
|
417
|
+
// Reap PTY subtrees synchronously FIRST so the node/bun grandchildren die immediately,
|
|
418
|
+
// even if the ordered handleShutdown below is slow. On Windows with the job guard
|
|
419
|
+
// active the kernel has usually already killed us; this is the cross-platform /
|
|
420
|
+
// degraded-mode backstop.
|
|
421
|
+
try { this._reapAllPtySubtreesSync(); } catch (_) { /* best-effort */ }
|
|
422
|
+
this.handleShutdown(0);
|
|
408
423
|
});
|
|
409
424
|
}
|
|
410
425
|
|
|
@@ -3937,6 +3952,24 @@ class ClaudeCodeWebServer {
|
|
|
3937
3952
|
return bridges[agentType] || null;
|
|
3938
3953
|
}
|
|
3939
3954
|
|
|
3955
|
+
/**
|
|
3956
|
+
* Synchronously reap every PTY subtree across all bridges. For the crash / supervisor-
|
|
3957
|
+
* death paths where we are exiting and cannot await async teardown. Windows closes each
|
|
3958
|
+
* per-PTY kill-on-close job (terminates the shell + node/bun grandchildren); POSIX
|
|
3959
|
+
* process-group kills. Best-effort; never throws.
|
|
3960
|
+
*/
|
|
3961
|
+
_reapAllPtySubtreesSync() {
|
|
3962
|
+
const bridges = [
|
|
3963
|
+
this.claudeBridge, this.codexBridge, this.copilotBridge,
|
|
3964
|
+
this.geminiBridge, this.terminalBridge,
|
|
3965
|
+
];
|
|
3966
|
+
for (const b of bridges) {
|
|
3967
|
+
if (b && typeof b.killAllSubtreesSync === 'function') {
|
|
3968
|
+
try { b.killAllSubtreesSync(); } catch (_) { /* ignore */ }
|
|
3969
|
+
}
|
|
3970
|
+
}
|
|
3971
|
+
}
|
|
3972
|
+
|
|
3940
3973
|
async startToolSession(wsId, toolName, bridge, options, cols, rows) {
|
|
3941
3974
|
const wsInfo = this.webSocketConnections.get(wsId);
|
|
3942
3975
|
if (!wsInfo) {
|
|
@@ -5226,6 +5259,15 @@ class ClaudeCodeWebServer {
|
|
|
5226
5259
|
.filter(s => s._voiceUploadTimestamps && s._voiceUploadTimestamps.length).length,
|
|
5227
5260
|
activity_broadcast_timestamps: (this.activityBroadcastTimestamps && this.activityBroadcastTimestamps.size) || 0,
|
|
5228
5261
|
},
|
|
5262
|
+
// Deterministic-shutdown guard status. On win32, job_guard_active reflects whether
|
|
5263
|
+
// the supervisor established the kill-on-close Job Object (false ⇒ degraded:
|
|
5264
|
+
// EDR/CLM/koffi unavailable ⇒ best-effort taskkill teardown). null off win32 (the
|
|
5265
|
+
// job mechanism is Windows-only; POSIX uses process-group teardown). See
|
|
5266
|
+
// docs/specs/process-shutdown.md.
|
|
5267
|
+
process_guard: {
|
|
5268
|
+
job_guard_active: process.platform === 'win32' ? (process.env.AOD_JOB_GUARD === '1') : null,
|
|
5269
|
+
supervised: !!this.supervised,
|
|
5270
|
+
},
|
|
5229
5271
|
// DISK-02/03: cached disk usage sample (60 s TTL, never blocks the
|
|
5230
5272
|
// event loop). Populated by _sampleDiskUsage() — see method
|
|
5231
5273
|
// comment for the time-budget contract.
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Cross-platform best-effort process-tree teardown.
|
|
4
|
+
//
|
|
5
|
+
// This is the FALLBACK layer, used when the deterministic mechanism is unavailable:
|
|
6
|
+
// - Windows: the per-PTY / supervisor Job Object is the real teardown. taskkill /T /F
|
|
7
|
+
// is only the degraded-mode backstop (jobGuard:false — EDR/CLM blocked the job).
|
|
8
|
+
// - POSIX: there is no job-object equivalent, so process-group kill IS the primary
|
|
9
|
+
// teardown for PTYs. node-pty's Unix backend runs each PTY through forkpty→setsid,
|
|
10
|
+
// so the PTY is a session/group leader and its pid == pgid; killing the negative pid
|
|
11
|
+
// targets that whole group. Honest limitation: a grandchild that calls setsid() (some
|
|
12
|
+
// daemonized MCP servers) starts its own group and escapes -pgid; only cgroup v2
|
|
13
|
+
// delegation closes that gap (see docs/specs/process-shutdown.md).
|
|
14
|
+
//
|
|
15
|
+
// Never throws into the caller — teardown must not break shutdown.
|
|
16
|
+
|
|
17
|
+
const childProcess = require('child_process');
|
|
18
|
+
|
|
19
|
+
const IS_WIN = process.platform === 'win32';
|
|
20
|
+
|
|
21
|
+
// Windows degraded-mode tree kill via taskkill. Async (spawns a child); resolves true
|
|
22
|
+
// once taskkill exits 0, false otherwise. windowsHide + no shell (taskkill is a real exe).
|
|
23
|
+
function _taskkillTree(pid, spawnImpl) {
|
|
24
|
+
return new Promise((resolve) => {
|
|
25
|
+
try {
|
|
26
|
+
const proc = spawnImpl('taskkill', ['/T', '/F', '/PID', String(pid)], {
|
|
27
|
+
windowsHide: true,
|
|
28
|
+
stdio: 'ignore',
|
|
29
|
+
shell: false,
|
|
30
|
+
});
|
|
31
|
+
let settled = false;
|
|
32
|
+
const done = (ok) => { if (!settled) { settled = true; resolve(ok); } };
|
|
33
|
+
proc.on('exit', (code) => done(code === 0));
|
|
34
|
+
proc.on('error', () => done(false));
|
|
35
|
+
// Bound the wait so a hung taskkill can't stall shutdown.
|
|
36
|
+
const t = setTimeout(() => done(false), 4000);
|
|
37
|
+
if (t.unref) t.unref();
|
|
38
|
+
} catch (_) {
|
|
39
|
+
resolve(false);
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// POSIX: kill the process group led by `pid` (negative-pid), then the pid itself as a
|
|
45
|
+
// fallback in case it is not actually a group leader.
|
|
46
|
+
function _killGroup(pid, signal, killImpl) {
|
|
47
|
+
let any = false;
|
|
48
|
+
try { killImpl(-pid, signal); any = true; } catch (_) { /* ESRCH / EPERM */ }
|
|
49
|
+
try { killImpl(pid, signal); any = true; } catch (_) { /* may already be gone */ }
|
|
50
|
+
return any;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Best-effort tree-kill of `pid` and its descendants. Returns a Promise<boolean>.
|
|
55
|
+
* Windows uses taskkill /T /F; POSIX kills the process group.
|
|
56
|
+
* Injectable deps (`opts.spawn` / `opts.kill`) are for unit tests.
|
|
57
|
+
*/
|
|
58
|
+
function killProcessTree(pid, opts = {}) {
|
|
59
|
+
const signal = opts.signal || 'SIGKILL';
|
|
60
|
+
if (!pid || pid <= 0) return Promise.resolve(false);
|
|
61
|
+
if (IS_WIN) {
|
|
62
|
+
return _taskkillTree(pid, opts.spawn || childProcess.spawn);
|
|
63
|
+
}
|
|
64
|
+
return Promise.resolve(_killGroup(pid, signal, opts.kill || process.kill.bind(process)));
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Synchronous best-effort tree-kill for the uncaughtException path, where the event loop
|
|
69
|
+
* is unsafe to rely on. Windows uses spawnSync(taskkill) with a short timeout; POSIX kills
|
|
70
|
+
* the process group synchronously. Returns boolean. Never throws.
|
|
71
|
+
*/
|
|
72
|
+
function killProcessTreeSync(pid, opts = {}) {
|
|
73
|
+
const signal = opts.signal || 'SIGKILL';
|
|
74
|
+
if (!pid || pid <= 0) return false;
|
|
75
|
+
if (IS_WIN) {
|
|
76
|
+
try {
|
|
77
|
+
const spawnSync = opts.spawnSync || childProcess.spawnSync;
|
|
78
|
+
const r = spawnSync('taskkill', ['/T', '/F', '/PID', String(pid)], {
|
|
79
|
+
windowsHide: true, stdio: 'ignore', shell: false, timeout: 3000,
|
|
80
|
+
});
|
|
81
|
+
return !!r && r.status === 0;
|
|
82
|
+
} catch (_) {
|
|
83
|
+
return false;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return _killGroup(pid, signal, opts.kill || process.kill.bind(process));
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
module.exports = {
|
|
90
|
+
killProcessTree,
|
|
91
|
+
killProcessTreeSync,
|
|
92
|
+
// internal, exposed for tests
|
|
93
|
+
_killGroup,
|
|
94
|
+
_taskkillTree,
|
|
95
|
+
};
|