ai-or-die 0.1.76 → 0.1.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/ai-or-die.js CHANGED
@@ -43,7 +43,7 @@ program
43
43
  .option('--no-sticky-notes', 'disable per-tab AI session summaries + auto tab titles (on by default)')
44
44
  .option('--sticky-notes-model-dir <path>', 'custom directory for the sticky-note model file')
45
45
  .option('--sticky-notes-model <url>', 'override the sticky-note model GGUF download URL')
46
- .option('--sticky-notes-threads <number>', 'CPU threads for sticky-note inference (default: auto, max 4)')
46
+ .option('--sticky-notes-threads <number>', 'CPU threads for sticky-note inference (default: auto — three-quarters of the cores on CPU, gentle on GPU)')
47
47
  .option('--no-keepalive', 'disable keeping the machine awake while the server runs (Windows only; on by default)')
48
48
  .option('--keepalive-display', 'also keep the display on (default keeps the system awake but lets the monitor sleep)');
49
49
 
package/bin/supervisor.js CHANGED
@@ -5,6 +5,7 @@
5
5
  const { spawn } = require('child_process');
6
6
  const path = require('path');
7
7
  const { RESTART_EXIT_CODE } = require('../src/restart-manager');
8
+ const jobGuard = require('../src/job-guard');
8
9
 
9
10
  // ---------------------------------------------------------------------------
10
11
  // Tunables — all overridable via env vars so the regression test can shrink
@@ -14,7 +15,12 @@ const { RESTART_EXIT_CODE } = require('../src/restart-manager');
14
15
 
15
16
  const RESTART_DELAY_MS = parseInt(process.env.RESTART_DELAY_MS, 10) || 1000; // clean RESTART_EXIT_CODE respawn
16
17
  const CRASH_RESTART_DELAY_MS = parseInt(process.env.CRASH_RESTART_DELAY_MS, 10) || 3000; // normal crash respawn
17
- const SHUTDOWN_TIMEOUT_MS = parseInt(process.env.SHUTDOWN_TIMEOUT_MS, 10) || 10000; // SIGINT/SIGTERM hard-kill fallback
18
+ // Must stay strictly GREATER than the server's own 15s force-exit budget
19
+ // (src/server.js handleShutdown) so a hung graceful shutdown lets the server
20
+ // finish (or self-force-exit) its own ordered teardown — including killing its
21
+ // PTY trees — before the supervisor hard-kills it. A supervisor SIGKILL that
22
+ // preempted the server would orphan the server's PTY/grandchild tree.
23
+ const SHUTDOWN_TIMEOUT_MS = parseInt(process.env.SHUTDOWN_TIMEOUT_MS, 10) || 20000; // SIGINT/SIGTERM hard-kill fallback
18
24
 
19
25
  // Tier 1 — tight crash loop. 3 crashes in 30 s historically tripped a hard
20
26
  // process.exit(1). The fix replaces that with an extended restart delay
@@ -49,6 +55,12 @@ let spawnCount = 0;
49
55
  let crashTimestamps = [];
50
56
  let pendingRestartTimer = null;
51
57
 
58
+ // Windows Job Object guard. Set up once at startup (below, before the first spawn).
59
+ // `jobGuardHandle` is held for the supervisor's entire life and intentionally NEVER
60
+ // closed by us — process death closes it, which is exactly what fires KILL_ON_JOB_CLOSE.
61
+ let jobGuardHandle = null;
62
+ let jobGuardActive = false;
63
+
52
64
  // Queued IPC message delivered to the NEXT spawned child once its IPC channel
53
65
  // is open. Used to forward tier-2 escalation downstream so the in-process
54
66
  // server can surface it to the browser ("supervisor is throttling restarts").
@@ -121,6 +133,9 @@ function startServer() {
121
133
  env: {
122
134
  ...process.env,
123
135
  SUPERVISED: '1',
136
+ // Tell the server whether the kill-on-close job guard is active, so it can
137
+ // surface the unprotected state in /api/diagnostics (jobGuard:false).
138
+ AOD_JOB_GUARD: jobGuardActive ? '1' : '0',
124
139
  ...(isRestart ? { AOD_SUPERVISOR_RESTART: '1' } : {})
125
140
  }
126
141
  });
@@ -252,4 +267,39 @@ process.on('message', (msg) => {
252
267
  if (msg && msg.type === 'shutdown') shutdownGracefully();
253
268
  });
254
269
 
270
+ // Establish the Windows Job Object guard BEFORE the first spawn so the server and its
271
+ // entire future tree (PTYs, the CLI's node/bun MCP grandchildren) auto-join the job.
272
+ // AssignProcessToJobObject is forward-looking: future children join, so the supervisor
273
+ // must be assigned while it still has no descendants — i.e. before startServer(). When
274
+ // the supervisor dies for ANY reason (Ctrl+C, crash, taskkill /F, console close), the OS
275
+ // closes the in-process handle and the kernel atomically terminates the whole job.
276
+ // No-op on non-Windows / when koffi is unavailable (jobGuardActive stays false →
277
+ // best-effort teardown, surfaced as jobGuard:false in /api/diagnostics).
278
+ function setupJobGuard() {
279
+ try {
280
+ if (!jobGuard.isAvailable()) {
281
+ if (process.platform === 'win32') {
282
+ console.warn('[supervisor] ⚠ process-guard: koffi unavailable; using best-effort teardown. ' +
283
+ 'Child node/bun processes may survive an uncatchable kill (taskkill /F).');
284
+ }
285
+ return;
286
+ }
287
+ jobGuardHandle = jobGuard.createKillOnCloseJob();
288
+ if (jobGuardHandle && jobGuard.assignSelf(jobGuardHandle)) {
289
+ jobGuardActive = true;
290
+ console.log('[supervisor] process-guard: kill-on-close job active — the whole tree dies with the supervisor.');
291
+ } else {
292
+ jobGuardHandle = null;
293
+ console.warn('[supervisor] ⚠ process-guard: could not create/assign the job object ' +
294
+ '(outer job UI limits / EDR / silo?); using best-effort teardown. ' +
295
+ 'Child node/bun processes may survive an uncatchable kill.');
296
+ }
297
+ } catch (e) {
298
+ jobGuardHandle = null;
299
+ jobGuardActive = false;
300
+ console.warn('[supervisor] ⚠ process-guard: setup failed (' + (e && e.message) + '); continuing without it.');
301
+ }
302
+ }
303
+
304
+ setupJobGuard();
255
305
  startServer();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ai-or-die",
3
- "version": "0.1.76",
3
+ "version": "0.1.78",
4
4
  "description": "Universal AI coding terminal — Claude, Copilot, Gemini & more in your browser",
5
5
  "main": "src/server.js",
6
6
  "bin": {
@@ -47,6 +47,7 @@
47
47
  "cors": "^2.8.5",
48
48
  "express": "^4.19.2",
49
49
  "fuzzysort": "^3.1.0",
50
+ "koffi": "^3.0.2",
50
51
  "open": "^10.1.0",
51
52
  "selfsigned": "^2.4.1",
52
53
  "sherpa-onnx-node": "^1.12.24",
@@ -3,6 +3,8 @@ const { execFile } = require('child_process');
3
3
  const path = require('path');
4
4
  const fs = require('fs');
5
5
  const os = require('os');
6
+ const jobGuard = require('./job-guard');
7
+ const { killProcessTreeSync } = require('./utils/process-tree');
6
8
 
7
9
  /** Chunk size for PTY writes — safely below ConPTY ~16KB kernel buffer */
8
10
  const PTY_WRITE_CHUNK_SIZE = 4096;
@@ -314,6 +316,11 @@ class BaseBridge {
314
316
 
315
317
  this.sessions.set(sessionId, session);
316
318
 
319
+ // Windows: enclose the PTY in its own kill-on-close Job Object now, before the CLI
320
+ // boots, so the CLI's future node/bun MCP grandchildren auto-join and can be reaped
321
+ // atomically on stopSession. No-op elsewhere.
322
+ this._attachPtyJob(session, ptyProcess);
323
+
317
324
  // Spawn watchdog: if no data, exit, or error arrives within 30s, treat as failure
318
325
  let receivedLifeSign = false;
319
326
  const ptyStartedAt = Date.now();
@@ -334,6 +341,14 @@ class BaseBridge {
334
341
  // the kill() below succeeds.
335
342
  this._disposePtyDisposables(session, sessionId);
336
343
  try { ptyProcess.kill(); } catch (e) { /* ignore */ }
344
+ // Reap the PTY subtree (Windows job close / POSIX group kill) so a hung-at-startup
345
+ // shell + any children it already spawned don't leak.
346
+ {
347
+ const jobClosed = this._disposePtyJob(session);
348
+ if (!jobClosed && ptyProcess && ptyProcess.pid) {
349
+ try { killProcessTreeSync(ptyProcess.pid); } catch (_) { /* best-effort */ }
350
+ }
351
+ }
337
352
  onError(new Error(`${this.toolName} process did not respond within ${SPAWN_TIMEOUT_MS / 1000} seconds. The command may not be installed or may have hung during startup.`));
338
353
  }
339
354
  }, SPAWN_TIMEOUT_MS);
@@ -390,6 +405,10 @@ class BaseBridge {
390
405
  // still hold references to the data-buffer closures. Skip onExit
391
406
  // self-disposal (node-pty already removed it on fire).
392
407
  this._disposePtyDisposables(session, sessionId);
408
+ // The PTY exited on its own, but the CLI's node/bun grandchildren may still be
409
+ // alive (node-pty doesn't walk the console process list). Closing the per-PTY
410
+ // kill-on-close job reaps them and frees the handle (Windows; no-op on POSIX).
411
+ this._disposePtyJob(session);
393
412
  if (this.sessions.has(sessionId)) {
394
413
  session.active = false;
395
414
  this.sessions.delete(sessionId);
@@ -444,6 +463,14 @@ class BaseBridge {
444
463
  // the shell is alive but unreadable) doesn't leak as a zombie process /
445
464
  // FD — mirrors the spawn-watchdog teardown. Harmless if already dead.
446
465
  try { ptyProcess.kill(); } catch (e) { /* ignore — may already be dead */ }
466
+ // Reap the subtree (Windows job close / POSIX group kill) so the shell's
467
+ // node/bun grandchildren don't outlive the failed session.
468
+ {
469
+ const jobClosed = this._disposePtyJob(session);
470
+ if (!jobClosed && ptyProcess && ptyProcess.pid) {
471
+ try { killProcessTreeSync(ptyProcess.pid); } catch (_) { /* best-effort */ }
472
+ }
473
+ }
447
474
  if (this.sessions.has(sessionId)) {
448
475
  session.active = false;
449
476
  this.sessions.delete(sessionId);
@@ -584,6 +611,53 @@ class BaseBridge {
584
611
  }
585
612
  }
586
613
 
614
+ /**
615
+ * Windows only: put a freshly-spawned PTY in its OWN kill-on-close Job Object so the
616
+ * PTY and the CLI's node/bun MCP grandchildren can be torn down atomically per session
617
+ * (see _disposePtyJob). Assigned right after spawn — before the CLI boots and spawns its
618
+ * children — so those future grandchildren auto-join the job. No-op on POSIX / when the
619
+ * job guard is unavailable (then teardown falls back to process-group / taskkill).
620
+ * Defence in depth: the PTY is also in the supervisor-level job, so supervisor death
621
+ * reaps it regardless.
622
+ * @private
623
+ */
624
+ _attachPtyJob(session, ptyProcess) {
625
+ if (process.platform !== 'win32' || !session) return;
626
+ try {
627
+ if (!jobGuard.isAvailable()) return;
628
+ const pid = ptyProcess && ptyProcess.pid;
629
+ if (!pid) return;
630
+ // Defensive: if a handle already exists for this session (re-entrant call), close it
631
+ // first so we never overwrite a live kernel handle and leak it.
632
+ if (session.jobHandle) this._disposePtyJob(session);
633
+ const handle = jobGuard.createKillOnCloseJob();
634
+ if (!handle) return;
635
+ if (jobGuard.assignPid(handle, pid)) {
636
+ session.jobHandle = handle;
637
+ } else {
638
+ // Assign failed (already-orphaned / access denied) — closing an empty job is harmless.
639
+ jobGuard.closeJob(handle);
640
+ }
641
+ } catch (_) { /* best-effort; never break session start */ }
642
+ }
643
+
644
+ /**
645
+ * Close a session's per-PTY job handle. On Windows this is the deterministic teardown:
646
+ * KILL_ON_JOB_CLOSE terminates the PTY + every descendant still in the job (the node/bun
647
+ * grandchildren). Also frees the kernel handle. Idempotent; no-op when no handle exists.
648
+ * Returns true when a job was actually closed (the subtree is reaped deterministically),
649
+ * false otherwise — callers use this to decide whether to escalate to the best-effort
650
+ * fallback (taskkill /T /F on Windows / process-group kill on POSIX) for the degraded
651
+ * path where no job exists (koffi unavailable: SEA binary, EDR/CLM-blocked, or POSIX).
652
+ * @private
653
+ */
654
+ _disposePtyJob(session) {
655
+ if (!session || !session.jobHandle) return false;
656
+ const h = session.jobHandle;
657
+ session.jobHandle = null;
658
+ try { return !!jobGuard.closeJob(h); } catch (_) { return false; }
659
+ }
660
+
587
661
  async stopSession(sessionId) {
588
662
  const session = this.sessions.get(sessionId);
589
663
  if (!session) {
@@ -608,13 +682,20 @@ class BaseBridge {
608
682
  // runs.
609
683
  this._disposePtyDisposables(session, sessionId);
610
684
 
611
- if (!session.process) return;
685
+ // No live process to wait on — close the per-PTY job (reaps any lingering grandchildren
686
+ // and frees the kernel handle) before returning, so this path can't leak the handle.
687
+ if (!session.process) { this._disposePtyJob(session); return; }
688
+
689
+ // Capture the pid before kill(): node-pty may clear it on exit, and we need it
690
+ // for the POSIX process-group escalation below.
691
+ const ptyPid = session.process.pid;
612
692
 
613
693
  // Return a promise that resolves when the PTY process actually exits
614
694
  // (or after a bounded timeout), so callers can await clean shutdown.
615
695
  return new Promise((resolve) => {
616
696
  let settled = false;
617
697
  let waitDisposable = null;
698
+ let exited = false;
618
699
 
619
700
  const cleanup = () => {
620
701
  if (settled) return;
@@ -628,11 +709,24 @@ class BaseBridge {
628
709
  if (waitDisposable && typeof waitDisposable.dispose === 'function') {
629
710
  try { waitDisposable.dispose(); } catch (_) { /* ignore */ }
630
711
  }
712
+ // Deterministic subtree teardown. On Windows with the job guard, closing the
713
+ // per-PTY kill-on-close job terminates the shell + its node/bun grandchildren and
714
+ // frees the handle — what node-pty's own kill() cannot do. When no job was closed
715
+ // (degraded: koffi unavailable in a SEA binary / EDR-blocked, or POSIX) AND the PTY
716
+ // did not exit on its own, escalate to the best-effort fallback: taskkill /T /F on
717
+ // Windows, or a process-group kill on POSIX (node-pty PTYs are session leaders). We
718
+ // skip the fallback after a clean exit to sidestep any pid/pgid-reuse window.
719
+ {
720
+ const jobClosed = this._disposePtyJob(session);
721
+ if (!jobClosed && !exited && ptyPid) {
722
+ try { killProcessTreeSync(ptyPid); } catch (_) { /* best-effort */ }
723
+ }
724
+ }
631
725
  resolve();
632
726
  };
633
727
 
634
728
  try {
635
- const handle = session.process.onExit(() => cleanup());
729
+ const handle = session.process.onExit(() => { exited = true; cleanup(); });
636
730
  // node-pty returns an IDisposable; older mocks may return undefined.
637
731
  if (handle && typeof handle.dispose === 'function') waitDisposable = handle;
638
732
  } catch (_) {
@@ -672,6 +766,25 @@ class BaseBridge {
672
766
  await this.stopSession(sessionId);
673
767
  }
674
768
  }
769
+
770
+ /**
771
+ * Synchronous, best-effort reap of EVERY live PTY subtree this bridge owns. For the
772
+ * uncaughtException and supervisor-death (IPC disconnect) paths, where we are about to
773
+ * exit and cannot await async teardown. Windows: close each per-PTY kill-on-close job
774
+ * (terminates the shell + node/bun grandchildren). POSIX: process-group kill of each PTY.
775
+ * Never throws.
776
+ */
777
+ killAllSubtreesSync() {
778
+ for (const [, session] of this.sessions) {
779
+ let jobClosed = false;
780
+ try { jobClosed = this._disposePtyJob(session); } catch (_) { jobClosed = false; }
781
+ // Degraded fallback (no job closed): taskkill /T /F on Windows, process-group kill
782
+ // on POSIX. When the job WAS closed the kernel already reaped the subtree.
783
+ if (!jobClosed && session && session.process && session.process.pid) {
784
+ try { killProcessTreeSync(session.process.pid); } catch (_) { /* ignore */ }
785
+ }
786
+ }
787
+ }
675
788
  }
676
789
 
677
790
  module.exports = BaseBridge;
@@ -0,0 +1,249 @@
1
+ 'use strict';
2
+
3
+ // Windows Job Object guard — deterministic process-tree teardown.
4
+ //
5
+ // The core mechanism for "no zombie node/bun processes": a Win32 Job Object with
6
+ // JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE. When the last handle to such a job closes
7
+ // (the holding process dies for ANY reason — Ctrl+C, crash, taskkill /F, console
8
+ // close), the kernel terminates EVERY process in the job atomically, with no user
9
+ // code running. This is the only Windows mechanism that survives an uncatchable kill.
10
+ //
11
+ // Two uses (see docs/specs/process-shutdown.md, ADR-00NN):
12
+ // 1. Supervisor-level job: bin/supervisor.js creates a kill-on-close job and assigns
13
+ // ITSELF before forking the server. AssignProcessToJobObject is forward-looking, so
14
+ // every future descendant (server, PTYs, the CLI's node/bun MCP grandchildren) joins
15
+ // the job. Supervisor death closes the in-process handle → the whole tree dies. The
16
+ // job persists across server restarts (only supervisor death closes the handle), so
17
+ // the legitimate exit-75 memory restart is unaffected.
18
+ // 2. Per-PTY nested job: src/base-bridge.js puts each PTY in its own kill-on-close job;
19
+ // closing that handle on stopSession atomically kills the PTY + its grandchildren —
20
+ // deterministic per-session teardown that also satisfies "restart independently".
21
+ //
22
+ // Held IN-PROCESS via the koffi FFI (not an external helper, which would be a single
23
+ // point of failure, and not PowerShell, whose Add-Type→csc.exe is blocked by CLM/WDAC/
24
+ // AMSI on the hardened corporate boxes that are the primary audience). The job's
25
+ // BREAKAWAY_OK flag is deliberately left OFF so a child requesting CREATE_BREAKAWAY_FROM_JOB
26
+ // is kept in the job rather than escaping it.
27
+ //
28
+ // Windows-only by design; a no-op on macOS/Linux and whenever koffi is unavailable
29
+ // (e.g. under Bun, or a locked-down box). Never throws into the caller — the guard must
30
+ // never break startup; failure degrades to best-effort taskkill (jobGuard:false).
31
+
32
+ const IS_WIN = process.platform === 'win32';
33
+
34
+ // JOBOBJECTINFOCLASS
35
+ const JobObjectExtendedLimitInformation = 9;
36
+ // JOBOBJECT_BASIC_LIMIT_INFORMATION.LimitFlags
37
+ const JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x00002000;
38
+ // OpenProcess access rights needed to assign a foreign pid to a job
39
+ const PROCESS_TERMINATE = 0x0001;
40
+ const PROCESS_SET_QUOTA = 0x0100;
41
+
42
+ let _koffi = null;
43
+ let _api = null;
44
+ let _loadError = null;
45
+
46
+ // Lazily bind kernel32 via koffi. Returns the bound API or null (cached).
47
+ function _ensureApi() {
48
+ if (_api || _loadError) return _api;
49
+ if (!IS_WIN) { _loadError = new Error('not win32'); return null; }
50
+ try {
51
+ _koffi = require('koffi');
52
+
53
+ const JOBOBJECT_BASIC_LIMIT_INFORMATION = _koffi.struct('JOBOBJECT_BASIC_LIMIT_INFORMATION', {
54
+ PerProcessUserTimeLimit: 'int64',
55
+ PerJobUserTimeLimit: 'int64',
56
+ LimitFlags: 'uint32',
57
+ MinimumWorkingSetSize: 'size_t',
58
+ MaximumWorkingSetSize: 'size_t',
59
+ ActiveProcessLimit: 'uint32',
60
+ Affinity: 'size_t',
61
+ PriorityClass: 'uint32',
62
+ SchedulingClass: 'uint32',
63
+ });
64
+ const IO_COUNTERS = _koffi.struct('IO_COUNTERS', {
65
+ ReadOperationCount: 'uint64',
66
+ WriteOperationCount: 'uint64',
67
+ OtherOperationCount: 'uint64',
68
+ ReadTransferCount: 'uint64',
69
+ WriteTransferCount: 'uint64',
70
+ OtherTransferCount: 'uint64',
71
+ });
72
+ const JOBOBJECT_EXTENDED_LIMIT_INFORMATION = _koffi.struct('JOBOBJECT_EXTENDED_LIMIT_INFORMATION', {
73
+ BasicLimitInformation: JOBOBJECT_BASIC_LIMIT_INFORMATION,
74
+ IoInfo: IO_COUNTERS,
75
+ ProcessMemoryLimit: 'size_t',
76
+ JobMemoryLimit: 'size_t',
77
+ PeakProcessMemoryUsed: 'size_t',
78
+ PeakJobMemoryUsed: 'size_t',
79
+ });
80
+
81
+ const k = _koffi.load('kernel32.dll');
82
+ _api = {
83
+ sizeofExtLimit: _koffi.sizeof(JOBOBJECT_EXTENDED_LIMIT_INFORMATION),
84
+ CreateJobObjectW: k.func('void* __stdcall CreateJobObjectW(void* lpJobAttributes, void* lpName)'),
85
+ // The struct is registered in koffi's type registry under its name, so the C
86
+ // prototype can reference it by that name (passed by pointer → marshaled from a JS object).
87
+ SetInformationJobObject: k.func('int __stdcall SetInformationJobObject(void* hJob, int JobObjectInformationClass, ' +
88
+ 'JOBOBJECT_EXTENDED_LIMIT_INFORMATION* lpJobObjectInformation, uint32 cbJobObjectInformationLength)'),
89
+ AssignProcessToJobObject: k.func('int __stdcall AssignProcessToJobObject(void* hJob, void* hProcess)'),
90
+ OpenProcess: k.func('void* __stdcall OpenProcess(uint32 dwDesiredAccess, int bInheritHandle, uint32 dwProcessId)'),
91
+ GetCurrentProcess: k.func('void* __stdcall GetCurrentProcess()'),
92
+ CloseHandle: k.func('int __stdcall CloseHandle(void* hObject)'),
93
+ GetLastError: k.func('uint32 __stdcall GetLastError()'),
94
+ };
95
+ } catch (err) {
96
+ _loadError = err;
97
+ _api = null;
98
+ }
99
+ return _api;
100
+ }
101
+
102
+ // True only when the koffi-backed Win32 binding is usable on this platform.
103
+ // `AOD_DISABLE_JOB_GUARD=1` forces it off (operator escape hatch if koffi/the FFI ever
104
+ // misbehaves, and the hook that lets tests exercise the best-effort degraded teardown).
105
+ // In the SEA single-file binary koffi is externalized out of the bundle and there is no
106
+ // node_modules, so it can never load — short-circuit to degraded mode without attempting
107
+ // the require (keeps the PTY-start hot path free of a doomed module lookup).
108
+ function isAvailable() {
109
+ if (process.env.AOD_DISABLE_JOB_GUARD === '1') return false;
110
+ if (typeof global !== 'undefined' && global.__SEA_MODE__) return false;
111
+ return !!_ensureApi();
112
+ }
113
+
114
+ // Explicit NULL-handle predicate. koffi 3.x returns JS `null` for a NULL pointer and a
115
+ // BigInt for a valid HANDLE (verified on koffi 3.0.2), so a bare `!h` already works; this
116
+ // guards the common shapes explicitly so a future koffi representation of NULL (0 / 0n /
117
+ // undefined) can't slip an invalid handle into a WinAPI call.
118
+ function _isNullHandle(h) {
119
+ return h === null || h === undefined || h === 0 || h === 0n;
120
+ }
121
+
122
+ // Create a job object with KILL_ON_JOB_CLOSE set (BREAKAWAY_OK deliberately OFF).
123
+ // Returns the job handle (opaque, pass back to assign*/closeJob) or null on any failure.
124
+ function createKillOnCloseJob() {
125
+ const api = _ensureApi();
126
+ if (!api) return null;
127
+ let job = null;
128
+ try {
129
+ job = api.CreateJobObjectW(null, null);
130
+ if (_isNullHandle(job)) return null;
131
+ const info = {
132
+ BasicLimitInformation: {
133
+ PerProcessUserTimeLimit: 0n,
134
+ PerJobUserTimeLimit: 0n,
135
+ LimitFlags: JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE,
136
+ MinimumWorkingSetSize: 0,
137
+ MaximumWorkingSetSize: 0,
138
+ ActiveProcessLimit: 0,
139
+ Affinity: 0,
140
+ PriorityClass: 0,
141
+ SchedulingClass: 0,
142
+ },
143
+ IoInfo: {
144
+ ReadOperationCount: 0n, WriteOperationCount: 0n, OtherOperationCount: 0n,
145
+ ReadTransferCount: 0n, WriteTransferCount: 0n, OtherTransferCount: 0n,
146
+ },
147
+ ProcessMemoryLimit: 0,
148
+ JobMemoryLimit: 0,
149
+ PeakProcessMemoryUsed: 0,
150
+ PeakJobMemoryUsed: 0,
151
+ };
152
+ const ok = api.SetInformationJobObject(job, JobObjectExtendedLimitInformation, info, api.sizeofExtLimit);
153
+ if (!ok) {
154
+ // Could not arm kill-on-close — a job without it is useless (worse: it would
155
+ // hold processes without ever reaping them). Close and fail closed to null.
156
+ try { api.CloseHandle(job); } catch (_) { /* ignore */ }
157
+ return null;
158
+ }
159
+ return job;
160
+ } catch (_) {
161
+ if (job) { try { api.CloseHandle(job); } catch (__) { /* ignore */ } }
162
+ return null;
163
+ }
164
+ }
165
+
166
+ // Assign the CURRENT process to the job (used by the supervisor before forking).
167
+ // Returns true on success. After this, all future descendants auto-join the job.
168
+ function assignSelf(job) {
169
+ const api = _ensureApi();
170
+ if (!api || _isNullHandle(job)) return false;
171
+ try {
172
+ const self = api.GetCurrentProcess(); // pseudo-handle (-1); valid for AssignProcessToJobObject
173
+ return !!api.AssignProcessToJobObject(job, self);
174
+ } catch (_) {
175
+ return false;
176
+ }
177
+ }
178
+
179
+ // Assign a foreign process (by pid) to the job (used per-PTY). Opens a scoped handle
180
+ // with exactly the rights needed, assigns, then closes that process handle (NOT the job).
181
+ // Returns true on success.
182
+ //
183
+ // PID-reuse safety: callers must pass the pid of a process they KNOW is currently alive
184
+ // and call this synchronously after spawning it (base-bridge._attachPtyJob runs in the same
185
+ // synchronous tick as node-pty's spawn() that produced the pid), so there is no async window
186
+ // in which the pid could be recycled before OpenProcess. Do NOT call this with a pid that
187
+ // may have already exited.
188
+ function assignPid(job, pid) {
189
+ const api = _ensureApi();
190
+ if (!api || _isNullHandle(job) || !pid) return false;
191
+ let h = null;
192
+ try {
193
+ h = api.OpenProcess(PROCESS_TERMINATE | PROCESS_SET_QUOTA, 0, pid >>> 0);
194
+ if (_isNullHandle(h)) return false;
195
+ return !!api.AssignProcessToJobObject(job, h);
196
+ } catch (_) {
197
+ return false;
198
+ } finally {
199
+ if (!_isNullHandle(h)) { try { api.CloseHandle(h); } catch (_) { /* ignore */ } }
200
+ }
201
+ }
202
+
203
+ // Close a job handle. For a per-PTY kill-on-close job this is the teardown trigger:
204
+ // it terminates every process still in the job. Idempotent-safe to call with null.
205
+ // NEVER call this on the supervisor-level job (its close = kill the whole tree); the
206
+ // supervisor holds it for life and lets process death close it.
207
+ function closeJob(job) {
208
+ const api = _ensureApi();
209
+ if (!api || _isNullHandle(job)) return false;
210
+ try {
211
+ return !!api.CloseHandle(job);
212
+ } catch (_) {
213
+ return false;
214
+ }
215
+ }
216
+
217
+ module.exports = {
218
+ isAvailable,
219
+ createKillOnCloseJob,
220
+ assignSelf,
221
+ assignPid,
222
+ closeJob,
223
+ // exposed for diagnostics/tests
224
+ _loadError: () => _loadError,
225
+ };
226
+
227
+ // --- self-test: `node src/job-guard.js` ------------------------------------------
228
+ // Proves the koffi bindings + struct marshaling work end to end on this host:
229
+ // create a kill-on-close job, assign a spawned child, close the job, assert the child dies.
230
+ if (require.main === module) {
231
+ if (!IS_WIN) { console.log('job-guard self-test: non-win32, no-op OK'); process.exit(0); }
232
+ const { spawn } = require('child_process');
233
+ console.log('koffi available:', isAvailable(), 'loadError:', _loadError && _loadError.message);
234
+ const job = createKillOnCloseJob();
235
+ console.log('createKillOnCloseJob ->', job ? 'OK' : 'FAIL');
236
+ if (!job) process.exit(1);
237
+ // Long-lived child that does nothing but stay alive.
238
+ const child = spawn(process.execPath, ['-e', 'setInterval(()=>{}, 1e9)'], { stdio: 'ignore' });
239
+ console.log('spawned child pid', child.pid);
240
+ setTimeout(() => {
241
+ const assigned = assignPid(job, child.pid);
242
+ console.log('assignPid ->', assigned ? 'OK' : 'FAIL');
243
+ closeJob(job);
244
+ console.log('closeJob called; waiting to see if child dies...');
245
+ let exited = false;
246
+ child.on('exit', (code, sig) => { exited = true; console.log(`child exited code=${code} sig=${sig} -> KILL-ON-CLOSE OK`); process.exit(0); });
247
+ setTimeout(() => { if (!exited) { console.log('child STILL ALIVE -> FAIL'); try { child.kill(); } catch (_) {} process.exit(2); } }, 2000);
248
+ }, 300);
249
+ }
package/src/server.js CHANGED
@@ -384,6 +384,10 @@ class ClaudeCodeWebServer {
384
384
  } catch (saveErr) {
385
385
  console.error('Failed to save sessions on crash:', saveErr);
386
386
  }
387
+ // Reap PTY subtrees so the CLI's node/bun grandchildren don't outlive this crashed
388
+ // server. Synchronous (the event loop is unsafe here). Windows closes each per-PTY
389
+ // job; POSIX group-kills. Best-effort; never rethrows.
390
+ try { this._reapAllPtySubtreesSync(); } catch (_) { /* ignore */ }
387
391
  process.exit(1);
388
392
  });
389
393
  process.on('unhandledRejection', (reason) => {
@@ -401,10 +405,21 @@ class ClaudeCodeWebServer {
401
405
  this.handleShutdown();
402
406
  }
403
407
  });
404
- // If the supervisor crashes, continue running standalone
408
+ // If the supervisor's IPC channel drops, the supervisor died. Per the
409
+ // "everything dies when the main process dies" contract, this server must NOT
410
+ // keep running standalone (the old behavior) — it tears down its own PTY trees
411
+ // (incl. the CLI's node/bun grandchildren) and shuts down.
405
412
  process.on('disconnect', () => {
406
- console.warn('IPC channel disconnected (supervisor may have crashed). Continuing standalone.');
407
- this.supervised = false;
413
+ // Expected channel close: a graceful shutdown / memory-restart we initiated is
414
+ // already in flight (the supervisor sent {type:'shutdown'} or we exited 75). No-op.
415
+ if (this.isShuttingDown) return;
416
+ console.warn('IPC channel disconnected (supervisor died). Tearing down this server and its process tree.');
417
+ // Reap PTY subtrees synchronously FIRST so the node/bun grandchildren die immediately,
418
+ // even if the ordered handleShutdown below is slow. On Windows with the job guard
419
+ // active the kernel has usually already killed us; this is the cross-platform /
420
+ // degraded-mode backstop.
421
+ try { this._reapAllPtySubtreesSync(); } catch (_) { /* best-effort */ }
422
+ this.handleShutdown(0);
408
423
  });
409
424
  }
410
425
 
@@ -3937,6 +3952,24 @@ class ClaudeCodeWebServer {
3937
3952
  return bridges[agentType] || null;
3938
3953
  }
3939
3954
 
3955
+ /**
3956
+ * Synchronously reap every PTY subtree across all bridges. For the crash / supervisor-
3957
+ * death paths where we are exiting and cannot await async teardown. Windows closes each
3958
+ * per-PTY kill-on-close job (terminates the shell + node/bun grandchildren); POSIX
3959
+ * process-group kills. Best-effort; never throws.
3960
+ */
3961
+ _reapAllPtySubtreesSync() {
3962
+ const bridges = [
3963
+ this.claudeBridge, this.codexBridge, this.copilotBridge,
3964
+ this.geminiBridge, this.terminalBridge,
3965
+ ];
3966
+ for (const b of bridges) {
3967
+ if (b && typeof b.killAllSubtreesSync === 'function') {
3968
+ try { b.killAllSubtreesSync(); } catch (_) { /* ignore */ }
3969
+ }
3970
+ }
3971
+ }
3972
+
3940
3973
  async startToolSession(wsId, toolName, bridge, options, cols, rows) {
3941
3974
  const wsInfo = this.webSocketConnections.get(wsId);
3942
3975
  if (!wsInfo) {
@@ -4243,7 +4276,24 @@ class ClaudeCodeWebServer {
4243
4276
  percent: progress.percent,
4244
4277
  });
4245
4278
  })
4246
- .then(() => this._broadcastStickyStatus())
4279
+ .then(() => {
4280
+ // One-time visibility into the inference backend. On CPU (no GPU — common
4281
+ // on Windows when the Vulkan/CUDA prebuilt is incompatible) summaries are
4282
+ // materially slower; the worker compensates with more threads + a generous
4283
+ // watchdog timeout, but a note can still take a couple of minutes.
4284
+ const rt = this.stickyNoteEngine.getRuntimeInfo && this.stickyNoteEngine.getRuntimeInfo();
4285
+ if (this.dev && rt) {
4286
+ if (rt.gpu) {
4287
+ console.log(`[sticky-notes] engine ready (GPU backend, ${rt.threads} threads)`);
4288
+ } else {
4289
+ console.log(
4290
+ `[sticky-notes] engine ready (CPU backend, ${rt.threads} threads) — ` +
4291
+ 'summaries run on CPU and may take a couple of minutes; a Vulkan/CUDA driver would accelerate them'
4292
+ );
4293
+ }
4294
+ }
4295
+ this._broadcastStickyStatus();
4296
+ })
4247
4297
  .catch((err) => {
4248
4298
  // Allow a later AI-session start to retry after a transient failure
4249
4299
  // (download blip). A permanent failure (no binding) just fails fast.
@@ -5226,6 +5276,15 @@ class ClaudeCodeWebServer {
5226
5276
  .filter(s => s._voiceUploadTimestamps && s._voiceUploadTimestamps.length).length,
5227
5277
  activity_broadcast_timestamps: (this.activityBroadcastTimestamps && this.activityBroadcastTimestamps.size) || 0,
5228
5278
  },
5279
+ // Deterministic-shutdown guard status. On win32, job_guard_active reflects whether
5280
+ // the supervisor established the kill-on-close Job Object (false ⇒ degraded:
5281
+ // EDR/CLM/koffi unavailable ⇒ best-effort taskkill teardown). null off win32 (the
5282
+ // job mechanism is Windows-only; POSIX uses process-group teardown). See
5283
+ // docs/specs/process-shutdown.md.
5284
+ process_guard: {
5285
+ job_guard_active: process.platform === 'win32' ? (process.env.AOD_JOB_GUARD === '1') : null,
5286
+ supervised: !!this.supervised,
5287
+ },
5229
5288
  // DISK-02/03: cached disk usage sample (60 s TTL, never blocks the
5230
5289
  // event loop). Populated by _sampleDiskUsage() — see method
5231
5290
  // comment for the time-budget contract.
@@ -9,21 +9,31 @@
9
9
 
10
10
  const { Worker } = require('worker_threads');
11
11
  const path = require('path');
12
- const os = require('os');
13
12
  const GgufModelManager = require('./utils/gguf-model-manager');
14
13
  const { isBun } = require('./utils/runtime');
15
14
 
16
15
  const MAX_QUEUE_SIZE = 3;
17
- const DEFAULT_INFER_TIMEOUT_MS = 60000;
16
+ // Watchdog-grade, unconditional per-request timeout. Real grammar-constrained
17
+ // summaries on a CPU backend (no GPU — common on Windows when the Vulkan/CUDA
18
+ // prebuilt is incompatible) take ~90s on half-core threading and up to ~160s on
19
+ // 2 threads. This is a true catastrophic watchdog set well above that, NOT an
20
+ // expected boundary: correctness over speed (a slow note must still complete).
21
+ // GPU runs finish in ~7s and return immediately, so the high cap costs them
22
+ // nothing. The summarizer's backstop sits strictly above this (one timeout
23
+ // owner). An explicit inferTimeoutMs still overrides.
24
+ const DEFAULT_INFER_TIMEOUT_MS = 300000;
18
25
  const MAX_RESTART_DELAY_MS = 15000;
19
26
  const MAX_RESTART_ATTEMPTS = 5;
20
27
 
21
28
  class StickyNoteEngine {
22
29
  constructor(options = {}) {
23
30
  this._enabled = !!options.enabled;
24
- // Low thread cap keeps inference gentle so the model can't saturate CPU and
25
- // starve the terminal / AI agent. Summaries are infrequent + throttled.
26
- this._numThreads = options.numThreads || Math.max(1, Math.min(2, os.cpus().length - 2));
31
+ // Thread count is auto-selected by the worker once it knows whether a GPU
32
+ // backend loaded (see sticky-note-threads.pickThreads), UNLESS the caller
33
+ // pins it explicitly (--sticky-notes-threads). Auto is signalled by leaving
34
+ // numThreads out of the worker data entirely.
35
+ this._numThreadsExplicit = Number.isFinite(Number(options.numThreads)) && Number(options.numThreads) > 0;
36
+ this._numThreads = this._numThreadsExplicit ? Math.floor(Number(options.numThreads)) : null;
27
37
  this._contextSize = options.contextSize || 8192;
28
38
  this._inferTimeoutMs = options.inferTimeoutMs || DEFAULT_INFER_TIMEOUT_MS;
29
39
  this._maxQueue = options.maxQueue || MAX_QUEUE_SIZE;
@@ -39,22 +49,30 @@ class StickyNoteEngine {
39
49
  this._stopping = false;
40
50
  this._initPromise = null;
41
51
  this._downloadProgress = null;
52
+ this._runtimeInfo = null; // { gpu, threads } reported by the worker on ready
42
53
 
43
54
  this._modelManager =
44
55
  options.modelManager ||
45
56
  new GgufModelManager({ model: options.model, modelsDir: options.modelsDir });
46
57
 
47
- // Injectable for tests; default spawns the real worker thread.
58
+ // Injectable for tests; default spawns the real worker thread. numThreads is
59
+ // included ONLY when explicitly pinned — otherwise the worker auto-picks.
48
60
  this._createWorker =
49
61
  options.createWorker ||
50
- (() =>
51
- new Worker(path.join(__dirname, 'sticky-note-worker.js'), {
52
- workerData: {
53
- modelPath: this._modelManager.getModelFile(),
54
- numThreads: this._numThreads,
55
- contextSize: this._contextSize,
56
- },
57
- }));
62
+ (() => new Worker(path.join(__dirname, 'sticky-note-worker.js'), { workerData: this._workerData() }));
63
+ }
64
+
65
+ /**
66
+ * Build the worker's workerData. numThreads is OMITTED when auto (not pinned)
67
+ * so the worker auto-selects based on the GPU backend it detects; pinning it
68
+ * here would defeat that. Kept as a method so it is unit-testable.
69
+ */
70
+ _workerData() {
71
+ return {
72
+ modelPath: this._modelManager.getModelFile(),
73
+ ...(this._numThreadsExplicit ? { numThreads: this._numThreads } : {}),
74
+ contextSize: this._contextSize,
75
+ };
58
76
  }
59
77
 
60
78
  async initialize(onProgress) {
@@ -109,6 +127,11 @@ class StickyNoteEngine {
109
127
  return this._downloadProgress;
110
128
  }
111
129
 
130
+ /** { gpu, threads } reported by the worker on ready, or null before ready. */
131
+ getRuntimeInfo() {
132
+ return this._runtimeInfo;
133
+ }
134
+
112
135
  /**
113
136
  * Run one inference. Resolves with the model's raw output string.
114
137
  * @param {string} prompt
@@ -173,6 +196,7 @@ class StickyNoteEngine {
173
196
  this._queue = [];
174
197
  this._currentRequest = null;
175
198
  this._worker = null;
199
+ this._runtimeInfo = null; // dead worker — drop its reported backend/threads
176
200
 
177
201
  if (this._stopping) {
178
202
  this._status = 'unavailable';
@@ -230,6 +254,7 @@ class StickyNoteEngine {
230
254
  this._status = 'ready';
231
255
  this._restartAttempts = 0;
232
256
  this._lastSpawnError = null;
257
+ this._runtimeInfo = { gpu: !!msg.gpu, threads: msg.threads || null };
233
258
  worker.on('message', (m) => this._onWorkerMessage(m));
234
259
  worker.on('exit', (c) => this._onWorkerExit(c));
235
260
  this._processQueue();
@@ -18,7 +18,7 @@ const DEFAULTS = {
18
18
  minIntervalMs: 20000, // floor between inferences for one session
19
19
  intervalFactor: 3, // adaptive: minInterval = max(floor, factor * lastDurationMs)
20
20
  turnDebounceMs: 1500, // (JSONL mode) coalesce a burst of appended turn lines
21
- inferTimeoutMs: 75000, // backstop ABOVE the engine's own 60s timeout, so the
21
+ inferTimeoutMs: 330000, // backstop ABOVE the engine's own 300s timeout, so the
22
22
  // engine times out first (one timeout owner); this only fires if the engine
23
23
  // promise hangs entirely. Worker-side serialisation prevents concurrent runs.
24
24
  failureThreshold: 3, // consecutive failures -> open circuit breaker
@@ -0,0 +1,25 @@
1
+ 'use strict';
2
+
3
+ // Thread-count policy for the sticky-note inference worker. Pure + dependency-
4
+ // free so it can be unit-tested without spawning a worker or loading a model.
5
+ //
6
+ // The worker decides its own thread count AFTER getLlama() reports whether a GPU
7
+ // backend actually loaded:
8
+ // - GPU present -> the GPU carries the inference (the worker also requests full
9
+ // layer offload); keep a low, gentle CPU thread count so it can't saturate CPU
10
+ // and starve the terminal / AI agent.
11
+ // - No GPU (CPU) -> common on Windows when the Vulkan/CUDA prebuilt binary is
12
+ // incompatible. At 2 threads one grammar-constrained summary takes ~160s on a
13
+ // 16-core box and blows every timeout; use THREE-QUARTERS of the cores (leaving
14
+ // a quarter for the terminal/agent) so it completes well inside the watchdog.
15
+ // An explicit override (--sticky-notes-threads) always wins, after validation.
16
+ // `explicit` is coerced with Number() so a numeric string (e.g. from a CLI/env
17
+ // arg) still counts as a valid pin rather than silently falling back to auto.
18
+ function pickThreads({ explicit, gpu, cpus } = {}) {
19
+ const pinned = Number(explicit);
20
+ if (Number.isFinite(pinned) && pinned > 0) return Math.floor(pinned);
21
+ const cores = Number.isFinite(cpus) && cpus > 0 ? Math.floor(cpus) : 1;
22
+ return gpu ? Math.max(1, Math.min(2, cores - 2)) : Math.max(1, Math.floor((cores * 3) / 4));
23
+ }
24
+
25
+ module.exports = { pickThreads };
@@ -9,10 +9,10 @@
9
9
  const { parentPort, workerData } = require('worker_threads');
10
10
  const os = require('os');
11
11
  const { SYSTEM_PROMPT, NOTE_SCHEMA } = require('./sticky-note-prompt');
12
+ const { pickThreads } = require('./sticky-note-threads');
12
13
 
13
14
  const modelPath = workerData.modelPath;
14
15
  const contextSize = workerData.contextSize || 8192;
15
- const numThreads = workerData.numThreads || Math.max(1, Math.min(2, os.cpus().length - 2));
16
16
  const maxTokens = workerData.maxTokens || 320;
17
17
 
18
18
  let llama;
@@ -42,12 +42,29 @@ async function init() {
42
42
  LlamaChatSessionCtor = LlamaChatSession;
43
43
 
44
44
  llama = await getLlama();
45
- model = await llama.loadModel({ modelPath });
45
+ // availableParallelism() reflects usable parallelism better than cpus().length
46
+ // on Windows hybrid P/E-core machines; fall back where it's unavailable.
47
+ const cpus = (typeof os.availableParallelism === 'function' ? os.availableParallelism() : 0) || os.cpus().length;
48
+ // llama.gpu is false | 'cuda' | 'vulkan' | 'metal'; any non-empty string = GPU.
49
+ const gpu = !!llama.gpu;
50
+ const numThreads = pickThreads({ explicit: workerData.numThreads, gpu, cpus });
51
+ // Use the GPU fully when present: request all layers in VRAM ('max'). If the
52
+ // GPU can't fit them, 'max' throws — fall back to the default 'auto', which
53
+ // still offloads as many layers as fit (never worse than CPU-only).
54
+ if (gpu) {
55
+ try {
56
+ model = await llama.loadModel({ modelPath, gpuLayers: 'max' });
57
+ } catch {
58
+ model = await llama.loadModel({ modelPath });
59
+ }
60
+ } else {
61
+ model = await llama.loadModel({ modelPath });
62
+ }
46
63
  context = await model.createContext({ contextSize, threads: numThreads });
47
64
  sequence = context.getSequence();
48
65
  grammar = await llama.createGrammarForJsonSchema(NOTE_SCHEMA);
49
66
 
50
- parentPort.postMessage({ type: 'ready' });
67
+ parentPort.postMessage({ type: 'ready', gpu, threads: numThreads });
51
68
  }
52
69
 
53
70
  async function handleInfer(msg) {
@@ -0,0 +1,95 @@
1
+ 'use strict';
2
+
3
+ // Cross-platform best-effort process-tree teardown.
4
+ //
5
+ // This is the FALLBACK layer, used when the deterministic mechanism is unavailable:
6
+ // - Windows: the per-PTY / supervisor Job Object is the real teardown. taskkill /T /F
7
+ // is only the degraded-mode backstop (jobGuard:false — EDR/CLM blocked the job).
8
+ // - POSIX: there is no job-object equivalent, so process-group kill IS the primary
9
+ // teardown for PTYs. node-pty's Unix backend runs each PTY through forkpty→setsid,
10
+ // so the PTY is a session/group leader and its pid == pgid; killing the negative pid
11
+ // targets that whole group. Honest limitation: a grandchild that calls setsid() (some
12
+ // daemonized MCP servers) starts its own group and escapes -pgid; only cgroup v2
13
+ // delegation closes that gap (see docs/specs/process-shutdown.md).
14
+ //
15
+ // Never throws into the caller — teardown must not break shutdown.
16
+
17
+ const childProcess = require('child_process');
18
+
19
+ const IS_WIN = process.platform === 'win32';
20
+
21
+ // Windows degraded-mode tree kill via taskkill. Async (spawns a child); resolves true
22
+ // once taskkill exits 0, false otherwise. windowsHide + no shell (taskkill is a real exe).
23
+ function _taskkillTree(pid, spawnImpl) {
24
+ return new Promise((resolve) => {
25
+ try {
26
+ const proc = spawnImpl('taskkill', ['/T', '/F', '/PID', String(pid)], {
27
+ windowsHide: true,
28
+ stdio: 'ignore',
29
+ shell: false,
30
+ });
31
+ let settled = false;
32
+ const done = (ok) => { if (!settled) { settled = true; resolve(ok); } };
33
+ proc.on('exit', (code) => done(code === 0));
34
+ proc.on('error', () => done(false));
35
+ // Bound the wait so a hung taskkill can't stall shutdown.
36
+ const t = setTimeout(() => done(false), 4000);
37
+ if (t.unref) t.unref();
38
+ } catch (_) {
39
+ resolve(false);
40
+ }
41
+ });
42
+ }
43
+
44
+ // POSIX: kill the process group led by `pid` (negative-pid), then the pid itself as a
45
+ // fallback in case it is not actually a group leader.
46
+ function _killGroup(pid, signal, killImpl) {
47
+ let any = false;
48
+ try { killImpl(-pid, signal); any = true; } catch (_) { /* ESRCH / EPERM */ }
49
+ try { killImpl(pid, signal); any = true; } catch (_) { /* may already be gone */ }
50
+ return any;
51
+ }
52
+
53
+ /**
54
+ * Best-effort tree-kill of `pid` and its descendants. Returns a Promise<boolean>.
55
+ * Windows uses taskkill /T /F; POSIX kills the process group.
56
+ * Injectable deps (`opts.spawn` / `opts.kill`) are for unit tests.
57
+ */
58
+ function killProcessTree(pid, opts = {}) {
59
+ const signal = opts.signal || 'SIGKILL';
60
+ if (!pid || pid <= 0) return Promise.resolve(false);
61
+ if (IS_WIN) {
62
+ return _taskkillTree(pid, opts.spawn || childProcess.spawn);
63
+ }
64
+ return Promise.resolve(_killGroup(pid, signal, opts.kill || process.kill.bind(process)));
65
+ }
66
+
67
+ /**
68
+ * Synchronous best-effort tree-kill for the uncaughtException path, where the event loop
69
+ * is unsafe to rely on. Windows uses spawnSync(taskkill) with a short timeout; POSIX kills
70
+ * the process group synchronously. Returns boolean. Never throws.
71
+ */
72
+ function killProcessTreeSync(pid, opts = {}) {
73
+ const signal = opts.signal || 'SIGKILL';
74
+ if (!pid || pid <= 0) return false;
75
+ if (IS_WIN) {
76
+ try {
77
+ const spawnSync = opts.spawnSync || childProcess.spawnSync;
78
+ const r = spawnSync('taskkill', ['/T', '/F', '/PID', String(pid)], {
79
+ windowsHide: true, stdio: 'ignore', shell: false, timeout: 3000,
80
+ });
81
+ return !!r && r.status === 0;
82
+ } catch (_) {
83
+ return false;
84
+ }
85
+ }
86
+ return _killGroup(pid, signal, opts.kill || process.kill.bind(process));
87
+ }
88
+
89
+ module.exports = {
90
+ killProcessTree,
91
+ killProcessTreeSync,
92
+ // internal, exposed for tests
93
+ _killGroup,
94
+ _taskkillTree,
95
+ };