greprag 5.43.1 → 5.43.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,32 +1,35 @@
1
1
  "use strict";
2
- /** Local watcher process registry + severed-ancestry orphan reaper.
2
+ /** Local watcher process registry + COUNT-CAP (the post-saga model).
3
3
  *
4
- * The architectural correction the whole monitor-resilience saga circled but
5
- * never landed (see adr/monitor-resilience.md 2026-05-28 "severed-ancestry =
6
- * the disarm signal" + the 2026-06-04 entry): a watcher's liveness lives at the
7
- * LOCAL process layer, not in a cloud registry. The consumer of a watcher is
8
- * the Claude Code Monitor task on THIS machine; the server can't see whether
9
- * anyone is still consuming the stream, so a server lease counts an orphan
10
- * (consumer dead, process still respawning) as "armed". Two local signals fix
11
- * that:
4
+ * THE LESSON (adr/monitor-resilience.md, "⭐ The watcher in one pass"): a
5
+ * functional watcher and an orphaned-but-alive one are EXTERNALLY
6
+ * INDISTINGUISHABLE same command line, same live PID. Only the watcher itself
7
+ * can tell, via its own stdout pipe (EPIPE). So every external reaper that
8
+ * *judged* orphan-ness (ancestry, owner-pid, monitor-pid) was guessing, and the
9
+ * guesses false-killed live watchers the 548-kill saga in one line.
12
10
  *
13
- * 1. **pidfile** (`~/.greprag/watchers/<short>.json`) written by the
14
- * supervisor on start, removed on terminal exit. `isLocallyArmed` reads it
15
- * to gate re-arming with zero cloud dependency and zero ghost-lease lag.
16
- * Paired with EPIPE-terminal in the watcher (a watcher whose consumer pipe
17
- * breaks exits and removes its own pidfile), a live pidfile means a live,
18
- * *consumed* watcher.
11
+ * This module no longer judges. Two reliable facts replace the one unknowable
12
+ * one:
19
13
  *
20
- * 2. **severed-ancestry reap** — `reapOrphanWatchers` snapshots the process
21
- * table and kills any `inbox watch` process whose ancestry no longer
22
- * reaches a live `claude.exe`. Backstop for the case EPIPE can't catch (a
23
- * consumer hard-killed without closing the pipe). SAFE BY CONSTRUCTION: the
24
- * kill set is intersected with "cmdline contains `inbox watch`", so it can
25
- * only ever terminate greprag watchers never claude.exe, an editor, or an
26
- * unrelated node server. A bug can at worst kill a watcher that then
27
- * re-arms; it can never touch the operator's work.
14
+ * 1. **pidfile registry** — `~/.greprag/watchers/<short>.json` is a LIST of every
15
+ * supervisor armed for that session (`{short,pid,startedAt,ownerPid}`). A
16
+ * supervisor appends on arm and removes its own entry on terminal exit; a
17
+ * hard kill leaves a dead entry that the alive-filter sweeps. `isLocallyArmed`
18
+ * = "any entry alive". (Multiple live supervisors per session is normal under
19
+ * the June-8 spray; the count-cap bounds them.)
28
20
  *
29
- * adr: adr/monitor-resilience.md */
21
+ * 2. **count-cap** — `reapOrphanWatchers` keeps the K freshest LIVE watchers per
22
+ * session and kills only the surplus, NEVER below K. No death-detection → no
23
+ * false-kill; a floor of K → it can never reap to zero, so the bug that broke
24
+ * liveness every time is structurally impossible. "Freshest" is a *bias*
25
+ * toward the just-armed functional one, not a guarantee — and that's fine:
26
+ * EPIPE kills a wrongly-kept orphan in ~30s and the supervisor respawns. It is
27
+ * **snapshot-free** (per-session file reads + `pidAlive()` syscalls), so the
28
+ * 06-05b conhost-OOM engine — a `powershell Get-CimInstance` per SessionStart —
29
+ * is gone from the cleanup path entirely.
30
+ *
31
+ * EPIPE-terminal (in the watcher) + the supervisor's programmatic respawn do the
32
+ * real liveness work; the count-cap only bounds the pile. adr: adr/monitor-resilience.md */
30
33
  var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
31
34
  if (k2 === undefined) k2 = k;
32
35
  var desc = Object.getOwnPropertyDescriptor(m, k);
@@ -61,43 +64,31 @@ var __importStar = (this && this.__importStar) || (function () {
61
64
  };
62
65
  })();
63
66
  Object.defineProperty(exports, "__esModule", { value: true });
67
+ exports.GC_GRACE_MS = exports.DEFAULT_WATCHER_CAP = void 0;
64
68
  exports.writeWatcherPidfile = writeWatcherPidfile;
69
+ exports.removeWatcherEntry = removeWatcherEntry;
65
70
  exports.removeWatcherPidfile = removeWatcherPidfile;
66
71
  exports.watcherAuditLog = watcherAuditLog;
67
72
  exports.isLocallyArmed = isLocallyArmed;
68
73
  exports.listLocalLiveWatchers = listLocalLiveWatchers;
69
74
  exports.resolveClaudeOwnerPid = resolveClaudeOwnerPid;
70
- exports.severedVerdict = severedVerdict;
75
+ exports.capSurplusVerdict = capSurplusVerdict;
71
76
  exports.reapOrphanWatchers = reapOrphanWatchers;
77
+ exports.gcVerdict = gcVerdict;
78
+ exports.gcDeadSessions = gcDeadSessions;
72
79
  const fs = __importStar(require("fs"));
73
80
  const path = __importStar(require("path"));
74
81
  const child_process_1 = require("child_process");
75
82
  const WATCHER_DIRNAME = 'watchers';
76
- // Matches a greprag watcher's command line in every launch shape: the npm shim
77
- // and the old bash while-loop wrapper (`greprag inbox watch …`), and the
78
- // supervisor's CreateProcess re-invocation (`node …\dist\index.js inbox watch
79
- // …`). Anchored on the INVOKED BINARY (`greprag` or `index.js`) immediately
80
- // followed by the `inbox watch` args NOT a bare `inbox watch` substring, which
81
- // would also match an unrelated shell that merely mentions the phrase (a grep, a
82
- // diagnostic, this very reaper). This is the SAFETY GUARD: only a process whose
83
- // command line is an actual watcher invocation is ever a kill candidate, so the
84
- // reaper cannot touch claude.exe, an editor, or an incidental shell.
85
- const WATCHER_CMD_RE = /(?:index\.js|greprag)["'\s]+inbox\s+watch\b/i;
83
+ // Matches a greprag watcher's command line in every launch shape (npm shim, bash
84
+ // relauncher, the supervisor's CreateProcess re-invocation). Anchored on the
85
+ // INVOKED BINARY (`greprag` or `index.js`) immediately followed by `inbox watch`
86
+ // never a bare `inbox watch` substring. Retained for the owner-pid resolver's
87
+ // claude.exe match; the cleanup path no longer scans the process table at all.
86
88
  const CLAUDE_PROC_RE = /^claude(\.exe)?$/i;
87
- // The owning claude.exe PID, stamped into the watch command at arm time and
88
- // inherited by every process in the tree (launcher shell supervisor child).
89
- // The robust replacement for the ancestry walk: the launcher chain collapses
90
- // within seconds of arming (a backgrounded daemon's parent shell exits by
91
- // design), so "can I still walk up to a claude.exe?" yields false positives on
92
- // LIVE watchers. "Is the specific claude.exe that armed me still alive?" does
93
- // not. adr: adr/monitor-resilience.md
94
- const OWNER_PID_RE = /--owner-pid[=\s]+(\d+)/;
95
- // LIVE as of 2026-06-05 (before/after passed: keeps live watchers, still reaps
96
- // genuine orphans, falls back to the ancestry walk for legacy watchers with no
97
- // `--owner-pid`). Default ON; set GREPRAG_REAP_BY_OWNER=0 as an escape hatch to
98
- // revert to the pure ancestry walk. Read per-call (not a load-time const) so a
99
- // single process can exercise both modes. adr: adr/monitor-resilience.md
100
- function reapByOwner() { return process.env.GREPRAG_REAP_BY_OWNER !== '0'; }
89
+ /** Default per-session watcher floor: keep this many freshest live watchers, reap
90
+ * only the surplus above it, never below it. K=2 = the live one + one margin. */
91
+ exports.DEFAULT_WATCHER_CAP = 2;
101
92
  function grepragHome() {
102
93
  const home = process.env.HOME || process.env.USERPROFILE || '';
103
94
  return home ? path.join(home, '.greprag') : null;
@@ -106,109 +97,139 @@ function watchersDir() {
106
97
  const h = grepragHome();
107
98
  return h ? path.join(h, WATCHER_DIRNAME) : null;
108
99
  }
109
- /** Write the watcher pidfile (supervisor PID), keyed by the 8-hex session short.
110
- * `ownerPid` (the owning claude.exe) is recorded so the reaper can test liveness
111
- * without enumerating the process table. Best-effort: a failed write only means
112
- * `isLocallyArmed` falls back to re-arming, never a crash. */
113
- function writeWatcherPidfile(short, pid, ownerPid) {
100
+ function pidfilePath(short) {
101
+ const dir = watchersDir();
102
+ return dir ? path.join(dir, `${short}.json`) : null;
103
+ }
104
+ /** True iff `pid` is a live process. `process.kill(pid, 0)` sends no signal — it
105
+ * only probes existence. EPERM = exists but another user (alive); ESRCH = gone. */
106
+ function pidAlive(pid) {
107
+ if (!Number.isFinite(pid) || pid <= 0)
108
+ return false;
109
+ try {
110
+ process.kill(pid, 0);
111
+ return true;
112
+ }
113
+ catch (e) {
114
+ return e?.code === 'EPERM';
115
+ }
116
+ }
117
+ /** Read a session's entry LIST. Back-compat: a legacy single-object pidfile reads
118
+ * as a 1-element list. */
119
+ function readWatcherEntries(short) {
120
+ try {
121
+ const p = pidfilePath(short);
122
+ if (!p)
123
+ return [];
124
+ const parsed = JSON.parse(fs.readFileSync(p, 'utf-8'));
125
+ const arr = Array.isArray(parsed) ? parsed : [parsed];
126
+ return arr.filter((r) => !!r && typeof r.pid === 'number');
127
+ }
128
+ catch {
129
+ return [];
130
+ }
131
+ }
132
+ function writeWatcherEntries(short, entries) {
114
133
  try {
115
134
  const dir = watchersDir();
116
135
  if (!dir)
117
136
  return;
118
137
  fs.mkdirSync(dir, { recursive: true });
119
- const rec = { short, pid, startedAt: Date.now(), ownerPid };
120
- fs.writeFileSync(path.join(dir, `${short}.json`), JSON.stringify(rec));
138
+ const p = path.join(dir, `${short}.json`);
139
+ if (entries.length === 0) {
140
+ fs.rmSync(p, { force: true });
141
+ return;
142
+ }
143
+ fs.writeFileSync(p, JSON.stringify(entries));
144
+ }
145
+ catch { /* best-effort — a failed write only means re-arm, never a crash */ }
146
+ }
147
+ /** APPEND this supervisor's entry to the session's list (pruning dead entries as
148
+ * it goes). Best-effort. adr: adr/monitor-resilience.md */
149
+ function writeWatcherPidfile(short, pid, ownerPid) {
150
+ const entries = readWatcherEntries(short).filter(r => r.pid !== pid && pidAlive(r.pid));
151
+ entries.push({ short, pid, startedAt: Date.now(), ownerPid });
152
+ writeWatcherEntries(short, entries);
153
+ }
154
+ /** Remove THIS supervisor's entry (called on a supervisor's terminal exit). */
155
+ function removeWatcherEntry(short, pid) {
156
+ writeWatcherEntries(short, readWatcherEntries(short).filter(r => r.pid !== pid));
157
+ }
158
+ /** Remove a session's whole pidfile (legacy / full sweep). */
159
+ function removeWatcherPidfile(short) {
160
+ try {
161
+ const p = pidfilePath(short);
162
+ if (p)
163
+ fs.rmSync(p, { force: true });
121
164
  }
122
165
  catch { /* best-effort */ }
123
166
  }
124
- /** Every parseable watcher pidfile. The on-disk registry of live watchers — the
125
- * reaper's cheap pre-check reads THIS, not a process snapshot. */
126
- function readAllPidfiles() {
167
+ /** Every session short with a pidfile on disk. */
168
+ function allShorts() {
127
169
  try {
128
170
  const dir = watchersDir();
129
171
  if (!dir || !fs.existsSync(dir))
130
172
  return [];
131
- const out = [];
132
- for (const f of fs.readdirSync(dir)) {
133
- if (!f.endsWith('.json'))
134
- continue;
135
- try {
136
- const rec = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf-8'));
137
- if (rec && typeof rec.pid === 'number')
138
- out.push(rec);
139
- }
140
- catch { /* skip unparseable */ }
141
- }
142
- return out;
173
+ return fs.readdirSync(dir).filter(f => f.endsWith('.json')).map(f => f.slice(0, -5));
143
174
  }
144
175
  catch {
145
176
  return [];
146
177
  }
147
178
  }
148
- // ---- Owner-PID resolution cache ------------------------------------------
149
- // The owning claude.exe PID is STABLE for the life of a session — claude.exe
150
- // stays up across every turn. Resolving it via a process snapshot on every
151
- // unarmed prompt (the `notify` hook's hot path) was the dominant conhost-leak
152
- // engine: each call spawned a `powershell Get-CimInstance Win32_Process` whose
153
- // conhost orphaned under load. Cache the result keyed by session short; the hot
154
- // path becomes a file read + one `pidAlive()` syscall, no subprocess.
155
- // adr: adr/monitor-resilience.md
179
+ // ---- Owner-PID cache (vestigial under the count-cap; kept for the arm stamp) ----
180
+ // The `notify` hook still stamps `--owner-pid` into the arm command for audit
181
+ // continuity. The cap never reads it; this cache just keeps the per-prompt resolve
182
+ // cheap (a file read instead of a process snapshot) on the rare cache miss.
156
183
  function ownerCachePath(short) {
157
184
  const dir = watchersDir();
158
185
  return dir ? path.join(dir, `${short}.owner`) : null;
159
186
  }
160
- function readOwnerCache(short) {
187
+ function readSessionOwner(short) {
161
188
  try {
162
189
  const p = ownerCachePath(short);
163
190
  if (!p)
164
191
  return null;
165
- const n = Number(fs.readFileSync(p, 'utf-8').trim());
166
- return Number.isFinite(n) && n > 0 ? n : null;
192
+ const raw = fs.readFileSync(p, 'utf-8').trim();
193
+ if (/^\d+$/.test(raw)) {
194
+ const n = Number(raw);
195
+ return n > 0 ? { pid: n, ts: 0 } : null;
196
+ } // legacy bare PID
197
+ const o = JSON.parse(raw);
198
+ return (o && typeof o.pid === 'number') ? o : null;
167
199
  }
168
200
  catch {
169
201
  return null;
170
202
  }
171
203
  }
172
- function writeOwnerCache(short, ownerPid) {
204
+ function writeSessionOwner(short, o) {
173
205
  try {
174
206
  const dir = watchersDir();
175
207
  if (!dir)
176
208
  return;
177
209
  fs.mkdirSync(dir, { recursive: true });
178
- fs.writeFileSync(path.join(dir, `${short}.owner`), String(ownerPid));
210
+ fs.writeFileSync(path.join(dir, `${short}.owner`), JSON.stringify(o));
179
211
  }
180
212
  catch { /* best-effort */ }
181
213
  }
182
- function removeWatcherPidfile(short) {
214
+ function removeSessionOwner(short) {
183
215
  try {
184
- const dir = watchersDir();
185
- if (!dir)
186
- return;
187
- fs.rmSync(path.join(dir, `${short}.json`), { force: true });
216
+ const p = ownerCachePath(short);
217
+ if (p)
218
+ fs.rmSync(p, { force: true });
188
219
  }
189
220
  catch { /* best-effort */ }
190
221
  }
191
- function readWatcherPidfile(short) {
192
- try {
193
- const dir = watchersDir();
194
- if (!dir)
195
- return null;
196
- const raw = fs.readFileSync(path.join(dir, `${short}.json`), 'utf-8');
197
- const rec = JSON.parse(raw);
198
- if (rec && typeof rec.pid === 'number' && rec.short === short)
199
- return rec;
200
- return null;
201
- }
202
- catch {
203
- return null;
204
- }
222
+ // Thin wrappers for resolveClaudeOwnerPid (unchanged contract): read returns the
223
+ // current owner PID; write refreshes the record fresh (clearing any deadSince).
224
+ function readOwnerCache(short) {
225
+ return readSessionOwner(short)?.pid ?? null;
226
+ }
227
+ function writeOwnerCache(short, ownerPid) {
228
+ writeSessionOwner(short, { pid: ownerPid, ts: Date.now() });
205
229
  }
206
- /** Append-only diagnostic audit log at ~/.greprag/watchers/audit.log. ALWAYS ON
207
- * (not env-gated) so a watcher killed by a SIBLING session's SessionStart reap
208
- * leaves a trail its own Monitor output file is gone by the time we'd read it,
209
- * and the reaper is a different process than the watcher. Behaviour-neutral:
210
- * pure observation, never gates a kill or an arm. Cheap (a few lines per
211
- * SessionStart + per watcher arm/exit). adr: adr/monitor-resilience.md */
230
+ /** Append-only diagnostic audit log at ~/.greprag/watchers/audit.log. ALWAYS ON so
231
+ * every arm / terminal exit / cap-kill leaves a trail. Behaviour-neutral: pure
232
+ * observation, never gates a kill or an arm. adr: adr/monitor-resilience.md */
212
233
  function watcherAuditLog(event) {
213
234
  try {
214
235
  const dir = watchersDir();
@@ -219,59 +240,36 @@ function watcherAuditLog(event) {
219
240
  }
220
241
  catch { /* best-effort — visibility must never break the watcher */ }
221
242
  }
222
- /** True iff `pid` is a live process. `process.kill(pid, 0)` sends no signal
223
- * it only probes existence. EPERM means the process exists but is owned by
224
- * another user (still alive); ESRCH means gone. */
225
- function pidAlive(pid) {
226
- if (!Number.isFinite(pid) || pid <= 0)
227
- return false;
228
- try {
229
- process.kill(pid, 0);
230
- return true;
231
- }
232
- catch (e) {
233
- return e?.code === 'EPERM';
234
- }
235
- }
236
- /** Local-first arm check: is there a live watcher PROCESS for this session on
237
- * THIS machine? This is ground truth for "armed" — the consumer (the Monitor
238
- * task) is local, so a live local pidfile means a live, consumed watcher.
239
- * Replaces the server `isSessionArmed` check, which counts orphans (consumer
240
- * dead, socket still open) as armed and so both (a) suppresses re-arm when the
241
- * real watcher is gone and (b) lets re-arm stack new watchers on undead
242
- * orphans. A dead-PID pidfile is swept here so the next turn re-arms. */
243
+ /** Local-first arm check: does this session have ANY live watcher process on THIS
244
+ * machine? Ground truth for "armed" the consumer (the Monitor task) is local,
245
+ * so a live local entry means a live, consumed watcher. Sweeps dead entries as it
246
+ * reads, so a hard-killed watcher doesn't read as armed. */
243
247
  function isLocallyArmed(short) {
244
- const rec = readWatcherPidfile(short);
245
- if (!rec)
246
- return false;
247
- if (!pidAlive(rec.pid)) {
248
- removeWatcherPidfile(short);
249
- return false;
250
- }
251
- return true;
248
+ const entries = readWatcherEntries(short);
249
+ const alive = entries.filter(r => pidAlive(r.pid));
250
+ if (alive.length !== entries.length)
251
+ writeWatcherEntries(short, alive);
252
+ return alive.length > 0;
252
253
  }
253
- /** The local source of truth: every session whose watcher process is alive on
254
- * THIS machine right now. Reads the pidfile registry and confirms each PID with
255
- * the OS (`pidAlive`), sweeping any dead-PID leftover as it goes. This is the
256
- * answer the desk-line returns when the cloud asks "what's truly active?" it
257
- * is computed on demand from ground truth, never a cached/replicated list, so it
258
- * cannot go stale the way the cloud `/attached` registry does. adr:
259
- * adr/monitor-resilience.md */
254
+ /** Every session whose watcher process is alive on THIS machine right now —
255
+ * computed on demand from the pidfile registry + OS liveness, sweeping dead
256
+ * entries. Most-recently-armed first. This is what the desk-line returns when the
257
+ * cloud asks "what's truly active?". adr: adr/monitor-resilience.md */
260
258
  function listLocalLiveWatchers() {
261
259
  const out = [];
262
- for (const rec of readAllPidfiles()) {
263
- if (pidAlive(rec.pid))
264
- out.push(rec);
265
- else
266
- removeWatcherPidfile(rec.short); // stale leftover — drop it
260
+ for (const short of allShorts()) {
261
+ const entries = readWatcherEntries(short);
262
+ const alive = entries.filter(r => pidAlive(r.pid));
263
+ if (alive.length !== entries.length)
264
+ writeWatcherEntries(short, alive);
265
+ out.push(...alive);
267
266
  }
268
- // Stable order: most-recently-armed first.
269
267
  return out.sort((a, b) => (b.startedAt || 0) - (a.startedAt || 0));
270
268
  }
271
- /** Snapshot (pid, ppid, name, cmdline) for every process. Windows via one CIM
272
- * call (a single PowerShell invocation avoids the per-PID `wmic` calls that
273
- * hung historically, see ADR 2026-05-26); POSIX via `ps`. Best-effort: any
274
- * failure returns [] and the reaper becomes a no-op. */
269
+ /** Snapshot (pid, ppid, name, cmdline) for every process. Windows via one CIM call;
270
+ * POSIX via `ps`. ONLY used by `resolveClaudeOwnerPid` now (the cleanup path is
271
+ * snapshot-free); it runs at most once per session (cache miss), never per-tick.
272
+ * Best-effort: any failure returns []. */
275
273
  function snapshotProcs() {
276
274
  try {
277
275
  if (process.platform === 'win32') {
@@ -290,7 +288,6 @@ function snapshotProcs() {
290
288
  }))
291
289
  .filter(p => Number.isFinite(p.pid) && p.pid > 0);
292
290
  }
293
- // POSIX fallback.
294
291
  const out = (0, child_process_1.execFileSync)('ps', ['-eo', 'pid=,ppid=,comm=,args='], {
295
292
  timeout: 12_000, maxBuffer: 64 * 1024 * 1024,
296
293
  }).toString();
@@ -307,48 +304,11 @@ function snapshotProcs() {
307
304
  return [];
308
305
  }
309
306
  }
310
- /** Walk `start`'s parent chain in the snapshot; true iff a live `claude.exe`
311
- * appears as an ancestor. A healthy watcher is launched (via the Monitor task)
312
- * under the live claude.exe, so its chain reaches it; an orphan's owning
313
- * claude.exe died on reload, so its chain is severed (a parent PID absent from
314
- * the live snapshot) before any claude.exe. Depth- and cycle-guarded. */
315
- function hasLiveClaudeAncestor(start, byPid) {
316
- let cur = start;
317
- const seen = new Set();
318
- for (let depth = 0; cur && depth < 32; depth++) {
319
- if (seen.has(cur.pid))
320
- break; // cycle guard
321
- seen.add(cur.pid);
322
- if (CLAUDE_PROC_RE.test(cur.name))
323
- return true; // start/ancestor IS claude
324
- const parent = byPid.get(cur.ppid);
325
- if (!parent)
326
- return false; // severed — parent not in the live table
327
- cur = parent;
328
- }
329
- return false;
330
- }
331
- /** Owning claude.exe PID parsed from a watcher's command line (`--owner-pid N`),
332
- * or null if absent (a legacy watcher armed before this flag existed). */
333
- function ownerPidFromCmd(cmd) {
334
- const m = OWNER_PID_RE.exec(cmd || '');
335
- return m ? Number(m[1]) : null;
336
- }
337
- /** Resolve the owning claude.exe PID for the CURRENT process by walking its
338
- * parent chain in a fresh snapshot. MUST be called from a hook — a direct, live
339
- * descendant of the session's claude.exe whose chain is intact — NOT from the
340
- * watcher, whose launcher shell collapses within seconds of arming (proven
341
- * 2026-06-05: a supervisor's ancestry to claude.exe is already severed at arm
342
- * time). The hook stamps the result into the watch command (`--owner-pid`) so
343
- * the reaper can later ask "is that exact claude.exe still alive?" instead of
344
- * re-walking a chain that no longer exists. Returns null if no claude.exe
345
- * ancestor is found (caller omits the flag → reaper falls back to ancestry).
346
- * adr: adr/monitor-resilience.md */
307
+ /** Resolve the owning claude.exe PID for the CURRENT process by walking its parent
308
+ * chain. MUST be called from a hook (a live descendant of claude.exe). Stamped
309
+ * into the arm command (`--owner-pid`) for audit continuity. Cache hit skips the
310
+ * snapshot. Returns null if no claude.exe ancestor. adr: adr/monitor-resilience.md */
347
311
  function resolveClaudeOwnerPid(cacheShort) {
348
- // Cache hit: the owning claude.exe is still alive (stable for the session) —
349
- // skip the snapshot entirely. This is what turns the per-prompt hot path from
350
- // "spawn a powershell every turn" into "read a file + one syscall".
351
- // adr: adr/monitor-resilience.md
352
312
  if (cacheShort) {
353
313
  const cached = readOwnerCache(cacheShort);
354
314
  if (cached && pidAlive(cached))
@@ -380,80 +340,109 @@ function resolveClaudeOwnerPid(cacheShort) {
380
340
  return null;
381
341
  }
382
342
  }
383
- /** True iff `pid` is present in the snapshot AND is a claude.exe — i.e. the
384
- * owning session is still alive. */
385
- function isLiveClaude(pid, byPid) {
386
- const p = byPid.get(pid);
387
- return !!p && CLAUDE_PROC_RE.test(p.name);
343
+ // ---- The count-cap -------------------------------------------------------
344
+ /** Pure verdict (testable): given a session's LIVE entries, keep the K freshest
345
+ * (by startedAt), return the older surplus to kill. Never kills when count ≤ K;
346
+ * never kills more than count − K; never the K freshest. This is the entire
347
+ * liveness-safety guarantee — no death judgment, just a count + a floor. */
348
+ function capSurplusVerdict(entries, K) {
349
+ if (entries.length <= K)
350
+ return { keep: entries.slice(), kill: [] };
351
+ const sorted = [...entries].sort((a, b) => (b.startedAt || 0) - (a.startedAt || 0));
352
+ return { keep: sorted.slice(0, K), kill: sorted.slice(K) };
388
353
  }
389
- /** Decide whether a watcher candidate is a severed orphan, and why. Under the
390
- * REAP_BY_OWNER flag, prefers the stamped owner PID (robust); falls back to the
391
- * ancestry walk for legacy watchers with no `--owner-pid`. Off-flag: pure
392
- * ancestry walk (unchanged behaviour). adr: adr/monitor-resilience.md */
393
- function severedVerdict(w, byPid) {
394
- if (reapByOwner()) {
395
- const owner = ownerPidFromCmd(w.cmd);
396
- if (owner !== null) {
397
- const live = isLiveClaude(owner, byPid);
398
- return { severed: !live, reason: `owner=${owner}:${live ? 'live-claude' : 'dead'}` };
354
+ /** Count-cap every session: keep the K freshest LIVE watchers, kill only the
355
+ * surplus. NO death-detection, NO process snapshot per-session file reads +
356
+ * `pidAlive()`, and a `taskkill` only when there is genuine surplus. Run at
357
+ * SessionStart (via the recap hook) and exposed as `greprag inbox reap`. The name
358
+ * is retained for its callers; the behaviour is "cap", not "judge orphans".
359
+ * adr: adr/monitor-resilience.md */
360
+ function reapOrphanWatchers(K = exports.DEFAULT_WATCHER_CAP) {
361
+ let scanned = 0;
362
+ let surplus = 0;
363
+ const killedAll = [];
364
+ for (const short of allShorts()) {
365
+ const alive = readWatcherEntries(short).filter(r => pidAlive(r.pid));
366
+ scanned += alive.length;
367
+ const { keep, kill } = capSurplusVerdict(alive, K);
368
+ if (kill.length === 0) {
369
+ writeWatcherEntries(short, alive); // sweep dead entries, nothing to cap
370
+ continue;
399
371
  }
400
- return { severed: !hasLiveClaudeAncestor(w, byPid), reason: 'no-owner-flag→ancestry' };
372
+ surplus += kill.length;
373
+ const survivors = keep.slice();
374
+ for (const w of kill) {
375
+ watcherAuditLog(`cap-candidate=${w.pid} session=${short} SURPLUS→kill startedAt=${w.startedAt}`);
376
+ if (killProcessTree(w.pid))
377
+ killedAll.push(w.pid);
378
+ else
379
+ survivors.push(w); // kill failed → keep tracking it so it's retried
380
+ }
381
+ writeWatcherEntries(short, survivors);
401
382
  }
402
- return { severed: !hasLiveClaudeAncestor(w, byPid), reason: 'ancestry' };
383
+ watcherAuditLog(`cap-summary K=${K} scanned=${scanned} surplus=${surplus} killed=[${killedAll.join(',')}]`);
384
+ return { scanned, orphans: surplus, killed: killedAll };
403
385
  }
404
- /** Kill every `inbox watch` process whose ancestry no longer reaches a live
405
- * claude.exe (severed = orphan), plus sweep dead-PID pidfiles. Returns a
406
- * summary. Best-effort and bounded: a snapshot failure yields a zero result;
407
- * the kill set is guaranteed (by `WATCHER_CMD_RE`) to contain only greprag
408
- * watchers. Intended to run at SessionStart, before this session arms at that
409
- * moment every live watcher belongs to a PRIOR incarnation, so reaping is the
410
- * common path, not the exception. */
411
- function reapOrphanWatchers() {
412
- sweepDeadPidfiles();
413
- // Demand-gate the snapshot. The process-table photograph (a spawned
414
- // `powershell Get-CimInstance`) was firing on EVERY SessionStart and
415
- // SessionStart re-fires on every compaction/reload, across every session — so
416
- // the snapshot itself, not any watcher, became the conhost-leak engine. The
417
- // on-disk pidfile registry already tells us, with zero subprocesses, whether
418
- // any watcher's owning claude.exe has died: that is the only condition under
419
- // which there is anything to reap. If no pidfile reports a dead owner, there is
420
- // nothing severed → skip the snapshot entirely. The snapshot now runs ONLY when
421
- // a real orphan exists (where its cost is justified), and still performs the
422
- // full cmdline-verified, recycle-safe kill below. Legacy pidfiles without an
423
- // ownerPid force one snapshot so they're not stranded uncollected.
424
- // adr: adr/monitor-resilience.md
425
- const pidfiles = readAllPidfiles();
426
- const severedByOwner = pidfiles.some(r => typeof r.ownerPid === 'number' && !pidAlive(r.ownerPid));
427
- const hasLegacy = pidfiles.some(r => typeof r.ownerPid !== 'number');
428
- if (!severedByOwner && !hasLegacy) {
429
- watcherAuditLog(`reap-summary gated no-severed-owner pidfiles=${pidfiles.length} (snapshot skipped)`);
430
- return { scanned: 0, orphans: 0, killed: [] };
431
- }
432
- const procs = snapshotProcs();
433
- if (procs.length === 0) {
434
- watcherAuditLog('reap-summary snapshot-empty no-op');
435
- return { scanned: 0, orphans: 0, killed: [] };
436
- }
437
- const byPid = new Map();
438
- for (const p of procs)
439
- byPid.set(p.pid, p);
440
- // Candidate set — the SAFETY GUARD. Only these are ever eligible to be killed.
441
- const watchers = procs.filter(p => WATCHER_CMD_RE.test(p.cmd));
442
- const verdicts = watchers.map(w => ({ w, ...severedVerdict(w, byPid) }));
443
- const severed = verdicts.filter(v => v.severed).map(v => v.w);
444
- // Visibility: record every candidate's verdict + the reason (which signal
445
- // decided it) BEFORE any kill, so a false-positive is auditable after the
446
- // fact. adr: adr/monitor-resilience.md
447
- for (const v of verdicts) {
448
- watcherAuditLog(`reap-candidate=${v.w.pid} name=${v.w.name} ppid=${v.w.ppid} ${v.severed ? 'SEVERED→kill' : 'ok→keep'} via=${v.reason} cmd="${(v.w.cmd || '').slice(0, 90)}"`);
449
- }
386
+ // ---- Dead-session GC (the re-resolved-owner cleanup) ----------------------
387
+ // The count-cap bounds a LIVE session to K; EPIPE + the relauncher clean a
388
+ // CLEANLY-closed session. This collects the remaining tail: a session whose
389
+ // claude.exe HARD-crashes (half-open pipe, no EPIPE) leaves ≤K watchers nothing
390
+ // else reaps. It asks "is the SESSION alive?" reliably knowable via the
391
+ // SessionStart-refreshed owner record NOT "is this WATCHER an orphan?"
392
+ // (unknowable). The grace window + `deadSince` clock make it immune to the restart
393
+ // false-kill that doomed the owner-pid reaper: a model/Fast-mode switch re-resolves
394
+ // the owner well within grace, overwriting the record and clearing deadSince.
395
+ // adr: adr/monitor-resilience.md
396
+ /** Minutes a session's owner must stay CONTINUOUSLY dead before its watchers are
397
+ * GC'd. Must exceed the worst-case claude.exe restart→SessionStart window (which
398
+ * is seconds), so a restart can never be mistaken for a death. */
399
+ exports.GC_GRACE_MS = 5 * 60 * 1000;
400
+ /** Pure GC decision (testable): given a session's owner record and whether its PID
401
+ * is alive right now, decide keep / start-the-grace-clock / reap. */
402
+ function gcVerdict(owner, ownerAlive, now, graceMs) {
403
+ if (!owner)
404
+ return 'keep'; // no owner record the count-cap's job
405
+ if (ownerAlive)
406
+ return 'keep'; // session alive (even if idle)
407
+ if (owner.deadSince === undefined)
408
+ return 'mark-dead'; // first death sighting start the clock
409
+ return (now - owner.deadSince > graceMs) ? 'reap' : 'keep';
410
+ }
411
+ /** Reap every watcher of a session whose owning claude.exe has stayed dead past the
412
+ * grace window. Run at SessionStart, AFTER refreshing THIS session's own owner
413
+ * (so it is never its own victim). No process snapshot — owner-file reads +
414
+ * pidAlive() syscalls. adr: adr/monitor-resilience.md */
415
+ function gcDeadSessions(graceMs = exports.GC_GRACE_MS) {
416
+ const now = Date.now();
417
+ const reaped = [];
450
418
  const killed = [];
451
- for (const p of severed) {
452
- if (killProcessTree(p.pid))
453
- killed.push(p.pid);
419
+ for (const short of allShorts()) {
420
+ const owner = readSessionOwner(short);
421
+ const verdict = gcVerdict(owner, !!owner && pidAlive(owner.pid), now, graceMs);
422
+ if (verdict === 'keep') {
423
+ if (owner && owner.deadSince !== undefined && pidAlive(owner.pid)) {
424
+ writeSessionOwner(short, { pid: owner.pid, ts: owner.ts }); // recovered → clear the grace clock
425
+ }
426
+ continue;
427
+ }
428
+ if (verdict === 'mark-dead') {
429
+ writeSessionOwner(short, { ...owner, deadSince: now }); // start the grace clock
430
+ continue;
431
+ }
432
+ // reap: the owner stayed dead past grace → the session is truly gone.
433
+ const deadFor = Math.round((now - (owner.deadSince || now)) / 1000);
434
+ for (const e of readWatcherEntries(short)) {
435
+ watcherAuditLog(`gc-kill=${e.pid} session=${short} owner=${owner.pid}:dead-${deadFor}s`);
436
+ if (killProcessTree(e.pid))
437
+ killed.push(e.pid);
438
+ }
439
+ removeWatcherPidfile(short);
440
+ removeSessionOwner(short);
441
+ reaped.push(short);
454
442
  }
455
- watcherAuditLog(`reap-summary mode=${reapByOwner() ? 'owner-pid' : 'ancestry'} scanned=${watchers.length} severed=${severed.length} killed=[${killed.join(',')}]`);
456
- return { scanned: watchers.length, orphans: severed.length, killed };
443
+ if (reaped.length)
444
+ watcherAuditLog(`gc-summary reaped=[${reaped.join(',')}] killed=[${killed.join(',')}]`);
445
+ return { reaped, killed };
457
446
  }
458
447
  /** Force-kill a process and its descendants. `taskkill /T` reaps the tree in one
459
448
  * shot so a supervisor can't respawn its child in the race window. */
@@ -474,27 +463,4 @@ function killProcessTree(pid) {
474
463
  return false;
475
464
  }
476
465
  }
477
- /** Remove pidfiles whose recorded PID is dead. Keeps `isLocallyArmed` honest
478
- * even when a watcher died without removing its own file (hard kill / crash). */
479
- function sweepDeadPidfiles() {
480
- try {
481
- const dir = watchersDir();
482
- if (!dir || !fs.existsSync(dir))
483
- return;
484
- for (const f of fs.readdirSync(dir)) {
485
- if (!f.endsWith('.json'))
486
- continue;
487
- try {
488
- const rec = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf-8'));
489
- if (!rec || typeof rec.pid !== 'number' || !pidAlive(rec.pid)) {
490
- fs.rmSync(path.join(dir, f), { force: true });
491
- }
492
- }
493
- catch {
494
- fs.rmSync(path.join(dir, f), { force: true }); // unparseable → drop
495
- }
496
- }
497
- }
498
- catch { /* best-effort */ }
499
- }
500
466
  //# sourceMappingURL=watcher-registry.js.map