greprag 5.43.0 → 5.43.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/inbox-watch-supervisor.d.ts +0 -9
- package/dist/commands/inbox-watch-supervisor.js +23 -105
- package/dist/commands/inbox-watch-supervisor.js.map +1 -1
- package/dist/commands/watcher-registry.d.ts +93 -93
- package/dist/commands/watcher-registry.js +259 -296
- package/dist/commands/watcher-registry.js.map +1 -1
- package/dist/guard.js +16 -3
- package/dist/guard.js.map +1 -1
- package/dist/hook.js +18 -11
- package/dist/hook.js.map +1 -1
- package/dist/session-id.d.ts +18 -10
- package/dist/session-id.js +24 -22
- package/dist/session-id.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,33 +1,35 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
/** Local watcher process registry +
|
|
2
|
+
/** Local watcher process registry + COUNT-CAP (the post-saga model).
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
* (consumer dead, process still respawning) as "armed". Two local signals fix
|
|
11
|
-
* that:
|
|
4
|
+
* THE LESSON (adr/monitor-resilience.md, "⭐ The watcher in one pass"): a
|
|
5
|
+
* functional watcher and an orphaned-but-alive one are EXTERNALLY
|
|
6
|
+
* INDISTINGUISHABLE — same command line, same live PID. Only the watcher itself
|
|
7
|
+
* can tell, via its own stdout pipe (EPIPE). So every external reaper that
|
|
8
|
+
* *judged* orphan-ness (ancestry, owner-pid, monitor-pid) was guessing, and the
|
|
9
|
+
* guesses false-killed live watchers — the 548-kill saga in one line.
|
|
12
10
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
* to gate re-arming with zero cloud dependency and zero ghost-lease lag.
|
|
16
|
-
* Paired with EPIPE-terminal in the watcher (a watcher whose consumer pipe
|
|
17
|
-
* breaks exits and removes its own pidfile), a live pidfile means a live,
|
|
18
|
-
* *consumed* watcher.
|
|
11
|
+
* This module no longer judges. Two reliable facts replace the one unknowable
|
|
12
|
+
* one:
|
|
19
13
|
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
* the kill set is intersected with "cmdline contains `inbox watch`", so it
|
|
27
|
-
* can only ever terminate greprag watchers — never claude.exe, an editor, or
|
|
28
|
-
* an unrelated node server.
|
|
14
|
+
* 1. **pidfile registry** — `~/.greprag/watchers/<short>.json` is a LIST of every
|
|
15
|
+
* supervisor armed for that session (`{short,pid,startedAt,ownerPid}`). A
|
|
16
|
+
* supervisor appends on arm and removes its own entry on terminal exit; a
|
|
17
|
+
* hard kill leaves a dead entry that the alive-filter sweeps. `isLocallyArmed`
|
|
18
|
+
* = "any entry alive". (Multiple live supervisors per session is normal under
|
|
19
|
+
* the June-8 spray; the count-cap bounds them.)
|
|
29
20
|
*
|
|
30
|
-
*
|
|
21
|
+
* 2. **count-cap** — `reapOrphanWatchers` keeps the K freshest LIVE watchers per
|
|
22
|
+
* session and kills only the surplus, NEVER below K. No death-detection → no
|
|
23
|
+
* false-kill; a floor of K → it can never reap to zero, so the bug that broke
|
|
24
|
+
* liveness every time is structurally impossible. "Freshest" is a *bias*
|
|
25
|
+
* toward the just-armed functional one, not a guarantee — and that's fine:
|
|
26
|
+
* EPIPE kills a wrongly-kept orphan in ~30s and the supervisor respawns. It is
|
|
27
|
+
* **snapshot-free** (per-session file reads + `pidAlive()` syscalls), so the
|
|
28
|
+
* 06-05b conhost-OOM engine — a `powershell Get-CimInstance` per SessionStart —
|
|
29
|
+
* is gone from the cleanup path entirely.
|
|
30
|
+
*
|
|
31
|
+
* EPIPE-terminal (in the watcher) + the supervisor's programmatic respawn do the
|
|
32
|
+
* real liveness work; the count-cap only bounds the pile. adr: adr/monitor-resilience.md */
|
|
31
33
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
32
34
|
if (k2 === undefined) k2 = k;
|
|
33
35
|
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
@@ -62,46 +64,31 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
62
64
|
};
|
|
63
65
|
})();
|
|
64
66
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
67
|
+
exports.GC_GRACE_MS = exports.DEFAULT_WATCHER_CAP = void 0;
|
|
65
68
|
exports.writeWatcherPidfile = writeWatcherPidfile;
|
|
69
|
+
exports.removeWatcherEntry = removeWatcherEntry;
|
|
66
70
|
exports.removeWatcherPidfile = removeWatcherPidfile;
|
|
67
71
|
exports.watcherAuditLog = watcherAuditLog;
|
|
68
72
|
exports.isLocallyArmed = isLocallyArmed;
|
|
69
73
|
exports.listLocalLiveWatchers = listLocalLiveWatchers;
|
|
70
74
|
exports.resolveClaudeOwnerPid = resolveClaudeOwnerPid;
|
|
71
|
-
exports.
|
|
75
|
+
exports.capSurplusVerdict = capSurplusVerdict;
|
|
72
76
|
exports.reapOrphanWatchers = reapOrphanWatchers;
|
|
77
|
+
exports.gcVerdict = gcVerdict;
|
|
78
|
+
exports.gcDeadSessions = gcDeadSessions;
|
|
73
79
|
const fs = __importStar(require("fs"));
|
|
74
80
|
const path = __importStar(require("path"));
|
|
75
|
-
const
|
|
81
|
+
const child_process_1 = require("child_process");
|
|
76
82
|
const WATCHER_DIRNAME = 'watchers';
|
|
77
|
-
// Matches a greprag watcher's command line in every launch shape
|
|
78
|
-
//
|
|
79
|
-
//
|
|
80
|
-
//
|
|
81
|
-
//
|
|
82
|
-
// would also match an unrelated shell that merely mentions the phrase (a grep, a
|
|
83
|
-
// diagnostic, this very reaper). This is the SAFETY GUARD: only a process whose
|
|
84
|
-
// command line is an actual watcher invocation is ever a kill candidate, so the
|
|
85
|
-
// reaper cannot touch claude.exe, an editor, or an incidental shell.
|
|
86
|
-
const WATCHER_CMD_RE = /(?:index\.js|greprag)["'\s]+inbox\s+watch\b/i;
|
|
83
|
+
// Matches a greprag watcher's command line in every launch shape (npm shim, bash
|
|
84
|
+
// relauncher, the supervisor's CreateProcess re-invocation). Anchored on the
|
|
85
|
+
// INVOKED BINARY (`greprag` or `index.js`) immediately followed by `inbox watch`
|
|
86
|
+
// — never a bare `inbox watch` substring. Retained for the owner-pid resolver's
|
|
87
|
+
// claude.exe match; the cleanup path no longer scans the process table at all.
|
|
87
88
|
const CLAUDE_PROC_RE = /^claude(\.exe)?$/i;
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
// within seconds of arming (a backgrounded daemon's parent shell exits by
|
|
92
|
-
// design), so "can I still walk up to a claude.exe?" yields false positives on
|
|
93
|
-
// LIVE watchers. "Is the specific claude.exe that armed me still alive?" does
|
|
94
|
-
// not. adr: adr/monitor-resilience.md
|
|
95
|
-
const OWNER_PID_RE = /--owner-pid[=\s]+(\d+)/;
|
|
96
|
-
// The --monitor-pid monitor-task backstop (ff681bb, 2026-06-10) was reverted
|
|
97
|
-
// 2026-06-11: the arm-time stamp came from /proc/$$/winpid of the transient
|
|
98
|
-
// arm-line bash, not the persistent Monitor task, so a dead stamp under a live
|
|
99
|
-
// owner false-flagged live watchers as orphans. Owner-pid liveness is the test.
|
|
100
|
-
// adr: adr/monitor-resilience.md
|
|
101
|
-
// The ancestry-walk fallback (and its GREPRAG_REAP_BY_OWNER escape hatch) was
|
|
102
|
-
// RETIRED 2026-06-13: it false-killed live backgrounded watchers (see
|
|
103
|
-
// severedVerdict). Owner-pid liveness is now the ONLY reap signal; a flagless
|
|
104
|
-
// watcher is kept. adr: adr/monitor-resilience.md
|
|
89
|
+
/** Default per-session watcher floor: keep this many freshest live watchers, reap
|
|
90
|
+
* only the surplus above it, never below it. K=2 = the live one + one margin. */
|
|
91
|
+
exports.DEFAULT_WATCHER_CAP = 2;
|
|
105
92
|
function grepragHome() {
|
|
106
93
|
const home = process.env.HOME || process.env.USERPROFILE || '';
|
|
107
94
|
return home ? path.join(home, '.greprag') : null;
|
|
@@ -110,109 +97,139 @@ function watchersDir() {
|
|
|
110
97
|
const h = grepragHome();
|
|
111
98
|
return h ? path.join(h, WATCHER_DIRNAME) : null;
|
|
112
99
|
}
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
100
|
+
function pidfilePath(short) {
|
|
101
|
+
const dir = watchersDir();
|
|
102
|
+
return dir ? path.join(dir, `${short}.json`) : null;
|
|
103
|
+
}
|
|
104
|
+
/** True iff `pid` is a live process. `process.kill(pid, 0)` sends no signal — it
|
|
105
|
+
* only probes existence. EPERM = exists but another user (alive); ESRCH = gone. */
|
|
106
|
+
function pidAlive(pid) {
|
|
107
|
+
if (!Number.isFinite(pid) || pid <= 0)
|
|
108
|
+
return false;
|
|
109
|
+
try {
|
|
110
|
+
process.kill(pid, 0);
|
|
111
|
+
return true;
|
|
112
|
+
}
|
|
113
|
+
catch (e) {
|
|
114
|
+
return e?.code === 'EPERM';
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
/** Read a session's entry LIST. Back-compat: a legacy single-object pidfile reads
|
|
118
|
+
* as a 1-element list. */
|
|
119
|
+
function readWatcherEntries(short) {
|
|
120
|
+
try {
|
|
121
|
+
const p = pidfilePath(short);
|
|
122
|
+
if (!p)
|
|
123
|
+
return [];
|
|
124
|
+
const parsed = JSON.parse(fs.readFileSync(p, 'utf-8'));
|
|
125
|
+
const arr = Array.isArray(parsed) ? parsed : [parsed];
|
|
126
|
+
return arr.filter((r) => !!r && typeof r.pid === 'number');
|
|
127
|
+
}
|
|
128
|
+
catch {
|
|
129
|
+
return [];
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
function writeWatcherEntries(short, entries) {
|
|
118
133
|
try {
|
|
119
134
|
const dir = watchersDir();
|
|
120
135
|
if (!dir)
|
|
121
136
|
return;
|
|
122
137
|
fs.mkdirSync(dir, { recursive: true });
|
|
123
|
-
const
|
|
124
|
-
|
|
138
|
+
const p = path.join(dir, `${short}.json`);
|
|
139
|
+
if (entries.length === 0) {
|
|
140
|
+
fs.rmSync(p, { force: true });
|
|
141
|
+
return;
|
|
142
|
+
}
|
|
143
|
+
fs.writeFileSync(p, JSON.stringify(entries));
|
|
144
|
+
}
|
|
145
|
+
catch { /* best-effort — a failed write only means re-arm, never a crash */ }
|
|
146
|
+
}
|
|
147
|
+
/** APPEND this supervisor's entry to the session's list (pruning dead entries as
|
|
148
|
+
* it goes). Best-effort. adr: adr/monitor-resilience.md */
|
|
149
|
+
function writeWatcherPidfile(short, pid, ownerPid) {
|
|
150
|
+
const entries = readWatcherEntries(short).filter(r => r.pid !== pid && pidAlive(r.pid));
|
|
151
|
+
entries.push({ short, pid, startedAt: Date.now(), ownerPid });
|
|
152
|
+
writeWatcherEntries(short, entries);
|
|
153
|
+
}
|
|
154
|
+
/** Remove THIS supervisor's entry (called on a supervisor's terminal exit). */
|
|
155
|
+
function removeWatcherEntry(short, pid) {
|
|
156
|
+
writeWatcherEntries(short, readWatcherEntries(short).filter(r => r.pid !== pid));
|
|
157
|
+
}
|
|
158
|
+
/** Remove a session's whole pidfile (legacy / full sweep). */
|
|
159
|
+
function removeWatcherPidfile(short) {
|
|
160
|
+
try {
|
|
161
|
+
const p = pidfilePath(short);
|
|
162
|
+
if (p)
|
|
163
|
+
fs.rmSync(p, { force: true });
|
|
125
164
|
}
|
|
126
165
|
catch { /* best-effort */ }
|
|
127
166
|
}
|
|
128
|
-
/** Every
|
|
129
|
-
|
|
130
|
-
function readAllPidfiles() {
|
|
167
|
+
/** Every session short with a pidfile on disk. */
|
|
168
|
+
function allShorts() {
|
|
131
169
|
try {
|
|
132
170
|
const dir = watchersDir();
|
|
133
171
|
if (!dir || !fs.existsSync(dir))
|
|
134
172
|
return [];
|
|
135
|
-
|
|
136
|
-
for (const f of fs.readdirSync(dir)) {
|
|
137
|
-
if (!f.endsWith('.json'))
|
|
138
|
-
continue;
|
|
139
|
-
try {
|
|
140
|
-
const rec = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf-8'));
|
|
141
|
-
if (rec && typeof rec.pid === 'number')
|
|
142
|
-
out.push(rec);
|
|
143
|
-
}
|
|
144
|
-
catch { /* skip unparseable */ }
|
|
145
|
-
}
|
|
146
|
-
return out;
|
|
173
|
+
return fs.readdirSync(dir).filter(f => f.endsWith('.json')).map(f => f.slice(0, -5));
|
|
147
174
|
}
|
|
148
175
|
catch {
|
|
149
176
|
return [];
|
|
150
177
|
}
|
|
151
178
|
}
|
|
152
|
-
// ---- Owner-PID
|
|
153
|
-
// The
|
|
154
|
-
//
|
|
155
|
-
//
|
|
156
|
-
// engine: each call spawned a `powershell Get-CimInstance Win32_Process` whose
|
|
157
|
-
// conhost orphaned under load. Cache the result keyed by session short; the hot
|
|
158
|
-
// path becomes a file read + one `pidAlive()` syscall, no subprocess.
|
|
159
|
-
// adr: adr/monitor-resilience.md
|
|
179
|
+
// ---- Owner-PID cache (vestigial under the count-cap; kept for the arm stamp) ----
|
|
180
|
+
// The `notify` hook still stamps `--owner-pid` into the arm command for audit
|
|
181
|
+
// continuity. The cap never reads it; this cache just keeps the per-prompt resolve
|
|
182
|
+
// cheap (a file read instead of a process snapshot) on the rare cache miss.
|
|
160
183
|
function ownerCachePath(short) {
|
|
161
184
|
const dir = watchersDir();
|
|
162
185
|
return dir ? path.join(dir, `${short}.owner`) : null;
|
|
163
186
|
}
|
|
164
|
-
function
|
|
187
|
+
function readSessionOwner(short) {
|
|
165
188
|
try {
|
|
166
189
|
const p = ownerCachePath(short);
|
|
167
190
|
if (!p)
|
|
168
191
|
return null;
|
|
169
|
-
const
|
|
170
|
-
|
|
192
|
+
const raw = fs.readFileSync(p, 'utf-8').trim();
|
|
193
|
+
if (/^\d+$/.test(raw)) {
|
|
194
|
+
const n = Number(raw);
|
|
195
|
+
return n > 0 ? { pid: n, ts: 0 } : null;
|
|
196
|
+
} // legacy bare PID
|
|
197
|
+
const o = JSON.parse(raw);
|
|
198
|
+
return (o && typeof o.pid === 'number') ? o : null;
|
|
171
199
|
}
|
|
172
200
|
catch {
|
|
173
201
|
return null;
|
|
174
202
|
}
|
|
175
203
|
}
|
|
176
|
-
function
|
|
204
|
+
function writeSessionOwner(short, o) {
|
|
177
205
|
try {
|
|
178
206
|
const dir = watchersDir();
|
|
179
207
|
if (!dir)
|
|
180
208
|
return;
|
|
181
209
|
fs.mkdirSync(dir, { recursive: true });
|
|
182
|
-
fs.writeFileSync(path.join(dir, `${short}.owner`),
|
|
210
|
+
fs.writeFileSync(path.join(dir, `${short}.owner`), JSON.stringify(o));
|
|
183
211
|
}
|
|
184
212
|
catch { /* best-effort */ }
|
|
185
213
|
}
|
|
186
|
-
function
|
|
214
|
+
function removeSessionOwner(short) {
|
|
187
215
|
try {
|
|
188
|
-
const
|
|
189
|
-
if (
|
|
190
|
-
|
|
191
|
-
fs.rmSync(path.join(dir, `${short}.json`), { force: true });
|
|
216
|
+
const p = ownerCachePath(short);
|
|
217
|
+
if (p)
|
|
218
|
+
fs.rmSync(p, { force: true });
|
|
192
219
|
}
|
|
193
220
|
catch { /* best-effort */ }
|
|
194
221
|
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
if (rec && typeof rec.pid === 'number' && rec.short === short)
|
|
203
|
-
return rec;
|
|
204
|
-
return null;
|
|
205
|
-
}
|
|
206
|
-
catch {
|
|
207
|
-
return null;
|
|
208
|
-
}
|
|
222
|
+
// Thin wrappers for resolveClaudeOwnerPid (unchanged contract): read returns the
|
|
223
|
+
// current owner PID; write refreshes the record fresh (clearing any deadSince).
|
|
224
|
+
function readOwnerCache(short) {
|
|
225
|
+
return readSessionOwner(short)?.pid ?? null;
|
|
226
|
+
}
|
|
227
|
+
function writeOwnerCache(short, ownerPid) {
|
|
228
|
+
writeSessionOwner(short, { pid: ownerPid, ts: Date.now() });
|
|
209
229
|
}
|
|
210
|
-
/** Append-only diagnostic audit log at ~/.greprag/watchers/audit.log. ALWAYS ON
|
|
211
|
-
*
|
|
212
|
-
*
|
|
213
|
-
* and the reaper is a different process than the watcher. Behaviour-neutral:
|
|
214
|
-
* pure observation, never gates a kill or an arm. Cheap (a few lines per
|
|
215
|
-
* SessionStart + per watcher arm/exit). adr: adr/monitor-resilience.md */
|
|
230
|
+
/** Append-only diagnostic audit log at ~/.greprag/watchers/audit.log. ALWAYS ON so
|
|
231
|
+
* every arm / terminal exit / cap-kill leaves a trail. Behaviour-neutral: pure
|
|
232
|
+
* observation, never gates a kill or an arm. adr: adr/monitor-resilience.md */
|
|
216
233
|
function watcherAuditLog(event) {
|
|
217
234
|
try {
|
|
218
235
|
const dir = watchersDir();
|
|
@@ -223,66 +240,43 @@ function watcherAuditLog(event) {
|
|
|
223
240
|
}
|
|
224
241
|
catch { /* best-effort — visibility must never break the watcher */ }
|
|
225
242
|
}
|
|
226
|
-
/**
|
|
227
|
-
*
|
|
228
|
-
*
|
|
229
|
-
|
|
230
|
-
if (!Number.isFinite(pid) || pid <= 0)
|
|
231
|
-
return false;
|
|
232
|
-
try {
|
|
233
|
-
process.kill(pid, 0);
|
|
234
|
-
return true;
|
|
235
|
-
}
|
|
236
|
-
catch (e) {
|
|
237
|
-
return e?.code === 'EPERM';
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
/** Local-first arm check: is there a live watcher PROCESS for this session on
|
|
241
|
-
* THIS machine? This is ground truth for "armed" — the consumer (the Monitor
|
|
242
|
-
* task) is local, so a live local pidfile means a live, consumed watcher.
|
|
243
|
-
* Replaces the server `isSessionArmed` check, which counts orphans (consumer
|
|
244
|
-
* dead, socket still open) as armed and so both (a) suppresses re-arm when the
|
|
245
|
-
* real watcher is gone and (b) lets re-arm stack new watchers on undead
|
|
246
|
-
* orphans. A dead-PID pidfile is swept here so the next turn re-arms. */
|
|
243
|
+
/** Local-first arm check: does this session have ANY live watcher process on THIS
|
|
244
|
+
* machine? Ground truth for "armed" — the consumer (the Monitor task) is local,
|
|
245
|
+
* so a live local entry means a live, consumed watcher. Sweeps dead entries as it
|
|
246
|
+
* reads, so a hard-killed watcher doesn't read as armed. */
|
|
247
247
|
function isLocallyArmed(short) {
|
|
248
|
-
const
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
return false;
|
|
254
|
-
}
|
|
255
|
-
return true;
|
|
248
|
+
const entries = readWatcherEntries(short);
|
|
249
|
+
const alive = entries.filter(r => pidAlive(r.pid));
|
|
250
|
+
if (alive.length !== entries.length)
|
|
251
|
+
writeWatcherEntries(short, alive);
|
|
252
|
+
return alive.length > 0;
|
|
256
253
|
}
|
|
257
|
-
/**
|
|
258
|
-
*
|
|
259
|
-
*
|
|
260
|
-
*
|
|
261
|
-
* is computed on demand from ground truth, never a cached/replicated list, so it
|
|
262
|
-
* cannot go stale the way the cloud `/attached` registry does. adr:
|
|
263
|
-
* adr/monitor-resilience.md */
|
|
254
|
+
/** Every session whose watcher process is alive on THIS machine right now —
|
|
255
|
+
* computed on demand from the pidfile registry + OS liveness, sweeping dead
|
|
256
|
+
* entries. Most-recently-armed first. This is what the desk-line returns when the
|
|
257
|
+
* cloud asks "what's truly active?". adr: adr/monitor-resilience.md */
|
|
264
258
|
function listLocalLiveWatchers() {
|
|
265
259
|
const out = [];
|
|
266
|
-
for (const
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
260
|
+
for (const short of allShorts()) {
|
|
261
|
+
const entries = readWatcherEntries(short);
|
|
262
|
+
const alive = entries.filter(r => pidAlive(r.pid));
|
|
263
|
+
if (alive.length !== entries.length)
|
|
264
|
+
writeWatcherEntries(short, alive);
|
|
265
|
+
out.push(...alive);
|
|
271
266
|
}
|
|
272
|
-
// Stable order: most-recently-armed first.
|
|
273
267
|
return out.sort((a, b) => (b.startedAt || 0) - (a.startedAt || 0));
|
|
274
268
|
}
|
|
275
|
-
/** Snapshot (pid, ppid, name, cmdline) for every process. Windows via one CIM
|
|
276
|
-
*
|
|
277
|
-
*
|
|
278
|
-
* failure returns []
|
|
269
|
+
/** Snapshot (pid, ppid, name, cmdline) for every process. Windows via one CIM call;
|
|
270
|
+
* POSIX via `ps`. ONLY used by `resolveClaudeOwnerPid` now (the cleanup path is
|
|
271
|
+
* snapshot-free); it runs at most once per session (cache miss), never per-tick.
|
|
272
|
+
* Best-effort: any failure returns []. */
|
|
279
273
|
function snapshotProcs() {
|
|
280
274
|
try {
|
|
281
275
|
if (process.platform === 'win32') {
|
|
282
276
|
const script = 'Get-CimInstance Win32_Process | ' +
|
|
283
277
|
'Select-Object ProcessId,ParentProcessId,Name,CommandLine | ' +
|
|
284
278
|
'ConvertTo-Json -Compress';
|
|
285
|
-
const out = (0,
|
|
279
|
+
const out = (0, child_process_1.execFileSync)('powershell.exe', ['-NoProfile', '-NonInteractive', '-Command', script], { timeout: 12_000, maxBuffer: 64 * 1024 * 1024, windowsHide: true }).toString();
|
|
286
280
|
const parsed = JSON.parse(out);
|
|
287
281
|
const arr = Array.isArray(parsed) ? parsed : [parsed];
|
|
288
282
|
return arr
|
|
@@ -294,8 +288,7 @@ function snapshotProcs() {
|
|
|
294
288
|
}))
|
|
295
289
|
.filter(p => Number.isFinite(p.pid) && p.pid > 0);
|
|
296
290
|
}
|
|
297
|
-
|
|
298
|
-
const out = (0, proc_1.safeExecFileSync)('ps', ['-eo', 'pid=,ppid=,comm=,args='], {
|
|
291
|
+
const out = (0, child_process_1.execFileSync)('ps', ['-eo', 'pid=,ppid=,comm=,args='], {
|
|
299
292
|
timeout: 12_000, maxBuffer: 64 * 1024 * 1024,
|
|
300
293
|
}).toString();
|
|
301
294
|
const rows = [];
|
|
@@ -311,28 +304,11 @@ function snapshotProcs() {
|
|
|
311
304
|
return [];
|
|
312
305
|
}
|
|
313
306
|
}
|
|
314
|
-
/**
|
|
315
|
-
*
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
return m ? Number(m[1]) : null;
|
|
319
|
-
}
|
|
320
|
-
/** Resolve the owning claude.exe PID for the CURRENT process by walking its
|
|
321
|
-
* parent chain in a fresh snapshot. MUST be called from a hook — a direct, live
|
|
322
|
-
* descendant of the session's claude.exe whose chain is intact — NOT from the
|
|
323
|
-
* watcher, whose launcher shell collapses within seconds of arming (proven
|
|
324
|
-
* 2026-06-05: a supervisor's ancestry to claude.exe is already severed at arm
|
|
325
|
-
* time). The hook stamps the result into the watch command (`--owner-pid`) so
|
|
326
|
-
* the reaper can later ask "is that exact claude.exe still alive?" instead of
|
|
327
|
-
* re-walking a chain that no longer exists. Returns null if no claude.exe
|
|
328
|
-
* ancestor is found (caller omits the flag → the watcher is flagless → the reaper
|
|
329
|
-
* KEEPS it, since the ancestry fallback that used to reap it was a false-kill).
|
|
330
|
-
* adr: adr/monitor-resilience.md */
|
|
307
|
+
/** Resolve the owning claude.exe PID for the CURRENT process by walking its parent
|
|
308
|
+
* chain. MUST be called from a hook (a live descendant of claude.exe). Stamped
|
|
309
|
+
* into the arm command (`--owner-pid`) for audit continuity. Cache hit skips the
|
|
310
|
+
* snapshot. Returns null if no claude.exe ancestor. adr: adr/monitor-resilience.md */
|
|
331
311
|
function resolveClaudeOwnerPid(cacheShort) {
|
|
332
|
-
// Cache hit: the owning claude.exe is still alive (stable for the session) —
|
|
333
|
-
// skip the snapshot entirely. This is what turns the per-prompt hot path from
|
|
334
|
-
// "spawn a powershell every turn" into "read a file + one syscall".
|
|
335
|
-
// adr: adr/monitor-resilience.md
|
|
336
312
|
if (cacheShort) {
|
|
337
313
|
const cached = readOwnerCache(cacheShort);
|
|
338
314
|
if (cached && pidAlive(cached))
|
|
@@ -364,106 +340,116 @@ function resolveClaudeOwnerPid(cacheShort) {
|
|
|
364
340
|
return null;
|
|
365
341
|
}
|
|
366
342
|
}
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
343
|
+
// ---- The count-cap -------------------------------------------------------
|
|
344
|
+
/** Pure verdict (testable): given a session's LIVE entries, keep the K freshest
|
|
345
|
+
* (by startedAt), return the older surplus to kill. Never kills when count ≤ K;
|
|
346
|
+
* never kills more than count − K; never the K freshest. This is the entire
|
|
347
|
+
* liveness-safety guarantee — no death judgment, just a count + a floor. */
|
|
348
|
+
function capSurplusVerdict(entries, K) {
|
|
349
|
+
if (entries.length <= K)
|
|
350
|
+
return { keep: entries.slice(), kill: [] };
|
|
351
|
+
const sorted = [...entries].sort((a, b) => (b.startedAt || 0) - (a.startedAt || 0));
|
|
352
|
+
return { keep: sorted.slice(0, K), kill: sorted.slice(K) };
|
|
372
353
|
}
|
|
373
|
-
/**
|
|
374
|
-
*
|
|
375
|
-
*
|
|
376
|
-
*
|
|
377
|
-
*
|
|
378
|
-
*
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
354
|
+
/** Count-cap every session: keep the K freshest LIVE watchers, kill only the
|
|
355
|
+
* surplus. NO death-detection, NO process snapshot — per-session file reads +
|
|
356
|
+
* `pidAlive()`, and a `taskkill` only when there is genuine surplus. Run at
|
|
357
|
+
* SessionStart (via the recap hook) and exposed as `greprag inbox reap`. The name
|
|
358
|
+
* is retained for its callers; the behaviour is "cap", not "judge orphans".
|
|
359
|
+
* adr: adr/monitor-resilience.md */
|
|
360
|
+
function reapOrphanWatchers(K = exports.DEFAULT_WATCHER_CAP) {
|
|
361
|
+
let scanned = 0;
|
|
362
|
+
let surplus = 0;
|
|
363
|
+
const killedAll = [];
|
|
364
|
+
for (const short of allShorts()) {
|
|
365
|
+
const alive = readWatcherEntries(short).filter(r => pidAlive(r.pid));
|
|
366
|
+
scanned += alive.length;
|
|
367
|
+
const { keep, kill } = capSurplusVerdict(alive, K);
|
|
368
|
+
if (kill.length === 0) {
|
|
369
|
+
writeWatcherEntries(short, alive); // sweep dead entries, nothing to cap
|
|
370
|
+
continue;
|
|
371
|
+
}
|
|
372
|
+
surplus += kill.length;
|
|
373
|
+
const survivors = keep.slice();
|
|
374
|
+
for (const w of kill) {
|
|
375
|
+
watcherAuditLog(`cap-candidate=${w.pid} session=${short} SURPLUS→kill startedAt=${w.startedAt}`);
|
|
376
|
+
if (killProcessTree(w.pid))
|
|
377
|
+
killedAll.push(w.pid);
|
|
378
|
+
else
|
|
379
|
+
survivors.push(w); // kill failed → keep tracking it so it's retried
|
|
380
|
+
}
|
|
381
|
+
writeWatcherEntries(short, survivors);
|
|
399
382
|
}
|
|
400
|
-
|
|
401
|
-
return {
|
|
383
|
+
watcherAuditLog(`cap-summary K=${K} scanned=${scanned} surplus=${surplus} killed=[${killedAll.join(',')}]`);
|
|
384
|
+
return { scanned, orphans: surplus, killed: killedAll };
|
|
402
385
|
}
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
const procs = snapshotProcs();
|
|
436
|
-
if (procs.length === 0) {
|
|
437
|
-
watcherAuditLog('reap-summary snapshot-empty no-op');
|
|
438
|
-
return { scanned: 0, orphans: 0, killed: [] };
|
|
439
|
-
}
|
|
440
|
-
const byPid = new Map();
|
|
441
|
-
for (const p of procs)
|
|
442
|
-
byPid.set(p.pid, p);
|
|
443
|
-
// Candidate set — the SAFETY GUARD. Only these are ever eligible to be killed.
|
|
444
|
-
const watchers = procs.filter(p => WATCHER_CMD_RE.test(p.cmd));
|
|
445
|
-
const verdicts = watchers.map(w => ({ w, ...severedVerdict(w, byPid) }));
|
|
446
|
-
const severed = verdicts.filter(v => v.severed).map(v => v.w);
|
|
447
|
-
// Visibility: record every candidate's verdict + the reason (which signal
|
|
448
|
-
// decided it) BEFORE any kill, so a false-positive is auditable after the
|
|
449
|
-
// fact. adr: adr/monitor-resilience.md
|
|
450
|
-
for (const v of verdicts) {
|
|
451
|
-
watcherAuditLog(`reap-candidate=${v.w.pid} name=${v.w.name} ppid=${v.w.ppid} ${v.severed ? 'SEVERED→kill' : 'ok→keep'} via=${v.reason} cmd="${(v.w.cmd || '').slice(0, 90)}"`);
|
|
452
|
-
}
|
|
386
|
+
// ---- Dead-session GC (the re-resolved-owner cleanup) ----------------------
|
|
387
|
+
// The count-cap bounds a LIVE session to K; EPIPE + the relauncher clean a
|
|
388
|
+
// CLEANLY-closed session. This collects the remaining tail: a session whose
|
|
389
|
+
// claude.exe HARD-crashes (half-open pipe, no EPIPE) leaves ≤K watchers nothing
|
|
390
|
+
// else reaps. It asks "is the SESSION alive?" — reliably knowable via the
|
|
391
|
+
// SessionStart-refreshed owner record — NOT "is this WATCHER an orphan?"
|
|
392
|
+
// (unknowable). The grace window + `deadSince` clock make it immune to the restart
|
|
393
|
+
// false-kill that doomed the owner-pid reaper: a model/Fast-mode switch re-resolves
|
|
394
|
+
// the owner well within grace, overwriting the record and clearing deadSince.
|
|
395
|
+
// adr: adr/monitor-resilience.md
|
|
396
|
+
/** Minutes a session's owner must stay CONTINUOUSLY dead before its watchers are
|
|
397
|
+
* GC'd. Must exceed the worst-case claude.exe restart→SessionStart window (which
|
|
398
|
+
* is seconds), so a restart can never be mistaken for a death. */
|
|
399
|
+
exports.GC_GRACE_MS = 5 * 60 * 1000;
|
|
400
|
+
/** Pure GC decision (testable): given a session's owner record and whether its PID
|
|
401
|
+
* is alive right now, decide keep / start-the-grace-clock / reap. */
|
|
402
|
+
function gcVerdict(owner, ownerAlive, now, graceMs) {
|
|
403
|
+
if (!owner)
|
|
404
|
+
return 'keep'; // no owner record → the count-cap's job
|
|
405
|
+
if (ownerAlive)
|
|
406
|
+
return 'keep'; // session alive (even if idle)
|
|
407
|
+
if (owner.deadSince === undefined)
|
|
408
|
+
return 'mark-dead'; // first death sighting → start the clock
|
|
409
|
+
return (now - owner.deadSince > graceMs) ? 'reap' : 'keep';
|
|
410
|
+
}
|
|
411
|
+
/** Reap every watcher of a session whose owning claude.exe has stayed dead past the
|
|
412
|
+
* grace window. Run at SessionStart, AFTER refreshing THIS session's own owner
|
|
413
|
+
* (so it is never its own victim). No process snapshot — owner-file reads +
|
|
414
|
+
* pidAlive() syscalls. adr: adr/monitor-resilience.md */
|
|
415
|
+
function gcDeadSessions(graceMs = exports.GC_GRACE_MS) {
|
|
416
|
+
const now = Date.now();
|
|
417
|
+
const reaped = [];
|
|
453
418
|
const killed = [];
|
|
454
|
-
for (const
|
|
455
|
-
|
|
456
|
-
|
|
419
|
+
for (const short of allShorts()) {
|
|
420
|
+
const owner = readSessionOwner(short);
|
|
421
|
+
const verdict = gcVerdict(owner, !!owner && pidAlive(owner.pid), now, graceMs);
|
|
422
|
+
if (verdict === 'keep') {
|
|
423
|
+
if (owner && owner.deadSince !== undefined && pidAlive(owner.pid)) {
|
|
424
|
+
writeSessionOwner(short, { pid: owner.pid, ts: owner.ts }); // recovered → clear the grace clock
|
|
425
|
+
}
|
|
426
|
+
continue;
|
|
427
|
+
}
|
|
428
|
+
if (verdict === 'mark-dead') {
|
|
429
|
+
writeSessionOwner(short, { ...owner, deadSince: now }); // start the grace clock
|
|
430
|
+
continue;
|
|
431
|
+
}
|
|
432
|
+
// reap: the owner stayed dead past grace → the session is truly gone.
|
|
433
|
+
const deadFor = Math.round((now - (owner.deadSince || now)) / 1000);
|
|
434
|
+
for (const e of readWatcherEntries(short)) {
|
|
435
|
+
watcherAuditLog(`gc-kill=${e.pid} session=${short} owner=${owner.pid}:dead-${deadFor}s`);
|
|
436
|
+
if (killProcessTree(e.pid))
|
|
437
|
+
killed.push(e.pid);
|
|
438
|
+
}
|
|
439
|
+
removeWatcherPidfile(short);
|
|
440
|
+
removeSessionOwner(short);
|
|
441
|
+
reaped.push(short);
|
|
457
442
|
}
|
|
458
|
-
|
|
459
|
-
|
|
443
|
+
if (reaped.length)
|
|
444
|
+
watcherAuditLog(`gc-summary reaped=[${reaped.join(',')}] killed=[${killed.join(',')}]`);
|
|
445
|
+
return { reaped, killed };
|
|
460
446
|
}
|
|
461
447
|
/** Force-kill a process and its descendants. `taskkill /T` reaps the tree in one
|
|
462
448
|
* shot so a supervisor can't respawn its child in the race window. */
|
|
463
449
|
function killProcessTree(pid) {
|
|
464
450
|
try {
|
|
465
451
|
if (process.platform === 'win32') {
|
|
466
|
-
(0,
|
|
452
|
+
(0, child_process_1.execFileSync)('taskkill.exe', ['/PID', String(pid), '/T', '/F'], { timeout: 5_000, stdio: 'ignore', windowsHide: true });
|
|
467
453
|
}
|
|
468
454
|
else {
|
|
469
455
|
try {
|
|
@@ -477,27 +463,4 @@ function killProcessTree(pid) {
|
|
|
477
463
|
return false;
|
|
478
464
|
}
|
|
479
465
|
}
|
|
480
|
-
/** Remove pidfiles whose recorded PID is dead. Keeps `isLocallyArmed` honest
|
|
481
|
-
* even when a watcher died without removing its own file (hard kill / crash). */
|
|
482
|
-
function sweepDeadPidfiles() {
|
|
483
|
-
try {
|
|
484
|
-
const dir = watchersDir();
|
|
485
|
-
if (!dir || !fs.existsSync(dir))
|
|
486
|
-
return;
|
|
487
|
-
for (const f of fs.readdirSync(dir)) {
|
|
488
|
-
if (!f.endsWith('.json'))
|
|
489
|
-
continue;
|
|
490
|
-
try {
|
|
491
|
-
const rec = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf-8'));
|
|
492
|
-
if (!rec || typeof rec.pid !== 'number' || !pidAlive(rec.pid)) {
|
|
493
|
-
fs.rmSync(path.join(dir, f), { force: true });
|
|
494
|
-
}
|
|
495
|
-
}
|
|
496
|
-
catch {
|
|
497
|
-
fs.rmSync(path.join(dir, f), { force: true }); // unparseable → drop
|
|
498
|
-
}
|
|
499
|
-
}
|
|
500
|
-
}
|
|
501
|
-
catch { /* best-effort */ }
|
|
502
|
-
}
|
|
503
466
|
//# sourceMappingURL=watcher-registry.js.map
|