greprag 5.43.1 → 5.43.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/inbox-watch-supervisor.js +21 -2
- package/dist/commands/inbox-watch-supervisor.js.map +1 -1
- package/dist/commands/watcher-registry.d.ts +93 -73
- package/dist/commands/watcher-registry.js +254 -288
- package/dist/commands/watcher-registry.js.map +1 -1
- package/dist/hook.js +18 -11
- package/dist/hook.js.map +1 -1
- package/dist/session-id.d.ts +18 -10
- package/dist/session-id.js +24 -16
- package/dist/session-id.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,32 +1,35 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
/** Local watcher process registry +
|
|
2
|
+
/** Local watcher process registry + COUNT-CAP (the post-saga model).
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
* (consumer dead, process still respawning) as "armed". Two local signals fix
|
|
11
|
-
* that:
|
|
4
|
+
* THE LESSON (adr/monitor-resilience.md, "⭐ The watcher in one pass"): a
|
|
5
|
+
* functional watcher and an orphaned-but-alive one are EXTERNALLY
|
|
6
|
+
* INDISTINGUISHABLE — same command line, same live PID. Only the watcher itself
|
|
7
|
+
* can tell, via its own stdout pipe (EPIPE). So every external reaper that
|
|
8
|
+
* *judged* orphan-ness (ancestry, owner-pid, monitor-pid) was guessing, and the
|
|
9
|
+
* guesses false-killed live watchers — the 548-kill saga in one line.
|
|
12
10
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
* to gate re-arming with zero cloud dependency and zero ghost-lease lag.
|
|
16
|
-
* Paired with EPIPE-terminal in the watcher (a watcher whose consumer pipe
|
|
17
|
-
* breaks exits and removes its own pidfile), a live pidfile means a live,
|
|
18
|
-
* *consumed* watcher.
|
|
11
|
+
* This module no longer judges. Two reliable facts replace the one unknowable
|
|
12
|
+
* one:
|
|
19
13
|
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
* unrelated node server. A bug can at worst kill a watcher that then
|
|
27
|
-
* re-arms; it can never touch the operator's work.
|
|
14
|
+
* 1. **pidfile registry** — `~/.greprag/watchers/<short>.json` is a LIST of every
|
|
15
|
+
* supervisor armed for that session (`{short,pid,startedAt,ownerPid}`). A
|
|
16
|
+
* supervisor appends on arm and removes its own entry on terminal exit; a
|
|
17
|
+
* hard kill leaves a dead entry that the alive-filter sweeps. `isLocallyArmed`
|
|
18
|
+
* = "any entry alive". (Multiple live supervisors per session is normal under
|
|
19
|
+
* the June-8 spray; the count-cap bounds them.)
|
|
28
20
|
*
|
|
29
|
-
*
|
|
21
|
+
* 2. **count-cap** — `reapOrphanWatchers` keeps the K freshest LIVE watchers per
|
|
22
|
+
* session and kills only the surplus, NEVER below K. No death-detection → no
|
|
23
|
+
* false-kill; a floor of K → it can never reap to zero, so the bug that broke
|
|
24
|
+
* liveness every time is structurally impossible. "Freshest" is a *bias*
|
|
25
|
+
* toward the just-armed functional one, not a guarantee — and that's fine:
|
|
26
|
+
* EPIPE kills a wrongly-kept orphan in ~30s and the supervisor respawns. It is
|
|
27
|
+
* **snapshot-free** (per-session file reads + `pidAlive()` syscalls), so the
|
|
28
|
+
* 06-05b conhost-OOM engine — a `powershell Get-CimInstance` per SessionStart —
|
|
29
|
+
* is gone from the cleanup path entirely.
|
|
30
|
+
*
|
|
31
|
+
* EPIPE-terminal (in the watcher) + the supervisor's programmatic respawn do the
|
|
32
|
+
* real liveness work; the count-cap only bounds the pile. adr: adr/monitor-resilience.md */
|
|
30
33
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
31
34
|
if (k2 === undefined) k2 = k;
|
|
32
35
|
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
@@ -61,43 +64,31 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
61
64
|
};
|
|
62
65
|
})();
|
|
63
66
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
67
|
+
exports.GC_GRACE_MS = exports.DEFAULT_WATCHER_CAP = void 0;
|
|
64
68
|
exports.writeWatcherPidfile = writeWatcherPidfile;
|
|
69
|
+
exports.removeWatcherEntry = removeWatcherEntry;
|
|
65
70
|
exports.removeWatcherPidfile = removeWatcherPidfile;
|
|
66
71
|
exports.watcherAuditLog = watcherAuditLog;
|
|
67
72
|
exports.isLocallyArmed = isLocallyArmed;
|
|
68
73
|
exports.listLocalLiveWatchers = listLocalLiveWatchers;
|
|
69
74
|
exports.resolveClaudeOwnerPid = resolveClaudeOwnerPid;
|
|
70
|
-
exports.
|
|
75
|
+
exports.capSurplusVerdict = capSurplusVerdict;
|
|
71
76
|
exports.reapOrphanWatchers = reapOrphanWatchers;
|
|
77
|
+
exports.gcVerdict = gcVerdict;
|
|
78
|
+
exports.gcDeadSessions = gcDeadSessions;
|
|
72
79
|
const fs = __importStar(require("fs"));
|
|
73
80
|
const path = __importStar(require("path"));
|
|
74
81
|
const child_process_1 = require("child_process");
|
|
75
82
|
const WATCHER_DIRNAME = 'watchers';
|
|
76
|
-
// Matches a greprag watcher's command line in every launch shape
|
|
77
|
-
//
|
|
78
|
-
//
|
|
79
|
-
//
|
|
80
|
-
//
|
|
81
|
-
// would also match an unrelated shell that merely mentions the phrase (a grep, a
|
|
82
|
-
// diagnostic, this very reaper). This is the SAFETY GUARD: only a process whose
|
|
83
|
-
// command line is an actual watcher invocation is ever a kill candidate, so the
|
|
84
|
-
// reaper cannot touch claude.exe, an editor, or an incidental shell.
|
|
85
|
-
const WATCHER_CMD_RE = /(?:index\.js|greprag)["'\s]+inbox\s+watch\b/i;
|
|
83
|
+
// Matches a greprag watcher's command line in every launch shape (npm shim, bash
|
|
84
|
+
// relauncher, the supervisor's CreateProcess re-invocation). Anchored on the
|
|
85
|
+
// INVOKED BINARY (`greprag` or `index.js`) immediately followed by `inbox watch`
|
|
86
|
+
// — never a bare `inbox watch` substring. Retained for the owner-pid resolver's
|
|
87
|
+
// claude.exe match; the cleanup path no longer scans the process table at all.
|
|
86
88
|
const CLAUDE_PROC_RE = /^claude(\.exe)?$/i;
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
// within seconds of arming (a backgrounded daemon's parent shell exits by
|
|
91
|
-
// design), so "can I still walk up to a claude.exe?" yields false positives on
|
|
92
|
-
// LIVE watchers. "Is the specific claude.exe that armed me still alive?" does
|
|
93
|
-
// not. adr: adr/monitor-resilience.md
|
|
94
|
-
const OWNER_PID_RE = /--owner-pid[=\s]+(\d+)/;
|
|
95
|
-
// LIVE as of 2026-06-05 (before/after passed: keeps live watchers, still reaps
|
|
96
|
-
// genuine orphans, falls back to the ancestry walk for legacy watchers with no
|
|
97
|
-
// `--owner-pid`). Default ON; set GREPRAG_REAP_BY_OWNER=0 as an escape hatch to
|
|
98
|
-
// revert to the pure ancestry walk. Read per-call (not a load-time const) so a
|
|
99
|
-
// single process can exercise both modes. adr: adr/monitor-resilience.md
|
|
100
|
-
function reapByOwner() { return process.env.GREPRAG_REAP_BY_OWNER !== '0'; }
|
|
89
|
+
/** Default per-session watcher floor: keep this many freshest live watchers, reap
|
|
90
|
+
* only the surplus above it, never below it. K=2 = the live one + one margin. */
|
|
91
|
+
exports.DEFAULT_WATCHER_CAP = 2;
|
|
101
92
|
function grepragHome() {
|
|
102
93
|
const home = process.env.HOME || process.env.USERPROFILE || '';
|
|
103
94
|
return home ? path.join(home, '.greprag') : null;
|
|
@@ -106,109 +97,139 @@ function watchersDir() {
|
|
|
106
97
|
const h = grepragHome();
|
|
107
98
|
return h ? path.join(h, WATCHER_DIRNAME) : null;
|
|
108
99
|
}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
100
|
+
function pidfilePath(short) {
|
|
101
|
+
const dir = watchersDir();
|
|
102
|
+
return dir ? path.join(dir, `${short}.json`) : null;
|
|
103
|
+
}
|
|
104
|
+
/** True iff `pid` is a live process. `process.kill(pid, 0)` sends no signal — it
|
|
105
|
+
* only probes existence. EPERM = exists but another user (alive); ESRCH = gone. */
|
|
106
|
+
function pidAlive(pid) {
|
|
107
|
+
if (!Number.isFinite(pid) || pid <= 0)
|
|
108
|
+
return false;
|
|
109
|
+
try {
|
|
110
|
+
process.kill(pid, 0);
|
|
111
|
+
return true;
|
|
112
|
+
}
|
|
113
|
+
catch (e) {
|
|
114
|
+
return e?.code === 'EPERM';
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
/** Read a session's entry LIST. Back-compat: a legacy single-object pidfile reads
|
|
118
|
+
* as a 1-element list. */
|
|
119
|
+
function readWatcherEntries(short) {
|
|
120
|
+
try {
|
|
121
|
+
const p = pidfilePath(short);
|
|
122
|
+
if (!p)
|
|
123
|
+
return [];
|
|
124
|
+
const parsed = JSON.parse(fs.readFileSync(p, 'utf-8'));
|
|
125
|
+
const arr = Array.isArray(parsed) ? parsed : [parsed];
|
|
126
|
+
return arr.filter((r) => !!r && typeof r.pid === 'number');
|
|
127
|
+
}
|
|
128
|
+
catch {
|
|
129
|
+
return [];
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
function writeWatcherEntries(short, entries) {
|
|
114
133
|
try {
|
|
115
134
|
const dir = watchersDir();
|
|
116
135
|
if (!dir)
|
|
117
136
|
return;
|
|
118
137
|
fs.mkdirSync(dir, { recursive: true });
|
|
119
|
-
const
|
|
120
|
-
|
|
138
|
+
const p = path.join(dir, `${short}.json`);
|
|
139
|
+
if (entries.length === 0) {
|
|
140
|
+
fs.rmSync(p, { force: true });
|
|
141
|
+
return;
|
|
142
|
+
}
|
|
143
|
+
fs.writeFileSync(p, JSON.stringify(entries));
|
|
144
|
+
}
|
|
145
|
+
catch { /* best-effort — a failed write only means re-arm, never a crash */ }
|
|
146
|
+
}
|
|
147
|
+
/** APPEND this supervisor's entry to the session's list (pruning dead entries as
|
|
148
|
+
* it goes). Best-effort. adr: adr/monitor-resilience.md */
|
|
149
|
+
function writeWatcherPidfile(short, pid, ownerPid) {
|
|
150
|
+
const entries = readWatcherEntries(short).filter(r => r.pid !== pid && pidAlive(r.pid));
|
|
151
|
+
entries.push({ short, pid, startedAt: Date.now(), ownerPid });
|
|
152
|
+
writeWatcherEntries(short, entries);
|
|
153
|
+
}
|
|
154
|
+
/** Remove THIS supervisor's entry (called on a supervisor's terminal exit). */
|
|
155
|
+
function removeWatcherEntry(short, pid) {
|
|
156
|
+
writeWatcherEntries(short, readWatcherEntries(short).filter(r => r.pid !== pid));
|
|
157
|
+
}
|
|
158
|
+
/** Remove a session's whole pidfile (legacy / full sweep). */
|
|
159
|
+
function removeWatcherPidfile(short) {
|
|
160
|
+
try {
|
|
161
|
+
const p = pidfilePath(short);
|
|
162
|
+
if (p)
|
|
163
|
+
fs.rmSync(p, { force: true });
|
|
121
164
|
}
|
|
122
165
|
catch { /* best-effort */ }
|
|
123
166
|
}
|
|
124
|
-
/** Every
|
|
125
|
-
|
|
126
|
-
function readAllPidfiles() {
|
|
167
|
+
/** Every session short with a pidfile on disk. */
|
|
168
|
+
function allShorts() {
|
|
127
169
|
try {
|
|
128
170
|
const dir = watchersDir();
|
|
129
171
|
if (!dir || !fs.existsSync(dir))
|
|
130
172
|
return [];
|
|
131
|
-
|
|
132
|
-
for (const f of fs.readdirSync(dir)) {
|
|
133
|
-
if (!f.endsWith('.json'))
|
|
134
|
-
continue;
|
|
135
|
-
try {
|
|
136
|
-
const rec = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf-8'));
|
|
137
|
-
if (rec && typeof rec.pid === 'number')
|
|
138
|
-
out.push(rec);
|
|
139
|
-
}
|
|
140
|
-
catch { /* skip unparseable */ }
|
|
141
|
-
}
|
|
142
|
-
return out;
|
|
173
|
+
return fs.readdirSync(dir).filter(f => f.endsWith('.json')).map(f => f.slice(0, -5));
|
|
143
174
|
}
|
|
144
175
|
catch {
|
|
145
176
|
return [];
|
|
146
177
|
}
|
|
147
178
|
}
|
|
148
|
-
// ---- Owner-PID
|
|
149
|
-
// The
|
|
150
|
-
//
|
|
151
|
-
//
|
|
152
|
-
// engine: each call spawned a `powershell Get-CimInstance Win32_Process` whose
|
|
153
|
-
// conhost orphaned under load. Cache the result keyed by session short; the hot
|
|
154
|
-
// path becomes a file read + one `pidAlive()` syscall, no subprocess.
|
|
155
|
-
// adr: adr/monitor-resilience.md
|
|
179
|
+
// ---- Owner-PID cache (vestigial under the count-cap; kept for the arm stamp) ----
|
|
180
|
+
// The `notify` hook still stamps `--owner-pid` into the arm command for audit
|
|
181
|
+
// continuity. The cap never reads it; this cache just keeps the per-prompt resolve
|
|
182
|
+
// cheap (a file read instead of a process snapshot) on the rare cache miss.
|
|
156
183
|
function ownerCachePath(short) {
|
|
157
184
|
const dir = watchersDir();
|
|
158
185
|
return dir ? path.join(dir, `${short}.owner`) : null;
|
|
159
186
|
}
|
|
160
|
-
function
|
|
187
|
+
function readSessionOwner(short) {
|
|
161
188
|
try {
|
|
162
189
|
const p = ownerCachePath(short);
|
|
163
190
|
if (!p)
|
|
164
191
|
return null;
|
|
165
|
-
const
|
|
166
|
-
|
|
192
|
+
const raw = fs.readFileSync(p, 'utf-8').trim();
|
|
193
|
+
if (/^\d+$/.test(raw)) {
|
|
194
|
+
const n = Number(raw);
|
|
195
|
+
return n > 0 ? { pid: n, ts: 0 } : null;
|
|
196
|
+
} // legacy bare PID
|
|
197
|
+
const o = JSON.parse(raw);
|
|
198
|
+
return (o && typeof o.pid === 'number') ? o : null;
|
|
167
199
|
}
|
|
168
200
|
catch {
|
|
169
201
|
return null;
|
|
170
202
|
}
|
|
171
203
|
}
|
|
172
|
-
function
|
|
204
|
+
function writeSessionOwner(short, o) {
|
|
173
205
|
try {
|
|
174
206
|
const dir = watchersDir();
|
|
175
207
|
if (!dir)
|
|
176
208
|
return;
|
|
177
209
|
fs.mkdirSync(dir, { recursive: true });
|
|
178
|
-
fs.writeFileSync(path.join(dir, `${short}.owner`),
|
|
210
|
+
fs.writeFileSync(path.join(dir, `${short}.owner`), JSON.stringify(o));
|
|
179
211
|
}
|
|
180
212
|
catch { /* best-effort */ }
|
|
181
213
|
}
|
|
182
|
-
function
|
|
214
|
+
function removeSessionOwner(short) {
|
|
183
215
|
try {
|
|
184
|
-
const
|
|
185
|
-
if (
|
|
186
|
-
|
|
187
|
-
fs.rmSync(path.join(dir, `${short}.json`), { force: true });
|
|
216
|
+
const p = ownerCachePath(short);
|
|
217
|
+
if (p)
|
|
218
|
+
fs.rmSync(p, { force: true });
|
|
188
219
|
}
|
|
189
220
|
catch { /* best-effort */ }
|
|
190
221
|
}
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
if (rec && typeof rec.pid === 'number' && rec.short === short)
|
|
199
|
-
return rec;
|
|
200
|
-
return null;
|
|
201
|
-
}
|
|
202
|
-
catch {
|
|
203
|
-
return null;
|
|
204
|
-
}
|
|
222
|
+
// Thin wrappers for resolveClaudeOwnerPid (unchanged contract): read returns the
|
|
223
|
+
// current owner PID; write refreshes the record fresh (clearing any deadSince).
|
|
224
|
+
function readOwnerCache(short) {
|
|
225
|
+
return readSessionOwner(short)?.pid ?? null;
|
|
226
|
+
}
|
|
227
|
+
function writeOwnerCache(short, ownerPid) {
|
|
228
|
+
writeSessionOwner(short, { pid: ownerPid, ts: Date.now() });
|
|
205
229
|
}
|
|
206
|
-
/** Append-only diagnostic audit log at ~/.greprag/watchers/audit.log. ALWAYS ON
|
|
207
|
-
*
|
|
208
|
-
*
|
|
209
|
-
* and the reaper is a different process than the watcher. Behaviour-neutral:
|
|
210
|
-
* pure observation, never gates a kill or an arm. Cheap (a few lines per
|
|
211
|
-
* SessionStart + per watcher arm/exit). adr: adr/monitor-resilience.md */
|
|
230
|
+
/** Append-only diagnostic audit log at ~/.greprag/watchers/audit.log. ALWAYS ON so
|
|
231
|
+
* every arm / terminal exit / cap-kill leaves a trail. Behaviour-neutral: pure
|
|
232
|
+
* observation, never gates a kill or an arm. adr: adr/monitor-resilience.md */
|
|
212
233
|
function watcherAuditLog(event) {
|
|
213
234
|
try {
|
|
214
235
|
const dir = watchersDir();
|
|
@@ -219,59 +240,36 @@ function watcherAuditLog(event) {
|
|
|
219
240
|
}
|
|
220
241
|
catch { /* best-effort — visibility must never break the watcher */ }
|
|
221
242
|
}
|
|
222
|
-
/**
|
|
223
|
-
*
|
|
224
|
-
*
|
|
225
|
-
|
|
226
|
-
if (!Number.isFinite(pid) || pid <= 0)
|
|
227
|
-
return false;
|
|
228
|
-
try {
|
|
229
|
-
process.kill(pid, 0);
|
|
230
|
-
return true;
|
|
231
|
-
}
|
|
232
|
-
catch (e) {
|
|
233
|
-
return e?.code === 'EPERM';
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
/** Local-first arm check: is there a live watcher PROCESS for this session on
|
|
237
|
-
* THIS machine? This is ground truth for "armed" — the consumer (the Monitor
|
|
238
|
-
* task) is local, so a live local pidfile means a live, consumed watcher.
|
|
239
|
-
* Replaces the server `isSessionArmed` check, which counts orphans (consumer
|
|
240
|
-
* dead, socket still open) as armed and so both (a) suppresses re-arm when the
|
|
241
|
-
* real watcher is gone and (b) lets re-arm stack new watchers on undead
|
|
242
|
-
* orphans. A dead-PID pidfile is swept here so the next turn re-arms. */
|
|
243
|
+
/** Local-first arm check: does this session have ANY live watcher process on THIS
|
|
244
|
+
* machine? Ground truth for "armed" — the consumer (the Monitor task) is local,
|
|
245
|
+
* so a live local entry means a live, consumed watcher. Sweeps dead entries as it
|
|
246
|
+
* reads, so a hard-killed watcher doesn't read as armed. */
|
|
243
247
|
function isLocallyArmed(short) {
|
|
244
|
-
const
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
return false;
|
|
250
|
-
}
|
|
251
|
-
return true;
|
|
248
|
+
const entries = readWatcherEntries(short);
|
|
249
|
+
const alive = entries.filter(r => pidAlive(r.pid));
|
|
250
|
+
if (alive.length !== entries.length)
|
|
251
|
+
writeWatcherEntries(short, alive);
|
|
252
|
+
return alive.length > 0;
|
|
252
253
|
}
|
|
253
|
-
/**
|
|
254
|
-
*
|
|
255
|
-
*
|
|
256
|
-
*
|
|
257
|
-
* is computed on demand from ground truth, never a cached/replicated list, so it
|
|
258
|
-
* cannot go stale the way the cloud `/attached` registry does. adr:
|
|
259
|
-
* adr/monitor-resilience.md */
|
|
254
|
+
/** Every session whose watcher process is alive on THIS machine right now —
|
|
255
|
+
* computed on demand from the pidfile registry + OS liveness, sweeping dead
|
|
256
|
+
* entries. Most-recently-armed first. This is what the desk-line returns when the
|
|
257
|
+
* cloud asks "what's truly active?". adr: adr/monitor-resilience.md */
|
|
260
258
|
function listLocalLiveWatchers() {
|
|
261
259
|
const out = [];
|
|
262
|
-
for (const
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
260
|
+
for (const short of allShorts()) {
|
|
261
|
+
const entries = readWatcherEntries(short);
|
|
262
|
+
const alive = entries.filter(r => pidAlive(r.pid));
|
|
263
|
+
if (alive.length !== entries.length)
|
|
264
|
+
writeWatcherEntries(short, alive);
|
|
265
|
+
out.push(...alive);
|
|
267
266
|
}
|
|
268
|
-
// Stable order: most-recently-armed first.
|
|
269
267
|
return out.sort((a, b) => (b.startedAt || 0) - (a.startedAt || 0));
|
|
270
268
|
}
|
|
271
|
-
/** Snapshot (pid, ppid, name, cmdline) for every process. Windows via one CIM
|
|
272
|
-
*
|
|
273
|
-
*
|
|
274
|
-
* failure returns []
|
|
269
|
+
/** Snapshot (pid, ppid, name, cmdline) for every process. Windows via one CIM call;
|
|
270
|
+
* POSIX via `ps`. ONLY used by `resolveClaudeOwnerPid` now (the cleanup path is
|
|
271
|
+
* snapshot-free); it runs at most once per session (cache miss), never per-tick.
|
|
272
|
+
* Best-effort: any failure returns []. */
|
|
275
273
|
function snapshotProcs() {
|
|
276
274
|
try {
|
|
277
275
|
if (process.platform === 'win32') {
|
|
@@ -290,7 +288,6 @@ function snapshotProcs() {
|
|
|
290
288
|
}))
|
|
291
289
|
.filter(p => Number.isFinite(p.pid) && p.pid > 0);
|
|
292
290
|
}
|
|
293
|
-
// POSIX fallback.
|
|
294
291
|
const out = (0, child_process_1.execFileSync)('ps', ['-eo', 'pid=,ppid=,comm=,args='], {
|
|
295
292
|
timeout: 12_000, maxBuffer: 64 * 1024 * 1024,
|
|
296
293
|
}).toString();
|
|
@@ -307,48 +304,11 @@ function snapshotProcs() {
|
|
|
307
304
|
return [];
|
|
308
305
|
}
|
|
309
306
|
}
|
|
310
|
-
/**
|
|
311
|
-
*
|
|
312
|
-
*
|
|
313
|
-
*
|
|
314
|
-
* the live snapshot) before any claude.exe. Depth- and cycle-guarded. */
|
|
315
|
-
function hasLiveClaudeAncestor(start, byPid) {
|
|
316
|
-
let cur = start;
|
|
317
|
-
const seen = new Set();
|
|
318
|
-
for (let depth = 0; cur && depth < 32; depth++) {
|
|
319
|
-
if (seen.has(cur.pid))
|
|
320
|
-
break; // cycle guard
|
|
321
|
-
seen.add(cur.pid);
|
|
322
|
-
if (CLAUDE_PROC_RE.test(cur.name))
|
|
323
|
-
return true; // start/ancestor IS claude
|
|
324
|
-
const parent = byPid.get(cur.ppid);
|
|
325
|
-
if (!parent)
|
|
326
|
-
return false; // severed — parent not in the live table
|
|
327
|
-
cur = parent;
|
|
328
|
-
}
|
|
329
|
-
return false;
|
|
330
|
-
}
|
|
331
|
-
/** Owning claude.exe PID parsed from a watcher's command line (`--owner-pid N`),
|
|
332
|
-
* or null if absent (a legacy watcher armed before this flag existed). */
|
|
333
|
-
function ownerPidFromCmd(cmd) {
|
|
334
|
-
const m = OWNER_PID_RE.exec(cmd || '');
|
|
335
|
-
return m ? Number(m[1]) : null;
|
|
336
|
-
}
|
|
337
|
-
/** Resolve the owning claude.exe PID for the CURRENT process by walking its
|
|
338
|
-
* parent chain in a fresh snapshot. MUST be called from a hook — a direct, live
|
|
339
|
-
* descendant of the session's claude.exe whose chain is intact — NOT from the
|
|
340
|
-
* watcher, whose launcher shell collapses within seconds of arming (proven
|
|
341
|
-
* 2026-06-05: a supervisor's ancestry to claude.exe is already severed at arm
|
|
342
|
-
* time). The hook stamps the result into the watch command (`--owner-pid`) so
|
|
343
|
-
* the reaper can later ask "is that exact claude.exe still alive?" instead of
|
|
344
|
-
* re-walking a chain that no longer exists. Returns null if no claude.exe
|
|
345
|
-
* ancestor is found (caller omits the flag → reaper falls back to ancestry).
|
|
346
|
-
* adr: adr/monitor-resilience.md */
|
|
307
|
+
/** Resolve the owning claude.exe PID for the CURRENT process by walking its parent
|
|
308
|
+
* chain. MUST be called from a hook (a live descendant of claude.exe). Stamped
|
|
309
|
+
* into the arm command (`--owner-pid`) for audit continuity. Cache hit skips the
|
|
310
|
+
* snapshot. Returns null if no claude.exe ancestor. adr: adr/monitor-resilience.md */
|
|
347
311
|
function resolveClaudeOwnerPid(cacheShort) {
|
|
348
|
-
// Cache hit: the owning claude.exe is still alive (stable for the session) —
|
|
349
|
-
// skip the snapshot entirely. This is what turns the per-prompt hot path from
|
|
350
|
-
// "spawn a powershell every turn" into "read a file + one syscall".
|
|
351
|
-
// adr: adr/monitor-resilience.md
|
|
352
312
|
if (cacheShort) {
|
|
353
313
|
const cached = readOwnerCache(cacheShort);
|
|
354
314
|
if (cached && pidAlive(cached))
|
|
@@ -380,80 +340,109 @@ function resolveClaudeOwnerPid(cacheShort) {
|
|
|
380
340
|
return null;
|
|
381
341
|
}
|
|
382
342
|
}
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
343
|
+
// ---- The count-cap -------------------------------------------------------
|
|
344
|
+
/** Pure verdict (testable): given a session's LIVE entries, keep the K freshest
|
|
345
|
+
* (by startedAt), return the older surplus to kill. Never kills when count ≤ K;
|
|
346
|
+
* never kills more than count − K; never the K freshest. This is the entire
|
|
347
|
+
* liveness-safety guarantee — no death judgment, just a count + a floor. */
|
|
348
|
+
function capSurplusVerdict(entries, K) {
|
|
349
|
+
if (entries.length <= K)
|
|
350
|
+
return { keep: entries.slice(), kill: [] };
|
|
351
|
+
const sorted = [...entries].sort((a, b) => (b.startedAt || 0) - (a.startedAt || 0));
|
|
352
|
+
return { keep: sorted.slice(0, K), kill: sorted.slice(K) };
|
|
388
353
|
}
|
|
389
|
-
/**
|
|
390
|
-
*
|
|
391
|
-
*
|
|
392
|
-
*
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
354
|
+
/** Count-cap every session: keep the K freshest LIVE watchers, kill only the
|
|
355
|
+
* surplus. NO death-detection, NO process snapshot — per-session file reads +
|
|
356
|
+
* `pidAlive()`, and a `taskkill` only when there is genuine surplus. Run at
|
|
357
|
+
* SessionStart (via the recap hook) and exposed as `greprag inbox reap`. The name
|
|
358
|
+
* is retained for its callers; the behaviour is "cap", not "judge orphans".
|
|
359
|
+
* adr: adr/monitor-resilience.md */
|
|
360
|
+
function reapOrphanWatchers(K = exports.DEFAULT_WATCHER_CAP) {
|
|
361
|
+
let scanned = 0;
|
|
362
|
+
let surplus = 0;
|
|
363
|
+
const killedAll = [];
|
|
364
|
+
for (const short of allShorts()) {
|
|
365
|
+
const alive = readWatcherEntries(short).filter(r => pidAlive(r.pid));
|
|
366
|
+
scanned += alive.length;
|
|
367
|
+
const { keep, kill } = capSurplusVerdict(alive, K);
|
|
368
|
+
if (kill.length === 0) {
|
|
369
|
+
writeWatcherEntries(short, alive); // sweep dead entries, nothing to cap
|
|
370
|
+
continue;
|
|
399
371
|
}
|
|
400
|
-
|
|
372
|
+
surplus += kill.length;
|
|
373
|
+
const survivors = keep.slice();
|
|
374
|
+
for (const w of kill) {
|
|
375
|
+
watcherAuditLog(`cap-candidate=${w.pid} session=${short} SURPLUS→kill startedAt=${w.startedAt}`);
|
|
376
|
+
if (killProcessTree(w.pid))
|
|
377
|
+
killedAll.push(w.pid);
|
|
378
|
+
else
|
|
379
|
+
survivors.push(w); // kill failed → keep tracking it so it's retried
|
|
380
|
+
}
|
|
381
|
+
writeWatcherEntries(short, survivors);
|
|
401
382
|
}
|
|
402
|
-
|
|
383
|
+
watcherAuditLog(`cap-summary K=${K} scanned=${scanned} surplus=${surplus} killed=[${killedAll.join(',')}]`);
|
|
384
|
+
return { scanned, orphans: surplus, killed: killedAll };
|
|
403
385
|
}
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
}
|
|
437
|
-
const byPid = new Map();
|
|
438
|
-
for (const p of procs)
|
|
439
|
-
byPid.set(p.pid, p);
|
|
440
|
-
// Candidate set — the SAFETY GUARD. Only these are ever eligible to be killed.
|
|
441
|
-
const watchers = procs.filter(p => WATCHER_CMD_RE.test(p.cmd));
|
|
442
|
-
const verdicts = watchers.map(w => ({ w, ...severedVerdict(w, byPid) }));
|
|
443
|
-
const severed = verdicts.filter(v => v.severed).map(v => v.w);
|
|
444
|
-
// Visibility: record every candidate's verdict + the reason (which signal
|
|
445
|
-
// decided it) BEFORE any kill, so a false-positive is auditable after the
|
|
446
|
-
// fact. adr: adr/monitor-resilience.md
|
|
447
|
-
for (const v of verdicts) {
|
|
448
|
-
watcherAuditLog(`reap-candidate=${v.w.pid} name=${v.w.name} ppid=${v.w.ppid} ${v.severed ? 'SEVERED→kill' : 'ok→keep'} via=${v.reason} cmd="${(v.w.cmd || '').slice(0, 90)}"`);
|
|
449
|
-
}
|
|
386
|
+
// ---- Dead-session GC (the re-resolved-owner cleanup) ----------------------
|
|
387
|
+
// The count-cap bounds a LIVE session to K; EPIPE + the relauncher clean a
|
|
388
|
+
// CLEANLY-closed session. This collects the remaining tail: a session whose
|
|
389
|
+
// claude.exe HARD-crashes (half-open pipe, no EPIPE) leaves ≤K watchers nothing
|
|
390
|
+
// else reaps. It asks "is the SESSION alive?" — reliably knowable via the
|
|
391
|
+
// SessionStart-refreshed owner record — NOT "is this WATCHER an orphan?"
|
|
392
|
+
// (unknowable). The grace window + `deadSince` clock make it immune to the restart
|
|
393
|
+
// false-kill that doomed the owner-pid reaper: a model/Fast-mode switch re-resolves
|
|
394
|
+
// the owner well within grace, overwriting the record and clearing deadSince.
|
|
395
|
+
// adr: adr/monitor-resilience.md
|
|
396
|
+
/** Minutes a session's owner must stay CONTINUOUSLY dead before its watchers are
|
|
397
|
+
* GC'd. Must exceed the worst-case claude.exe restart→SessionStart window (which
|
|
398
|
+
* is seconds), so a restart can never be mistaken for a death. */
|
|
399
|
+
exports.GC_GRACE_MS = 5 * 60 * 1000;
|
|
400
|
+
/** Pure GC decision (testable): given a session's owner record and whether its PID
|
|
401
|
+
* is alive right now, decide keep / start-the-grace-clock / reap. */
|
|
402
|
+
function gcVerdict(owner, ownerAlive, now, graceMs) {
|
|
403
|
+
if (!owner)
|
|
404
|
+
return 'keep'; // no owner record → the count-cap's job
|
|
405
|
+
if (ownerAlive)
|
|
406
|
+
return 'keep'; // session alive (even if idle)
|
|
407
|
+
if (owner.deadSince === undefined)
|
|
408
|
+
return 'mark-dead'; // first death sighting → start the clock
|
|
409
|
+
return (now - owner.deadSince > graceMs) ? 'reap' : 'keep';
|
|
410
|
+
}
|
|
411
|
+
/** Reap every watcher of a session whose owning claude.exe has stayed dead past the
|
|
412
|
+
* grace window. Run at SessionStart, AFTER refreshing THIS session's own owner
|
|
413
|
+
* (so it is never its own victim). No process snapshot — owner-file reads +
|
|
414
|
+
* pidAlive() syscalls. adr: adr/monitor-resilience.md */
|
|
415
|
+
function gcDeadSessions(graceMs = exports.GC_GRACE_MS) {
|
|
416
|
+
const now = Date.now();
|
|
417
|
+
const reaped = [];
|
|
450
418
|
const killed = [];
|
|
451
|
-
for (const
|
|
452
|
-
|
|
453
|
-
|
|
419
|
+
for (const short of allShorts()) {
|
|
420
|
+
const owner = readSessionOwner(short);
|
|
421
|
+
const verdict = gcVerdict(owner, !!owner && pidAlive(owner.pid), now, graceMs);
|
|
422
|
+
if (verdict === 'keep') {
|
|
423
|
+
if (owner && owner.deadSince !== undefined && pidAlive(owner.pid)) {
|
|
424
|
+
writeSessionOwner(short, { pid: owner.pid, ts: owner.ts }); // recovered → clear the grace clock
|
|
425
|
+
}
|
|
426
|
+
continue;
|
|
427
|
+
}
|
|
428
|
+
if (verdict === 'mark-dead') {
|
|
429
|
+
writeSessionOwner(short, { ...owner, deadSince: now }); // start the grace clock
|
|
430
|
+
continue;
|
|
431
|
+
}
|
|
432
|
+
// reap: the owner stayed dead past grace → the session is truly gone.
|
|
433
|
+
const deadFor = Math.round((now - (owner.deadSince || now)) / 1000);
|
|
434
|
+
for (const e of readWatcherEntries(short)) {
|
|
435
|
+
watcherAuditLog(`gc-kill=${e.pid} session=${short} owner=${owner.pid}:dead-${deadFor}s`);
|
|
436
|
+
if (killProcessTree(e.pid))
|
|
437
|
+
killed.push(e.pid);
|
|
438
|
+
}
|
|
439
|
+
removeWatcherPidfile(short);
|
|
440
|
+
removeSessionOwner(short);
|
|
441
|
+
reaped.push(short);
|
|
454
442
|
}
|
|
455
|
-
|
|
456
|
-
|
|
443
|
+
if (reaped.length)
|
|
444
|
+
watcherAuditLog(`gc-summary reaped=[${reaped.join(',')}] killed=[${killed.join(',')}]`);
|
|
445
|
+
return { reaped, killed };
|
|
457
446
|
}
|
|
458
447
|
/** Force-kill a process and its descendants. `taskkill /T` reaps the tree in one
|
|
459
448
|
* shot so a supervisor can't respawn its child in the race window. */
|
|
@@ -474,27 +463,4 @@ function killProcessTree(pid) {
|
|
|
474
463
|
return false;
|
|
475
464
|
}
|
|
476
465
|
}
|
|
477
|
-
/** Remove pidfiles whose recorded PID is dead. Keeps `isLocallyArmed` honest
|
|
478
|
-
* even when a watcher died without removing its own file (hard kill / crash). */
|
|
479
|
-
function sweepDeadPidfiles() {
|
|
480
|
-
try {
|
|
481
|
-
const dir = watchersDir();
|
|
482
|
-
if (!dir || !fs.existsSync(dir))
|
|
483
|
-
return;
|
|
484
|
-
for (const f of fs.readdirSync(dir)) {
|
|
485
|
-
if (!f.endsWith('.json'))
|
|
486
|
-
continue;
|
|
487
|
-
try {
|
|
488
|
-
const rec = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf-8'));
|
|
489
|
-
if (!rec || typeof rec.pid !== 'number' || !pidAlive(rec.pid)) {
|
|
490
|
-
fs.rmSync(path.join(dir, f), { force: true });
|
|
491
|
-
}
|
|
492
|
-
}
|
|
493
|
-
catch {
|
|
494
|
-
fs.rmSync(path.join(dir, f), { force: true }); // unparseable → drop
|
|
495
|
-
}
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
catch { /* best-effort */ }
|
|
499
|
-
}
|
|
500
466
|
//# sourceMappingURL=watcher-registry.js.map
|