aegis-bridge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +404 -0
- package/dashboard/dist/assets/index-BoZwGLAx.css +32 -0
- package/dashboard/dist/assets/index-C61BkKH-.js +312 -0
- package/dashboard/dist/assets/index-C61BkKH-.js.map +1 -0
- package/dashboard/dist/index.html +14 -0
- package/dist/api-contracts.d.ts +229 -0
- package/dist/api-contracts.js +7 -0
- package/dist/api-contracts.typecheck.d.ts +14 -0
- package/dist/api-contracts.typecheck.js +1 -0
- package/dist/api-error-envelope.d.ts +15 -0
- package/dist/api-error-envelope.js +80 -0
- package/dist/auth.d.ts +87 -0
- package/dist/auth.js +276 -0
- package/dist/channels/index.d.ts +8 -0
- package/dist/channels/index.js +8 -0
- package/dist/channels/manager.d.ts +47 -0
- package/dist/channels/manager.js +115 -0
- package/dist/channels/telegram-style.d.ts +118 -0
- package/dist/channels/telegram-style.js +202 -0
- package/dist/channels/telegram.d.ts +91 -0
- package/dist/channels/telegram.js +1518 -0
- package/dist/channels/types.d.ts +77 -0
- package/dist/channels/types.js +8 -0
- package/dist/channels/webhook.d.ts +60 -0
- package/dist/channels/webhook.js +216 -0
- package/dist/cli.d.ts +8 -0
- package/dist/cli.js +252 -0
- package/dist/config.d.ts +90 -0
- package/dist/config.js +214 -0
- package/dist/consensus.d.ts +16 -0
- package/dist/consensus.js +19 -0
- package/dist/continuation-pointer.d.ts +11 -0
- package/dist/continuation-pointer.js +65 -0
- package/dist/diagnostics.d.ts +27 -0
- package/dist/diagnostics.js +95 -0
- package/dist/error-categories.d.ts +39 -0
- package/dist/error-categories.js +73 -0
- package/dist/events.d.ts +133 -0
- package/dist/events.js +389 -0
- package/dist/fault-injection.d.ts +29 -0
- package/dist/fault-injection.js +115 -0
- package/dist/file-utils.d.ts +2 -0
- package/dist/file-utils.js +37 -0
- package/dist/handshake.d.ts +60 -0
- package/dist/handshake.js +124 -0
- package/dist/hook-settings.d.ts +80 -0
- package/dist/hook-settings.js +272 -0
- package/dist/hook.d.ts +19 -0
- package/dist/hook.js +231 -0
- package/dist/hooks.d.ts +32 -0
- package/dist/hooks.js +364 -0
- package/dist/jsonl-watcher.d.ts +59 -0
- package/dist/jsonl-watcher.js +166 -0
- package/dist/logger.d.ts +35 -0
- package/dist/logger.js +65 -0
- package/dist/mcp-server.d.ts +123 -0
- package/dist/mcp-server.js +869 -0
- package/dist/memory-bridge.d.ts +27 -0
- package/dist/memory-bridge.js +137 -0
- package/dist/memory-routes.d.ts +3 -0
- package/dist/memory-routes.js +100 -0
- package/dist/metrics.d.ts +126 -0
- package/dist/metrics.js +286 -0
- package/dist/model-router.d.ts +53 -0
- package/dist/model-router.js +150 -0
- package/dist/monitor.d.ts +103 -0
- package/dist/monitor.js +820 -0
- package/dist/path-utils.d.ts +11 -0
- package/dist/path-utils.js +21 -0
- package/dist/permission-evaluator.d.ts +10 -0
- package/dist/permission-evaluator.js +48 -0
- package/dist/permission-guard.d.ts +51 -0
- package/dist/permission-guard.js +196 -0
- package/dist/permission-request-manager.d.ts +12 -0
- package/dist/permission-request-manager.js +36 -0
- package/dist/permission-routes.d.ts +7 -0
- package/dist/permission-routes.js +28 -0
- package/dist/pipeline.d.ts +97 -0
- package/dist/pipeline.js +291 -0
- package/dist/process-utils.d.ts +4 -0
- package/dist/process-utils.js +73 -0
- package/dist/question-manager.d.ts +54 -0
- package/dist/question-manager.js +80 -0
- package/dist/retry.d.ts +11 -0
- package/dist/retry.js +34 -0
- package/dist/safe-json.d.ts +12 -0
- package/dist/safe-json.js +22 -0
- package/dist/screenshot.d.ts +28 -0
- package/dist/screenshot.js +60 -0
- package/dist/server.d.ts +10 -0
- package/dist/server.js +1973 -0
- package/dist/session-cleanup.d.ts +18 -0
- package/dist/session-cleanup.js +11 -0
- package/dist/session.d.ts +379 -0
- package/dist/session.js +1568 -0
- package/dist/shutdown-utils.d.ts +5 -0
- package/dist/shutdown-utils.js +24 -0
- package/dist/signal-cleanup-helper.d.ts +48 -0
- package/dist/signal-cleanup-helper.js +117 -0
- package/dist/sse-limiter.d.ts +47 -0
- package/dist/sse-limiter.js +61 -0
- package/dist/sse-writer.d.ts +31 -0
- package/dist/sse-writer.js +94 -0
- package/dist/ssrf.d.ts +102 -0
- package/dist/ssrf.js +267 -0
- package/dist/startup.d.ts +6 -0
- package/dist/startup.js +162 -0
- package/dist/suppress.d.ts +33 -0
- package/dist/suppress.js +79 -0
- package/dist/swarm-monitor.d.ts +117 -0
- package/dist/swarm-monitor.js +300 -0
- package/dist/template-store.d.ts +45 -0
- package/dist/template-store.js +142 -0
- package/dist/terminal-parser.d.ts +16 -0
- package/dist/terminal-parser.js +346 -0
- package/dist/tmux-capture-cache.d.ts +18 -0
- package/dist/tmux-capture-cache.js +34 -0
- package/dist/tmux.d.ts +183 -0
- package/dist/tmux.js +906 -0
- package/dist/tool-registry.d.ts +40 -0
- package/dist/tool-registry.js +83 -0
- package/dist/transcript.d.ts +63 -0
- package/dist/transcript.js +284 -0
- package/dist/utils/circular-buffer.d.ts +11 -0
- package/dist/utils/circular-buffer.js +37 -0
- package/dist/utils/redact-headers.d.ts +13 -0
- package/dist/utils/redact-headers.js +54 -0
- package/dist/validation.d.ts +406 -0
- package/dist/validation.js +415 -0
- package/dist/verification.d.ts +2 -0
- package/dist/verification.js +72 -0
- package/dist/worktree-lookup.d.ts +24 -0
- package/dist/worktree-lookup.js +71 -0
- package/dist/ws-terminal.d.ts +32 -0
- package/dist/ws-terminal.js +348 -0
- package/package.json +83 -0
package/dist/monitor.js
ADDED
|
@@ -0,0 +1,820 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* monitor.ts — Background monitor that polls sessions and routes events to channels.
|
|
3
|
+
*
|
|
4
|
+
* Runs a polling loop that:
|
|
5
|
+
* 1. Checks each active session for new JSONL entries
|
|
6
|
+
* 2. Detects status changes (working → idle, permission prompts, etc.)
|
|
7
|
+
* 3. Routes events to the ChannelManager (which fans out to Telegram, webhooks, etc.)
|
|
8
|
+
*/
|
|
9
|
+
import { readFile } from 'node:fs/promises';
|
|
10
|
+
import { existsSync } from 'node:fs';
|
|
11
|
+
import { join } from 'node:path';
|
|
12
|
+
import { homedir } from 'node:os';
|
|
13
|
+
import { stopSignalsSchema } from './validation.js';
|
|
14
|
+
import { suppressedCatch } from './suppress.js';
|
|
15
|
+
import { logger } from './logger.js';
|
|
16
|
+
import { maybeInjectFault } from './fault-injection.js';
|
|
17
|
+
/** Issue #89 L4: Debounce interval for status change broadcasts (ms). */
|
|
18
|
+
const STATUS_CHANGE_DEBOUNCE_MS = 500;
|
|
19
|
+
export const DEFAULT_MONITOR_CONFIG = {
|
|
20
|
+
pollIntervalMs: 30_000, // 30s base — hooks are the primary signal (Issue #169 Phase 3)
|
|
21
|
+
fastPollIntervalMs: 5_000, // 5s when hooks are quiet — fallback safety net
|
|
22
|
+
hookQuietMs: 60_000, // 60s without a hook → switch to fast polling
|
|
23
|
+
stallThresholdMs: 2 * 60 * 1000, // 2 minutes (Issue #392: reduced from 5 min)
|
|
24
|
+
stallCheckIntervalMs: 30 * 1000, // check every 30 seconds (faster for shorter thresholds)
|
|
25
|
+
deadCheckIntervalMs: 10 * 1000, // check every 10 seconds (Issue M19: faster dead detection)
|
|
26
|
+
permissionStallMs: 5 * 60 * 1000, // 5 min waiting for permission = stalled
|
|
27
|
+
unknownStallMs: 3 * 60 * 1000, // 3 min in unknown state = stalled
|
|
28
|
+
permissionTimeoutMs: 10 * 60 * 1000, // 10 min → auto-reject permission
|
|
29
|
+
};
|
|
30
|
+
const SIGNAL_BY_NUMBER = {
|
|
31
|
+
1: 'SIGHUP',
|
|
32
|
+
2: 'SIGINT',
|
|
33
|
+
3: 'SIGQUIT',
|
|
34
|
+
6: 'SIGABRT',
|
|
35
|
+
9: 'SIGKILL',
|
|
36
|
+
11: 'SIGSEGV',
|
|
37
|
+
13: 'SIGPIPE',
|
|
38
|
+
14: 'SIGALRM',
|
|
39
|
+
15: 'SIGTERM',
|
|
40
|
+
};
|
|
41
|
+
function signalFromExitCode(exitCode) {
|
|
42
|
+
if (exitCode === null || exitCode < 129)
|
|
43
|
+
return null;
|
|
44
|
+
return SIGNAL_BY_NUMBER[exitCode - 128] ?? `SIG${exitCode - 128}`;
|
|
45
|
+
}
|
|
46
|
+
export class SessionMonitor {
|
|
47
|
+
sessions;
|
|
48
|
+
channels;
|
|
49
|
+
config;
|
|
50
|
+
running = false;
|
|
51
|
+
lastStatus = new Map();
|
|
52
|
+
lastBytesSeen = new Map();
|
|
53
|
+
// Issue #663: Nested Map for O(1) per-session stall lookup (was Set with O(n) prefix scan)
|
|
54
|
+
stallNotified = new Map(); // sessionId → Set<stallType>
|
|
55
|
+
/** Issue #663: O(1) stall notification check. */
|
|
56
|
+
stallHas(sessionId, stallType) {
|
|
57
|
+
return this.stallNotified.get(sessionId)?.has(stallType) ?? false;
|
|
58
|
+
}
|
|
59
|
+
/** Issue #663: O(1) stall notification add. */
|
|
60
|
+
stallAdd(sessionId, stallType) {
|
|
61
|
+
const set = this.stallNotified.get(sessionId);
|
|
62
|
+
if (set) {
|
|
63
|
+
set.add(stallType);
|
|
64
|
+
}
|
|
65
|
+
else {
|
|
66
|
+
this.stallNotified.set(sessionId, new Set([stallType]));
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
/** Issue #663: O(1) stall notification delete. */
|
|
70
|
+
stallDelete(sessionId, stallType) {
|
|
71
|
+
this.stallNotified.get(sessionId)?.delete(stallType);
|
|
72
|
+
}
|
|
73
|
+
/** Issue #663: Delete all stall notifications for a session. */
|
|
74
|
+
stallDeleteAll(sessionId) {
|
|
75
|
+
this.stallNotified.delete(sessionId);
|
|
76
|
+
}
|
|
77
|
+
/** Issue #663: Delete specific stall types for a session. */
|
|
78
|
+
stallDeleteTypes(sessionId, types) {
|
|
79
|
+
const set = this.stallNotified.get(sessionId);
|
|
80
|
+
if (!set)
|
|
81
|
+
return;
|
|
82
|
+
for (const t of types)
|
|
83
|
+
set.delete(t);
|
|
84
|
+
}
|
|
85
|
+
lastStallCheck = 0;
|
|
86
|
+
lastDeadCheck = 0;
|
|
87
|
+
idleNotified = new Set(); // prevent idle spam
|
|
88
|
+
idleSince = new Map(); // debounce: when idle started
|
|
89
|
+
processedStopSignals = new Set(); // Issue #15: don't re-process signals
|
|
90
|
+
static MAX_PROCESSED_STOP_SIGNALS = 1000; // #220: prevent unbounded growth
|
|
91
|
+
// Smart stall detection: track when each non-working state started
|
|
92
|
+
stateSince = new Map(); // sessionId → { state, since } (one entry per session)
|
|
93
|
+
deadNotified = new Set(); // don't spam dead session events
|
|
94
|
+
prevStatusForStall = new Map(); // track previous status for stall transition detection
|
|
95
|
+
rateLimitedSessions = new Set(); // sessions in rate-limit backoff
|
|
96
|
+
// Issue #397: Track tmux server health for crash recovery
|
|
97
|
+
tmuxWasDown = false;
|
|
98
|
+
lastTmuxHealthCheck = 0;
|
|
99
|
+
static TMUX_HEALTH_CHECK_INTERVAL_MS = 10_000; // check every 10s
|
|
100
|
+
/** Issue #89 L4: Debounce status change broadcasts per session.
|
|
101
|
+
* If multiple status changes happen within 500ms, only emit the last one.
|
|
102
|
+
* Prevents rapid-fire notifications during state transitions. */
|
|
103
|
+
statusChangeDebounce = new Map();
|
|
104
|
+
/** Issue #32: Optional SSE event bus for real-time streaming. */
|
|
105
|
+
eventBus;
|
|
106
|
+
/** Issue #84: fs.watch-based JSONL watcher for near-instant message detection. */
|
|
107
|
+
jsonlWatcher;
|
|
108
|
+
constructor(sessions, channels, config = DEFAULT_MONITOR_CONFIG) {
|
|
109
|
+
this.sessions = sessions;
|
|
110
|
+
this.channels = channels;
|
|
111
|
+
this.config = config;
|
|
112
|
+
this.config = { ...DEFAULT_MONITOR_CONFIG, ...config };
|
|
113
|
+
}
|
|
114
|
+
/** Issue #32: Set the event bus for SSE streaming. */
|
|
115
|
+
setEventBus(bus) {
|
|
116
|
+
this.eventBus = bus;
|
|
117
|
+
}
|
|
118
|
+
/** Issue #397: Set the TmuxManager reference for tmux health checks. */
|
|
119
|
+
tmux;
|
|
120
|
+
setTmuxManager(tmuxManager) {
|
|
121
|
+
this.tmux = tmuxManager;
|
|
122
|
+
}
|
|
123
|
+
/** Issue #84: Set the JSONL watcher for fs.watch-based message detection. */
|
|
124
|
+
setJsonlWatcher(watcher) {
|
|
125
|
+
this.jsonlWatcher = watcher;
|
|
126
|
+
watcher.onEntries((event) => {
|
|
127
|
+
this.handleWatcherEvent(event);
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
start() {
|
|
131
|
+
if (this.running)
|
|
132
|
+
return;
|
|
133
|
+
this.running = true;
|
|
134
|
+
this.loop();
|
|
135
|
+
}
|
|
136
|
+
stop() {
|
|
137
|
+
this.running = false;
|
|
138
|
+
}
|
|
139
|
+
async loop() {
|
|
140
|
+
while (this.running) {
|
|
141
|
+
try {
|
|
142
|
+
await this.poll();
|
|
143
|
+
}
|
|
144
|
+
catch (e) {
|
|
145
|
+
logger.error({
|
|
146
|
+
component: 'monitor',
|
|
147
|
+
operation: 'poll',
|
|
148
|
+
errorCode: 'MONITOR_POLL_ERROR',
|
|
149
|
+
attributes: { error: e instanceof Error ? e.message : String(e) },
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
// Issue #169 Phase 3: Adaptive polling — use fast interval if any session
|
|
153
|
+
// hasn't received a hook recently (hooks may have stopped working).
|
|
154
|
+
const interval = this.needsFastPolling() ? this.config.fastPollIntervalMs : this.config.pollIntervalMs;
|
|
155
|
+
await sleep(interval);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
/** Check if any active session hasn't received a hook recently. */
|
|
159
|
+
needsFastPolling() {
|
|
160
|
+
const now = Date.now();
|
|
161
|
+
for (const session of this.sessions.listSessions()) {
|
|
162
|
+
const lastHook = session.lastHookAt;
|
|
163
|
+
// If a session has never received a hook, always fast-poll (hooks may not be configured)
|
|
164
|
+
if (lastHook === undefined)
|
|
165
|
+
return true;
|
|
166
|
+
// If no hook for hookQuietMs, switch to fast polling
|
|
167
|
+
if (now - lastHook > this.config.hookQuietMs)
|
|
168
|
+
return true;
|
|
169
|
+
}
|
|
170
|
+
return false;
|
|
171
|
+
}
|
|
172
|
+
async poll() {
|
|
173
|
+
const now = Date.now();
|
|
174
|
+
// Issue #397: Run tmux health checks before dead-session reaping.
|
|
175
|
+
// This prevents false "status.dead" events when tmux is temporarily
|
|
176
|
+
// unreachable and windows still exist once the server recovers.
|
|
177
|
+
if (now - this.lastTmuxHealthCheck >= SessionMonitor.TMUX_HEALTH_CHECK_INTERVAL_MS) {
|
|
178
|
+
this.lastTmuxHealthCheck = now;
|
|
179
|
+
await this.checkTmuxHealth();
|
|
180
|
+
}
|
|
181
|
+
for (const session of this.sessions.listSessions()) {
|
|
182
|
+
try {
|
|
183
|
+
// Issue #84: Start watching when jsonlPath is discovered
|
|
184
|
+
if (this.jsonlWatcher && session.jsonlPath && !this.jsonlWatcher.isWatching(session.id)) {
|
|
185
|
+
this.jsonlWatcher.watch(session.id, session.jsonlPath, session.monitorOffset);
|
|
186
|
+
}
|
|
187
|
+
await this.checkSession(session);
|
|
188
|
+
}
|
|
189
|
+
catch (e) {
|
|
190
|
+
suppressedCatch(e, 'monitor.checkSession');
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
// Stall detection: run less frequently than message polling
|
|
194
|
+
if (now - this.lastStallCheck >= this.config.stallCheckIntervalMs) {
|
|
195
|
+
this.lastStallCheck = now;
|
|
196
|
+
await this.checkForStalls(now);
|
|
197
|
+
await this.checkStopSignals();
|
|
198
|
+
}
|
|
199
|
+
// Dead session detection: independent timer (M19: 10s default)
|
|
200
|
+
if (now - this.lastDeadCheck >= this.config.deadCheckIntervalMs) {
|
|
201
|
+
this.lastDeadCheck = now;
|
|
202
|
+
await this.checkDeadSessions();
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
/** Smart stall detection: multiple stall types with graduated thresholds.
|
|
206
|
+
*
|
|
207
|
+
* Detects 4 types of stalls:
|
|
208
|
+
* 1. JSONL stall: "working" but no new JSONL bytes for stallThresholdMs
|
|
209
|
+
* 2. Permission stall: permission_prompt/bash_approval for permissionStallMs
|
|
210
|
+
* 3. Unknown stall: unknown state for unknownStallMs (CC stuck in transition)
|
|
211
|
+
* 4. State duration stall: any non-idle state for 2x its threshold
|
|
212
|
+
*/
|
|
213
|
+
async checkForStalls(now) {
|
|
214
|
+
for (const session of this.sessions.listSessions()) {
|
|
215
|
+
const currentStatus = this.lastStatus.get(session.id);
|
|
216
|
+
const prevStallStatus = this.prevStatusForStall.get(session.id);
|
|
217
|
+
// Track state transitions — one entry per session, preserving timer across
|
|
218
|
+
// permission_prompt ↔ bash_approval transitions (both are "permission" states)
|
|
219
|
+
if (currentStatus && currentStatus !== 'idle') {
|
|
220
|
+
const entry = this.stateSince.get(session.id);
|
|
221
|
+
if (!entry) {
|
|
222
|
+
this.stateSince.set(session.id, { state: currentStatus, since: now });
|
|
223
|
+
}
|
|
224
|
+
else if (entry.state !== currentStatus) {
|
|
225
|
+
const isPermState = (s) => s === 'permission_prompt' || s === 'bash_approval';
|
|
226
|
+
if (isPermState(entry.state) && isPermState(currentStatus)) {
|
|
227
|
+
entry.state = currentStatus; // preserve since across permission sub-type transitions
|
|
228
|
+
}
|
|
229
|
+
else {
|
|
230
|
+
this.stateSince.set(session.id, { state: currentStatus, since: now });
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
// --- Type 1: JSONL stall (working but no output) ---
|
|
235
|
+
if (currentStatus === 'working') {
|
|
236
|
+
// Skip stall detection for rate-limited sessions — CC is in backoff
|
|
237
|
+
if (this.rateLimitedSessions.has(session.id)) {
|
|
238
|
+
continue;
|
|
239
|
+
}
|
|
240
|
+
const prev = this.lastBytesSeen.get(session.id);
|
|
241
|
+
const currentBytes = session.monitorOffset;
|
|
242
|
+
if (!prev) {
|
|
243
|
+
this.lastBytesSeen.set(session.id, { bytes: currentBytes, at: now });
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
if (currentBytes > prev.bytes) {
|
|
247
|
+
this.lastBytesSeen.set(session.id, { bytes: currentBytes, at: now });
|
|
248
|
+
this.stallDelete(session.id, 'jsonl');
|
|
249
|
+
}
|
|
250
|
+
else {
|
|
251
|
+
const stallDuration = now - prev.at;
|
|
252
|
+
const threshold = session.stallThresholdMs || this.config.stallThresholdMs;
|
|
253
|
+
if (stallDuration >= threshold && !this.stallHas(session.id, 'jsonl')) {
|
|
254
|
+
this.stallAdd(session.id, 'jsonl');
|
|
255
|
+
const minutes = Math.round(stallDuration / 60000);
|
|
256
|
+
const detail = `Session stalled: "working" for ${minutes}min with no new output. ` +
|
|
257
|
+
`Last activity: ${new Date(session.lastActivity).toISOString()}`;
|
|
258
|
+
this.eventBus?.emitStall(session.id, 'jsonl', detail);
|
|
259
|
+
await this.channels.statusChange(this.makePayload('status.stall', session, detail));
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
else {
|
|
264
|
+
// Reset JSONL stall tracking when not working
|
|
265
|
+
this.stallDelete(session.id, 'jsonl');
|
|
266
|
+
}
|
|
267
|
+
// --- Type 2: Permission stall (waiting for approval too long) ---
|
|
268
|
+
if (currentStatus === 'permission_prompt' || currentStatus === 'bash_approval') {
|
|
269
|
+
const entry = this.stateSince.get(session.id);
|
|
270
|
+
const permDuration = entry ? now - entry.since : 0;
|
|
271
|
+
if (permDuration >= this.config.permissionStallMs) {
|
|
272
|
+
if (!this.stallHas(session.id, 'permission')) {
|
|
273
|
+
this.stallAdd(session.id, 'permission');
|
|
274
|
+
const minutes = Math.round(permDuration / 60000);
|
|
275
|
+
const detail = `Session stalled: waiting for permission approval for ${minutes}min. ` +
|
|
276
|
+
`Auto-approve this session or POST /v1/sessions/${session.id}/approve`;
|
|
277
|
+
this.eventBus?.emitStall(session.id, 'permission', detail);
|
|
278
|
+
await this.channels.statusChange(this.makePayload('status.stall', session, detail));
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
// L9: Auto-reject permission after timeout
|
|
282
|
+
if (permDuration >= this.config.permissionTimeoutMs) {
|
|
283
|
+
if (!this.stallHas(session.id, 'permission_timeout')) {
|
|
284
|
+
this.stallAdd(session.id, 'permission_timeout');
|
|
285
|
+
const minutes = Math.round(permDuration / 60000);
|
|
286
|
+
logger.warn({
|
|
287
|
+
component: 'monitor',
|
|
288
|
+
operation: 'permission_timeout_auto_reject',
|
|
289
|
+
sessionId: session.id,
|
|
290
|
+
errorCode: 'PERMISSION_TIMEOUT',
|
|
291
|
+
attributes: { windowName: session.windowName, timeoutMinutes: minutes },
|
|
292
|
+
});
|
|
293
|
+
try {
|
|
294
|
+
await this.sessions.reject(session.id);
|
|
295
|
+
const detail = `Permission auto-rejected after ${minutes}min timeout (session ${session.windowName})`;
|
|
296
|
+
this.eventBus?.emitStall(session.id, 'permission_timeout', detail);
|
|
297
|
+
await this.channels.statusChange(this.makePayload('status.permission_timeout', session, detail));
|
|
298
|
+
}
|
|
299
|
+
catch (e) {
|
|
300
|
+
logger.error({
|
|
301
|
+
component: 'monitor',
|
|
302
|
+
operation: 'permission_timeout_auto_reject',
|
|
303
|
+
sessionId: session.id,
|
|
304
|
+
errorCode: 'AUTO_REJECT_FAILED',
|
|
305
|
+
attributes: { error: e instanceof Error ? e.message : String(e) },
|
|
306
|
+
});
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
// --- Type 3: Unknown stall (CC stuck in transition) ---
|
|
312
|
+
if (currentStatus === 'unknown') {
|
|
313
|
+
const entry = this.stateSince.get(session.id);
|
|
314
|
+
const unkDuration = entry ? now - entry.since : 0;
|
|
315
|
+
if (unkDuration >= this.config.unknownStallMs) {
|
|
316
|
+
if (!this.stallHas(session.id, 'unknown')) {
|
|
317
|
+
this.stallAdd(session.id, 'unknown');
|
|
318
|
+
const minutes = Math.round(unkDuration / 60000);
|
|
319
|
+
const detail = `Session stalled: in "unknown" state for ${minutes}min. ` +
|
|
320
|
+
`CC may be stuck. Try: POST /v1/sessions/${session.id}/interrupt or /kill`;
|
|
321
|
+
this.eventBus?.emitStall(session.id, 'unknown', detail);
|
|
322
|
+
await this.channels.statusChange(this.makePayload('status.stall', session, detail));
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
// --- Type 4: Extended state stall (any state held too long) ---
|
|
327
|
+
if (currentStatus && currentStatus !== 'idle' && currentStatus !== 'working') {
|
|
328
|
+
const entry = this.stateSince.get(session.id);
|
|
329
|
+
const stateDuration = entry ? now - entry.since : 0;
|
|
330
|
+
const extendedThreshold = this.config.stallThresholdMs * 2;
|
|
331
|
+
if (stateDuration >= extendedThreshold) {
|
|
332
|
+
if (!this.stallHas(session.id, 'extended')) {
|
|
333
|
+
this.stallAdd(session.id, 'extended');
|
|
334
|
+
const minutes = Math.round(stateDuration / 60000);
|
|
335
|
+
const detail = `Session stalled: "${currentStatus}" state for ${minutes}min. ` +
|
|
336
|
+
`May need intervention: /interrupt, /approve, or /kill`;
|
|
337
|
+
this.eventBus?.emitStall(session.id, 'extended', detail);
|
|
338
|
+
await this.channels.statusChange(this.makePayload('status.stall', session, detail));
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
// --- Type 5: Extended working stall (working too long regardless of byte changes, ---
|
|
343
|
+
// Catches CC stuck in "Misting" state where internal loop detection
|
|
344
|
+
if (currentStatus === 'working') {
|
|
345
|
+
const entry = this.stateSince.get(session.id);
|
|
346
|
+
if (entry && entry.state === 'working') {
|
|
347
|
+
const workingDuration = now - entry.since;
|
|
348
|
+
const maxWorkingMs = this.config.stallThresholdMs * 3; // 15 min default
|
|
349
|
+
if (workingDuration >= maxWorkingMs && !this.stallHas(session.id, 'extended_working')) {
|
|
350
|
+
this.stallAdd(session.id, 'extended_working');
|
|
351
|
+
const minutes = Math.round(workingDuration / 60000);
|
|
352
|
+
const detail = `Session stalled: in "working" state for ${minutes}min. ` +
|
|
353
|
+
`CC may be stuck in an internal loop (e.g., Misting). Consider: POST /v1/sessions/${session.id}/interrupt or /kill`;
|
|
354
|
+
this.eventBus?.emitStall(session.id, 'extended_working', detail);
|
|
355
|
+
await this.channels.statusChange(this.makePayload('status.stall', session, detail));
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
// Clean up stall notifications on state transitions (using prevStallStatus)
|
|
360
|
+
if (prevStallStatus && prevStallStatus !== currentStatus) {
|
|
361
|
+
const exitedPermission = prevStallStatus === 'permission_prompt' || prevStallStatus === 'bash_approval';
|
|
362
|
+
const exitedUnknown = prevStallStatus === 'unknown';
|
|
363
|
+
if (exitedPermission) {
|
|
364
|
+
this.stallDeleteTypes(session.id, ['permission', 'permission_timeout']);
|
|
365
|
+
}
|
|
366
|
+
if (exitedUnknown) {
|
|
367
|
+
this.stallDelete(session.id, 'unknown');
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
// Clean up all state tracking when idle (catch-all)
|
|
371
|
+
if (currentStatus === 'idle') {
|
|
372
|
+
this.rateLimitedSessions.delete(session.id);
|
|
373
|
+
this.stateSince.delete(session.id);
|
|
374
|
+
// Clean stall notifications (session recovered) — O(1) with Map
|
|
375
|
+
this.stallDeleteAll(session.id);
|
|
376
|
+
}
|
|
377
|
+
// Update prevStatusForStall for next cycle
|
|
378
|
+
if (currentStatus) {
|
|
379
|
+
this.prevStatusForStall.set(session.id, currentStatus);
|
|
380
|
+
}
|
|
381
|
+
else {
|
|
382
|
+
this.prevStatusForStall.delete(session.id);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
/** Issue #15: Check for Stop/StopFailure signals written by hook.ts. */
|
|
387
|
+
async checkStopSignals() {
|
|
388
|
+
// Check both aegis and manus dirs for backward compat
|
|
389
|
+
const aegisDir = join(homedir(), '.aegis');
|
|
390
|
+
const manusDir = join(homedir(), '.manus');
|
|
391
|
+
const signalFile = existsSync(join(aegisDir, 'stop_signals.json'))
|
|
392
|
+
? join(aegisDir, 'stop_signals.json')
|
|
393
|
+
: join(manusDir, 'stop_signals.json');
|
|
394
|
+
if (!existsSync(signalFile))
|
|
395
|
+
return;
|
|
396
|
+
try {
|
|
397
|
+
const raw = await readFile(signalFile, 'utf-8');
|
|
398
|
+
const parsed = stopSignalsSchema.safeParse(JSON.parse(raw));
|
|
399
|
+
if (!parsed.success) {
|
|
400
|
+
logger.warn({
|
|
401
|
+
component: 'monitor',
|
|
402
|
+
operation: 'check_stop_signals',
|
|
403
|
+
errorCode: 'STOP_SIGNALS_INVALID',
|
|
404
|
+
});
|
|
405
|
+
return;
|
|
406
|
+
}
|
|
407
|
+
const signals = parsed.data;
|
|
408
|
+
for (const session of this.sessions.listSessions()) {
|
|
409
|
+
if (!session.claudeSessionId)
|
|
410
|
+
continue;
|
|
411
|
+
const signal = signals[session.claudeSessionId];
|
|
412
|
+
if (!signal)
|
|
413
|
+
continue;
|
|
414
|
+
const signalKey = `${session.claudeSessionId}:${signal.timestamp}`;
|
|
415
|
+
if (this.processedStopSignals.has(signalKey))
|
|
416
|
+
continue;
|
|
417
|
+
this.processedStopSignals.add(signalKey);
|
|
418
|
+
// #220: Prune oldest entries when Set exceeds max size
|
|
419
|
+
// #510: Collect keys first, then delete — avoid mutation during iteration
|
|
420
|
+
if (this.processedStopSignals.size > SessionMonitor.MAX_PROCESSED_STOP_SIGNALS) {
|
|
421
|
+
const toRemove = this.processedStopSignals.size - SessionMonitor.MAX_PROCESSED_STOP_SIGNALS;
|
|
422
|
+
const keysToDelete = [...this.processedStopSignals].slice(0, toRemove);
|
|
423
|
+
for (const key of keysToDelete) {
|
|
424
|
+
this.processedStopSignals.delete(key);
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
if (signal.event === 'StopFailure') {
|
|
428
|
+
logger.warn({
|
|
429
|
+
component: 'monitor',
|
|
430
|
+
operation: 'check_stop_signals',
|
|
431
|
+
sessionId: session.id,
|
|
432
|
+
errorCode: 'STOP_FAILURE_SIGNAL',
|
|
433
|
+
attributes: {
|
|
434
|
+
stopReason: signal.stop_reason ?? null,
|
|
435
|
+
error: signal.error ?? null,
|
|
436
|
+
signalTimestamp: signal.timestamp ?? null,
|
|
437
|
+
},
|
|
438
|
+
});
|
|
439
|
+
const stopReason = signal.stop_reason || '';
|
|
440
|
+
if (stopReason === 'rate_limit' || stopReason === 'overloaded') {
|
|
441
|
+
this.rateLimitedSessions.add(session.id);
|
|
442
|
+
await this.channels.statusChange(this.makePayload('status.rate_limited', session, `Claude API rate limited (${stopReason}). Session will resume when the backoff window expires.`));
|
|
443
|
+
}
|
|
444
|
+
else {
|
|
445
|
+
const errorDetail = signal.error || signal.stop_reason || 'Unknown API error';
|
|
446
|
+
await this.channels.statusChange(this.makePayload('status.error', session, `⚠️ Claude Code error: ${errorDetail}`));
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
else if (signal.event === 'Stop') {
|
|
450
|
+
logger.info({
|
|
451
|
+
component: 'monitor',
|
|
452
|
+
operation: 'check_stop_signals',
|
|
453
|
+
sessionId: session.id,
|
|
454
|
+
errorCode: 'STOP_SIGNAL',
|
|
455
|
+
attributes: {
|
|
456
|
+
signalTimestamp: signal.timestamp ?? null,
|
|
457
|
+
},
|
|
458
|
+
});
|
|
459
|
+
await this.channels.statusChange(this.makePayload('status.stopped', session, 'Claude Code session ended normally'));
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
catch (e) {
|
|
464
|
+
suppressedCatch(e, 'monitor.checkStopSignals.parseEntry');
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
/** Issue #84: Handle new entries from the fs.watch-based JSONL watcher.
|
|
468
|
+
* Forwards messages to channels and updates stall tracking. */
|
|
469
|
+
handleWatcherEvent(event) {
|
|
470
|
+
const session = this.sessions.getSession(event.sessionId);
|
|
471
|
+
if (!session)
|
|
472
|
+
return;
|
|
473
|
+
// Update monitor offset from watcher
|
|
474
|
+
session.monitorOffset = event.newOffset;
|
|
475
|
+
if (event.messages.length > 0) {
|
|
476
|
+
// Clear rate-limited state — CC resumed producing real output
|
|
477
|
+
this.rateLimitedSessions.delete(event.sessionId);
|
|
478
|
+
for (const msg of event.messages) {
|
|
479
|
+
// Forward asynchronously (fire-and-forget) — catch to prevent unhandled rejection (#404)
|
|
480
|
+
void this.forwardMessage(session, msg).catch(e => logger.error({
|
|
481
|
+
component: 'monitor',
|
|
482
|
+
operation: 'forward_message',
|
|
483
|
+
sessionId: session.id,
|
|
484
|
+
errorCode: 'FORWARD_MESSAGE_FAILED',
|
|
485
|
+
attributes: { error: e instanceof Error ? e.message : String(e) },
|
|
486
|
+
}));
|
|
487
|
+
}
|
|
488
|
+
// Update last activity
|
|
489
|
+
session.lastActivity = Date.now();
|
|
490
|
+
}
|
|
491
|
+
// Update JSONL stall tracking — only reset stall timer when real messages arrive
|
|
492
|
+
// When no messages, only update bytes tracking (keep timestamp)
|
|
493
|
+
const now = Date.now();
|
|
494
|
+
const prev = this.lastBytesSeen.get(event.sessionId);
|
|
495
|
+
if (event.newOffset > (prev?.bytes ?? -1)) {
|
|
496
|
+
if (event.messages.length > 0) {
|
|
497
|
+
// Real output — reset stall timer
|
|
498
|
+
this.lastBytesSeen.set(event.sessionId, { bytes: event.newOffset, at: now });
|
|
499
|
+
this.stallDelete(event.sessionId, 'jsonl');
|
|
500
|
+
}
|
|
501
|
+
else {
|
|
502
|
+
// File grew but no messages — only update bytes, keep timestamp
|
|
503
|
+
this.lastBytesSeen.set(event.sessionId, { bytes: event.newOffset, at: prev?.at ?? now });
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
async checkSession(session) {
|
|
508
|
+
// When the JSONL watcher is active, messages are forwarded via handleWatcherEvent.
|
|
509
|
+
// Here we only need to capture the terminal UI state (permission prompts, idle, etc.)
|
|
510
|
+
const result = await this.sessions.readMessagesForMonitor(session.id);
|
|
511
|
+
const prevStatus = this.lastStatus.get(session.id);
|
|
512
|
+
// Forward messages only when watcher is NOT active (fallback polling path)
|
|
513
|
+
if (!this.jsonlWatcher && result.messages.length > 0) {
|
|
514
|
+
this.rateLimitedSessions.delete(session.id);
|
|
515
|
+
for (const msg of result.messages) {
|
|
516
|
+
await this.forwardMessage(session, msg);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
// Idle debounce: only emit idle after 10s of continuous idle
|
|
520
|
+
if (result.status === 'idle') {
|
|
521
|
+
if (!this.idleSince.has(session.id)) {
|
|
522
|
+
this.idleSince.set(session.id, Date.now());
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
else {
|
|
526
|
+
this.idleSince.delete(session.id);
|
|
527
|
+
// Reset idle notification guard when genuinely not idle
|
|
528
|
+
if (result.status === 'working' || result.status === 'unknown') {
|
|
529
|
+
this.idleNotified.delete(session.id);
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
// Detect and broadcast status changes (debounced)
|
|
533
|
+
if (result.status !== prevStatus) {
|
|
534
|
+
// Issue #89 L4: Debounce rapid status changes per session.
|
|
535
|
+
// If multiple transitions happen within STATUS_CHANGE_DEBOUNCE_MS,
|
|
536
|
+
// only the last one triggers a broadcast.
|
|
537
|
+
const existing = this.statusChangeDebounce.get(session.id);
|
|
538
|
+
if (existing)
|
|
539
|
+
clearTimeout(existing);
|
|
540
|
+
const latestStatus = result.status;
|
|
541
|
+
const latestPrevStatus = prevStatus;
|
|
542
|
+
const latestResult = { statusText: result.statusText, interactiveContent: result.interactiveContent };
|
|
543
|
+
this.statusChangeDebounce.set(session.id, setTimeout(() => {
|
|
544
|
+
this.statusChangeDebounce.delete(session.id);
|
|
545
|
+
// #511: Skip broadcast if session was killed while debounce was pending
|
|
546
|
+
if (!this.lastStatus.has(session.id))
|
|
547
|
+
return;
|
|
548
|
+
void this.broadcastStatusChange(session, latestStatus, latestPrevStatus, latestResult)
|
|
549
|
+
.catch(e => logger.error({
|
|
550
|
+
component: 'monitor',
|
|
551
|
+
operation: 'broadcast_status_change',
|
|
552
|
+
sessionId: session.id,
|
|
553
|
+
errorCode: 'BROADCAST_STATUS_CHANGE_FAILED',
|
|
554
|
+
attributes: { error: e instanceof Error ? e.message : String(e) },
|
|
555
|
+
}));
|
|
556
|
+
}, STATUS_CHANGE_DEBOUNCE_MS));
|
|
557
|
+
}
|
|
558
|
+
this.lastStatus.set(session.id, result.status);
|
|
559
|
+
}
|
|
560
|
+
async forwardMessage(session, msg) {
|
|
561
|
+
const eventMap = {
|
|
562
|
+
'user:text': 'message.user',
|
|
563
|
+
'assistant:text': 'message.assistant',
|
|
564
|
+
'assistant:thinking': 'message.thinking',
|
|
565
|
+
'assistant:tool_use': 'message.tool_use',
|
|
566
|
+
'assistant:tool_result': 'message.tool_result',
|
|
567
|
+
};
|
|
568
|
+
const key = `${msg.role}:${msg.contentType}`;
|
|
569
|
+
// Issue #89 L33: System entries get a different SSE event type
|
|
570
|
+
if (msg.role === 'system') {
|
|
571
|
+
this.eventBus?.emitSystem(session.id, msg.text, msg.contentType);
|
|
572
|
+
return;
|
|
573
|
+
}
|
|
574
|
+
const event = eventMap[key];
|
|
575
|
+
if (!event)
|
|
576
|
+
return;
|
|
577
|
+
// Issue #32: Emit SSE message event (L11: include tool metadata)
|
|
578
|
+
this.eventBus?.emitMessage(session.id, msg.role, msg.text, msg.contentType, msg.toolName || msg.toolUseId ? { tool_name: msg.toolName, tool_id: msg.toolUseId } : undefined);
|
|
579
|
+
await maybeInjectFault('monitor.forwardMessage.channels.message');
|
|
580
|
+
await this.channels.message(this.makePayload(event, session, msg.text));
|
|
581
|
+
}
|
|
582
|
+
async broadcastStatusChange(session, status, prevStatus, result) {
|
|
583
|
+
await maybeInjectFault('monitor.broadcastStatusChange.start');
|
|
584
|
+
if (status === 'permission_prompt' || status === 'bash_approval') {
|
|
585
|
+
// Issue #32: Emit SSE approval event
|
|
586
|
+
this.eventBus?.emitApproval(session.id, result.interactiveContent || 'Permission requested');
|
|
587
|
+
// Auto-approve if session has a non-default permission mode
|
|
588
|
+
// that auto-approves permission prompts (bypassPermissions, dontAsk,
|
|
589
|
+
// acceptEdits, plan, auto all handle their own permissions).
|
|
590
|
+
const AUTO_APPROVE_MODES = new Set(['bypassPermissions', 'dontAsk', 'acceptEdits', 'plan', 'auto']);
|
|
591
|
+
if (session.permissionMode !== 'default' && AUTO_APPROVE_MODES.has(session.permissionMode)) {
|
|
592
|
+
logger.info({
|
|
593
|
+
component: 'monitor',
|
|
594
|
+
operation: 'auto_approve_permission',
|
|
595
|
+
sessionId: session.id,
|
|
596
|
+
attributes: { windowName: session.windowName, mode: session.permissionMode },
|
|
597
|
+
});
|
|
598
|
+
try {
|
|
599
|
+
await this.sessions.approve(session.id);
|
|
600
|
+
await this.channels.statusChange(this.makePayload('status.permission', session, `[AUTO-APPROVED] ${result.interactiveContent || 'Permission auto-approved'}`));
|
|
601
|
+
}
|
|
602
|
+
catch (e) {
|
|
603
|
+
const errMsg = e instanceof Error ? e.message : String(e);
|
|
604
|
+
logger.error({
|
|
605
|
+
component: 'monitor',
|
|
606
|
+
operation: 'auto_approve_permission',
|
|
607
|
+
sessionId: session.id,
|
|
608
|
+
errorCode: 'AUTO_APPROVE_FAILED',
|
|
609
|
+
attributes: { error: errMsg },
|
|
610
|
+
});
|
|
611
|
+
await this.channels.statusChange(this.makePayload('status.permission', session, `[AUTO-APPROVE FAILED] ${result.interactiveContent || 'Permission requested'}: ${errMsg}`));
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
else {
|
|
615
|
+
await this.channels.statusChange(this.makePayload('status.permission', session, result.interactiveContent || 'Permission requested'));
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
else if (status === 'plan_mode') {
|
|
619
|
+
this.eventBus?.emitStatus(session.id, 'plan_mode', result.interactiveContent || 'Plan review requested');
|
|
620
|
+
await this.channels.statusChange(this.makePayload('status.plan', session, result.interactiveContent || 'Plan review requested'));
|
|
621
|
+
}
|
|
622
|
+
else if (status === 'idle') {
|
|
623
|
+
const idleStart = this.idleSince.get(session.id) || Date.now();
|
|
624
|
+
const idleDuration = Date.now() - idleStart;
|
|
625
|
+
// Only notify after 3s of continuous idle, and only once (M23: reduced from 10s)
|
|
626
|
+
if (idleDuration >= 3_000 && !this.idleNotified.has(session.id)) {
|
|
627
|
+
this.idleNotified.add(session.id);
|
|
628
|
+
this.eventBus?.emitStatus(session.id, 'idle', result.statusText || 'Session finished working, awaiting input');
|
|
629
|
+
await this.channels.statusChange(this.makePayload('status.idle', session, result.statusText || 'Session finished working, awaiting input'));
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
else if (status === 'ask_question' && prevStatus !== 'ask_question') {
|
|
633
|
+
this.eventBus?.emitStatus(session.id, 'ask_question', result.interactiveContent || 'Session is asking a question');
|
|
634
|
+
await this.channels.statusChange(this.makePayload('status.question', session, result.interactiveContent || 'Session is asking a question'));
|
|
635
|
+
}
|
|
636
|
+
// Issue #32: Emit working status via SSE
|
|
637
|
+
if (status === 'working' && prevStatus !== 'working') {
|
|
638
|
+
this.eventBus?.emitStatus(session.id, 'working', 'Claude is working');
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
makePayload(event, session, detail) {
|
|
642
|
+
return {
|
|
643
|
+
event,
|
|
644
|
+
timestamp: new Date().toISOString(),
|
|
645
|
+
session: {
|
|
646
|
+
id: session.id,
|
|
647
|
+
name: session.windowName,
|
|
648
|
+
workDir: session.workDir,
|
|
649
|
+
},
|
|
650
|
+
detail: detail.slice(0, 2000),
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
/** Check for dead tmux windows and notify via channels. */
|
|
654
|
+
async checkDeadSessions() {
|
|
655
|
+
// Issue #397: While tmux server is down, defer dead-session cleanup.
|
|
656
|
+
// tmux commands can fail transiently and make healthy sessions look dead.
|
|
657
|
+
if (this.tmuxWasDown)
|
|
658
|
+
return;
|
|
659
|
+
const sessions = this.sessions.listSessions();
|
|
660
|
+
for (const session of sessions) {
|
|
661
|
+
if (this.deadNotified.has(session.id))
|
|
662
|
+
continue;
|
|
663
|
+
await maybeInjectFault('monitor.checkDeadSessions.isWindowAlive');
|
|
664
|
+
const alive = await this.sessions.isWindowAlive(session.id);
|
|
665
|
+
if (!alive) {
|
|
666
|
+
let windowExists = null;
|
|
667
|
+
let paneDead = null;
|
|
668
|
+
let paneCommand = null;
|
|
669
|
+
let exitCode = null;
|
|
670
|
+
try {
|
|
671
|
+
if (this.tmux) {
|
|
672
|
+
const health = await this.tmux.getWindowHealth(session.windowId);
|
|
673
|
+
windowExists = health.windowExists;
|
|
674
|
+
paneDead = health.paneDead;
|
|
675
|
+
paneCommand = health.paneCommand;
|
|
676
|
+
if (health.windowExists && health.paneDead) {
|
|
677
|
+
const paneText = await this.tmux.capturePane(session.windowId);
|
|
678
|
+
const statusMatch = paneText.match(/Pane is dead \(status\s+(\d+)\)/i);
|
|
679
|
+
if (statusMatch) {
|
|
680
|
+
const parsed = parseInt(statusMatch[1] ?? '', 10);
|
|
681
|
+
exitCode = Number.isFinite(parsed) ? parsed : null;
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
catch {
|
|
687
|
+
// best-effort diagnostics only
|
|
688
|
+
}
|
|
689
|
+
const cause = windowExists === false
|
|
690
|
+
? 'window_missing'
|
|
691
|
+
: paneDead
|
|
692
|
+
? 'pane_dead'
|
|
693
|
+
: 'process_not_alive_or_unknown';
|
|
694
|
+
logger.warn({
|
|
695
|
+
component: 'monitor',
|
|
696
|
+
operation: 'check_dead_sessions',
|
|
697
|
+
sessionId: session.id,
|
|
698
|
+
errorCode: 'SESSION_TERMINATED_UNEXPECTEDLY',
|
|
699
|
+
attributes: {
|
|
700
|
+
cause,
|
|
701
|
+
windowName: session.windowName,
|
|
702
|
+
windowId: session.windowId,
|
|
703
|
+
claudeSessionId: session.claudeSessionId,
|
|
704
|
+
ccPid: session.ccPid ?? null,
|
|
705
|
+
paneCommand,
|
|
706
|
+
windowExists,
|
|
707
|
+
paneDead,
|
|
708
|
+
paneAlive: paneDead === null ? null : !paneDead,
|
|
709
|
+
exitCode,
|
|
710
|
+
signal: signalFromExitCode(exitCode),
|
|
711
|
+
uptimeMs: Date.now() - session.createdAt,
|
|
712
|
+
lastActivityAt: new Date(session.lastActivity).toISOString(),
|
|
713
|
+
detectedAt: new Date().toISOString(),
|
|
714
|
+
},
|
|
715
|
+
});
|
|
716
|
+
this.deadNotified.add(session.id);
|
|
717
|
+
// Track when the session died so the zombie reaper can clean it up
|
|
718
|
+
session.lastDeadAt = Date.now();
|
|
719
|
+
const detail = `Session "${session.windowName}" died — tmux window no longer exists. ` +
|
|
720
|
+
`Last activity: ${new Date(session.lastActivity).toISOString()}`;
|
|
721
|
+
this.eventBus?.emitDead(session.id, detail);
|
|
722
|
+
await this.channels.statusChange(this.makePayload('status.dead', session, detail));
|
|
723
|
+
this.removeSession(session.id);
|
|
724
|
+
// #262: Also remove from SessionManager so dead sessions don't linger
|
|
725
|
+
try {
|
|
726
|
+
await this.sessions.killSession(session.id);
|
|
727
|
+
}
|
|
728
|
+
catch (e) {
|
|
729
|
+
suppressedCatch(e, 'monitor.checkDeadSessions.killSession');
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
/** Issue #397: Check tmux server health. Detect crashes and trigger reconciliation. */
|
|
735
|
+
async checkTmuxHealth() {
|
|
736
|
+
if (!this.tmux)
|
|
737
|
+
return;
|
|
738
|
+
let healthy = true;
|
|
739
|
+
let error = null;
|
|
740
|
+
try {
|
|
741
|
+
({ healthy, error } = await this.tmux.isServerHealthy());
|
|
742
|
+
}
|
|
743
|
+
catch (e) {
|
|
744
|
+
healthy = false;
|
|
745
|
+
error = e instanceof Error ? e.message : String(e);
|
|
746
|
+
}
|
|
747
|
+
if (!healthy) {
|
|
748
|
+
// Only treat known server/socket failures as "tmux down".
|
|
749
|
+
// Other tmux errors can be transient command failures.
|
|
750
|
+
const serverDown = this.tmux.isTmuxServerError(new Error(error ?? 'tmux unavailable'));
|
|
751
|
+
if (!serverDown) {
|
|
752
|
+
logger.warn({
|
|
753
|
+
component: 'monitor',
|
|
754
|
+
operation: 'tmux_health_check',
|
|
755
|
+
errorCode: 'TMUX_HEALTH_CHECK_ERROR',
|
|
756
|
+
attributes: { error: error ?? 'unknown tmux health error' },
|
|
757
|
+
});
|
|
758
|
+
return;
|
|
759
|
+
}
|
|
760
|
+
if (!this.tmuxWasDown) {
|
|
761
|
+
logger.warn({
|
|
762
|
+
component: 'monitor',
|
|
763
|
+
operation: 'tmux_health_check',
|
|
764
|
+
errorCode: 'TMUX_UNREACHABLE',
|
|
765
|
+
attributes: { error: error ?? 'tmux server unavailable' },
|
|
766
|
+
});
|
|
767
|
+
this.tmuxWasDown = true;
|
|
768
|
+
}
|
|
769
|
+
return;
|
|
770
|
+
}
|
|
771
|
+
// Tmux is healthy now
|
|
772
|
+
if (this.tmuxWasDown) {
|
|
773
|
+
logger.info({
|
|
774
|
+
component: 'monitor',
|
|
775
|
+
operation: 'tmux_health_check',
|
|
776
|
+
errorCode: 'TMUX_RECOVERED',
|
|
777
|
+
});
|
|
778
|
+
this.tmuxWasDown = false;
|
|
779
|
+
// Trigger crash reconciliation to re-attach or mark orphaned sessions
|
|
780
|
+
const result = await this.sessions.reconcileTmuxCrash();
|
|
781
|
+
if (result.recovered > 0 || result.orphaned > 0) {
|
|
782
|
+
logger.info({
|
|
783
|
+
component: 'monitor',
|
|
784
|
+
operation: 'tmux_crash_reconciliation',
|
|
785
|
+
attributes: { recovered: result.recovered, orphaned: result.orphaned },
|
|
786
|
+
});
|
|
787
|
+
// Notify channels about recovery
|
|
788
|
+
for (const session of this.sessions.listSessions()) {
|
|
789
|
+
await this.channels.statusChange(this.makePayload('status.recovered', session, `tmux server recovered. Session ${session.windowName} re-attached.`));
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
/** Clean up tracking for a killed session. */
|
|
795
|
+
removeSession(sessionId) {
|
|
796
|
+
// Issue #84: Stop watching JSONL file for this session
|
|
797
|
+
this.jsonlWatcher?.unwatch(sessionId);
|
|
798
|
+
this.lastStatus.delete(sessionId);
|
|
799
|
+
this.lastBytesSeen.delete(sessionId);
|
|
800
|
+
this.deadNotified.delete(sessionId);
|
|
801
|
+
this.rateLimitedSessions.delete(sessionId);
|
|
802
|
+
// Issue #89 L4: Clear pending debounce timer
|
|
803
|
+
const pending = this.statusChangeDebounce.get(sessionId);
|
|
804
|
+
if (pending) {
|
|
805
|
+
clearTimeout(pending);
|
|
806
|
+
this.statusChangeDebounce.delete(sessionId);
|
|
807
|
+
}
|
|
808
|
+
// Clean all stall notifications for this session — O(1) with Map
|
|
809
|
+
this.stallDeleteAll(sessionId);
|
|
810
|
+
this.idleNotified.delete(sessionId);
|
|
811
|
+
this.idleSince.delete(sessionId);
|
|
812
|
+
this.stateSince.delete(sessionId);
|
|
813
|
+
this.prevStatusForStall.delete(sessionId);
|
|
814
|
+
// Note: processedStopSignals uses claudeSessionId:timestamp keys, not bridge sessionId.
|
|
815
|
+
// We don't clean them here — they're small and prevent re-processing.
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
function sleep(ms) {
|
|
819
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
820
|
+
}
|