aegis-bridge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +404 -0
  3. package/dashboard/dist/assets/index-BoZwGLAx.css +32 -0
  4. package/dashboard/dist/assets/index-C61BkKH-.js +312 -0
  5. package/dashboard/dist/assets/index-C61BkKH-.js.map +1 -0
  6. package/dashboard/dist/index.html +14 -0
  7. package/dist/api-contracts.d.ts +229 -0
  8. package/dist/api-contracts.js +7 -0
  9. package/dist/api-contracts.typecheck.d.ts +14 -0
  10. package/dist/api-contracts.typecheck.js +1 -0
  11. package/dist/api-error-envelope.d.ts +15 -0
  12. package/dist/api-error-envelope.js +80 -0
  13. package/dist/auth.d.ts +87 -0
  14. package/dist/auth.js +276 -0
  15. package/dist/channels/index.d.ts +8 -0
  16. package/dist/channels/index.js +8 -0
  17. package/dist/channels/manager.d.ts +47 -0
  18. package/dist/channels/manager.js +115 -0
  19. package/dist/channels/telegram-style.d.ts +118 -0
  20. package/dist/channels/telegram-style.js +202 -0
  21. package/dist/channels/telegram.d.ts +91 -0
  22. package/dist/channels/telegram.js +1518 -0
  23. package/dist/channels/types.d.ts +77 -0
  24. package/dist/channels/types.js +8 -0
  25. package/dist/channels/webhook.d.ts +60 -0
  26. package/dist/channels/webhook.js +216 -0
  27. package/dist/cli.d.ts +8 -0
  28. package/dist/cli.js +252 -0
  29. package/dist/config.d.ts +90 -0
  30. package/dist/config.js +214 -0
  31. package/dist/consensus.d.ts +16 -0
  32. package/dist/consensus.js +19 -0
  33. package/dist/continuation-pointer.d.ts +11 -0
  34. package/dist/continuation-pointer.js +65 -0
  35. package/dist/diagnostics.d.ts +27 -0
  36. package/dist/diagnostics.js +95 -0
  37. package/dist/error-categories.d.ts +39 -0
  38. package/dist/error-categories.js +73 -0
  39. package/dist/events.d.ts +133 -0
  40. package/dist/events.js +389 -0
  41. package/dist/fault-injection.d.ts +29 -0
  42. package/dist/fault-injection.js +115 -0
  43. package/dist/file-utils.d.ts +2 -0
  44. package/dist/file-utils.js +37 -0
  45. package/dist/handshake.d.ts +60 -0
  46. package/dist/handshake.js +124 -0
  47. package/dist/hook-settings.d.ts +80 -0
  48. package/dist/hook-settings.js +272 -0
  49. package/dist/hook.d.ts +19 -0
  50. package/dist/hook.js +231 -0
  51. package/dist/hooks.d.ts +32 -0
  52. package/dist/hooks.js +364 -0
  53. package/dist/jsonl-watcher.d.ts +59 -0
  54. package/dist/jsonl-watcher.js +166 -0
  55. package/dist/logger.d.ts +35 -0
  56. package/dist/logger.js +65 -0
  57. package/dist/mcp-server.d.ts +123 -0
  58. package/dist/mcp-server.js +869 -0
  59. package/dist/memory-bridge.d.ts +27 -0
  60. package/dist/memory-bridge.js +137 -0
  61. package/dist/memory-routes.d.ts +3 -0
  62. package/dist/memory-routes.js +100 -0
  63. package/dist/metrics.d.ts +126 -0
  64. package/dist/metrics.js +286 -0
  65. package/dist/model-router.d.ts +53 -0
  66. package/dist/model-router.js +150 -0
  67. package/dist/monitor.d.ts +103 -0
  68. package/dist/monitor.js +820 -0
  69. package/dist/path-utils.d.ts +11 -0
  70. package/dist/path-utils.js +21 -0
  71. package/dist/permission-evaluator.d.ts +10 -0
  72. package/dist/permission-evaluator.js +48 -0
  73. package/dist/permission-guard.d.ts +51 -0
  74. package/dist/permission-guard.js +196 -0
  75. package/dist/permission-request-manager.d.ts +12 -0
  76. package/dist/permission-request-manager.js +36 -0
  77. package/dist/permission-routes.d.ts +7 -0
  78. package/dist/permission-routes.js +28 -0
  79. package/dist/pipeline.d.ts +97 -0
  80. package/dist/pipeline.js +291 -0
  81. package/dist/process-utils.d.ts +4 -0
  82. package/dist/process-utils.js +73 -0
  83. package/dist/question-manager.d.ts +54 -0
  84. package/dist/question-manager.js +80 -0
  85. package/dist/retry.d.ts +11 -0
  86. package/dist/retry.js +34 -0
  87. package/dist/safe-json.d.ts +12 -0
  88. package/dist/safe-json.js +22 -0
  89. package/dist/screenshot.d.ts +28 -0
  90. package/dist/screenshot.js +60 -0
  91. package/dist/server.d.ts +10 -0
  92. package/dist/server.js +1973 -0
  93. package/dist/session-cleanup.d.ts +18 -0
  94. package/dist/session-cleanup.js +11 -0
  95. package/dist/session.d.ts +379 -0
  96. package/dist/session.js +1568 -0
  97. package/dist/shutdown-utils.d.ts +5 -0
  98. package/dist/shutdown-utils.js +24 -0
  99. package/dist/signal-cleanup-helper.d.ts +48 -0
  100. package/dist/signal-cleanup-helper.js +117 -0
  101. package/dist/sse-limiter.d.ts +47 -0
  102. package/dist/sse-limiter.js +61 -0
  103. package/dist/sse-writer.d.ts +31 -0
  104. package/dist/sse-writer.js +94 -0
  105. package/dist/ssrf.d.ts +102 -0
  106. package/dist/ssrf.js +267 -0
  107. package/dist/startup.d.ts +6 -0
  108. package/dist/startup.js +162 -0
  109. package/dist/suppress.d.ts +33 -0
  110. package/dist/suppress.js +79 -0
  111. package/dist/swarm-monitor.d.ts +117 -0
  112. package/dist/swarm-monitor.js +300 -0
  113. package/dist/template-store.d.ts +45 -0
  114. package/dist/template-store.js +142 -0
  115. package/dist/terminal-parser.d.ts +16 -0
  116. package/dist/terminal-parser.js +346 -0
  117. package/dist/tmux-capture-cache.d.ts +18 -0
  118. package/dist/tmux-capture-cache.js +34 -0
  119. package/dist/tmux.d.ts +183 -0
  120. package/dist/tmux.js +906 -0
  121. package/dist/tool-registry.d.ts +40 -0
  122. package/dist/tool-registry.js +83 -0
  123. package/dist/transcript.d.ts +63 -0
  124. package/dist/transcript.js +284 -0
  125. package/dist/utils/circular-buffer.d.ts +11 -0
  126. package/dist/utils/circular-buffer.js +37 -0
  127. package/dist/utils/redact-headers.d.ts +13 -0
  128. package/dist/utils/redact-headers.js +54 -0
  129. package/dist/validation.d.ts +406 -0
  130. package/dist/validation.js +415 -0
  131. package/dist/verification.d.ts +2 -0
  132. package/dist/verification.js +72 -0
  133. package/dist/worktree-lookup.d.ts +24 -0
  134. package/dist/worktree-lookup.js +71 -0
  135. package/dist/ws-terminal.d.ts +32 -0
  136. package/dist/ws-terminal.js +348 -0
  137. package/package.json +83 -0
@@ -0,0 +1,820 @@
1
+ /**
2
+ * monitor.ts — Background monitor that polls sessions and routes events to channels.
3
+ *
4
+ * Runs a polling loop that:
5
+ * 1. Checks each active session for new JSONL entries
6
+ * 2. Detects status changes (working → idle, permission prompts, etc.)
7
+ * 3. Routes events to the ChannelManager (which fans out to Telegram, webhooks, etc.)
8
+ */
9
+ import { readFile } from 'node:fs/promises';
10
+ import { existsSync } from 'node:fs';
11
+ import { join } from 'node:path';
12
+ import { homedir } from 'node:os';
13
+ import { stopSignalsSchema } from './validation.js';
14
+ import { suppressedCatch } from './suppress.js';
15
+ import { logger } from './logger.js';
16
+ import { maybeInjectFault } from './fault-injection.js';
17
+ /** Issue #89 L4: Debounce interval for status change broadcasts (ms). */
18
+ const STATUS_CHANGE_DEBOUNCE_MS = 500;
19
+ export const DEFAULT_MONITOR_CONFIG = {
20
+ pollIntervalMs: 30_000, // 30s base — hooks are the primary signal (Issue #169 Phase 3)
21
+ fastPollIntervalMs: 5_000, // 5s when hooks are quiet — fallback safety net
22
+ hookQuietMs: 60_000, // 60s without a hook → switch to fast polling
23
+ stallThresholdMs: 2 * 60 * 1000, // 2 minutes (Issue #392: reduced from 5 min)
24
+ stallCheckIntervalMs: 30 * 1000, // check every 30 seconds (faster for shorter thresholds)
25
+ deadCheckIntervalMs: 10 * 1000, // check every 10 seconds (Issue M19: faster dead detection)
26
+ permissionStallMs: 5 * 60 * 1000, // 5 min waiting for permission = stalled
27
+ unknownStallMs: 3 * 60 * 1000, // 3 min in unknown state = stalled
28
+ permissionTimeoutMs: 10 * 60 * 1000, // 10 min → auto-reject permission
29
+ };
30
+ const SIGNAL_BY_NUMBER = {
31
+ 1: 'SIGHUP',
32
+ 2: 'SIGINT',
33
+ 3: 'SIGQUIT',
34
+ 6: 'SIGABRT',
35
+ 9: 'SIGKILL',
36
+ 11: 'SIGSEGV',
37
+ 13: 'SIGPIPE',
38
+ 14: 'SIGALRM',
39
+ 15: 'SIGTERM',
40
+ };
41
+ function signalFromExitCode(exitCode) {
42
+ if (exitCode === null || exitCode < 129)
43
+ return null;
44
+ return SIGNAL_BY_NUMBER[exitCode - 128] ?? `SIG${exitCode - 128}`;
45
+ }
46
+ export class SessionMonitor {
47
+ sessions;
48
+ channels;
49
+ config;
50
+ running = false;
51
+ lastStatus = new Map();
52
+ lastBytesSeen = new Map();
53
+ // Issue #663: Nested Map for O(1) per-session stall lookup (was Set with O(n) prefix scan)
54
+ stallNotified = new Map(); // sessionId → Set<stallType>
55
+ /** Issue #663: O(1) stall notification check. */
56
+ stallHas(sessionId, stallType) {
57
+ return this.stallNotified.get(sessionId)?.has(stallType) ?? false;
58
+ }
59
+ /** Issue #663: O(1) stall notification add. */
60
+ stallAdd(sessionId, stallType) {
61
+ const set = this.stallNotified.get(sessionId);
62
+ if (set) {
63
+ set.add(stallType);
64
+ }
65
+ else {
66
+ this.stallNotified.set(sessionId, new Set([stallType]));
67
+ }
68
+ }
69
+ /** Issue #663: O(1) stall notification delete. */
70
+ stallDelete(sessionId, stallType) {
71
+ this.stallNotified.get(sessionId)?.delete(stallType);
72
+ }
73
+ /** Issue #663: Delete all stall notifications for a session. */
74
+ stallDeleteAll(sessionId) {
75
+ this.stallNotified.delete(sessionId);
76
+ }
77
+ /** Issue #663: Delete specific stall types for a session. */
78
+ stallDeleteTypes(sessionId, types) {
79
+ const set = this.stallNotified.get(sessionId);
80
+ if (!set)
81
+ return;
82
+ for (const t of types)
83
+ set.delete(t);
84
+ }
85
+ lastStallCheck = 0;
86
+ lastDeadCheck = 0;
87
+ idleNotified = new Set(); // prevent idle spam
88
+ idleSince = new Map(); // debounce: when idle started
89
+ processedStopSignals = new Set(); // Issue #15: don't re-process signals
90
+ static MAX_PROCESSED_STOP_SIGNALS = 1000; // #220: prevent unbounded growth
91
+ // Smart stall detection: track when each non-working state started
92
+ stateSince = new Map(); // sessionId → { state, since } (one entry per session)
93
+ deadNotified = new Set(); // don't spam dead session events
94
+ prevStatusForStall = new Map(); // track previous status for stall transition detection
95
+ rateLimitedSessions = new Set(); // sessions in rate-limit backoff
96
+ // Issue #397: Track tmux server health for crash recovery
97
+ tmuxWasDown = false;
98
+ lastTmuxHealthCheck = 0;
99
+ static TMUX_HEALTH_CHECK_INTERVAL_MS = 10_000; // check every 10s
100
+ /** Issue #89 L4: Debounce status change broadcasts per session.
101
+ * If multiple status changes happen within 500ms, only emit the last one.
102
+ * Prevents rapid-fire notifications during state transitions. */
103
+ statusChangeDebounce = new Map();
104
+ /** Issue #32: Optional SSE event bus for real-time streaming. */
105
+ eventBus;
106
+ /** Issue #84: fs.watch-based JSONL watcher for near-instant message detection. */
107
+ jsonlWatcher;
108
+ constructor(sessions, channels, config = DEFAULT_MONITOR_CONFIG) {
109
+ this.sessions = sessions;
110
+ this.channels = channels;
111
+ this.config = config;
112
+ this.config = { ...DEFAULT_MONITOR_CONFIG, ...config };
113
+ }
114
+ /** Issue #32: Set the event bus for SSE streaming. */
115
+ setEventBus(bus) {
116
+ this.eventBus = bus;
117
+ }
118
+ /** Issue #397: Set the TmuxManager reference for tmux health checks. */
119
+ tmux;
120
+ setTmuxManager(tmuxManager) {
121
+ this.tmux = tmuxManager;
122
+ }
123
+ /** Issue #84: Set the JSONL watcher for fs.watch-based message detection. */
124
+ setJsonlWatcher(watcher) {
125
+ this.jsonlWatcher = watcher;
126
+ watcher.onEntries((event) => {
127
+ this.handleWatcherEvent(event);
128
+ });
129
+ }
130
+ start() {
131
+ if (this.running)
132
+ return;
133
+ this.running = true;
134
+ this.loop();
135
+ }
136
+ stop() {
137
+ this.running = false;
138
+ }
139
+ async loop() {
140
+ while (this.running) {
141
+ try {
142
+ await this.poll();
143
+ }
144
+ catch (e) {
145
+ logger.error({
146
+ component: 'monitor',
147
+ operation: 'poll',
148
+ errorCode: 'MONITOR_POLL_ERROR',
149
+ attributes: { error: e instanceof Error ? e.message : String(e) },
150
+ });
151
+ }
152
+ // Issue #169 Phase 3: Adaptive polling — use fast interval if any session
153
+ // hasn't received a hook recently (hooks may have stopped working).
154
+ const interval = this.needsFastPolling() ? this.config.fastPollIntervalMs : this.config.pollIntervalMs;
155
+ await sleep(interval);
156
+ }
157
+ }
158
+ /** Check if any active session hasn't received a hook recently. */
159
+ needsFastPolling() {
160
+ const now = Date.now();
161
+ for (const session of this.sessions.listSessions()) {
162
+ const lastHook = session.lastHookAt;
163
+ // If a session has never received a hook, always fast-poll (hooks may not be configured)
164
+ if (lastHook === undefined)
165
+ return true;
166
+ // If no hook for hookQuietMs, switch to fast polling
167
+ if (now - lastHook > this.config.hookQuietMs)
168
+ return true;
169
+ }
170
+ return false;
171
+ }
172
+ async poll() {
173
+ const now = Date.now();
174
+ // Issue #397: Run tmux health checks before dead-session reaping.
175
+ // This prevents false "status.dead" events when tmux is temporarily
176
+ // unreachable and windows still exist once the server recovers.
177
+ if (now - this.lastTmuxHealthCheck >= SessionMonitor.TMUX_HEALTH_CHECK_INTERVAL_MS) {
178
+ this.lastTmuxHealthCheck = now;
179
+ await this.checkTmuxHealth();
180
+ }
181
+ for (const session of this.sessions.listSessions()) {
182
+ try {
183
+ // Issue #84: Start watching when jsonlPath is discovered
184
+ if (this.jsonlWatcher && session.jsonlPath && !this.jsonlWatcher.isWatching(session.id)) {
185
+ this.jsonlWatcher.watch(session.id, session.jsonlPath, session.monitorOffset);
186
+ }
187
+ await this.checkSession(session);
188
+ }
189
+ catch (e) {
190
+ suppressedCatch(e, 'monitor.checkSession');
191
+ }
192
+ }
193
+ // Stall detection: run less frequently than message polling
194
+ if (now - this.lastStallCheck >= this.config.stallCheckIntervalMs) {
195
+ this.lastStallCheck = now;
196
+ await this.checkForStalls(now);
197
+ await this.checkStopSignals();
198
+ }
199
+ // Dead session detection: independent timer (M19: 10s default)
200
+ if (now - this.lastDeadCheck >= this.config.deadCheckIntervalMs) {
201
+ this.lastDeadCheck = now;
202
+ await this.checkDeadSessions();
203
+ }
204
+ }
205
+ /** Smart stall detection: multiple stall types with graduated thresholds.
206
+ *
207
+ * Detects 4 types of stalls:
208
+ * 1. JSONL stall: "working" but no new JSONL bytes for stallThresholdMs
209
+ * 2. Permission stall: permission_prompt/bash_approval for permissionStallMs
210
+ * 3. Unknown stall: unknown state for unknownStallMs (CC stuck in transition)
211
+ * 4. State duration stall: any non-idle state for 2x its threshold
212
+ */
213
+ async checkForStalls(now) {
214
+ for (const session of this.sessions.listSessions()) {
215
+ const currentStatus = this.lastStatus.get(session.id);
216
+ const prevStallStatus = this.prevStatusForStall.get(session.id);
217
+ // Track state transitions — one entry per session, preserving timer across
218
+ // permission_prompt ↔ bash_approval transitions (both are "permission" states)
219
+ if (currentStatus && currentStatus !== 'idle') {
220
+ const entry = this.stateSince.get(session.id);
221
+ if (!entry) {
222
+ this.stateSince.set(session.id, { state: currentStatus, since: now });
223
+ }
224
+ else if (entry.state !== currentStatus) {
225
+ const isPermState = (s) => s === 'permission_prompt' || s === 'bash_approval';
226
+ if (isPermState(entry.state) && isPermState(currentStatus)) {
227
+ entry.state = currentStatus; // preserve since across permission sub-type transitions
228
+ }
229
+ else {
230
+ this.stateSince.set(session.id, { state: currentStatus, since: now });
231
+ }
232
+ }
233
+ }
234
+ // --- Type 1: JSONL stall (working but no output) ---
235
+ if (currentStatus === 'working') {
236
+ // Skip stall detection for rate-limited sessions — CC is in backoff
237
+ if (this.rateLimitedSessions.has(session.id)) {
238
+ continue;
239
+ }
240
+ const prev = this.lastBytesSeen.get(session.id);
241
+ const currentBytes = session.monitorOffset;
242
+ if (!prev) {
243
+ this.lastBytesSeen.set(session.id, { bytes: currentBytes, at: now });
244
+ continue;
245
+ }
246
+ if (currentBytes > prev.bytes) {
247
+ this.lastBytesSeen.set(session.id, { bytes: currentBytes, at: now });
248
+ this.stallDelete(session.id, 'jsonl');
249
+ }
250
+ else {
251
+ const stallDuration = now - prev.at;
252
+ const threshold = session.stallThresholdMs || this.config.stallThresholdMs;
253
+ if (stallDuration >= threshold && !this.stallHas(session.id, 'jsonl')) {
254
+ this.stallAdd(session.id, 'jsonl');
255
+ const minutes = Math.round(stallDuration / 60000);
256
+ const detail = `Session stalled: "working" for ${minutes}min with no new output. ` +
257
+ `Last activity: ${new Date(session.lastActivity).toISOString()}`;
258
+ this.eventBus?.emitStall(session.id, 'jsonl', detail);
259
+ await this.channels.statusChange(this.makePayload('status.stall', session, detail));
260
+ }
261
+ }
262
+ }
263
+ else {
264
+ // Reset JSONL stall tracking when not working
265
+ this.stallDelete(session.id, 'jsonl');
266
+ }
267
+ // --- Type 2: Permission stall (waiting for approval too long) ---
268
+ if (currentStatus === 'permission_prompt' || currentStatus === 'bash_approval') {
269
+ const entry = this.stateSince.get(session.id);
270
+ const permDuration = entry ? now - entry.since : 0;
271
+ if (permDuration >= this.config.permissionStallMs) {
272
+ if (!this.stallHas(session.id, 'permission')) {
273
+ this.stallAdd(session.id, 'permission');
274
+ const minutes = Math.round(permDuration / 60000);
275
+ const detail = `Session stalled: waiting for permission approval for ${minutes}min. ` +
276
+ `Auto-approve this session or POST /v1/sessions/${session.id}/approve`;
277
+ this.eventBus?.emitStall(session.id, 'permission', detail);
278
+ await this.channels.statusChange(this.makePayload('status.stall', session, detail));
279
+ }
280
+ }
281
+ // L9: Auto-reject permission after timeout
282
+ if (permDuration >= this.config.permissionTimeoutMs) {
283
+ if (!this.stallHas(session.id, 'permission_timeout')) {
284
+ this.stallAdd(session.id, 'permission_timeout');
285
+ const minutes = Math.round(permDuration / 60000);
286
+ logger.warn({
287
+ component: 'monitor',
288
+ operation: 'permission_timeout_auto_reject',
289
+ sessionId: session.id,
290
+ errorCode: 'PERMISSION_TIMEOUT',
291
+ attributes: { windowName: session.windowName, timeoutMinutes: minutes },
292
+ });
293
+ try {
294
+ await this.sessions.reject(session.id);
295
+ const detail = `Permission auto-rejected after ${minutes}min timeout (session ${session.windowName})`;
296
+ this.eventBus?.emitStall(session.id, 'permission_timeout', detail);
297
+ await this.channels.statusChange(this.makePayload('status.permission_timeout', session, detail));
298
+ }
299
+ catch (e) {
300
+ logger.error({
301
+ component: 'monitor',
302
+ operation: 'permission_timeout_auto_reject',
303
+ sessionId: session.id,
304
+ errorCode: 'AUTO_REJECT_FAILED',
305
+ attributes: { error: e instanceof Error ? e.message : String(e) },
306
+ });
307
+ }
308
+ }
309
+ }
310
+ }
311
+ // --- Type 3: Unknown stall (CC stuck in transition) ---
312
+ if (currentStatus === 'unknown') {
313
+ const entry = this.stateSince.get(session.id);
314
+ const unkDuration = entry ? now - entry.since : 0;
315
+ if (unkDuration >= this.config.unknownStallMs) {
316
+ if (!this.stallHas(session.id, 'unknown')) {
317
+ this.stallAdd(session.id, 'unknown');
318
+ const minutes = Math.round(unkDuration / 60000);
319
+ const detail = `Session stalled: in "unknown" state for ${minutes}min. ` +
320
+ `CC may be stuck. Try: POST /v1/sessions/${session.id}/interrupt or /kill`;
321
+ this.eventBus?.emitStall(session.id, 'unknown', detail);
322
+ await this.channels.statusChange(this.makePayload('status.stall', session, detail));
323
+ }
324
+ }
325
+ }
326
+ // --- Type 4: Extended state stall (any state held too long) ---
327
+ if (currentStatus && currentStatus !== 'idle' && currentStatus !== 'working') {
328
+ const entry = this.stateSince.get(session.id);
329
+ const stateDuration = entry ? now - entry.since : 0;
330
+ const extendedThreshold = this.config.stallThresholdMs * 2;
331
+ if (stateDuration >= extendedThreshold) {
332
+ if (!this.stallHas(session.id, 'extended')) {
333
+ this.stallAdd(session.id, 'extended');
334
+ const minutes = Math.round(stateDuration / 60000);
335
+ const detail = `Session stalled: "${currentStatus}" state for ${minutes}min. ` +
336
+ `May need intervention: /interrupt, /approve, or /kill`;
337
+ this.eventBus?.emitStall(session.id, 'extended', detail);
338
+ await this.channels.statusChange(this.makePayload('status.stall', session, detail));
339
+ }
340
+ }
341
+ }
342
+ // --- Type 5: Extended working stall (working too long regardless of byte changes, ---
343
+ // Catches CC stuck in "Misting" state where internal loop detection
344
+ if (currentStatus === 'working') {
345
+ const entry = this.stateSince.get(session.id);
346
+ if (entry && entry.state === 'working') {
347
+ const workingDuration = now - entry.since;
348
+ const maxWorkingMs = this.config.stallThresholdMs * 3; // 15 min default
349
+ if (workingDuration >= maxWorkingMs && !this.stallHas(session.id, 'extended_working')) {
350
+ this.stallAdd(session.id, 'extended_working');
351
+ const minutes = Math.round(workingDuration / 60000);
352
+ const detail = `Session stalled: in "working" state for ${minutes}min. ` +
353
+ `CC may be stuck in an internal loop (e.g., Misting). Consider: POST /v1/sessions/${session.id}/interrupt or /kill`;
354
+ this.eventBus?.emitStall(session.id, 'extended_working', detail);
355
+ await this.channels.statusChange(this.makePayload('status.stall', session, detail));
356
+ }
357
+ }
358
+ }
359
+ // Clean up stall notifications on state transitions (using prevStallStatus)
360
+ if (prevStallStatus && prevStallStatus !== currentStatus) {
361
+ const exitedPermission = prevStallStatus === 'permission_prompt' || prevStallStatus === 'bash_approval';
362
+ const exitedUnknown = prevStallStatus === 'unknown';
363
+ if (exitedPermission) {
364
+ this.stallDeleteTypes(session.id, ['permission', 'permission_timeout']);
365
+ }
366
+ if (exitedUnknown) {
367
+ this.stallDelete(session.id, 'unknown');
368
+ }
369
+ }
370
+ // Clean up all state tracking when idle (catch-all)
371
+ if (currentStatus === 'idle') {
372
+ this.rateLimitedSessions.delete(session.id);
373
+ this.stateSince.delete(session.id);
374
+ // Clean stall notifications (session recovered) — O(1) with Map
375
+ this.stallDeleteAll(session.id);
376
+ }
377
+ // Update prevStatusForStall for next cycle
378
+ if (currentStatus) {
379
+ this.prevStatusForStall.set(session.id, currentStatus);
380
+ }
381
+ else {
382
+ this.prevStatusForStall.delete(session.id);
383
+ }
384
+ }
385
+ }
386
+ /** Issue #15: Check for Stop/StopFailure signals written by hook.ts. */
387
+ async checkStopSignals() {
388
+ // Check both aegis and manus dirs for backward compat
389
+ const aegisDir = join(homedir(), '.aegis');
390
+ const manusDir = join(homedir(), '.manus');
391
+ const signalFile = existsSync(join(aegisDir, 'stop_signals.json'))
392
+ ? join(aegisDir, 'stop_signals.json')
393
+ : join(manusDir, 'stop_signals.json');
394
+ if (!existsSync(signalFile))
395
+ return;
396
+ try {
397
+ const raw = await readFile(signalFile, 'utf-8');
398
+ const parsed = stopSignalsSchema.safeParse(JSON.parse(raw));
399
+ if (!parsed.success) {
400
+ logger.warn({
401
+ component: 'monitor',
402
+ operation: 'check_stop_signals',
403
+ errorCode: 'STOP_SIGNALS_INVALID',
404
+ });
405
+ return;
406
+ }
407
+ const signals = parsed.data;
408
+ for (const session of this.sessions.listSessions()) {
409
+ if (!session.claudeSessionId)
410
+ continue;
411
+ const signal = signals[session.claudeSessionId];
412
+ if (!signal)
413
+ continue;
414
+ const signalKey = `${session.claudeSessionId}:${signal.timestamp}`;
415
+ if (this.processedStopSignals.has(signalKey))
416
+ continue;
417
+ this.processedStopSignals.add(signalKey);
418
+ // #220: Prune oldest entries when Set exceeds max size
419
+ // #510: Collect keys first, then delete — avoid mutation during iteration
420
+ if (this.processedStopSignals.size > SessionMonitor.MAX_PROCESSED_STOP_SIGNALS) {
421
+ const toRemove = this.processedStopSignals.size - SessionMonitor.MAX_PROCESSED_STOP_SIGNALS;
422
+ const keysToDelete = [...this.processedStopSignals].slice(0, toRemove);
423
+ for (const key of keysToDelete) {
424
+ this.processedStopSignals.delete(key);
425
+ }
426
+ }
427
+ if (signal.event === 'StopFailure') {
428
+ logger.warn({
429
+ component: 'monitor',
430
+ operation: 'check_stop_signals',
431
+ sessionId: session.id,
432
+ errorCode: 'STOP_FAILURE_SIGNAL',
433
+ attributes: {
434
+ stopReason: signal.stop_reason ?? null,
435
+ error: signal.error ?? null,
436
+ signalTimestamp: signal.timestamp ?? null,
437
+ },
438
+ });
439
+ const stopReason = signal.stop_reason || '';
440
+ if (stopReason === 'rate_limit' || stopReason === 'overloaded') {
441
+ this.rateLimitedSessions.add(session.id);
442
+ await this.channels.statusChange(this.makePayload('status.rate_limited', session, `Claude API rate limited (${stopReason}). Session will resume when the backoff window expires.`));
443
+ }
444
+ else {
445
+ const errorDetail = signal.error || signal.stop_reason || 'Unknown API error';
446
+ await this.channels.statusChange(this.makePayload('status.error', session, `⚠️ Claude Code error: ${errorDetail}`));
447
+ }
448
+ }
449
+ else if (signal.event === 'Stop') {
450
+ logger.info({
451
+ component: 'monitor',
452
+ operation: 'check_stop_signals',
453
+ sessionId: session.id,
454
+ errorCode: 'STOP_SIGNAL',
455
+ attributes: {
456
+ signalTimestamp: signal.timestamp ?? null,
457
+ },
458
+ });
459
+ await this.channels.statusChange(this.makePayload('status.stopped', session, 'Claude Code session ended normally'));
460
+ }
461
+ }
462
+ }
463
+ catch (e) {
464
+ suppressedCatch(e, 'monitor.checkStopSignals.parseEntry');
465
+ }
466
+ }
467
+ /** Issue #84: Handle new entries from the fs.watch-based JSONL watcher.
468
+ * Forwards messages to channels and updates stall tracking. */
469
+ handleWatcherEvent(event) {
470
+ const session = this.sessions.getSession(event.sessionId);
471
+ if (!session)
472
+ return;
473
+ // Update monitor offset from watcher
474
+ session.monitorOffset = event.newOffset;
475
+ if (event.messages.length > 0) {
476
+ // Clear rate-limited state — CC resumed producing real output
477
+ this.rateLimitedSessions.delete(event.sessionId);
478
+ for (const msg of event.messages) {
479
+ // Forward asynchronously (fire-and-forget) — catch to prevent unhandled rejection (#404)
480
+ void this.forwardMessage(session, msg).catch(e => logger.error({
481
+ component: 'monitor',
482
+ operation: 'forward_message',
483
+ sessionId: session.id,
484
+ errorCode: 'FORWARD_MESSAGE_FAILED',
485
+ attributes: { error: e instanceof Error ? e.message : String(e) },
486
+ }));
487
+ }
488
+ // Update last activity
489
+ session.lastActivity = Date.now();
490
+ }
491
+ // Update JSONL stall tracking — only reset stall timer when real messages arrive
492
+ // When no messages, only update bytes tracking (keep timestamp)
493
+ const now = Date.now();
494
+ const prev = this.lastBytesSeen.get(event.sessionId);
495
+ if (event.newOffset > (prev?.bytes ?? -1)) {
496
+ if (event.messages.length > 0) {
497
+ // Real output — reset stall timer
498
+ this.lastBytesSeen.set(event.sessionId, { bytes: event.newOffset, at: now });
499
+ this.stallDelete(event.sessionId, 'jsonl');
500
+ }
501
+ else {
502
+ // File grew but no messages — only update bytes, keep timestamp
503
+ this.lastBytesSeen.set(event.sessionId, { bytes: event.newOffset, at: prev?.at ?? now });
504
+ }
505
+ }
506
+ }
507
+ async checkSession(session) {
508
+ // When the JSONL watcher is active, messages are forwarded via handleWatcherEvent.
509
+ // Here we only need to capture the terminal UI state (permission prompts, idle, etc.)
510
+ const result = await this.sessions.readMessagesForMonitor(session.id);
511
+ const prevStatus = this.lastStatus.get(session.id);
512
+ // Forward messages only when watcher is NOT active (fallback polling path)
513
+ if (!this.jsonlWatcher && result.messages.length > 0) {
514
+ this.rateLimitedSessions.delete(session.id);
515
+ for (const msg of result.messages) {
516
+ await this.forwardMessage(session, msg);
517
+ }
518
+ }
519
+ // Idle debounce: only emit idle after 10s of continuous idle
520
+ if (result.status === 'idle') {
521
+ if (!this.idleSince.has(session.id)) {
522
+ this.idleSince.set(session.id, Date.now());
523
+ }
524
+ }
525
+ else {
526
+ this.idleSince.delete(session.id);
527
+ // Reset idle notification guard when genuinely not idle
528
+ if (result.status === 'working' || result.status === 'unknown') {
529
+ this.idleNotified.delete(session.id);
530
+ }
531
+ }
532
+ // Detect and broadcast status changes (debounced)
533
+ if (result.status !== prevStatus) {
534
+ // Issue #89 L4: Debounce rapid status changes per session.
535
+ // If multiple transitions happen within STATUS_CHANGE_DEBOUNCE_MS,
536
+ // only the last one triggers a broadcast.
537
+ const existing = this.statusChangeDebounce.get(session.id);
538
+ if (existing)
539
+ clearTimeout(existing);
540
+ const latestStatus = result.status;
541
+ const latestPrevStatus = prevStatus;
542
+ const latestResult = { statusText: result.statusText, interactiveContent: result.interactiveContent };
543
+ this.statusChangeDebounce.set(session.id, setTimeout(() => {
544
+ this.statusChangeDebounce.delete(session.id);
545
+ // #511: Skip broadcast if session was killed while debounce was pending
546
+ if (!this.lastStatus.has(session.id))
547
+ return;
548
+ void this.broadcastStatusChange(session, latestStatus, latestPrevStatus, latestResult)
549
+ .catch(e => logger.error({
550
+ component: 'monitor',
551
+ operation: 'broadcast_status_change',
552
+ sessionId: session.id,
553
+ errorCode: 'BROADCAST_STATUS_CHANGE_FAILED',
554
+ attributes: { error: e instanceof Error ? e.message : String(e) },
555
+ }));
556
+ }, STATUS_CHANGE_DEBOUNCE_MS));
557
+ }
558
+ this.lastStatus.set(session.id, result.status);
559
+ }
560
+ async forwardMessage(session, msg) {
561
+ const eventMap = {
562
+ 'user:text': 'message.user',
563
+ 'assistant:text': 'message.assistant',
564
+ 'assistant:thinking': 'message.thinking',
565
+ 'assistant:tool_use': 'message.tool_use',
566
+ 'assistant:tool_result': 'message.tool_result',
567
+ };
568
+ const key = `${msg.role}:${msg.contentType}`;
569
+ // Issue #89 L33: System entries get a different SSE event type
570
+ if (msg.role === 'system') {
571
+ this.eventBus?.emitSystem(session.id, msg.text, msg.contentType);
572
+ return;
573
+ }
574
+ const event = eventMap[key];
575
+ if (!event)
576
+ return;
577
+ // Issue #32: Emit SSE message event (L11: include tool metadata)
578
+ this.eventBus?.emitMessage(session.id, msg.role, msg.text, msg.contentType, msg.toolName || msg.toolUseId ? { tool_name: msg.toolName, tool_id: msg.toolUseId } : undefined);
579
+ await maybeInjectFault('monitor.forwardMessage.channels.message');
580
+ await this.channels.message(this.makePayload(event, session, msg.text));
581
+ }
582
+ async broadcastStatusChange(session, status, prevStatus, result) {
583
+ await maybeInjectFault('monitor.broadcastStatusChange.start');
584
+ if (status === 'permission_prompt' || status === 'bash_approval') {
585
+ // Issue #32: Emit SSE approval event
586
+ this.eventBus?.emitApproval(session.id, result.interactiveContent || 'Permission requested');
587
+ // Auto-approve if session has a non-default permission mode
588
+ // that auto-approves permission prompts (bypassPermissions, dontAsk,
589
+ // acceptEdits, plan, auto all handle their own permissions).
590
+ const AUTO_APPROVE_MODES = new Set(['bypassPermissions', 'dontAsk', 'acceptEdits', 'plan', 'auto']);
591
+ if (session.permissionMode !== 'default' && AUTO_APPROVE_MODES.has(session.permissionMode)) {
592
+ logger.info({
593
+ component: 'monitor',
594
+ operation: 'auto_approve_permission',
595
+ sessionId: session.id,
596
+ attributes: { windowName: session.windowName, mode: session.permissionMode },
597
+ });
598
+ try {
599
+ await this.sessions.approve(session.id);
600
+ await this.channels.statusChange(this.makePayload('status.permission', session, `[AUTO-APPROVED] ${result.interactiveContent || 'Permission auto-approved'}`));
601
+ }
602
+ catch (e) {
603
+ const errMsg = e instanceof Error ? e.message : String(e);
604
+ logger.error({
605
+ component: 'monitor',
606
+ operation: 'auto_approve_permission',
607
+ sessionId: session.id,
608
+ errorCode: 'AUTO_APPROVE_FAILED',
609
+ attributes: { error: errMsg },
610
+ });
611
+ await this.channels.statusChange(this.makePayload('status.permission', session, `[AUTO-APPROVE FAILED] ${result.interactiveContent || 'Permission requested'}: ${errMsg}`));
612
+ }
613
+ }
614
+ else {
615
+ await this.channels.statusChange(this.makePayload('status.permission', session, result.interactiveContent || 'Permission requested'));
616
+ }
617
+ }
618
+ else if (status === 'plan_mode') {
619
+ this.eventBus?.emitStatus(session.id, 'plan_mode', result.interactiveContent || 'Plan review requested');
620
+ await this.channels.statusChange(this.makePayload('status.plan', session, result.interactiveContent || 'Plan review requested'));
621
+ }
622
+ else if (status === 'idle') {
623
+ const idleStart = this.idleSince.get(session.id) || Date.now();
624
+ const idleDuration = Date.now() - idleStart;
625
+ // Only notify after 3s of continuous idle, and only once (M23: reduced from 10s)
626
+ if (idleDuration >= 3_000 && !this.idleNotified.has(session.id)) {
627
+ this.idleNotified.add(session.id);
628
+ this.eventBus?.emitStatus(session.id, 'idle', result.statusText || 'Session finished working, awaiting input');
629
+ await this.channels.statusChange(this.makePayload('status.idle', session, result.statusText || 'Session finished working, awaiting input'));
630
+ }
631
+ }
632
+ else if (status === 'ask_question' && prevStatus !== 'ask_question') {
633
+ this.eventBus?.emitStatus(session.id, 'ask_question', result.interactiveContent || 'Session is asking a question');
634
+ await this.channels.statusChange(this.makePayload('status.question', session, result.interactiveContent || 'Session is asking a question'));
635
+ }
636
+ // Issue #32: Emit working status via SSE
637
+ if (status === 'working' && prevStatus !== 'working') {
638
+ this.eventBus?.emitStatus(session.id, 'working', 'Claude is working');
639
+ }
640
+ }
641
+ makePayload(event, session, detail) {
642
+ return {
643
+ event,
644
+ timestamp: new Date().toISOString(),
645
+ session: {
646
+ id: session.id,
647
+ name: session.windowName,
648
+ workDir: session.workDir,
649
+ },
650
+ detail: detail.slice(0, 2000),
651
+ };
652
+ }
653
+ /** Check for dead tmux windows and notify via channels. */
654
+ async checkDeadSessions() {
655
+ // Issue #397: While tmux server is down, defer dead-session cleanup.
656
+ // tmux commands can fail transiently and make healthy sessions look dead.
657
+ if (this.tmuxWasDown)
658
+ return;
659
+ const sessions = this.sessions.listSessions();
660
+ for (const session of sessions) {
661
+ if (this.deadNotified.has(session.id))
662
+ continue;
663
+ await maybeInjectFault('monitor.checkDeadSessions.isWindowAlive');
664
+ const alive = await this.sessions.isWindowAlive(session.id);
665
+ if (!alive) {
666
+ let windowExists = null;
667
+ let paneDead = null;
668
+ let paneCommand = null;
669
+ let exitCode = null;
670
+ try {
671
+ if (this.tmux) {
672
+ const health = await this.tmux.getWindowHealth(session.windowId);
673
+ windowExists = health.windowExists;
674
+ paneDead = health.paneDead;
675
+ paneCommand = health.paneCommand;
676
+ if (health.windowExists && health.paneDead) {
677
+ const paneText = await this.tmux.capturePane(session.windowId);
678
+ const statusMatch = paneText.match(/Pane is dead \(status\s+(\d+)\)/i);
679
+ if (statusMatch) {
680
+ const parsed = parseInt(statusMatch[1] ?? '', 10);
681
+ exitCode = Number.isFinite(parsed) ? parsed : null;
682
+ }
683
+ }
684
+ }
685
+ }
686
+ catch {
687
+ // best-effort diagnostics only
688
+ }
689
+ const cause = windowExists === false
690
+ ? 'window_missing'
691
+ : paneDead
692
+ ? 'pane_dead'
693
+ : 'process_not_alive_or_unknown';
694
+ logger.warn({
695
+ component: 'monitor',
696
+ operation: 'check_dead_sessions',
697
+ sessionId: session.id,
698
+ errorCode: 'SESSION_TERMINATED_UNEXPECTEDLY',
699
+ attributes: {
700
+ cause,
701
+ windowName: session.windowName,
702
+ windowId: session.windowId,
703
+ claudeSessionId: session.claudeSessionId,
704
+ ccPid: session.ccPid ?? null,
705
+ paneCommand,
706
+ windowExists,
707
+ paneDead,
708
+ paneAlive: paneDead === null ? null : !paneDead,
709
+ exitCode,
710
+ signal: signalFromExitCode(exitCode),
711
+ uptimeMs: Date.now() - session.createdAt,
712
+ lastActivityAt: new Date(session.lastActivity).toISOString(),
713
+ detectedAt: new Date().toISOString(),
714
+ },
715
+ });
716
+ this.deadNotified.add(session.id);
717
+ // Track when the session died so the zombie reaper can clean it up
718
+ session.lastDeadAt = Date.now();
719
+ const detail = `Session "${session.windowName}" died — tmux window no longer exists. ` +
720
+ `Last activity: ${new Date(session.lastActivity).toISOString()}`;
721
+ this.eventBus?.emitDead(session.id, detail);
722
+ await this.channels.statusChange(this.makePayload('status.dead', session, detail));
723
+ this.removeSession(session.id);
724
+ // #262: Also remove from SessionManager so dead sessions don't linger
725
+ try {
726
+ await this.sessions.killSession(session.id);
727
+ }
728
+ catch (e) {
729
+ suppressedCatch(e, 'monitor.checkDeadSessions.killSession');
730
+ }
731
+ }
732
+ }
733
+ }
734
+ /** Issue #397: Check tmux server health. Detect crashes and trigger reconciliation. */
735
+ async checkTmuxHealth() {
736
+ if (!this.tmux)
737
+ return;
738
+ let healthy = true;
739
+ let error = null;
740
+ try {
741
+ ({ healthy, error } = await this.tmux.isServerHealthy());
742
+ }
743
+ catch (e) {
744
+ healthy = false;
745
+ error = e instanceof Error ? e.message : String(e);
746
+ }
747
+ if (!healthy) {
748
+ // Only treat known server/socket failures as "tmux down".
749
+ // Other tmux errors can be transient command failures.
750
+ const serverDown = this.tmux.isTmuxServerError(new Error(error ?? 'tmux unavailable'));
751
+ if (!serverDown) {
752
+ logger.warn({
753
+ component: 'monitor',
754
+ operation: 'tmux_health_check',
755
+ errorCode: 'TMUX_HEALTH_CHECK_ERROR',
756
+ attributes: { error: error ?? 'unknown tmux health error' },
757
+ });
758
+ return;
759
+ }
760
+ if (!this.tmuxWasDown) {
761
+ logger.warn({
762
+ component: 'monitor',
763
+ operation: 'tmux_health_check',
764
+ errorCode: 'TMUX_UNREACHABLE',
765
+ attributes: { error: error ?? 'tmux server unavailable' },
766
+ });
767
+ this.tmuxWasDown = true;
768
+ }
769
+ return;
770
+ }
771
+ // Tmux is healthy now
772
+ if (this.tmuxWasDown) {
773
+ logger.info({
774
+ component: 'monitor',
775
+ operation: 'tmux_health_check',
776
+ errorCode: 'TMUX_RECOVERED',
777
+ });
778
+ this.tmuxWasDown = false;
779
+ // Trigger crash reconciliation to re-attach or mark orphaned sessions
780
+ const result = await this.sessions.reconcileTmuxCrash();
781
+ if (result.recovered > 0 || result.orphaned > 0) {
782
+ logger.info({
783
+ component: 'monitor',
784
+ operation: 'tmux_crash_reconciliation',
785
+ attributes: { recovered: result.recovered, orphaned: result.orphaned },
786
+ });
787
+ // Notify channels about recovery
788
+ for (const session of this.sessions.listSessions()) {
789
+ await this.channels.statusChange(this.makePayload('status.recovered', session, `tmux server recovered. Session ${session.windowName} re-attached.`));
790
+ }
791
+ }
792
+ }
793
+ }
794
+ /** Clean up tracking for a killed session. */
795
+ removeSession(sessionId) {
796
+ // Issue #84: Stop watching JSONL file for this session
797
+ this.jsonlWatcher?.unwatch(sessionId);
798
+ this.lastStatus.delete(sessionId);
799
+ this.lastBytesSeen.delete(sessionId);
800
+ this.deadNotified.delete(sessionId);
801
+ this.rateLimitedSessions.delete(sessionId);
802
+ // Issue #89 L4: Clear pending debounce timer
803
+ const pending = this.statusChangeDebounce.get(sessionId);
804
+ if (pending) {
805
+ clearTimeout(pending);
806
+ this.statusChangeDebounce.delete(sessionId);
807
+ }
808
+ // Clean all stall notifications for this session — O(1) with Map
809
+ this.stallDeleteAll(sessionId);
810
+ this.idleNotified.delete(sessionId);
811
+ this.idleSince.delete(sessionId);
812
+ this.stateSince.delete(sessionId);
813
+ this.prevStatusForStall.delete(sessionId);
814
+ // Note: processedStopSignals uses claudeSessionId:timestamp keys, not bridge sessionId.
815
+ // We don't clean them here — they're small and prevent re-processing.
816
+ }
817
+ }
818
+ function sleep(ms) {
819
+ return new Promise(resolve => setTimeout(resolve, ms));
820
+ }