instar 0.7.52 → 0.7.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,326 @@
1
+ /**
2
+ * SessionWatchdog — Auto-remediation for stuck Claude sessions (Instar port).
3
+ *
4
+ * Detects when a Claude session has a long-running bash command and escalates
5
+ * from gentle (Ctrl+C) to forceful (SIGKILL + session kill). Adapted from
6
+ * Dawn Server's SessionWatchdog for Instar's self-contained architecture.
7
+ *
8
+ * Escalation pipeline:
9
+ * Level 0: Monitoring (default)
10
+ * Level 1: Ctrl+C via tmux send-keys
11
+ * Level 2: SIGTERM the stuck child PID
12
+ * Level 3: SIGKILL the stuck child PID
13
+ * Level 4: Kill tmux session
14
+ */
15
+ import { execSync } from 'node:child_process';
16
+ import { EventEmitter } from 'node:events';
17
+ export var EscalationLevel;
18
+ (function (EscalationLevel) {
19
+ EscalationLevel[EscalationLevel["Monitoring"] = 0] = "Monitoring";
20
+ EscalationLevel[EscalationLevel["CtrlC"] = 1] = "CtrlC";
21
+ EscalationLevel[EscalationLevel["SigTerm"] = 2] = "SigTerm";
22
+ EscalationLevel[EscalationLevel["SigKill"] = 3] = "SigKill";
23
+ EscalationLevel[EscalationLevel["KillSession"] = 4] = "KillSession";
24
+ })(EscalationLevel || (EscalationLevel = {}));
25
+ // Processes that are long-running by design
26
+ const EXCLUDED_PATTERNS = [
27
+ 'playwright-mcp', 'playwright-persistent', '@playwright/mcp',
28
+ 'chrome-native-host', 'claude-in-chrome-mcp', 'payments-mcp',
29
+ 'mcp-remote', '/mcp/', '.mcp/', 'caffeinate', 'exa-mcp-server',
30
+ ];
31
+ const EXCLUDED_PREFIXES = [
32
+ '/bin/zsh -c -l source',
33
+ '/bin/bash -c -l source',
34
+ ];
35
+ // Escalation delays (ms to wait before advancing to next level)
36
+ const ESCALATION_DELAYS = {
37
+ [EscalationLevel.Monitoring]: 0,
38
+ [EscalationLevel.CtrlC]: 0,
39
+ [EscalationLevel.SigTerm]: 15_000,
40
+ [EscalationLevel.SigKill]: 10_000,
41
+ [EscalationLevel.KillSession]: 5_000,
42
+ };
43
+ const DEFAULT_STUCK_THRESHOLD_MS = 180_000; // 3 minutes
44
+ const DEFAULT_POLL_INTERVAL_MS = 30_000;
45
+ const MAX_RETRIES = 2;
46
+ export class SessionWatchdog extends EventEmitter {
47
+ config;
48
+ sessionManager;
49
+ state;
50
+ interval = null;
51
+ escalationState = new Map();
52
+ interventionHistory = [];
53
+ enabled = true;
54
+ running = false;
55
+ stuckThresholdMs;
56
+ pollIntervalMs;
57
+ constructor(config, sessionManager, state) {
58
+ super();
59
+ this.config = config;
60
+ this.sessionManager = sessionManager;
61
+ this.state = state;
62
+ const wdConfig = config.monitoring.watchdog;
63
+ this.stuckThresholdMs = (wdConfig?.stuckCommandSec ?? 180) * 1000;
64
+ this.pollIntervalMs = wdConfig?.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
65
+ }
66
+ start() {
67
+ if (this.interval)
68
+ return;
69
+ console.log(`[Watchdog] Starting (poll: ${this.pollIntervalMs / 1000}s, threshold: ${this.stuckThresholdMs / 1000}s)`);
70
+ this.interval = setInterval(() => this.poll(), this.pollIntervalMs);
71
+ setTimeout(() => this.poll(), 5000);
72
+ }
73
+ stop() {
74
+ if (this.interval) {
75
+ clearInterval(this.interval);
76
+ this.interval = null;
77
+ }
78
+ }
79
+ setEnabled(enabled) {
80
+ this.enabled = enabled;
81
+ if (!enabled) {
82
+ this.escalationState.clear();
83
+ }
84
+ }
85
+ isEnabled() {
86
+ return this.enabled;
87
+ }
88
+ isManaging(sessionName) {
89
+ const s = this.escalationState.get(sessionName);
90
+ return s !== undefined && s.level > EscalationLevel.Monitoring;
91
+ }
92
+ getStatus() {
93
+ const runningSessions = this.sessionManager.listRunningSessions();
94
+ const sessions = runningSessions.map(s => ({
95
+ name: s.tmuxSession,
96
+ escalation: this.escalationState.get(s.tmuxSession) ?? null,
97
+ }));
98
+ return {
99
+ enabled: this.enabled,
100
+ sessions,
101
+ interventionHistory: this.interventionHistory.slice(-20),
102
+ };
103
+ }
104
+ // --- Core polling ---
105
+ async poll() {
106
+ if (!this.enabled || this.running)
107
+ return;
108
+ this.running = true;
109
+ try {
110
+ const sessions = this.sessionManager.listRunningSessions();
111
+ for (const session of sessions) {
112
+ try {
113
+ this.checkSession(session.tmuxSession);
114
+ }
115
+ catch (err) {
116
+ console.error(`[Watchdog] Error checking "${session.tmuxSession}":`, err);
117
+ }
118
+ }
119
+ }
120
+ finally {
121
+ this.running = false;
122
+ }
123
+ }
124
+ checkSession(tmuxSession) {
125
+ const existing = this.escalationState.get(tmuxSession);
126
+ if (existing && existing.level > EscalationLevel.Monitoring) {
127
+ this.handleEscalation(tmuxSession, existing);
128
+ return;
129
+ }
130
+ // Find Claude PID in the tmux session
131
+ const claudePid = this.getClaudePid(tmuxSession);
132
+ if (!claudePid)
133
+ return;
134
+ const children = this.getChildProcesses(claudePid);
135
+ const stuckChild = children.find(c => !this.isExcluded(c.command) && c.elapsedMs > this.stuckThresholdMs);
136
+ if (stuckChild) {
137
+ const state = {
138
+ level: EscalationLevel.CtrlC,
139
+ levelEnteredAt: Date.now(),
140
+ stuckChildPid: stuckChild.pid,
141
+ stuckCommand: stuckChild.command,
142
+ retryCount: existing?.retryCount ?? 0,
143
+ };
144
+ this.escalationState.set(tmuxSession, state);
145
+ console.log(`[Watchdog] "${tmuxSession}": stuck command (${Math.round(stuckChild.elapsedMs / 1000)}s): ` +
146
+ `${stuckChild.command.slice(0, 80)} — sending Ctrl+C`);
147
+ this.sessionManager.sendKey(tmuxSession, 'C-c');
148
+ this.recordIntervention(tmuxSession, EscalationLevel.CtrlC, 'Sent Ctrl+C', stuckChild);
149
+ }
150
+ else if (existing) {
151
+ this.escalationState.delete(tmuxSession);
152
+ }
153
+ }
154
+ handleEscalation(tmuxSession, state) {
155
+ const now = Date.now();
156
+ if (!this.isProcessAlive(state.stuckChildPid)) {
157
+ console.log(`[Watchdog] "${tmuxSession}": stuck process ${state.stuckChildPid} died — recovered`);
158
+ this.emit('recovery', tmuxSession, state.level);
159
+ this.escalationState.delete(tmuxSession);
160
+ return;
161
+ }
162
+ const timeInLevel = now - state.levelEnteredAt;
163
+ const nextLevel = state.level + 1;
164
+ if (nextLevel > EscalationLevel.KillSession) {
165
+ if (state.retryCount >= MAX_RETRIES) {
166
+ console.log(`[Watchdog] "${tmuxSession}": max retries reached — giving up`);
167
+ this.escalationState.delete(tmuxSession);
168
+ return;
169
+ }
170
+ state.level = EscalationLevel.CtrlC;
171
+ state.levelEnteredAt = now;
172
+ state.retryCount++;
173
+ this.sessionManager.sendKey(tmuxSession, 'C-c');
174
+ this.recordIntervention(tmuxSession, EscalationLevel.CtrlC, `Retry ${state.retryCount}: Sent Ctrl+C`, {
175
+ pid: state.stuckChildPid, command: state.stuckCommand, elapsedMs: 0,
176
+ });
177
+ return;
178
+ }
179
+ const delayForNext = ESCALATION_DELAYS[nextLevel] ?? 15_000;
180
+ if (timeInLevel < delayForNext)
181
+ return;
182
+ state.level = nextLevel;
183
+ state.levelEnteredAt = now;
184
+ const child = { pid: state.stuckChildPid, command: state.stuckCommand, elapsedMs: 0 };
185
+ switch (state.level) {
186
+ case EscalationLevel.SigTerm:
187
+ console.log(`[Watchdog] "${tmuxSession}": sending SIGTERM to ${state.stuckChildPid}`);
188
+ this.sendSignal(state.stuckChildPid, 'SIGTERM');
189
+ this.recordIntervention(tmuxSession, EscalationLevel.SigTerm, `SIGTERM ${state.stuckChildPid}`, child);
190
+ break;
191
+ case EscalationLevel.SigKill:
192
+ console.log(`[Watchdog] "${tmuxSession}": sending SIGKILL to ${state.stuckChildPid}`);
193
+ this.sendSignal(state.stuckChildPid, 'SIGKILL');
194
+ this.recordIntervention(tmuxSession, EscalationLevel.SigKill, `SIGKILL ${state.stuckChildPid}`, child);
195
+ break;
196
+ case EscalationLevel.KillSession:
197
+ console.log(`[Watchdog] "${tmuxSession}": killing tmux session`);
198
+ this.killTmuxSession(tmuxSession);
199
+ this.recordIntervention(tmuxSession, EscalationLevel.KillSession, 'Killed tmux session', child);
200
+ this.escalationState.delete(tmuxSession);
201
+ break;
202
+ }
203
+ }
204
+ // --- Process utilities (self-contained, no shared module) ---
205
+ getClaudePid(tmuxSession) {
206
+ try {
207
+ // Get pane PID
208
+ const panePidStr = execSync(`${this.config.sessions.tmuxPath} list-panes -t "=${tmuxSession}" -F "#{pane_pid}" 2>/dev/null`, { encoding: 'utf-8', timeout: 5000 }).trim().split('\n')[0];
209
+ if (!panePidStr)
210
+ return null;
211
+ const panePid = parseInt(panePidStr, 10);
212
+ if (isNaN(panePid))
213
+ return null;
214
+ // Find claude child
215
+ const claudePidStr = execSync(`pgrep -P ${panePid} -f claude 2>/dev/null | head -1`, { encoding: 'utf-8', timeout: 5000 }).trim();
216
+ if (!claudePidStr)
217
+ return null;
218
+ const pid = parseInt(claudePidStr, 10);
219
+ return isNaN(pid) ? null : pid;
220
+ }
221
+ catch {
222
+ return null;
223
+ }
224
+ }
225
+ getChildProcesses(pid) {
226
+ try {
227
+ const childPidsStr = execSync(`pgrep -P ${pid} 2>/dev/null`, { encoding: 'utf-8', timeout: 5000 }).trim();
228
+ if (!childPidsStr)
229
+ return [];
230
+ const childPids = childPidsStr.split('\n').filter(Boolean).join(',');
231
+ if (!childPids)
232
+ return [];
233
+ const output = execSync(`ps -o pid=,etime=,command= -p ${childPids} 2>/dev/null`, { encoding: 'utf-8', timeout: 5000 }).trim();
234
+ if (!output)
235
+ return [];
236
+ const results = [];
237
+ for (const line of output.split('\n')) {
238
+ const match = line.trim().match(/^(\d+)\s+([\d:.-]+)\s+(.+)$/);
239
+ if (!match)
240
+ continue;
241
+ const childPid = parseInt(match[1], 10);
242
+ if (isNaN(childPid))
243
+ continue;
244
+ results.push({
245
+ pid: childPid,
246
+ command: match[3],
247
+ elapsedMs: this.parseElapsed(match[2]),
248
+ });
249
+ }
250
+ return results;
251
+ }
252
+ catch {
253
+ return [];
254
+ }
255
+ }
256
+ isExcluded(command) {
257
+ for (const pattern of EXCLUDED_PATTERNS) {
258
+ if (command.includes(pattern))
259
+ return true;
260
+ }
261
+ for (const prefix of EXCLUDED_PREFIXES) {
262
+ if (command.startsWith(prefix))
263
+ return true;
264
+ }
265
+ return false;
266
+ }
267
+ parseElapsed(elapsed) {
268
+ let days = 0;
269
+ let timePart = elapsed;
270
+ if (elapsed.includes('-')) {
271
+ const [d, t] = elapsed.split('-');
272
+ days = parseInt(d, 10);
273
+ timePart = t;
274
+ }
275
+ const parts = timePart.split(':').map(Number);
276
+ let seconds = 0;
277
+ if (parts.length === 3)
278
+ seconds = parts[0] * 3600 + parts[1] * 60 + parts[2];
279
+ else if (parts.length === 2)
280
+ seconds = parts[0] * 60 + parts[1];
281
+ else
282
+ seconds = parts[0];
283
+ return (days * 86400 + seconds) * 1000;
284
+ }
285
+ sendSignal(pid, signal) {
286
+ try {
287
+ process.kill(pid, signal);
288
+ }
289
+ catch (err) {
290
+ if (err.code !== 'ESRCH') {
291
+ console.error(`[Watchdog] Failed to send ${signal} to ${pid}:`, err);
292
+ }
293
+ }
294
+ }
295
+ isProcessAlive(pid) {
296
+ try {
297
+ process.kill(pid, 0);
298
+ return true;
299
+ }
300
+ catch {
301
+ return false;
302
+ }
303
+ }
304
+ killTmuxSession(tmuxSession) {
305
+ try {
306
+ execSync(`${this.config.sessions.tmuxPath} kill-session -t "=${tmuxSession}" 2>/dev/null`, { timeout: 5000, stdio: 'ignore' });
307
+ }
308
+ catch { }
309
+ }
310
+ recordIntervention(sessionName, level, action, child) {
311
+ const event = {
312
+ sessionName,
313
+ level,
314
+ action,
315
+ stuckCommand: child.command.slice(0, 200),
316
+ stuckPid: child.pid,
317
+ timestamp: Date.now(),
318
+ };
319
+ this.interventionHistory.push(event);
320
+ if (this.interventionHistory.length > 50) {
321
+ this.interventionHistory = this.interventionHistory.slice(-50);
322
+ }
323
+ this.emit('intervention', event);
324
+ }
325
+ }
326
+ //# sourceMappingURL=SessionWatchdog.js.map
@@ -21,6 +21,7 @@ import type { TelegraphService } from '../publishing/TelegraphService.js';
21
21
  import type { PrivateViewer } from '../publishing/PrivateViewer.js';
22
22
  import type { TunnelManager } from '../tunnel/TunnelManager.js';
23
23
  import type { EvolutionManager } from '../core/EvolutionManager.js';
24
+ import type { SessionWatchdog } from '../monitoring/SessionWatchdog.js';
24
25
  export declare class AgentServer {
25
26
  private app;
26
27
  private server;
@@ -43,6 +44,7 @@ export declare class AgentServer {
43
44
  viewer?: PrivateViewer;
44
45
  tunnel?: TunnelManager;
45
46
  evolution?: EvolutionManager;
47
+ watchdog?: SessionWatchdog;
46
48
  });
47
49
  /**
48
50
  * Start the HTTP server.
@@ -39,6 +39,7 @@ export class AgentServer {
39
39
  viewer: options.viewer ?? null,
40
40
  tunnel: options.tunnel ?? null,
41
41
  evolution: options.evolution ?? null,
42
+ watchdog: options.watchdog ?? null,
42
43
  startTime: this.startTime,
43
44
  });
44
45
  this.app.use(routes);
@@ -0,0 +1,62 @@
1
+ /**
2
+ * WebSocket Manager — real-time terminal streaming for the dashboard.
3
+ *
4
+ * Handles client subscriptions to tmux sessions, streams terminal output
5
+ * via diff-based updates, and forwards input to sessions.
6
+ *
7
+ * Protocol (JSON messages):
8
+ *
9
+ * Client → Server:
10
+ * { type: 'subscribe', session: 'session-name' }
11
+ * { type: 'unsubscribe', session: 'session-name' }
12
+ * { type: 'input', session: 'session-name', text: 'some input' }
13
+ * { type: 'key', session: 'session-name', key: 'C-c' }
14
+ * { type: 'ping' }
15
+ *
16
+ * Server → Client:
17
+ * { type: 'output', session: 'session-name', data: '...terminal output...' }
18
+ * { type: 'sessions', sessions: [...] }
19
+ * { type: 'session_ended', session: 'session-name' }
20
+ * { type: 'subscribed', session: 'session-name' }
21
+ * { type: 'unsubscribed', session: 'session-name' }
22
+ * { type: 'input_ack', session: 'session-name', success: true }
23
+ * { type: 'pong' }
24
+ * { type: 'error', message: '...' }
25
+ */
26
+ import type { Server as HttpServer } from 'node:http';
27
+ import type { SessionManager } from '../core/SessionManager.js';
28
+ import type { StateManager } from '../core/StateManager.js';
29
+ export declare class WebSocketManager {
30
+ private wss;
31
+ private clients;
32
+ private sessionOutputCache;
33
+ private streamInterval;
34
+ private heartbeatInterval;
35
+ private sessionBroadcastInterval;
36
+ private sessionManager;
37
+ private state;
38
+ private authToken?;
39
+ constructor(options: {
40
+ server: HttpServer;
41
+ sessionManager: SessionManager;
42
+ state: StateManager;
43
+ authToken?: string;
44
+ });
45
+ private authenticate;
46
+ private verifyToken;
47
+ private handleMessage;
48
+ /**
49
+ * Stream terminal output to subscribed clients.
50
+ * Uses diff-based approach: only sends new content since last capture.
51
+ */
52
+ private startStreaming;
53
+ private sendSessionList;
54
+ private broadcastSessionList;
55
+ private clientId;
56
+ private send;
57
+ /**
58
+ * Graceful shutdown — close all connections and stop intervals.
59
+ */
60
+ shutdown(): void;
61
+ }
62
+ //# sourceMappingURL=WebSocketManager.d.ts.map