instar 0.7.50 → 0.7.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,7 @@ import { QuotaTracker } from '../monitoring/QuotaTracker.js';
36
36
  import { AccountSwitcher } from '../monitoring/AccountSwitcher.js';
37
37
  import { QuotaNotifier } from '../monitoring/QuotaNotifier.js';
38
38
  import { classifySessionDeath } from '../monitoring/QuotaExhaustionDetector.js';
39
+ import { SessionWatchdog } from '../monitoring/SessionWatchdog.js';
39
40
  import { installAutoStart } from './setup.js';
40
41
  /**
41
42
  * Check if autostart is installed for this project.
@@ -646,6 +647,31 @@ export async function startServer(options) {
646
647
  scheduler.notifyJobComplete(session.id, session.tmuxSession);
647
648
  });
648
649
  }
650
+ // Session Watchdog — auto-remediation for stuck commands
651
+ let watchdog;
652
+ if (config.monitoring.watchdog?.enabled) {
653
+ watchdog = new SessionWatchdog(config, sessionManager, state);
654
+ watchdog.on('intervention', (event) => {
655
+ if (telegram) {
656
+ const topicId = telegram.getTopicForSession(event.sessionName);
657
+ if (topicId) {
658
+ const levelNames = ['Monitoring', 'Ctrl+C', 'SIGTERM', 'SIGKILL', 'Kill Session'];
659
+ const levelName = levelNames[event.level] || `Level ${event.level}`;
660
+ telegram.sendToTopic(topicId, `🔧 Watchdog [${levelName}]: ${event.action}\nStuck: \`${event.stuckCommand.slice(0, 60)}\``).catch(() => { });
661
+ }
662
+ }
663
+ });
664
+ watchdog.on('recovery', (sessionName, fromLevel) => {
665
+ if (telegram) {
666
+ const topicId = telegram.getTopicForSession(sessionName);
667
+ if (topicId) {
668
+ telegram.sendToTopic(topicId, `✅ Watchdog: session recovered (was at escalation level ${fromLevel})`).catch(() => { });
669
+ }
670
+ }
671
+ });
672
+ watchdog.start();
673
+ console.log(pc.green(' Session Watchdog enabled'));
674
+ }
649
675
  // Set up feedback and update checking
650
676
  let feedback;
651
677
  if (config.feedback) {
@@ -800,7 +826,7 @@ export async function startServer(options) {
800
826
  }
801
827
  });
802
828
  sleepWakeDetector.start();
803
- const server = new AgentServer({ config, sessionManager, state, scheduler, telegram, relationships, feedback, dispatches, updateChecker, autoUpdater, autoDispatcher, quotaTracker, publisher, viewer, tunnel, evolution });
829
+ const server = new AgentServer({ config, sessionManager, state, scheduler, telegram, relationships, feedback, dispatches, updateChecker, autoUpdater, autoDispatcher, quotaTracker, publisher, viewer, tunnel, evolution, watchdog });
804
830
  await server.start();
805
831
  // Start tunnel AFTER server is listening
806
832
  if (tunnel) {
@@ -68,6 +68,12 @@ export declare class SessionManager extends EventEmitter {
68
68
  * Send input to a running tmux session.
69
69
  */
70
70
  sendInput(tmuxSession: string, input: string): boolean;
71
+ /**
72
+ * Send a tmux key sequence (without -l literal flag).
73
+ * Use for special keys like 'C-c' (Ctrl+C), 'Enter', 'Escape'.
74
+ * Unlike sendInput() which uses -l (literal), this sends key names directly.
75
+ */
76
+ sendKey(tmuxSession: string, key: string): boolean;
71
77
  /**
72
78
  * List all sessions that are currently running.
73
79
  * Pure filter — does not mutate state. The monitor tick handles lifecycle transitions.
@@ -297,6 +297,20 @@ export class SessionManager extends EventEmitter {
297
297
  return false;
298
298
  }
299
299
  }
300
+ /**
301
+ * Send a tmux key sequence (without -l literal flag).
302
+ * Use for special keys like 'C-c' (Ctrl+C), 'Enter', 'Escape'.
303
+ * Unlike sendInput() which uses -l (literal), this sends key names directly.
304
+ */
305
+ sendKey(tmuxSession, key) {
306
+ try {
307
+ execFileSync(this.config.tmuxPath, ['send-keys', '-t', `=${tmuxSession}:`, key], { encoding: 'utf-8', timeout: 5000 });
308
+ return true;
309
+ }
310
+ catch {
311
+ return false;
312
+ }
313
+ }
300
314
  /**
301
315
  * List all sessions that are currently running.
302
316
  * Pure filter — does not mutate state. The monitor tick handles lifecycle transitions.
@@ -464,35 +478,25 @@ export class SessionManager extends EventEmitter {
464
478
  const exactTarget = `=${tmuxSession}:`;
465
479
  try {
466
480
  if (text.includes('\n')) {
467
- // Multi-line: write to temp file, load into tmux buffer, paste into pane.
481
+ // Multi-line: pipe into tmux load-buffer via stdin, then paste into pane.
468
482
  // This avoids newlines being treated as Enter keypresses which would
469
483
  // fragment the message into multiple Claude prompts.
470
- const tmpDir = path.join('/tmp', 'instar-inject');
471
- fs.mkdirSync(tmpDir, { recursive: true });
472
- const tmpPath = path.join(tmpDir, `msg-${Date.now()}-${process.pid}.txt`);
473
- fs.writeFileSync(tmpPath, text);
474
- try {
475
- execFileSync(this.config.tmuxPath, ['load-buffer', tmpPath], {
476
- encoding: 'utf-8', timeout: 5000,
477
- });
478
- execFileSync(this.config.tmuxPath, ['paste-buffer', '-t', exactTarget, '-p'], {
479
- encoding: 'utf-8', timeout: 5000,
480
- });
481
- // Brief delay to let the terminal process the paste before sending Enter.
482
- // Without this, the Enter arrives before paste processing completes and
483
- // the message sits in the input buffer without being submitted.
484
- execFileSync('/bin/sleep', ['0.3'], { timeout: 2000 });
485
- // Send Enter to submit
486
- execFileSync(this.config.tmuxPath, ['send-keys', '-t', exactTarget, 'Enter'], {
487
- encoding: 'utf-8', timeout: 5000,
488
- });
489
- }
490
- finally {
491
- try {
492
- fs.unlinkSync(tmpPath);
493
- }
494
- catch { /* ignore */ }
495
- }
484
+ // Uses stdin pipe (load-buffer -) instead of temp files to avoid
485
+ // macOS TCC "access data from other apps" permission prompts.
486
+ execFileSync(this.config.tmuxPath, ['load-buffer', '-'], {
487
+ encoding: 'utf-8', timeout: 5000, input: text,
488
+ });
489
+ execFileSync(this.config.tmuxPath, ['paste-buffer', '-t', exactTarget, '-p'], {
490
+ encoding: 'utf-8', timeout: 5000,
491
+ });
492
+ // Brief delay to let the terminal process the paste before sending Enter.
493
+ // Without this, the Enter arrives before paste processing completes and
494
+ // the message sits in the input buffer without being submitted.
495
+ execFileSync('/bin/sleep', ['0.3'], { timeout: 2000 });
496
+ // Send Enter to submit
497
+ execFileSync(this.config.tmuxPath, ['send-keys', '-t', exactTarget, 'Enter'], {
498
+ encoding: 'utf-8', timeout: 5000,
499
+ });
496
500
  }
497
501
  else {
498
502
  // Single-line: simple send-keys
@@ -712,6 +712,14 @@ export interface MonitoringConfig {
712
712
  memoryMonitoring: boolean;
713
713
  /** Health check interval in ms */
714
714
  healthCheckIntervalMs: number;
715
+ /** Session watchdog — auto-remediation for stuck commands */
716
+ watchdog?: {
717
+ enabled: boolean;
718
+ /** Seconds before a command is considered stuck (default: 180) */
719
+ stuckCommandSec?: number;
720
+ /** Poll interval in ms (default: 30000) */
721
+ pollIntervalMs?: number;
722
+ };
715
723
  }
716
724
  /** @deprecated Use InstarConfig instead */
717
725
  export type AgentKitConfig = InstarConfig;
@@ -241,8 +241,23 @@ export class TelegramLifeline {
241
241
  // Forward to server if healthy
242
242
  if (this.supervisor.healthy) {
243
243
  const forwarded = await this.forwardToServer(topicId, text, msg);
244
- if (forwarded)
244
+ if (forwarded) {
245
+ // Delivery confirmation — user knows message reached the server
246
+ await this.sendToTopic(topicId, '✓ Delivered');
245
247
  return;
248
+ }
249
+ // Server appears healthy but forward failed — queue with accurate message
250
+ this.queue.enqueue({
251
+ id: `tg-${msg.message_id}`,
252
+ topicId,
253
+ text,
254
+ fromUserId: msg.from.id,
255
+ fromUsername: msg.from.username,
256
+ fromFirstName: msg.from.first_name,
257
+ timestamp: new Date(msg.date * 1000).toISOString(),
258
+ });
259
+ await this.sendToTopic(topicId, `Server is restarting. Your message has been queued (${this.queue.length} in queue). It will be delivered when the server recovers.`);
260
+ return;
246
261
  }
247
262
  // Server is down — queue the message
248
263
  this.queue.enqueue({
@@ -7,13 +7,15 @@
7
7
  import type { SessionManager } from '../core/SessionManager.js';
8
8
  import type { JobScheduler } from '../scheduler/JobScheduler.js';
9
9
  import type { HealthStatus, InstarConfig } from '../core/types.js';
10
+ import type { SessionWatchdog } from './SessionWatchdog.js';
10
11
  export declare class HealthChecker {
11
12
  private config;
12
13
  private sessionManager;
13
14
  private scheduler;
15
+ private watchdog;
14
16
  private checkInterval;
15
17
  private lastStatus;
16
- constructor(config: InstarConfig, sessionManager: SessionManager, scheduler?: JobScheduler | null);
18
+ constructor(config: InstarConfig, sessionManager: SessionManager, scheduler?: JobScheduler | null, watchdog?: SessionWatchdog | null);
17
19
  /**
18
20
  * Run all health checks and return aggregated status.
19
21
  */
@@ -11,12 +11,14 @@ export class HealthChecker {
11
11
  config;
12
12
  sessionManager;
13
13
  scheduler;
14
+ watchdog;
14
15
  checkInterval = null;
15
16
  lastStatus = null;
16
- constructor(config, sessionManager, scheduler = null) {
17
+ constructor(config, sessionManager, scheduler = null, watchdog = null) {
17
18
  this.config = config;
18
19
  this.sessionManager = sessionManager;
19
20
  this.scheduler = scheduler;
21
+ this.watchdog = watchdog;
20
22
  }
21
23
  /**
22
24
  * Run all health checks and return aggregated status.
@@ -30,6 +32,17 @@ export class HealthChecker {
30
32
  if (this.scheduler) {
31
33
  components.scheduler = this.checkScheduler();
32
34
  }
35
+ if (this.watchdog) {
36
+ const wdStatus = this.watchdog.getStatus();
37
+ const intervening = wdStatus.sessions.filter(s => s.escalation && s.escalation.level > 0);
38
+ components.watchdog = {
39
+ status: intervening.length > 0 ? 'degraded' : 'healthy',
40
+ message: intervening.length > 0
41
+ ? `Intervening on ${intervening.length} session(s)`
42
+ : `Monitoring${wdStatus.enabled ? '' : ' (disabled)'}`,
43
+ lastCheck: new Date().toISOString(),
44
+ };
45
+ }
33
46
  // Aggregate: worst component status becomes overall status
34
47
  const statuses = Object.values(components).map(c => c.status);
35
48
  let overall = 'healthy';
@@ -0,0 +1,83 @@
1
+ /**
2
+ * SessionWatchdog — Auto-remediation for stuck Claude sessions (Instar port).
3
+ *
4
+ * Detects when a Claude session has a long-running bash command and escalates
5
+ * from gentle (Ctrl+C) to forceful (SIGKILL + session kill). Adapted from
6
+ * Dawn Server's SessionWatchdog for Instar's self-contained architecture.
7
+ *
8
+ * Escalation pipeline:
9
+ * Level 0: Monitoring (default)
10
+ * Level 1: Ctrl+C via tmux send-keys
11
+ * Level 2: SIGTERM the stuck child PID
12
+ * Level 3: SIGKILL the stuck child PID
13
+ * Level 4: Kill tmux session
14
+ */
15
+ import { EventEmitter } from 'node:events';
16
+ import type { SessionManager } from '../core/SessionManager.js';
17
+ import type { StateManager } from '../core/StateManager.js';
18
+ import type { InstarConfig } from '../core/types.js';
19
+ export declare enum EscalationLevel {
20
+ Monitoring = 0,
21
+ CtrlC = 1,
22
+ SigTerm = 2,
23
+ SigKill = 3,
24
+ KillSession = 4
25
+ }
26
+ interface EscalationState {
27
+ level: EscalationLevel;
28
+ levelEnteredAt: number;
29
+ stuckChildPid: number;
30
+ stuckCommand: string;
31
+ retryCount: number;
32
+ }
33
+ export interface InterventionEvent {
34
+ sessionName: string;
35
+ level: EscalationLevel;
36
+ action: string;
37
+ stuckCommand: string;
38
+ stuckPid: number;
39
+ timestamp: number;
40
+ }
41
+ export interface WatchdogEvents {
42
+ intervention: [event: InterventionEvent];
43
+ recovery: [sessionName: string, fromLevel: EscalationLevel];
44
+ }
45
+ export declare class SessionWatchdog extends EventEmitter {
46
+ private config;
47
+ private sessionManager;
48
+ private state;
49
+ private interval;
50
+ private escalationState;
51
+ private interventionHistory;
52
+ private enabled;
53
+ private running;
54
+ private stuckThresholdMs;
55
+ private pollIntervalMs;
56
+ constructor(config: InstarConfig, sessionManager: SessionManager, state: StateManager);
57
+ start(): void;
58
+ stop(): void;
59
+ setEnabled(enabled: boolean): void;
60
+ isEnabled(): boolean;
61
+ isManaging(sessionName: string): boolean;
62
+ getStatus(): {
63
+ enabled: boolean;
64
+ sessions: Array<{
65
+ name: string;
66
+ escalation: EscalationState | null;
67
+ }>;
68
+ interventionHistory: InterventionEvent[];
69
+ };
70
+ private poll;
71
+ private checkSession;
72
+ private handleEscalation;
73
+ private getClaudePid;
74
+ private getChildProcesses;
75
+ private isExcluded;
76
+ private parseElapsed;
77
+ private sendSignal;
78
+ private isProcessAlive;
79
+ private killTmuxSession;
80
+ private recordIntervention;
81
+ }
82
+ export {};
83
+ //# sourceMappingURL=SessionWatchdog.d.ts.map
@@ -0,0 +1,326 @@
1
+ /**
2
+ * SessionWatchdog — Auto-remediation for stuck Claude sessions (Instar port).
3
+ *
4
+ * Detects when a Claude session has a long-running bash command and escalates
5
+ * from gentle (Ctrl+C) to forceful (SIGKILL + session kill). Adapted from
6
+ * Dawn Server's SessionWatchdog for Instar's self-contained architecture.
7
+ *
8
+ * Escalation pipeline:
9
+ * Level 0: Monitoring (default)
10
+ * Level 1: Ctrl+C via tmux send-keys
11
+ * Level 2: SIGTERM the stuck child PID
12
+ * Level 3: SIGKILL the stuck child PID
13
+ * Level 4: Kill tmux session
14
+ */
15
+ import { execSync } from 'node:child_process';
16
+ import { EventEmitter } from 'node:events';
17
+ export var EscalationLevel;
18
+ (function (EscalationLevel) {
19
+ EscalationLevel[EscalationLevel["Monitoring"] = 0] = "Monitoring";
20
+ EscalationLevel[EscalationLevel["CtrlC"] = 1] = "CtrlC";
21
+ EscalationLevel[EscalationLevel["SigTerm"] = 2] = "SigTerm";
22
+ EscalationLevel[EscalationLevel["SigKill"] = 3] = "SigKill";
23
+ EscalationLevel[EscalationLevel["KillSession"] = 4] = "KillSession";
24
+ })(EscalationLevel || (EscalationLevel = {}));
25
+ // Processes that are long-running by design
26
+ const EXCLUDED_PATTERNS = [
27
+ 'playwright-mcp', 'playwright-persistent', '@playwright/mcp',
28
+ 'chrome-native-host', 'claude-in-chrome-mcp', 'payments-mcp',
29
+ 'mcp-remote', '/mcp/', '.mcp/', 'caffeinate', 'exa-mcp-server',
30
+ ];
31
+ const EXCLUDED_PREFIXES = [
32
+ '/bin/zsh -c -l source',
33
+ '/bin/bash -c -l source',
34
+ ];
35
+ // Escalation delays (ms to wait before advancing to next level)
36
+ const ESCALATION_DELAYS = {
37
+ [EscalationLevel.Monitoring]: 0,
38
+ [EscalationLevel.CtrlC]: 0,
39
+ [EscalationLevel.SigTerm]: 15_000,
40
+ [EscalationLevel.SigKill]: 10_000,
41
+ [EscalationLevel.KillSession]: 5_000,
42
+ };
43
+ const DEFAULT_STUCK_THRESHOLD_MS = 180_000; // 3 minutes
44
+ const DEFAULT_POLL_INTERVAL_MS = 30_000;
45
+ const MAX_RETRIES = 2;
46
+ export class SessionWatchdog extends EventEmitter {
47
+ config;
48
+ sessionManager;
49
+ state;
50
+ interval = null;
51
+ escalationState = new Map();
52
+ interventionHistory = [];
53
+ enabled = true;
54
+ running = false;
55
+ stuckThresholdMs;
56
+ pollIntervalMs;
57
+ constructor(config, sessionManager, state) {
58
+ super();
59
+ this.config = config;
60
+ this.sessionManager = sessionManager;
61
+ this.state = state;
62
+ const wdConfig = config.monitoring.watchdog;
63
+ this.stuckThresholdMs = (wdConfig?.stuckCommandSec ?? 180) * 1000;
64
+ this.pollIntervalMs = wdConfig?.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
65
+ }
66
+ start() {
67
+ if (this.interval)
68
+ return;
69
+ console.log(`[Watchdog] Starting (poll: ${this.pollIntervalMs / 1000}s, threshold: ${this.stuckThresholdMs / 1000}s)`);
70
+ this.interval = setInterval(() => this.poll(), this.pollIntervalMs);
71
+ setTimeout(() => this.poll(), 5000);
72
+ }
73
+ stop() {
74
+ if (this.interval) {
75
+ clearInterval(this.interval);
76
+ this.interval = null;
77
+ }
78
+ }
79
+ setEnabled(enabled) {
80
+ this.enabled = enabled;
81
+ if (!enabled) {
82
+ this.escalationState.clear();
83
+ }
84
+ }
85
+ isEnabled() {
86
+ return this.enabled;
87
+ }
88
+ isManaging(sessionName) {
89
+ const s = this.escalationState.get(sessionName);
90
+ return s !== undefined && s.level > EscalationLevel.Monitoring;
91
+ }
92
+ getStatus() {
93
+ const runningSessions = this.sessionManager.listRunningSessions();
94
+ const sessions = runningSessions.map(s => ({
95
+ name: s.tmuxSession,
96
+ escalation: this.escalationState.get(s.tmuxSession) ?? null,
97
+ }));
98
+ return {
99
+ enabled: this.enabled,
100
+ sessions,
101
+ interventionHistory: this.interventionHistory.slice(-20),
102
+ };
103
+ }
104
+ // --- Core polling ---
105
+ async poll() {
106
+ if (!this.enabled || this.running)
107
+ return;
108
+ this.running = true;
109
+ try {
110
+ const sessions = this.sessionManager.listRunningSessions();
111
+ for (const session of sessions) {
112
+ try {
113
+ this.checkSession(session.tmuxSession);
114
+ }
115
+ catch (err) {
116
+ console.error(`[Watchdog] Error checking "${session.tmuxSession}":`, err);
117
+ }
118
+ }
119
+ }
120
+ finally {
121
+ this.running = false;
122
+ }
123
+ }
124
+ checkSession(tmuxSession) {
125
+ const existing = this.escalationState.get(tmuxSession);
126
+ if (existing && existing.level > EscalationLevel.Monitoring) {
127
+ this.handleEscalation(tmuxSession, existing);
128
+ return;
129
+ }
130
+ // Find Claude PID in the tmux session
131
+ const claudePid = this.getClaudePid(tmuxSession);
132
+ if (!claudePid)
133
+ return;
134
+ const children = this.getChildProcesses(claudePid);
135
+ const stuckChild = children.find(c => !this.isExcluded(c.command) && c.elapsedMs > this.stuckThresholdMs);
136
+ if (stuckChild) {
137
+ const state = {
138
+ level: EscalationLevel.CtrlC,
139
+ levelEnteredAt: Date.now(),
140
+ stuckChildPid: stuckChild.pid,
141
+ stuckCommand: stuckChild.command,
142
+ retryCount: existing?.retryCount ?? 0,
143
+ };
144
+ this.escalationState.set(tmuxSession, state);
145
+ console.log(`[Watchdog] "${tmuxSession}": stuck command (${Math.round(stuckChild.elapsedMs / 1000)}s): ` +
146
+ `${stuckChild.command.slice(0, 80)} — sending Ctrl+C`);
147
+ this.sessionManager.sendKey(tmuxSession, 'C-c');
148
+ this.recordIntervention(tmuxSession, EscalationLevel.CtrlC, 'Sent Ctrl+C', stuckChild);
149
+ }
150
+ else if (existing) {
151
+ this.escalationState.delete(tmuxSession);
152
+ }
153
+ }
154
+ handleEscalation(tmuxSession, state) {
155
+ const now = Date.now();
156
+ if (!this.isProcessAlive(state.stuckChildPid)) {
157
+ console.log(`[Watchdog] "${tmuxSession}": stuck process ${state.stuckChildPid} died — recovered`);
158
+ this.emit('recovery', tmuxSession, state.level);
159
+ this.escalationState.delete(tmuxSession);
160
+ return;
161
+ }
162
+ const timeInLevel = now - state.levelEnteredAt;
163
+ const nextLevel = state.level + 1;
164
+ if (nextLevel > EscalationLevel.KillSession) {
165
+ if (state.retryCount >= MAX_RETRIES) {
166
+ console.log(`[Watchdog] "${tmuxSession}": max retries reached — giving up`);
167
+ this.escalationState.delete(tmuxSession);
168
+ return;
169
+ }
170
+ state.level = EscalationLevel.CtrlC;
171
+ state.levelEnteredAt = now;
172
+ state.retryCount++;
173
+ this.sessionManager.sendKey(tmuxSession, 'C-c');
174
+ this.recordIntervention(tmuxSession, EscalationLevel.CtrlC, `Retry ${state.retryCount}: Sent Ctrl+C`, {
175
+ pid: state.stuckChildPid, command: state.stuckCommand, elapsedMs: 0,
176
+ });
177
+ return;
178
+ }
179
+ const delayForNext = ESCALATION_DELAYS[nextLevel] ?? 15_000;
180
+ if (timeInLevel < delayForNext)
181
+ return;
182
+ state.level = nextLevel;
183
+ state.levelEnteredAt = now;
184
+ const child = { pid: state.stuckChildPid, command: state.stuckCommand, elapsedMs: 0 };
185
+ switch (state.level) {
186
+ case EscalationLevel.SigTerm:
187
+ console.log(`[Watchdog] "${tmuxSession}": sending SIGTERM to ${state.stuckChildPid}`);
188
+ this.sendSignal(state.stuckChildPid, 'SIGTERM');
189
+ this.recordIntervention(tmuxSession, EscalationLevel.SigTerm, `SIGTERM ${state.stuckChildPid}`, child);
190
+ break;
191
+ case EscalationLevel.SigKill:
192
+ console.log(`[Watchdog] "${tmuxSession}": sending SIGKILL to ${state.stuckChildPid}`);
193
+ this.sendSignal(state.stuckChildPid, 'SIGKILL');
194
+ this.recordIntervention(tmuxSession, EscalationLevel.SigKill, `SIGKILL ${state.stuckChildPid}`, child);
195
+ break;
196
+ case EscalationLevel.KillSession:
197
+ console.log(`[Watchdog] "${tmuxSession}": killing tmux session`);
198
+ this.killTmuxSession(tmuxSession);
199
+ this.recordIntervention(tmuxSession, EscalationLevel.KillSession, 'Killed tmux session', child);
200
+ this.escalationState.delete(tmuxSession);
201
+ break;
202
+ }
203
+ }
204
+ // --- Process utilities (self-contained, no shared module) ---
205
+ getClaudePid(tmuxSession) {
206
+ try {
207
+ // Get pane PID
208
+ const panePidStr = execSync(`${this.config.sessions.tmuxPath} list-panes -t "=${tmuxSession}" -F "#{pane_pid}" 2>/dev/null`, { encoding: 'utf-8', timeout: 5000 }).trim().split('\n')[0];
209
+ if (!panePidStr)
210
+ return null;
211
+ const panePid = parseInt(panePidStr, 10);
212
+ if (isNaN(panePid))
213
+ return null;
214
+ // Find claude child
215
+ const claudePidStr = execSync(`pgrep -P ${panePid} -f claude 2>/dev/null | head -1`, { encoding: 'utf-8', timeout: 5000 }).trim();
216
+ if (!claudePidStr)
217
+ return null;
218
+ const pid = parseInt(claudePidStr, 10);
219
+ return isNaN(pid) ? null : pid;
220
+ }
221
+ catch {
222
+ return null;
223
+ }
224
+ }
225
+ getChildProcesses(pid) {
226
+ try {
227
+ const childPidsStr = execSync(`pgrep -P ${pid} 2>/dev/null`, { encoding: 'utf-8', timeout: 5000 }).trim();
228
+ if (!childPidsStr)
229
+ return [];
230
+ const childPids = childPidsStr.split('\n').filter(Boolean).join(',');
231
+ if (!childPids)
232
+ return [];
233
+ const output = execSync(`ps -o pid=,etime=,command= -p ${childPids} 2>/dev/null`, { encoding: 'utf-8', timeout: 5000 }).trim();
234
+ if (!output)
235
+ return [];
236
+ const results = [];
237
+ for (const line of output.split('\n')) {
238
+ const match = line.trim().match(/^(\d+)\s+([\d:.-]+)\s+(.+)$/);
239
+ if (!match)
240
+ continue;
241
+ const childPid = parseInt(match[1], 10);
242
+ if (isNaN(childPid))
243
+ continue;
244
+ results.push({
245
+ pid: childPid,
246
+ command: match[3],
247
+ elapsedMs: this.parseElapsed(match[2]),
248
+ });
249
+ }
250
+ return results;
251
+ }
252
+ catch {
253
+ return [];
254
+ }
255
+ }
256
+ isExcluded(command) {
257
+ for (const pattern of EXCLUDED_PATTERNS) {
258
+ if (command.includes(pattern))
259
+ return true;
260
+ }
261
+ for (const prefix of EXCLUDED_PREFIXES) {
262
+ if (command.startsWith(prefix))
263
+ return true;
264
+ }
265
+ return false;
266
+ }
267
+ parseElapsed(elapsed) {
268
+ let days = 0;
269
+ let timePart = elapsed;
270
+ if (elapsed.includes('-')) {
271
+ const [d, t] = elapsed.split('-');
272
+ days = parseInt(d, 10);
273
+ timePart = t;
274
+ }
275
+ const parts = timePart.split(':').map(Number);
276
+ let seconds = 0;
277
+ if (parts.length === 3)
278
+ seconds = parts[0] * 3600 + parts[1] * 60 + parts[2];
279
+ else if (parts.length === 2)
280
+ seconds = parts[0] * 60 + parts[1];
281
+ else
282
+ seconds = parts[0];
283
+ return (days * 86400 + seconds) * 1000;
284
+ }
285
+ sendSignal(pid, signal) {
286
+ try {
287
+ process.kill(pid, signal);
288
+ }
289
+ catch (err) {
290
+ if (err.code !== 'ESRCH') {
291
+ console.error(`[Watchdog] Failed to send ${signal} to ${pid}:`, err);
292
+ }
293
+ }
294
+ }
295
+ isProcessAlive(pid) {
296
+ try {
297
+ process.kill(pid, 0);
298
+ return true;
299
+ }
300
+ catch {
301
+ return false;
302
+ }
303
+ }
304
+ killTmuxSession(tmuxSession) {
305
+ try {
306
+ execSync(`${this.config.sessions.tmuxPath} kill-session -t "=${tmuxSession}" 2>/dev/null`, { timeout: 5000, stdio: 'ignore' });
307
+ }
308
+ catch { }
309
+ }
310
+ recordIntervention(sessionName, level, action, child) {
311
+ const event = {
312
+ sessionName,
313
+ level,
314
+ action,
315
+ stuckCommand: child.command.slice(0, 200),
316
+ stuckPid: child.pid,
317
+ timestamp: Date.now(),
318
+ };
319
+ this.interventionHistory.push(event);
320
+ if (this.interventionHistory.length > 50) {
321
+ this.interventionHistory = this.interventionHistory.slice(-50);
322
+ }
323
+ this.emit('intervention', event);
324
+ }
325
+ }
326
+ //# sourceMappingURL=SessionWatchdog.js.map
@@ -21,6 +21,7 @@ import type { TelegraphService } from '../publishing/TelegraphService.js';
21
21
  import type { PrivateViewer } from '../publishing/PrivateViewer.js';
22
22
  import type { TunnelManager } from '../tunnel/TunnelManager.js';
23
23
  import type { EvolutionManager } from '../core/EvolutionManager.js';
24
+ import type { SessionWatchdog } from '../monitoring/SessionWatchdog.js';
24
25
  export declare class AgentServer {
25
26
  private app;
26
27
  private server;
@@ -43,6 +44,7 @@ export declare class AgentServer {
43
44
  viewer?: PrivateViewer;
44
45
  tunnel?: TunnelManager;
45
46
  evolution?: EvolutionManager;
47
+ watchdog?: SessionWatchdog;
46
48
  });
47
49
  /**
48
50
  * Start the HTTP server.
@@ -39,6 +39,7 @@ export class AgentServer {
39
39
  viewer: options.viewer ?? null,
40
40
  tunnel: options.tunnel ?? null,
41
41
  evolution: options.evolution ?? null,
42
+ watchdog: options.watchdog ?? null,
42
43
  startTime: this.startTime,
43
44
  });
44
45
  this.app.use(routes);
@@ -21,6 +21,7 @@ import type { TelegraphService } from '../publishing/TelegraphService.js';
21
21
  import type { PrivateViewer } from '../publishing/PrivateViewer.js';
22
22
  import type { TunnelManager } from '../tunnel/TunnelManager.js';
23
23
  import type { EvolutionManager } from '../core/EvolutionManager.js';
24
+ import type { SessionWatchdog } from '../monitoring/SessionWatchdog.js';
24
25
  export interface RouteContext {
25
26
  config: InstarConfig;
26
27
  sessionManager: SessionManager;
@@ -38,6 +39,7 @@ export interface RouteContext {
38
39
  viewer: PrivateViewer | null;
39
40
  tunnel: TunnelManager | null;
40
41
  evolution: EvolutionManager | null;
42
+ watchdog: SessionWatchdog | null;
41
43
  startTime: Date;
42
44
  }
43
45
  export declare function createRoutes(ctx: RouteContext): Router;
@@ -1695,6 +1695,27 @@ export function createRoutes(ctx) {
1695
1695
  }
1696
1696
  res.json({ ok: true, id: req.params.id, status });
1697
1697
  });
1698
+ // ── Watchdog ──────────────────────────────────────────────────
1699
+ router.get('/watchdog/status', (req, res) => {
1700
+ if (!ctx.watchdog) {
1701
+ res.json({ enabled: false, sessions: [], interventionHistory: [] });
1702
+ return;
1703
+ }
1704
+ res.json(ctx.watchdog.getStatus());
1705
+ });
1706
+ router.post('/watchdog/toggle', (req, res) => {
1707
+ if (!ctx.watchdog) {
1708
+ res.status(404).json({ error: 'Watchdog not configured' });
1709
+ return;
1710
+ }
1711
+ const { enabled } = req.body;
1712
+ if (typeof enabled !== 'boolean') {
1713
+ res.status(400).json({ error: 'enabled (boolean) required' });
1714
+ return;
1715
+ }
1716
+ ctx.watchdog.setEnabled(enabled);
1717
+ res.json({ enabled: ctx.watchdog.isEnabled() });
1718
+ });
1698
1719
  return router;
1699
1720
  }
1700
1721
  export function formatUptime(ms) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "instar",
3
- "version": "0.7.50",
3
+ "version": "0.7.51",
4
4
  "description": "Persistent autonomy infrastructure for AI agents",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",