instar 0.7.50 → 0.7.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/server.js +27 -1
- package/dist/core/SessionManager.d.ts +6 -0
- package/dist/core/SessionManager.js +31 -27
- package/dist/core/types.d.ts +8 -0
- package/dist/lifeline/TelegramLifeline.js +16 -1
- package/dist/monitoring/HealthChecker.d.ts +3 -1
- package/dist/monitoring/HealthChecker.js +14 -1
- package/dist/monitoring/SessionWatchdog.d.ts +83 -0
- package/dist/monitoring/SessionWatchdog.js +326 -0
- package/dist/server/AgentServer.d.ts +2 -0
- package/dist/server/AgentServer.js +1 -0
- package/dist/server/routes.d.ts +2 -0
- package/dist/server/routes.js +21 -0
- package/package.json +1 -1
package/dist/commands/server.js
CHANGED
|
@@ -36,6 +36,7 @@ import { QuotaTracker } from '../monitoring/QuotaTracker.js';
|
|
|
36
36
|
import { AccountSwitcher } from '../monitoring/AccountSwitcher.js';
|
|
37
37
|
import { QuotaNotifier } from '../monitoring/QuotaNotifier.js';
|
|
38
38
|
import { classifySessionDeath } from '../monitoring/QuotaExhaustionDetector.js';
|
|
39
|
+
import { SessionWatchdog } from '../monitoring/SessionWatchdog.js';
|
|
39
40
|
import { installAutoStart } from './setup.js';
|
|
40
41
|
/**
|
|
41
42
|
* Check if autostart is installed for this project.
|
|
@@ -646,6 +647,31 @@ export async function startServer(options) {
|
|
|
646
647
|
scheduler.notifyJobComplete(session.id, session.tmuxSession);
|
|
647
648
|
});
|
|
648
649
|
}
|
|
650
|
+
// Session Watchdog — auto-remediation for stuck commands
|
|
651
|
+
let watchdog;
|
|
652
|
+
if (config.monitoring.watchdog?.enabled) {
|
|
653
|
+
watchdog = new SessionWatchdog(config, sessionManager, state);
|
|
654
|
+
watchdog.on('intervention', (event) => {
|
|
655
|
+
if (telegram) {
|
|
656
|
+
const topicId = telegram.getTopicForSession(event.sessionName);
|
|
657
|
+
if (topicId) {
|
|
658
|
+
const levelNames = ['Monitoring', 'Ctrl+C', 'SIGTERM', 'SIGKILL', 'Kill Session'];
|
|
659
|
+
const levelName = levelNames[event.level] || `Level ${event.level}`;
|
|
660
|
+
telegram.sendToTopic(topicId, `🔧 Watchdog [${levelName}]: ${event.action}\nStuck: \`${event.stuckCommand.slice(0, 60)}\``).catch(() => { });
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
});
|
|
664
|
+
watchdog.on('recovery', (sessionName, fromLevel) => {
|
|
665
|
+
if (telegram) {
|
|
666
|
+
const topicId = telegram.getTopicForSession(sessionName);
|
|
667
|
+
if (topicId) {
|
|
668
|
+
telegram.sendToTopic(topicId, `✅ Watchdog: session recovered (was at escalation level ${fromLevel})`).catch(() => { });
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
});
|
|
672
|
+
watchdog.start();
|
|
673
|
+
console.log(pc.green(' Session Watchdog enabled'));
|
|
674
|
+
}
|
|
649
675
|
// Set up feedback and update checking
|
|
650
676
|
let feedback;
|
|
651
677
|
if (config.feedback) {
|
|
@@ -800,7 +826,7 @@ export async function startServer(options) {
|
|
|
800
826
|
}
|
|
801
827
|
});
|
|
802
828
|
sleepWakeDetector.start();
|
|
803
|
-
const server = new AgentServer({ config, sessionManager, state, scheduler, telegram, relationships, feedback, dispatches, updateChecker, autoUpdater, autoDispatcher, quotaTracker, publisher, viewer, tunnel, evolution });
|
|
829
|
+
const server = new AgentServer({ config, sessionManager, state, scheduler, telegram, relationships, feedback, dispatches, updateChecker, autoUpdater, autoDispatcher, quotaTracker, publisher, viewer, tunnel, evolution, watchdog });
|
|
804
830
|
await server.start();
|
|
805
831
|
// Start tunnel AFTER server is listening
|
|
806
832
|
if (tunnel) {
|
|
@@ -68,6 +68,12 @@ export declare class SessionManager extends EventEmitter {
|
|
|
68
68
|
* Send input to a running tmux session.
|
|
69
69
|
*/
|
|
70
70
|
sendInput(tmuxSession: string, input: string): boolean;
|
|
71
|
+
/**
|
|
72
|
+
* Send a tmux key sequence (without -l literal flag).
|
|
73
|
+
* Use for special keys like 'C-c' (Ctrl+C), 'Enter', 'Escape'.
|
|
74
|
+
* Unlike sendInput() which uses -l (literal), this sends key names directly.
|
|
75
|
+
*/
|
|
76
|
+
sendKey(tmuxSession: string, key: string): boolean;
|
|
71
77
|
/**
|
|
72
78
|
* List all sessions that are currently running.
|
|
73
79
|
* Pure filter — does not mutate state. The monitor tick handles lifecycle transitions.
|
|
@@ -297,6 +297,20 @@ export class SessionManager extends EventEmitter {
|
|
|
297
297
|
return false;
|
|
298
298
|
}
|
|
299
299
|
}
|
|
300
|
+
/**
|
|
301
|
+
* Send a tmux key sequence (without -l literal flag).
|
|
302
|
+
* Use for special keys like 'C-c' (Ctrl+C), 'Enter', 'Escape'.
|
|
303
|
+
* Unlike sendInput() which uses -l (literal), this sends key names directly.
|
|
304
|
+
*/
|
|
305
|
+
sendKey(tmuxSession, key) {
|
|
306
|
+
try {
|
|
307
|
+
execFileSync(this.config.tmuxPath, ['send-keys', '-t', `=${tmuxSession}:`, key], { encoding: 'utf-8', timeout: 5000 });
|
|
308
|
+
return true;
|
|
309
|
+
}
|
|
310
|
+
catch {
|
|
311
|
+
return false;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
300
314
|
/**
|
|
301
315
|
* List all sessions that are currently running.
|
|
302
316
|
* Pure filter — does not mutate state. The monitor tick handles lifecycle transitions.
|
|
@@ -464,35 +478,25 @@ export class SessionManager extends EventEmitter {
|
|
|
464
478
|
const exactTarget = `=${tmuxSession}:`;
|
|
465
479
|
try {
|
|
466
480
|
if (text.includes('\n')) {
|
|
467
|
-
// Multi-line:
|
|
481
|
+
// Multi-line: pipe into tmux load-buffer via stdin, then paste into pane.
|
|
468
482
|
// This avoids newlines being treated as Enter keypresses which would
|
|
469
483
|
// fragment the message into multiple Claude prompts.
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
execFileSync(this.config.tmuxPath, ['send-keys', '-t', exactTarget, 'Enter'], {
|
|
487
|
-
encoding: 'utf-8', timeout: 5000,
|
|
488
|
-
});
|
|
489
|
-
}
|
|
490
|
-
finally {
|
|
491
|
-
try {
|
|
492
|
-
fs.unlinkSync(tmpPath);
|
|
493
|
-
}
|
|
494
|
-
catch { /* ignore */ }
|
|
495
|
-
}
|
|
484
|
+
// Uses stdin pipe (load-buffer -) instead of temp files to avoid
|
|
485
|
+
// macOS TCC "access data from other apps" permission prompts.
|
|
486
|
+
execFileSync(this.config.tmuxPath, ['load-buffer', '-'], {
|
|
487
|
+
encoding: 'utf-8', timeout: 5000, input: text,
|
|
488
|
+
});
|
|
489
|
+
execFileSync(this.config.tmuxPath, ['paste-buffer', '-t', exactTarget, '-p'], {
|
|
490
|
+
encoding: 'utf-8', timeout: 5000,
|
|
491
|
+
});
|
|
492
|
+
// Brief delay to let the terminal process the paste before sending Enter.
|
|
493
|
+
// Without this, the Enter arrives before paste processing completes and
|
|
494
|
+
// the message sits in the input buffer without being submitted.
|
|
495
|
+
execFileSync('/bin/sleep', ['0.3'], { timeout: 2000 });
|
|
496
|
+
// Send Enter to submit
|
|
497
|
+
execFileSync(this.config.tmuxPath, ['send-keys', '-t', exactTarget, 'Enter'], {
|
|
498
|
+
encoding: 'utf-8', timeout: 5000,
|
|
499
|
+
});
|
|
496
500
|
}
|
|
497
501
|
else {
|
|
498
502
|
// Single-line: simple send-keys
|
package/dist/core/types.d.ts
CHANGED
|
@@ -712,6 +712,14 @@ export interface MonitoringConfig {
|
|
|
712
712
|
memoryMonitoring: boolean;
|
|
713
713
|
/** Health check interval in ms */
|
|
714
714
|
healthCheckIntervalMs: number;
|
|
715
|
+
/** Session watchdog — auto-remediation for stuck commands */
|
|
716
|
+
watchdog?: {
|
|
717
|
+
enabled: boolean;
|
|
718
|
+
/** Seconds before a command is considered stuck (default: 180) */
|
|
719
|
+
stuckCommandSec?: number;
|
|
720
|
+
/** Poll interval in ms (default: 30000) */
|
|
721
|
+
pollIntervalMs?: number;
|
|
722
|
+
};
|
|
715
723
|
}
|
|
716
724
|
/** @deprecated Use InstarConfig instead */
|
|
717
725
|
export type AgentKitConfig = InstarConfig;
|
|
@@ -241,8 +241,23 @@ export class TelegramLifeline {
|
|
|
241
241
|
// Forward to server if healthy
|
|
242
242
|
if (this.supervisor.healthy) {
|
|
243
243
|
const forwarded = await this.forwardToServer(topicId, text, msg);
|
|
244
|
-
if (forwarded)
|
|
244
|
+
if (forwarded) {
|
|
245
|
+
// Delivery confirmation — user knows message reached the server
|
|
246
|
+
await this.sendToTopic(topicId, '✓ Delivered');
|
|
245
247
|
return;
|
|
248
|
+
}
|
|
249
|
+
// Server appears healthy but forward failed — queue with accurate message
|
|
250
|
+
this.queue.enqueue({
|
|
251
|
+
id: `tg-${msg.message_id}`,
|
|
252
|
+
topicId,
|
|
253
|
+
text,
|
|
254
|
+
fromUserId: msg.from.id,
|
|
255
|
+
fromUsername: msg.from.username,
|
|
256
|
+
fromFirstName: msg.from.first_name,
|
|
257
|
+
timestamp: new Date(msg.date * 1000).toISOString(),
|
|
258
|
+
});
|
|
259
|
+
await this.sendToTopic(topicId, `Server is restarting. Your message has been queued (${this.queue.length} in queue). It will be delivered when the server recovers.`);
|
|
260
|
+
return;
|
|
246
261
|
}
|
|
247
262
|
// Server is down — queue the message
|
|
248
263
|
this.queue.enqueue({
|
|
@@ -7,13 +7,15 @@
|
|
|
7
7
|
import type { SessionManager } from '../core/SessionManager.js';
|
|
8
8
|
import type { JobScheduler } from '../scheduler/JobScheduler.js';
|
|
9
9
|
import type { HealthStatus, InstarConfig } from '../core/types.js';
|
|
10
|
+
import type { SessionWatchdog } from './SessionWatchdog.js';
|
|
10
11
|
export declare class HealthChecker {
|
|
11
12
|
private config;
|
|
12
13
|
private sessionManager;
|
|
13
14
|
private scheduler;
|
|
15
|
+
private watchdog;
|
|
14
16
|
private checkInterval;
|
|
15
17
|
private lastStatus;
|
|
16
|
-
constructor(config: InstarConfig, sessionManager: SessionManager, scheduler?: JobScheduler | null);
|
|
18
|
+
constructor(config: InstarConfig, sessionManager: SessionManager, scheduler?: JobScheduler | null, watchdog?: SessionWatchdog | null);
|
|
17
19
|
/**
|
|
18
20
|
* Run all health checks and return aggregated status.
|
|
19
21
|
*/
|
|
@@ -11,12 +11,14 @@ export class HealthChecker {
|
|
|
11
11
|
config;
|
|
12
12
|
sessionManager;
|
|
13
13
|
scheduler;
|
|
14
|
+
watchdog;
|
|
14
15
|
checkInterval = null;
|
|
15
16
|
lastStatus = null;
|
|
16
|
-
constructor(config, sessionManager, scheduler = null) {
|
|
17
|
+
constructor(config, sessionManager, scheduler = null, watchdog = null) {
|
|
17
18
|
this.config = config;
|
|
18
19
|
this.sessionManager = sessionManager;
|
|
19
20
|
this.scheduler = scheduler;
|
|
21
|
+
this.watchdog = watchdog;
|
|
20
22
|
}
|
|
21
23
|
/**
|
|
22
24
|
* Run all health checks and return aggregated status.
|
|
@@ -30,6 +32,17 @@ export class HealthChecker {
|
|
|
30
32
|
if (this.scheduler) {
|
|
31
33
|
components.scheduler = this.checkScheduler();
|
|
32
34
|
}
|
|
35
|
+
if (this.watchdog) {
|
|
36
|
+
const wdStatus = this.watchdog.getStatus();
|
|
37
|
+
const intervening = wdStatus.sessions.filter(s => s.escalation && s.escalation.level > 0);
|
|
38
|
+
components.watchdog = {
|
|
39
|
+
status: intervening.length > 0 ? 'degraded' : 'healthy',
|
|
40
|
+
message: intervening.length > 0
|
|
41
|
+
? `Intervening on ${intervening.length} session(s)`
|
|
42
|
+
: `Monitoring${wdStatus.enabled ? '' : ' (disabled)'}`,
|
|
43
|
+
lastCheck: new Date().toISOString(),
|
|
44
|
+
};
|
|
45
|
+
}
|
|
33
46
|
// Aggregate: worst component status becomes overall status
|
|
34
47
|
const statuses = Object.values(components).map(c => c.status);
|
|
35
48
|
let overall = 'healthy';
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SessionWatchdog — Auto-remediation for stuck Claude sessions (Instar port).
|
|
3
|
+
*
|
|
4
|
+
* Detects when a Claude session has a long-running bash command and escalates
|
|
5
|
+
* from gentle (Ctrl+C) to forceful (SIGKILL + session kill). Adapted from
|
|
6
|
+
* Dawn Server's SessionWatchdog for Instar's self-contained architecture.
|
|
7
|
+
*
|
|
8
|
+
* Escalation pipeline:
|
|
9
|
+
* Level 0: Monitoring (default)
|
|
10
|
+
* Level 1: Ctrl+C via tmux send-keys
|
|
11
|
+
* Level 2: SIGTERM the stuck child PID
|
|
12
|
+
* Level 3: SIGKILL the stuck child PID
|
|
13
|
+
* Level 4: Kill tmux session
|
|
14
|
+
*/
|
|
15
|
+
import { EventEmitter } from 'node:events';
|
|
16
|
+
import type { SessionManager } from '../core/SessionManager.js';
|
|
17
|
+
import type { StateManager } from '../core/StateManager.js';
|
|
18
|
+
import type { InstarConfig } from '../core/types.js';
|
|
19
|
+
export declare enum EscalationLevel {
|
|
20
|
+
Monitoring = 0,
|
|
21
|
+
CtrlC = 1,
|
|
22
|
+
SigTerm = 2,
|
|
23
|
+
SigKill = 3,
|
|
24
|
+
KillSession = 4
|
|
25
|
+
}
|
|
26
|
+
interface EscalationState {
|
|
27
|
+
level: EscalationLevel;
|
|
28
|
+
levelEnteredAt: number;
|
|
29
|
+
stuckChildPid: number;
|
|
30
|
+
stuckCommand: string;
|
|
31
|
+
retryCount: number;
|
|
32
|
+
}
|
|
33
|
+
export interface InterventionEvent {
|
|
34
|
+
sessionName: string;
|
|
35
|
+
level: EscalationLevel;
|
|
36
|
+
action: string;
|
|
37
|
+
stuckCommand: string;
|
|
38
|
+
stuckPid: number;
|
|
39
|
+
timestamp: number;
|
|
40
|
+
}
|
|
41
|
+
export interface WatchdogEvents {
|
|
42
|
+
intervention: [event: InterventionEvent];
|
|
43
|
+
recovery: [sessionName: string, fromLevel: EscalationLevel];
|
|
44
|
+
}
|
|
45
|
+
export declare class SessionWatchdog extends EventEmitter {
|
|
46
|
+
private config;
|
|
47
|
+
private sessionManager;
|
|
48
|
+
private state;
|
|
49
|
+
private interval;
|
|
50
|
+
private escalationState;
|
|
51
|
+
private interventionHistory;
|
|
52
|
+
private enabled;
|
|
53
|
+
private running;
|
|
54
|
+
private stuckThresholdMs;
|
|
55
|
+
private pollIntervalMs;
|
|
56
|
+
constructor(config: InstarConfig, sessionManager: SessionManager, state: StateManager);
|
|
57
|
+
start(): void;
|
|
58
|
+
stop(): void;
|
|
59
|
+
setEnabled(enabled: boolean): void;
|
|
60
|
+
isEnabled(): boolean;
|
|
61
|
+
isManaging(sessionName: string): boolean;
|
|
62
|
+
getStatus(): {
|
|
63
|
+
enabled: boolean;
|
|
64
|
+
sessions: Array<{
|
|
65
|
+
name: string;
|
|
66
|
+
escalation: EscalationState | null;
|
|
67
|
+
}>;
|
|
68
|
+
interventionHistory: InterventionEvent[];
|
|
69
|
+
};
|
|
70
|
+
private poll;
|
|
71
|
+
private checkSession;
|
|
72
|
+
private handleEscalation;
|
|
73
|
+
private getClaudePid;
|
|
74
|
+
private getChildProcesses;
|
|
75
|
+
private isExcluded;
|
|
76
|
+
private parseElapsed;
|
|
77
|
+
private sendSignal;
|
|
78
|
+
private isProcessAlive;
|
|
79
|
+
private killTmuxSession;
|
|
80
|
+
private recordIntervention;
|
|
81
|
+
}
|
|
82
|
+
export {};
|
|
83
|
+
//# sourceMappingURL=SessionWatchdog.d.ts.map
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SessionWatchdog — Auto-remediation for stuck Claude sessions (Instar port).
|
|
3
|
+
*
|
|
4
|
+
* Detects when a Claude session has a long-running bash command and escalates
|
|
5
|
+
* from gentle (Ctrl+C) to forceful (SIGKILL + session kill). Adapted from
|
|
6
|
+
* Dawn Server's SessionWatchdog for Instar's self-contained architecture.
|
|
7
|
+
*
|
|
8
|
+
* Escalation pipeline:
|
|
9
|
+
* Level 0: Monitoring (default)
|
|
10
|
+
* Level 1: Ctrl+C via tmux send-keys
|
|
11
|
+
* Level 2: SIGTERM the stuck child PID
|
|
12
|
+
* Level 3: SIGKILL the stuck child PID
|
|
13
|
+
* Level 4: Kill tmux session
|
|
14
|
+
*/
|
|
15
|
+
import { execSync } from 'node:child_process';
|
|
16
|
+
import { EventEmitter } from 'node:events';
|
|
17
|
+
export var EscalationLevel;
|
|
18
|
+
(function (EscalationLevel) {
|
|
19
|
+
EscalationLevel[EscalationLevel["Monitoring"] = 0] = "Monitoring";
|
|
20
|
+
EscalationLevel[EscalationLevel["CtrlC"] = 1] = "CtrlC";
|
|
21
|
+
EscalationLevel[EscalationLevel["SigTerm"] = 2] = "SigTerm";
|
|
22
|
+
EscalationLevel[EscalationLevel["SigKill"] = 3] = "SigKill";
|
|
23
|
+
EscalationLevel[EscalationLevel["KillSession"] = 4] = "KillSession";
|
|
24
|
+
})(EscalationLevel || (EscalationLevel = {}));
|
|
25
|
+
// Processes that are long-running by design
|
|
26
|
+
const EXCLUDED_PATTERNS = [
|
|
27
|
+
'playwright-mcp', 'playwright-persistent', '@playwright/mcp',
|
|
28
|
+
'chrome-native-host', 'claude-in-chrome-mcp', 'payments-mcp',
|
|
29
|
+
'mcp-remote', '/mcp/', '.mcp/', 'caffeinate', 'exa-mcp-server',
|
|
30
|
+
];
|
|
31
|
+
const EXCLUDED_PREFIXES = [
|
|
32
|
+
'/bin/zsh -c -l source',
|
|
33
|
+
'/bin/bash -c -l source',
|
|
34
|
+
];
|
|
35
|
+
// Escalation delays (ms to wait before advancing to next level)
|
|
36
|
+
const ESCALATION_DELAYS = {
|
|
37
|
+
[EscalationLevel.Monitoring]: 0,
|
|
38
|
+
[EscalationLevel.CtrlC]: 0,
|
|
39
|
+
[EscalationLevel.SigTerm]: 15_000,
|
|
40
|
+
[EscalationLevel.SigKill]: 10_000,
|
|
41
|
+
[EscalationLevel.KillSession]: 5_000,
|
|
42
|
+
};
|
|
43
|
+
const DEFAULT_STUCK_THRESHOLD_MS = 180_000; // 3 minutes
|
|
44
|
+
const DEFAULT_POLL_INTERVAL_MS = 30_000;
|
|
45
|
+
const MAX_RETRIES = 2;
|
|
46
|
+
export class SessionWatchdog extends EventEmitter {
|
|
47
|
+
config;
|
|
48
|
+
sessionManager;
|
|
49
|
+
state;
|
|
50
|
+
interval = null;
|
|
51
|
+
escalationState = new Map();
|
|
52
|
+
interventionHistory = [];
|
|
53
|
+
enabled = true;
|
|
54
|
+
running = false;
|
|
55
|
+
stuckThresholdMs;
|
|
56
|
+
pollIntervalMs;
|
|
57
|
+
constructor(config, sessionManager, state) {
|
|
58
|
+
super();
|
|
59
|
+
this.config = config;
|
|
60
|
+
this.sessionManager = sessionManager;
|
|
61
|
+
this.state = state;
|
|
62
|
+
const wdConfig = config.monitoring.watchdog;
|
|
63
|
+
this.stuckThresholdMs = (wdConfig?.stuckCommandSec ?? 180) * 1000;
|
|
64
|
+
this.pollIntervalMs = wdConfig?.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
|
|
65
|
+
}
|
|
66
|
+
start() {
|
|
67
|
+
if (this.interval)
|
|
68
|
+
return;
|
|
69
|
+
console.log(`[Watchdog] Starting (poll: ${this.pollIntervalMs / 1000}s, threshold: ${this.stuckThresholdMs / 1000}s)`);
|
|
70
|
+
this.interval = setInterval(() => this.poll(), this.pollIntervalMs);
|
|
71
|
+
setTimeout(() => this.poll(), 5000);
|
|
72
|
+
}
|
|
73
|
+
stop() {
|
|
74
|
+
if (this.interval) {
|
|
75
|
+
clearInterval(this.interval);
|
|
76
|
+
this.interval = null;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
setEnabled(enabled) {
|
|
80
|
+
this.enabled = enabled;
|
|
81
|
+
if (!enabled) {
|
|
82
|
+
this.escalationState.clear();
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
isEnabled() {
|
|
86
|
+
return this.enabled;
|
|
87
|
+
}
|
|
88
|
+
isManaging(sessionName) {
|
|
89
|
+
const s = this.escalationState.get(sessionName);
|
|
90
|
+
return s !== undefined && s.level > EscalationLevel.Monitoring;
|
|
91
|
+
}
|
|
92
|
+
getStatus() {
|
|
93
|
+
const runningSessions = this.sessionManager.listRunningSessions();
|
|
94
|
+
const sessions = runningSessions.map(s => ({
|
|
95
|
+
name: s.tmuxSession,
|
|
96
|
+
escalation: this.escalationState.get(s.tmuxSession) ?? null,
|
|
97
|
+
}));
|
|
98
|
+
return {
|
|
99
|
+
enabled: this.enabled,
|
|
100
|
+
sessions,
|
|
101
|
+
interventionHistory: this.interventionHistory.slice(-20),
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
// --- Core polling ---
|
|
105
|
+
async poll() {
|
|
106
|
+
if (!this.enabled || this.running)
|
|
107
|
+
return;
|
|
108
|
+
this.running = true;
|
|
109
|
+
try {
|
|
110
|
+
const sessions = this.sessionManager.listRunningSessions();
|
|
111
|
+
for (const session of sessions) {
|
|
112
|
+
try {
|
|
113
|
+
this.checkSession(session.tmuxSession);
|
|
114
|
+
}
|
|
115
|
+
catch (err) {
|
|
116
|
+
console.error(`[Watchdog] Error checking "${session.tmuxSession}":`, err);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
finally {
|
|
121
|
+
this.running = false;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
checkSession(tmuxSession) {
|
|
125
|
+
const existing = this.escalationState.get(tmuxSession);
|
|
126
|
+
if (existing && existing.level > EscalationLevel.Monitoring) {
|
|
127
|
+
this.handleEscalation(tmuxSession, existing);
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
// Find Claude PID in the tmux session
|
|
131
|
+
const claudePid = this.getClaudePid(tmuxSession);
|
|
132
|
+
if (!claudePid)
|
|
133
|
+
return;
|
|
134
|
+
const children = this.getChildProcesses(claudePid);
|
|
135
|
+
const stuckChild = children.find(c => !this.isExcluded(c.command) && c.elapsedMs > this.stuckThresholdMs);
|
|
136
|
+
if (stuckChild) {
|
|
137
|
+
const state = {
|
|
138
|
+
level: EscalationLevel.CtrlC,
|
|
139
|
+
levelEnteredAt: Date.now(),
|
|
140
|
+
stuckChildPid: stuckChild.pid,
|
|
141
|
+
stuckCommand: stuckChild.command,
|
|
142
|
+
retryCount: existing?.retryCount ?? 0,
|
|
143
|
+
};
|
|
144
|
+
this.escalationState.set(tmuxSession, state);
|
|
145
|
+
console.log(`[Watchdog] "${tmuxSession}": stuck command (${Math.round(stuckChild.elapsedMs / 1000)}s): ` +
|
|
146
|
+
`${stuckChild.command.slice(0, 80)} — sending Ctrl+C`);
|
|
147
|
+
this.sessionManager.sendKey(tmuxSession, 'C-c');
|
|
148
|
+
this.recordIntervention(tmuxSession, EscalationLevel.CtrlC, 'Sent Ctrl+C', stuckChild);
|
|
149
|
+
}
|
|
150
|
+
else if (existing) {
|
|
151
|
+
this.escalationState.delete(tmuxSession);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
handleEscalation(tmuxSession, state) {
|
|
155
|
+
const now = Date.now();
|
|
156
|
+
if (!this.isProcessAlive(state.stuckChildPid)) {
|
|
157
|
+
console.log(`[Watchdog] "${tmuxSession}": stuck process ${state.stuckChildPid} died — recovered`);
|
|
158
|
+
this.emit('recovery', tmuxSession, state.level);
|
|
159
|
+
this.escalationState.delete(tmuxSession);
|
|
160
|
+
return;
|
|
161
|
+
}
|
|
162
|
+
const timeInLevel = now - state.levelEnteredAt;
|
|
163
|
+
const nextLevel = state.level + 1;
|
|
164
|
+
if (nextLevel > EscalationLevel.KillSession) {
|
|
165
|
+
if (state.retryCount >= MAX_RETRIES) {
|
|
166
|
+
console.log(`[Watchdog] "${tmuxSession}": max retries reached — giving up`);
|
|
167
|
+
this.escalationState.delete(tmuxSession);
|
|
168
|
+
return;
|
|
169
|
+
}
|
|
170
|
+
state.level = EscalationLevel.CtrlC;
|
|
171
|
+
state.levelEnteredAt = now;
|
|
172
|
+
state.retryCount++;
|
|
173
|
+
this.sessionManager.sendKey(tmuxSession, 'C-c');
|
|
174
|
+
this.recordIntervention(tmuxSession, EscalationLevel.CtrlC, `Retry ${state.retryCount}: Sent Ctrl+C`, {
|
|
175
|
+
pid: state.stuckChildPid, command: state.stuckCommand, elapsedMs: 0,
|
|
176
|
+
});
|
|
177
|
+
return;
|
|
178
|
+
}
|
|
179
|
+
const delayForNext = ESCALATION_DELAYS[nextLevel] ?? 15_000;
|
|
180
|
+
if (timeInLevel < delayForNext)
|
|
181
|
+
return;
|
|
182
|
+
state.level = nextLevel;
|
|
183
|
+
state.levelEnteredAt = now;
|
|
184
|
+
const child = { pid: state.stuckChildPid, command: state.stuckCommand, elapsedMs: 0 };
|
|
185
|
+
switch (state.level) {
|
|
186
|
+
case EscalationLevel.SigTerm:
|
|
187
|
+
console.log(`[Watchdog] "${tmuxSession}": sending SIGTERM to ${state.stuckChildPid}`);
|
|
188
|
+
this.sendSignal(state.stuckChildPid, 'SIGTERM');
|
|
189
|
+
this.recordIntervention(tmuxSession, EscalationLevel.SigTerm, `SIGTERM ${state.stuckChildPid}`, child);
|
|
190
|
+
break;
|
|
191
|
+
case EscalationLevel.SigKill:
|
|
192
|
+
console.log(`[Watchdog] "${tmuxSession}": sending SIGKILL to ${state.stuckChildPid}`);
|
|
193
|
+
this.sendSignal(state.stuckChildPid, 'SIGKILL');
|
|
194
|
+
this.recordIntervention(tmuxSession, EscalationLevel.SigKill, `SIGKILL ${state.stuckChildPid}`, child);
|
|
195
|
+
break;
|
|
196
|
+
case EscalationLevel.KillSession:
|
|
197
|
+
console.log(`[Watchdog] "${tmuxSession}": killing tmux session`);
|
|
198
|
+
this.killTmuxSession(tmuxSession);
|
|
199
|
+
this.recordIntervention(tmuxSession, EscalationLevel.KillSession, 'Killed tmux session', child);
|
|
200
|
+
this.escalationState.delete(tmuxSession);
|
|
201
|
+
break;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
// --- Process utilities (self-contained, no shared module) ---
|
|
205
|
+
getClaudePid(tmuxSession) {
|
|
206
|
+
try {
|
|
207
|
+
// Get pane PID
|
|
208
|
+
const panePidStr = execSync(`${this.config.sessions.tmuxPath} list-panes -t "=${tmuxSession}" -F "#{pane_pid}" 2>/dev/null`, { encoding: 'utf-8', timeout: 5000 }).trim().split('\n')[0];
|
|
209
|
+
if (!panePidStr)
|
|
210
|
+
return null;
|
|
211
|
+
const panePid = parseInt(panePidStr, 10);
|
|
212
|
+
if (isNaN(panePid))
|
|
213
|
+
return null;
|
|
214
|
+
// Find claude child
|
|
215
|
+
const claudePidStr = execSync(`pgrep -P ${panePid} -f claude 2>/dev/null | head -1`, { encoding: 'utf-8', timeout: 5000 }).trim();
|
|
216
|
+
if (!claudePidStr)
|
|
217
|
+
return null;
|
|
218
|
+
const pid = parseInt(claudePidStr, 10);
|
|
219
|
+
return isNaN(pid) ? null : pid;
|
|
220
|
+
}
|
|
221
|
+
catch {
|
|
222
|
+
return null;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
getChildProcesses(pid) {
|
|
226
|
+
try {
|
|
227
|
+
const childPidsStr = execSync(`pgrep -P ${pid} 2>/dev/null`, { encoding: 'utf-8', timeout: 5000 }).trim();
|
|
228
|
+
if (!childPidsStr)
|
|
229
|
+
return [];
|
|
230
|
+
const childPids = childPidsStr.split('\n').filter(Boolean).join(',');
|
|
231
|
+
if (!childPids)
|
|
232
|
+
return [];
|
|
233
|
+
const output = execSync(`ps -o pid=,etime=,command= -p ${childPids} 2>/dev/null`, { encoding: 'utf-8', timeout: 5000 }).trim();
|
|
234
|
+
if (!output)
|
|
235
|
+
return [];
|
|
236
|
+
const results = [];
|
|
237
|
+
for (const line of output.split('\n')) {
|
|
238
|
+
const match = line.trim().match(/^(\d+)\s+([\d:.-]+)\s+(.+)$/);
|
|
239
|
+
if (!match)
|
|
240
|
+
continue;
|
|
241
|
+
const childPid = parseInt(match[1], 10);
|
|
242
|
+
if (isNaN(childPid))
|
|
243
|
+
continue;
|
|
244
|
+
results.push({
|
|
245
|
+
pid: childPid,
|
|
246
|
+
command: match[3],
|
|
247
|
+
elapsedMs: this.parseElapsed(match[2]),
|
|
248
|
+
});
|
|
249
|
+
}
|
|
250
|
+
return results;
|
|
251
|
+
}
|
|
252
|
+
catch {
|
|
253
|
+
return [];
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
isExcluded(command) {
|
|
257
|
+
for (const pattern of EXCLUDED_PATTERNS) {
|
|
258
|
+
if (command.includes(pattern))
|
|
259
|
+
return true;
|
|
260
|
+
}
|
|
261
|
+
for (const prefix of EXCLUDED_PREFIXES) {
|
|
262
|
+
if (command.startsWith(prefix))
|
|
263
|
+
return true;
|
|
264
|
+
}
|
|
265
|
+
return false;
|
|
266
|
+
}
|
|
267
|
+
parseElapsed(elapsed) {
|
|
268
|
+
let days = 0;
|
|
269
|
+
let timePart = elapsed;
|
|
270
|
+
if (elapsed.includes('-')) {
|
|
271
|
+
const [d, t] = elapsed.split('-');
|
|
272
|
+
days = parseInt(d, 10);
|
|
273
|
+
timePart = t;
|
|
274
|
+
}
|
|
275
|
+
const parts = timePart.split(':').map(Number);
|
|
276
|
+
let seconds = 0;
|
|
277
|
+
if (parts.length === 3)
|
|
278
|
+
seconds = parts[0] * 3600 + parts[1] * 60 + parts[2];
|
|
279
|
+
else if (parts.length === 2)
|
|
280
|
+
seconds = parts[0] * 60 + parts[1];
|
|
281
|
+
else
|
|
282
|
+
seconds = parts[0];
|
|
283
|
+
return (days * 86400 + seconds) * 1000;
|
|
284
|
+
}
|
|
285
|
+
sendSignal(pid, signal) {
|
|
286
|
+
try {
|
|
287
|
+
process.kill(pid, signal);
|
|
288
|
+
}
|
|
289
|
+
catch (err) {
|
|
290
|
+
if (err.code !== 'ESRCH') {
|
|
291
|
+
console.error(`[Watchdog] Failed to send ${signal} to ${pid}:`, err);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
isProcessAlive(pid) {
|
|
296
|
+
try {
|
|
297
|
+
process.kill(pid, 0);
|
|
298
|
+
return true;
|
|
299
|
+
}
|
|
300
|
+
catch {
|
|
301
|
+
return false;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
killTmuxSession(tmuxSession) {
|
|
305
|
+
try {
|
|
306
|
+
execSync(`${this.config.sessions.tmuxPath} kill-session -t "=${tmuxSession}" 2>/dev/null`, { timeout: 5000, stdio: 'ignore' });
|
|
307
|
+
}
|
|
308
|
+
catch { }
|
|
309
|
+
}
|
|
310
|
+
recordIntervention(sessionName, level, action, child) {
|
|
311
|
+
const event = {
|
|
312
|
+
sessionName,
|
|
313
|
+
level,
|
|
314
|
+
action,
|
|
315
|
+
stuckCommand: child.command.slice(0, 200),
|
|
316
|
+
stuckPid: child.pid,
|
|
317
|
+
timestamp: Date.now(),
|
|
318
|
+
};
|
|
319
|
+
this.interventionHistory.push(event);
|
|
320
|
+
if (this.interventionHistory.length > 50) {
|
|
321
|
+
this.interventionHistory = this.interventionHistory.slice(-50);
|
|
322
|
+
}
|
|
323
|
+
this.emit('intervention', event);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
//# sourceMappingURL=SessionWatchdog.js.map
|
|
@@ -21,6 +21,7 @@ import type { TelegraphService } from '../publishing/TelegraphService.js';
|
|
|
21
21
|
import type { PrivateViewer } from '../publishing/PrivateViewer.js';
|
|
22
22
|
import type { TunnelManager } from '../tunnel/TunnelManager.js';
|
|
23
23
|
import type { EvolutionManager } from '../core/EvolutionManager.js';
|
|
24
|
+
import type { SessionWatchdog } from '../monitoring/SessionWatchdog.js';
|
|
24
25
|
export declare class AgentServer {
|
|
25
26
|
private app;
|
|
26
27
|
private server;
|
|
@@ -43,6 +44,7 @@ export declare class AgentServer {
|
|
|
43
44
|
viewer?: PrivateViewer;
|
|
44
45
|
tunnel?: TunnelManager;
|
|
45
46
|
evolution?: EvolutionManager;
|
|
47
|
+
watchdog?: SessionWatchdog;
|
|
46
48
|
});
|
|
47
49
|
/**
|
|
48
50
|
* Start the HTTP server.
|
package/dist/server/routes.d.ts
CHANGED
|
@@ -21,6 +21,7 @@ import type { TelegraphService } from '../publishing/TelegraphService.js';
|
|
|
21
21
|
import type { PrivateViewer } from '../publishing/PrivateViewer.js';
|
|
22
22
|
import type { TunnelManager } from '../tunnel/TunnelManager.js';
|
|
23
23
|
import type { EvolutionManager } from '../core/EvolutionManager.js';
|
|
24
|
+
import type { SessionWatchdog } from '../monitoring/SessionWatchdog.js';
|
|
24
25
|
export interface RouteContext {
|
|
25
26
|
config: InstarConfig;
|
|
26
27
|
sessionManager: SessionManager;
|
|
@@ -38,6 +39,7 @@ export interface RouteContext {
|
|
|
38
39
|
viewer: PrivateViewer | null;
|
|
39
40
|
tunnel: TunnelManager | null;
|
|
40
41
|
evolution: EvolutionManager | null;
|
|
42
|
+
watchdog: SessionWatchdog | null;
|
|
41
43
|
startTime: Date;
|
|
42
44
|
}
|
|
43
45
|
export declare function createRoutes(ctx: RouteContext): Router;
|
package/dist/server/routes.js
CHANGED
|
@@ -1695,6 +1695,27 @@ export function createRoutes(ctx) {
|
|
|
1695
1695
|
}
|
|
1696
1696
|
res.json({ ok: true, id: req.params.id, status });
|
|
1697
1697
|
});
|
|
1698
|
+
// ── Watchdog ──────────────────────────────────────────────────
|
|
1699
|
+
router.get('/watchdog/status', (req, res) => {
|
|
1700
|
+
if (!ctx.watchdog) {
|
|
1701
|
+
res.json({ enabled: false, sessions: [], interventionHistory: [] });
|
|
1702
|
+
return;
|
|
1703
|
+
}
|
|
1704
|
+
res.json(ctx.watchdog.getStatus());
|
|
1705
|
+
});
|
|
1706
|
+
router.post('/watchdog/toggle', (req, res) => {
|
|
1707
|
+
if (!ctx.watchdog) {
|
|
1708
|
+
res.status(404).json({ error: 'Watchdog not configured' });
|
|
1709
|
+
return;
|
|
1710
|
+
}
|
|
1711
|
+
const { enabled } = req.body;
|
|
1712
|
+
if (typeof enabled !== 'boolean') {
|
|
1713
|
+
res.status(400).json({ error: 'enabled (boolean) required' });
|
|
1714
|
+
return;
|
|
1715
|
+
}
|
|
1716
|
+
ctx.watchdog.setEnabled(enabled);
|
|
1717
|
+
res.json({ enabled: ctx.watchdog.isEnabled() });
|
|
1718
|
+
});
|
|
1698
1719
|
return router;
|
|
1699
1720
|
}
|
|
1700
1721
|
export function formatUptime(ms) {
|