@proletariat/cli 0.3.96 → 0.3.97
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/gc.d.ts +1 -0
- package/dist/commands/gc.js +31 -1
- package/dist/commands/gc.js.map +1 -1
- package/dist/commands/linear/connect.d.ts +5 -0
- package/dist/commands/linear/connect.js +84 -0
- package/dist/commands/linear/connect.js.map +1 -1
- package/dist/commands/session/watch.d.ts +1 -0
- package/dist/commands/session/watch.js +46 -2
- package/dist/commands/session/watch.js.map +1 -1
- package/dist/commands/work/complete.d.ts +1 -0
- package/dist/commands/work/complete.js +27 -25
- package/dist/commands/work/complete.js.map +1 -1
- package/dist/commands/work/drop.d.ts +14 -0
- package/dist/commands/work/drop.js +215 -0
- package/dist/commands/work/drop.js.map +1 -0
- package/dist/commands/work/ready.d.ts +1 -0
- package/dist/commands/work/ready.js +26 -25
- package/dist/commands/work/ready.js.map +1 -1
- package/dist/commands/work/ship.d.ts +1 -0
- package/dist/commands/work/ship.js +33 -32
- package/dist/commands/work/ship.js.map +1 -1
- package/dist/commands/work/start.d.ts +2 -0
- package/dist/commands/work/start.js +160 -42
- package/dist/commands/work/start.js.map +1 -1
- package/dist/commands/work/stop.d.ts +1 -0
- package/dist/commands/work/stop.js +40 -0
- package/dist/commands/work/stop.js.map +1 -1
- package/dist/lib/agents/commands.js +7 -5
- package/dist/lib/agents/commands.js.map +1 -1
- package/dist/lib/database/drizzle-schema.d.ts +17 -0
- package/dist/lib/database/drizzle-schema.js +1 -0
- package/dist/lib/database/drizzle-schema.js.map +1 -1
- package/dist/lib/database/migrations/0019_gc_artifact_cleanup.d.ts +9 -0
- package/dist/lib/database/migrations/0019_gc_artifact_cleanup.js +23 -0
- package/dist/lib/database/migrations/0019_gc_artifact_cleanup.js.map +1 -0
- package/dist/lib/database/migrations/0020_transition_map.d.ts +2 -0
- package/dist/lib/database/migrations/0020_transition_map.js +27 -0
- package/dist/lib/database/migrations/0020_transition_map.js.map +1 -0
- package/dist/lib/database/migrations/index.js +4 -0
- package/dist/lib/database/migrations/index.js.map +1 -1
- package/dist/lib/execution/config.d.ts +10 -0
- package/dist/lib/execution/config.js +24 -0
- package/dist/lib/execution/config.js.map +1 -1
- package/dist/lib/execution/preflight.d.ts +51 -0
- package/dist/lib/execution/preflight.js +278 -0
- package/dist/lib/execution/preflight.js.map +1 -0
- package/dist/lib/execution/runners/prompt-builder.d.ts +6 -0
- package/dist/lib/execution/runners/prompt-builder.js +38 -7
- package/dist/lib/execution/runners/prompt-builder.js.map +1 -1
- package/dist/lib/execution/session-utils.d.ts +23 -0
- package/dist/lib/execution/session-utils.js +69 -0
- package/dist/lib/execution/session-utils.js.map +1 -1
- package/dist/lib/execution/spawner.d.ts +11 -1
- package/dist/lib/execution/spawner.js +44 -16
- package/dist/lib/execution/spawner.js.map +1 -1
- package/dist/lib/execution/ticket-refs.d.ts +71 -0
- package/dist/lib/execution/ticket-refs.js +125 -0
- package/dist/lib/execution/ticket-refs.js.map +1 -0
- package/dist/lib/execution/types.d.ts +7 -2
- package/dist/lib/execution/types.js +5 -3
- package/dist/lib/execution/types.js.map +1 -1
- package/dist/lib/gc/index.d.ts +59 -6
- package/dist/lib/gc/index.js +258 -13
- package/dist/lib/gc/index.js.map +1 -1
- package/dist/lib/prompt-json.d.ts +31 -0
- package/dist/lib/prompt-json.js.map +1 -1
- package/dist/lib/providers/auto-mapper.d.ts +45 -0
- package/dist/lib/providers/auto-mapper.js +115 -0
- package/dist/lib/providers/auto-mapper.js.map +1 -0
- package/dist/lib/providers/state-intents.d.ts +20 -0
- package/dist/lib/providers/state-intents.js +61 -7
- package/dist/lib/providers/state-intents.js.map +1 -1
- package/dist/lib/providers/state-resolution.d.ts +15 -11
- package/dist/lib/providers/state-resolution.js +54 -48
- package/dist/lib/providers/state-resolution.js.map +1 -1
- package/dist/lib/providers/transition-map.d.ts +59 -0
- package/dist/lib/providers/transition-map.js +113 -0
- package/dist/lib/providers/transition-map.js.map +1 -0
- package/dist/lib/session/index.d.ts +3 -1
- package/dist/lib/session/index.js +3 -1
- package/dist/lib/session/index.js.map +1 -1
- package/dist/lib/session/tmux-watchdog.d.ts +157 -0
- package/dist/lib/session/tmux-watchdog.js +424 -0
- package/dist/lib/session/tmux-watchdog.js.map +1 -0
- package/dist/lib/session/watcher.d.ts +22 -4
- package/dist/lib/session/watcher.js +66 -8
- package/dist/lib/session/watcher.js.map +1 -1
- package/dist/lib/work-lifecycle/transition.d.ts +73 -0
- package/dist/lib/work-lifecycle/transition.js +124 -0
- package/dist/lib/work-lifecycle/transition.js.map +1 -0
- package/oclif.manifest.json +438 -332
- package/package.json +1 -1
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tmux Watchdog
|
|
3
|
+
*
|
|
4
|
+
* Detects tmux server crashes and attempts automatic recovery of agent sessions.
|
|
5
|
+
*
|
|
6
|
+
* Problem: When the tmux server crashes (macOS sleep, resource pressure, OOM),
|
|
7
|
+
* ALL agent sessions are silently lost. The existing heartbeat system treats
|
|
8
|
+
* each agent as individually stale, missing the root cause. There is no
|
|
9
|
+
* recovery or user notification.
|
|
10
|
+
*
|
|
11
|
+
* Solution: This watchdog monitors the tmux server process itself. When a
|
|
12
|
+
* crash is detected (server transitions from alive→dead while agents were
|
|
13
|
+
* active), it:
|
|
14
|
+
*
|
|
15
|
+
* 1. Marks all affected executions with a crash-specific error
|
|
16
|
+
* 2. Attempts to re-create tmux sessions and restart agents
|
|
17
|
+
* 3. Logs crash events for user notification
|
|
18
|
+
*
|
|
19
|
+
* Architecture:
|
|
20
|
+
* - Runs as Phase 0 in SessionWatcher.runCycle(), before heartbeat checks
|
|
21
|
+
* - Maintains last-known server state per environment (host + each container)
|
|
22
|
+
* - Uses session-utils.ts for server status checks
|
|
23
|
+
* - Uses session restart logic to re-launch agents in recovered sessions
|
|
24
|
+
*/
|
|
25
|
+
import { execSync, execFileSync } from 'node:child_process';
|
|
26
|
+
import { getHostTmuxServerStatus, getContainerTmuxServerStatus, } from '../execution/session-utils.js';
|
|
27
|
+
// =============================================================================
|
|
28
|
+
// Tmux Watchdog
|
|
29
|
+
// =============================================================================
|
|
30
|
+
export class TmuxWatchdog {
|
|
31
|
+
storage;
|
|
32
|
+
autoRecover;
|
|
33
|
+
log;
|
|
34
|
+
onCrashDetected;
|
|
35
|
+
/**
|
|
36
|
+
* Tracks last known tmux server state.
|
|
37
|
+
* Key: 'host' for host server, containerId for container servers.
|
|
38
|
+
* Value: true = server was alive, false = server was down.
|
|
39
|
+
*/
|
|
40
|
+
lastServerState = new Map();
|
|
41
|
+
/**
|
|
42
|
+
* Tracks which executions were active when we last saw the server alive.
|
|
43
|
+
* Key: 'host' or containerId.
|
|
44
|
+
* Value: array of execution IDs that were active in that environment.
|
|
45
|
+
*/
|
|
46
|
+
activeExecutionsSnapshot = new Map();
|
|
47
|
+
/**
|
|
48
|
+
* Tracks crash events we've already handled to avoid duplicate recovery.
|
|
49
|
+
* Key: 'host' or containerId.
|
|
50
|
+
* Value: timestamp of when we detected the crash.
|
|
51
|
+
*/
|
|
52
|
+
handledCrashes = new Map();
|
|
53
|
+
constructor(options) {
|
|
54
|
+
this.storage = options.storage;
|
|
55
|
+
this.autoRecover = options.autoRecover ?? true;
|
|
56
|
+
this.log = options.log ?? (() => { });
|
|
57
|
+
this.onCrashDetected = options.onCrashDetected;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Run a single watchdog check cycle.
|
|
61
|
+
*
|
|
62
|
+
* Checks tmux server health for host and all containers with active executions.
|
|
63
|
+
* Detects crashes (server alive→dead transition) and attempts recovery.
|
|
64
|
+
*/
|
|
65
|
+
async checkAndRecover() {
|
|
66
|
+
const result = {
|
|
67
|
+
hostServerAlive: true,
|
|
68
|
+
containerServersAlive: new Map(),
|
|
69
|
+
crashEvents: [],
|
|
70
|
+
totalAffected: 0,
|
|
71
|
+
totalRecovered: 0,
|
|
72
|
+
};
|
|
73
|
+
// Get all active executions
|
|
74
|
+
const runningExecs = this.storage.listExecutions({ status: 'running' });
|
|
75
|
+
const startingExecs = this.storage.listExecutions({ status: 'starting' });
|
|
76
|
+
const activeExecs = [...runningExecs, ...startingExecs];
|
|
77
|
+
// Group by environment
|
|
78
|
+
const hostExecs = activeExecs.filter(e => e.environment === 'host' || e.environment === 'sandbox');
|
|
79
|
+
const containerExecs = activeExecs.filter(e => (e.environment === 'devcontainer' || e.environment === 'docker') && e.containerId);
|
|
80
|
+
// Group container executions by container ID
|
|
81
|
+
const containerGroups = new Map();
|
|
82
|
+
for (const exec of containerExecs) {
|
|
83
|
+
const cId = exec.containerId;
|
|
84
|
+
const group = containerGroups.get(cId) || [];
|
|
85
|
+
group.push(exec);
|
|
86
|
+
containerGroups.set(cId, group);
|
|
87
|
+
}
|
|
88
|
+
// === Check host tmux server ===
|
|
89
|
+
if (hostExecs.length > 0 || this.lastServerState.has('host')) {
|
|
90
|
+
const hostStatus = getHostTmuxServerStatus();
|
|
91
|
+
const hostAlive = hostStatus === 'running';
|
|
92
|
+
result.hostServerAlive = hostAlive;
|
|
93
|
+
const wasAlive = this.lastServerState.get('host');
|
|
94
|
+
// Update snapshot of active host executions when server is alive
|
|
95
|
+
if (hostAlive) {
|
|
96
|
+
this.activeExecutionsSnapshot.set('host', hostExecs.map(e => e.id));
|
|
97
|
+
// Clear any previous crash handling for host
|
|
98
|
+
this.handledCrashes.delete('host');
|
|
99
|
+
}
|
|
100
|
+
// Detect crash: server was alive, now it's dead, and there were active sessions
|
|
101
|
+
if (wasAlive === true && !hostAlive && !this.handledCrashes.has('host')) {
|
|
102
|
+
const snapshotIds = this.activeExecutionsSnapshot.get('host') || [];
|
|
103
|
+
// Get the actual execution objects for affected sessions
|
|
104
|
+
const affected = snapshotIds
|
|
105
|
+
.map(id => this.storage.getExecution(id))
|
|
106
|
+
.filter((e) => e !== null && (e.status === 'running' || e.status === 'starting'));
|
|
107
|
+
if (affected.length > 0) {
|
|
108
|
+
const event = await this.handleCrash('host', undefined, affected);
|
|
109
|
+
result.crashEvents.push(event);
|
|
110
|
+
result.totalAffected += event.affectedExecutions.length;
|
|
111
|
+
result.totalRecovered += event.recoveredSessions.length;
|
|
112
|
+
this.handledCrashes.set('host', Date.now());
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
this.lastServerState.set('host', hostAlive);
|
|
116
|
+
}
|
|
117
|
+
// === Check container tmux servers ===
|
|
118
|
+
for (const [containerId, execs] of containerGroups) {
|
|
119
|
+
const containerStatus = getContainerTmuxServerStatus(containerId);
|
|
120
|
+
const containerAlive = containerStatus === 'running';
|
|
121
|
+
result.containerServersAlive.set(containerId, containerAlive);
|
|
122
|
+
const wasAlive = this.lastServerState.get(containerId);
|
|
123
|
+
if (containerAlive) {
|
|
124
|
+
this.activeExecutionsSnapshot.set(containerId, execs.map(e => e.id));
|
|
125
|
+
this.handledCrashes.delete(containerId);
|
|
126
|
+
}
|
|
127
|
+
if (wasAlive === true && !containerAlive && !this.handledCrashes.has(containerId)) {
|
|
128
|
+
const snapshotIds = this.activeExecutionsSnapshot.get(containerId) || [];
|
|
129
|
+
const affected = snapshotIds
|
|
130
|
+
.map(id => this.storage.getExecution(id))
|
|
131
|
+
.filter((e) => e !== null && (e.status === 'running' || e.status === 'starting'));
|
|
132
|
+
if (affected.length > 0) {
|
|
133
|
+
const event = await this.handleCrash('container', containerId, affected);
|
|
134
|
+
result.crashEvents.push(event);
|
|
135
|
+
result.totalAffected += event.affectedExecutions.length;
|
|
136
|
+
result.totalRecovered += event.recoveredSessions.length;
|
|
137
|
+
this.handledCrashes.set(containerId, Date.now());
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
this.lastServerState.set(containerId, containerAlive);
|
|
141
|
+
}
|
|
142
|
+
// Clean up tracking for containers that no longer have active executions
|
|
143
|
+
for (const key of this.lastServerState.keys()) {
|
|
144
|
+
if (key !== 'host' && !containerGroups.has(key)) {
|
|
145
|
+
this.lastServerState.delete(key);
|
|
146
|
+
this.activeExecutionsSnapshot.delete(key);
|
|
147
|
+
this.handledCrashes.delete(key);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return result;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Handle a detected tmux server crash.
|
|
154
|
+
* Marks affected executions and attempts recovery.
|
|
155
|
+
*/
|
|
156
|
+
async handleCrash(environment, containerId, affected) {
|
|
157
|
+
const envLabel = environment === 'host' ? 'host' : `container ${containerId?.slice(0, 12)}`;
|
|
158
|
+
this.log(`[watchdog] TMUX SERVER CRASH detected on ${envLabel} — ${affected.length} agent(s) affected`);
|
|
159
|
+
const event = {
|
|
160
|
+
timestamp: new Date(),
|
|
161
|
+
environment,
|
|
162
|
+
containerId,
|
|
163
|
+
affectedExecutions: affected,
|
|
164
|
+
recoveryAttempted: false,
|
|
165
|
+
recoveredSessions: [],
|
|
166
|
+
failedRecoveries: [],
|
|
167
|
+
};
|
|
168
|
+
// Mark affected executions with crash-specific error
|
|
169
|
+
for (const exec of affected) {
|
|
170
|
+
this.log(`[watchdog] Affected: ${exec.agentName} (${exec.ticketId}) — session: ${exec.sessionId || 'unknown'}`);
|
|
171
|
+
this.storage.updateStatus(exec.id, 'failed', undefined, `tmux server crash on ${envLabel} — all sessions lost. ` +
|
|
172
|
+
`${this.autoRecover ? 'Auto-recovery attempted.' : 'Manual restart required.'}`);
|
|
173
|
+
this.storage.updateLifecycleState(exec.id, 'died');
|
|
174
|
+
}
|
|
175
|
+
// Attempt auto-recovery if enabled
|
|
176
|
+
if (this.autoRecover) {
|
|
177
|
+
event.recoveryAttempted = true;
|
|
178
|
+
await this.attemptRecovery(event);
|
|
179
|
+
}
|
|
180
|
+
// Fire callback
|
|
181
|
+
if (this.onCrashDetected) {
|
|
182
|
+
await this.onCrashDetected(event);
|
|
183
|
+
}
|
|
184
|
+
// Log recovery summary
|
|
185
|
+
if (event.recoveryAttempted) {
|
|
186
|
+
if (event.recoveredSessions.length > 0) {
|
|
187
|
+
this.log(`[watchdog] Recovery: ${event.recoveredSessions.length}/${affected.length} sessions restored`);
|
|
188
|
+
}
|
|
189
|
+
if (event.failedRecoveries.length > 0) {
|
|
190
|
+
this.log(`[watchdog] Recovery failed for: ${event.failedRecoveries.join(', ')}`);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return event;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Attempt to recover agent sessions after a tmux server crash.
|
|
197
|
+
*
|
|
198
|
+
* Recovery steps:
|
|
199
|
+
* 1. Wait briefly for tmux server to come back (macOS wake, Docker resume)
|
|
200
|
+
* 2. Create new tmux sessions with the original session names
|
|
201
|
+
* 3. Navigate to the working directory
|
|
202
|
+
* 4. Re-launch Claude Code with --resume flag
|
|
203
|
+
*/
|
|
204
|
+
async attemptRecovery(event) {
|
|
205
|
+
const { environment, containerId, affectedExecutions } = event;
|
|
206
|
+
// Wait briefly for tmux server to recover (macOS wake scenario)
|
|
207
|
+
const serverBack = await this.waitForServerRecovery(environment, containerId, 5000);
|
|
208
|
+
if (!serverBack) {
|
|
209
|
+
// For host: try starting a new tmux server (it auto-starts on new-session)
|
|
210
|
+
// For container: tmux server restarts on next tmux command
|
|
211
|
+
this.log('[watchdog] Tmux server did not auto-recover, will try to start new sessions');
|
|
212
|
+
}
|
|
213
|
+
for (const exec of affectedExecutions) {
|
|
214
|
+
const sessionName = exec.sessionId;
|
|
215
|
+
if (!sessionName) {
|
|
216
|
+
event.failedRecoveries.push(exec.agentName);
|
|
217
|
+
this.log(`[watchdog] Skip ${exec.agentName}: no session ID recorded`);
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
try {
|
|
221
|
+
const recovered = this.restartSession(exec, environment, containerId);
|
|
222
|
+
if (recovered) {
|
|
223
|
+
event.recoveredSessions.push(exec.agentName);
|
|
224
|
+
// Update execution back to running
|
|
225
|
+
this.storage.updateStatus(exec.id, 'running');
|
|
226
|
+
this.storage.updateLifecycleState(exec.id, 'healthy');
|
|
227
|
+
this.storage.updateHeartbeat(exec.id);
|
|
228
|
+
this.log(`[watchdog] Recovered: ${exec.agentName} (${exec.ticketId})`);
|
|
229
|
+
}
|
|
230
|
+
else {
|
|
231
|
+
event.failedRecoveries.push(exec.agentName);
|
|
232
|
+
this.log(`[watchdog] Failed to recover: ${exec.agentName} (${exec.ticketId})`);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
catch (error) {
|
|
236
|
+
event.failedRecoveries.push(exec.agentName);
|
|
237
|
+
this.log(`[watchdog] Error recovering ${exec.agentName}: ${error instanceof Error ? error.message : error}`);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Wait for the tmux server to come back online.
|
|
243
|
+
* Returns true if the server recovered within the timeout.
|
|
244
|
+
*/
|
|
245
|
+
async waitForServerRecovery(environment, containerId, timeoutMs) {
|
|
246
|
+
const start = Date.now();
|
|
247
|
+
const checkInterval = 1000;
|
|
248
|
+
while (Date.now() - start < timeoutMs) {
|
|
249
|
+
const status = environment === 'host'
|
|
250
|
+
? getHostTmuxServerStatus()
|
|
251
|
+
: getContainerTmuxServerStatus(containerId);
|
|
252
|
+
if (status === 'running') {
|
|
253
|
+
this.log('[watchdog] Tmux server recovered');
|
|
254
|
+
return true;
|
|
255
|
+
}
|
|
256
|
+
await new Promise(resolve => setTimeout(resolve, checkInterval));
|
|
257
|
+
}
|
|
258
|
+
return false;
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Restart an agent session by creating a new tmux session and re-launching Claude Code.
|
|
262
|
+
*/
|
|
263
|
+
restartSession(exec, environment, containerId) {
|
|
264
|
+
const sessionName = exec.sessionId;
|
|
265
|
+
// Build the claude command for re-launch
|
|
266
|
+
let claudeCmd = 'claude --resume';
|
|
267
|
+
if (exec.permissionMode === 'danger') {
|
|
268
|
+
claudeCmd += ' --dangerously-skip-permissions';
|
|
269
|
+
}
|
|
270
|
+
if (environment === 'container' && containerId) {
|
|
271
|
+
return this.restartContainerSession(containerId, sessionName, claudeCmd);
|
|
272
|
+
}
|
|
273
|
+
return this.restartHostSession(sessionName, claudeCmd);
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Restart a host tmux session.
|
|
277
|
+
*/
|
|
278
|
+
restartHostSession(sessionName, command) {
|
|
279
|
+
try {
|
|
280
|
+
// Create new tmux session — this also starts the server if it's not running
|
|
281
|
+
execFileSync('tmux', [
|
|
282
|
+
'new-session', '-d',
|
|
283
|
+
'-s', sessionName,
|
|
284
|
+
'-n', sessionName,
|
|
285
|
+
command,
|
|
286
|
+
], {
|
|
287
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
288
|
+
timeout: 10000,
|
|
289
|
+
});
|
|
290
|
+
// Enable mouse support
|
|
291
|
+
try {
|
|
292
|
+
execFileSync('tmux', ['set-option', '-g', 'mouse', 'on'], {
|
|
293
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
294
|
+
timeout: 5000,
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
catch {
|
|
298
|
+
// Non-critical
|
|
299
|
+
}
|
|
300
|
+
return true;
|
|
301
|
+
}
|
|
302
|
+
catch {
|
|
303
|
+
return false;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Restart a tmux session inside a Docker container.
|
|
308
|
+
*/
|
|
309
|
+
restartContainerSession(containerId, sessionName, command) {
|
|
310
|
+
try {
|
|
311
|
+
// Create new tmux session inside container
|
|
312
|
+
execFileSync('docker', [
|
|
313
|
+
'exec', containerId,
|
|
314
|
+
'tmux', 'new-session', '-d',
|
|
315
|
+
'-s', sessionName,
|
|
316
|
+
'-n', sessionName,
|
|
317
|
+
command,
|
|
318
|
+
], {
|
|
319
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
320
|
+
timeout: 15000,
|
|
321
|
+
});
|
|
322
|
+
return true;
|
|
323
|
+
}
|
|
324
|
+
catch {
|
|
325
|
+
return false;
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
/**
|
|
329
|
+
* Get the current known state of servers.
|
|
330
|
+
* Useful for diagnostics and testing.
|
|
331
|
+
*/
|
|
332
|
+
getServerStates() {
|
|
333
|
+
return new Map(this.lastServerState);
|
|
334
|
+
}
|
|
335
|
+
/**
|
|
336
|
+
* Reset internal state. Useful for testing.
|
|
337
|
+
*/
|
|
338
|
+
reset() {
|
|
339
|
+
this.lastServerState.clear();
|
|
340
|
+
this.activeExecutionsSnapshot.clear();
|
|
341
|
+
this.handledCrashes.clear();
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Seed the initial server state without triggering crash detection.
|
|
345
|
+
* Call this once during watcher startup to establish baseline.
|
|
346
|
+
*/
|
|
347
|
+
seedServerState(activeExecs) {
|
|
348
|
+
// Group by environment
|
|
349
|
+
const hostExecs = activeExecs.filter(e => e.environment === 'host' || e.environment === 'sandbox');
|
|
350
|
+
const containerExecs = activeExecs.filter(e => (e.environment === 'devcontainer' || e.environment === 'docker') && e.containerId);
|
|
351
|
+
if (hostExecs.length > 0) {
|
|
352
|
+
const hostStatus = getHostTmuxServerStatus();
|
|
353
|
+
this.lastServerState.set('host', hostStatus === 'running');
|
|
354
|
+
this.activeExecutionsSnapshot.set('host', hostExecs.map(e => e.id));
|
|
355
|
+
}
|
|
356
|
+
const containerGroups = new Map();
|
|
357
|
+
for (const exec of containerExecs) {
|
|
358
|
+
const cId = exec.containerId;
|
|
359
|
+
const group = containerGroups.get(cId) || [];
|
|
360
|
+
group.push(exec);
|
|
361
|
+
containerGroups.set(cId, group);
|
|
362
|
+
}
|
|
363
|
+
for (const [cId, execs] of containerGroups) {
|
|
364
|
+
const status = getContainerTmuxServerStatus(cId);
|
|
365
|
+
this.lastServerState.set(cId, status === 'running');
|
|
366
|
+
this.activeExecutionsSnapshot.set(cId, execs.map(e => e.id));
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
// =============================================================================
|
|
371
|
+
// Notification Helpers
|
|
372
|
+
// =============================================================================
|
|
373
|
+
/**
|
|
374
|
+
* Format a crash event into a human-readable notification string.
|
|
375
|
+
*/
|
|
376
|
+
export function formatCrashNotification(event) {
|
|
377
|
+
const envLabel = event.environment === 'host'
|
|
378
|
+
? 'host tmux server'
|
|
379
|
+
: `container ${event.containerId?.slice(0, 12)} tmux server`;
|
|
380
|
+
const lines = [
|
|
381
|
+
`⚠️ TMUX SERVER CRASH — ${envLabel}`,
|
|
382
|
+
` Time: ${event.timestamp.toLocaleString()}`,
|
|
383
|
+
` Affected agents: ${event.affectedExecutions.length}`,
|
|
384
|
+
];
|
|
385
|
+
for (const exec of event.affectedExecutions) {
|
|
386
|
+
lines.push(` • ${exec.agentName} (${exec.ticketId})`);
|
|
387
|
+
}
|
|
388
|
+
if (event.recoveryAttempted) {
|
|
389
|
+
if (event.recoveredSessions.length === event.affectedExecutions.length) {
|
|
390
|
+
lines.push(` Recovery: ALL ${event.recoveredSessions.length} sessions restored`);
|
|
391
|
+
}
|
|
392
|
+
else if (event.recoveredSessions.length > 0) {
|
|
393
|
+
lines.push(` Recovery: ${event.recoveredSessions.length}/${event.affectedExecutions.length} restored`);
|
|
394
|
+
lines.push(` Failed: ${event.failedRecoveries.join(', ')}`);
|
|
395
|
+
}
|
|
396
|
+
else {
|
|
397
|
+
lines.push(` Recovery: FAILED — all ${event.affectedExecutions.length} sessions need manual restart`);
|
|
398
|
+
lines.push(' Run: prlt session health');
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
else {
|
|
402
|
+
lines.push(' Auto-recovery: disabled');
|
|
403
|
+
lines.push(' Run: prlt session health');
|
|
404
|
+
}
|
|
405
|
+
return lines.join('\n');
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Send a desktop notification about a tmux crash (best effort).
|
|
409
|
+
* Uses osascript on macOS, notify-send on Linux.
|
|
410
|
+
*/
|
|
411
|
+
export function sendDesktopNotification(title, message) {
|
|
412
|
+
try {
|
|
413
|
+
if (process.platform === 'darwin') {
|
|
414
|
+
execSync(`osascript -e 'display notification "${message.replace(/"/g, '\\"')}" with title "${title.replace(/"/g, '\\"')}"'`, { stdio: 'pipe', timeout: 5000 });
|
|
415
|
+
}
|
|
416
|
+
else if (process.platform === 'linux') {
|
|
417
|
+
execSync(`notify-send "${title.replace(/"/g, '\\"')}" "${message.replace(/"/g, '\\"')}" 2>/dev/null`, { stdio: 'pipe', timeout: 5000 });
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
catch {
|
|
421
|
+
// Best effort — notification systems may not be available
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
//# sourceMappingURL=tmux-watchdog.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tmux-watchdog.js","sourceRoot":"","sources":["../../../src/lib/session/tmux-watchdog.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAA;AAG3D,OAAO,EACL,uBAAuB,EACvB,4BAA4B,GAE7B,MAAM,+BAA+B,CAAA;AAqDtC,gFAAgF;AAChF,gBAAgB;AAChB,gFAAgF;AAEhF,MAAM,OAAO,YAAY;IACf,OAAO,CAAkB;IACzB,WAAW,CAAS;IACpB,GAAG,CAAuB;IAC1B,eAAe,CAAkD;IAEzE;;;;OAIG;IACK,eAAe,GAAG,IAAI,GAAG,EAAmB,CAAA;IAEpD;;;;OAIG;IACK,wBAAwB,GAAG,IAAI,GAAG,EAAoB,CAAA;IAE9D;;;;OAIG;IACK,cAAc,GAAG,IAAI,GAAG,EAAkB,CAAA;IAElD,YAAY,OAA4B;QACtC,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAA;QAC9B,IAAI,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,IAAI,CAAA;QAC9C,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,GAAG,IAAI,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAA;QACpC,IAAI,CAAC,eAAe,GAAG,OAAO,CAAC,eAAe,CAAA;IAChD,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,eAAe;QACnB,MAAM,MAAM,GAAwB;YAClC,eAAe,EAAE,IAAI;YACrB,qBAAqB,EAAE,IAAI,GAAG,EAAE;YAChC,WAAW,EAAE,EAAE;YACf,aAAa,EAAE,CAAC;YAChB,cAAc,EAAE,CAAC;SAClB,CAAA;QAED,4BAA4B;QAC5B,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC,CAAA;QACvE,MAAM,aAAa,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAA;QACzE,MAAM,WAAW,GAAG,CAAC,GAAG,YAAY,EAAE,GAAG,aAAa,CAAC,CAAA;QAEvD,uBAAuB;QACvB,MAAM,SAAS,GAAG,WAAW,CAAC,MAAM,CAClC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,KAAK,MAAM,IAAI,CAAC,CAAC,WAAW,KAAK,SAAS,CAC7D,CAAA;QACD,MAAM,cAAc,GAAG,WAAW,CAAC,MAAM,CACvC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,KAAK,cAAc,IAAI,CAAC,CAAC,WAAW,KAAK,QAAQ,CAAC,IAAI,CAAC,CAAC,WAAW,CACvF,CAAA;QAED,6CAA6C;QAC7C,MAAM,eAAe,GAAG,IAAI,GAAG,EAAuB,CAAA;QACtD,KAAK,MAAM,IAAI,IAAI,cAAc,EAAE,CAAC;YAClC,MAAM,GAAG,GAAG,IAAI,CAAC,WAAY,CAAA;YAC7B,MAAM,KAAK,GAAG,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAA;YAC5C,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;YAChB,eAAe,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAA;QACjC,CAAC;QAED,iCAAiC;QACjC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;YAC7D,MAAM,UAAU,GAAG,uBAAuB,EAAE,CAAA;YAC5C,MAAM,SAAS,GAAG,UAAU,KAAK,SAAS,CAAA;YAC1C,MAAM,CAAC,eAAe,GAAG,SAAS,CAAA;YAClC,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;YAEjD,iEAAiE;YACjE,IAAI,SAAS,EAAE,CAAC;gBACd,IAAI,CAAC,wBAAwB,CAAC,GAAG,CAAC,MAAM,EAAE,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;gBACnE,6CAA6C;gBAC7C,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;YACpC,CAAC;YAED,gFAAgF;YAChF,IAAI,QAAQ,KAAK,IAAI,IAAI,CAAC,SAAS,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;gBACxE,MAAM,WAAW,GAAG,IAAI,CAAC,wBAAwB,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,CAAA;gBACnE,yDAAyD;gBACzD,MAAM,QAAQ,GAAG,WAAW;qBACzB,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC;qBACxC,MAAM,CAAC,CAAC,CAAC,EAAkB,EAAE,CAAC,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,KAAK,SAAS,IAAI,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAA;gBAEnG,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACxB,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAA;oBACjE,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;oBAC9B,MAAM,CAAC,aAAa,IAAI,KAAK,CAAC,kBAAkB,CAAC,MAAM,CAAA;oBACvD,MAAM,CAAC,cAAc,IAAI,KAAK,CAAC,iBAAiB,CAAC,MAAM,CAAA;oBACvD,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,GAAG,EAAE,CAAC,CAAA;gBAC7C,CAAC;YACH,CAAC;YAED,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,MAAM,EAAE,SAAS,CAAC,CAAA;QAC7C,CAAC;QAED,uCAAuC;QACvC,KAAK,MAAM,CAAC,WAAW,EAAE,KAAK,CAAC,IAAI,eAAe,EAAE,CAAC;YACnD,MAAM,eAAe,GAAG,4BAA4B,CAAC,WAAW,CAAC,CAAA;YACjE,MAAM,cAAc,GAAG,eAAe,KAAK,SAAS,CAAA;YACpD,MAAM,CAAC,qBAAqB,CAAC,GAAG,CAAC,WAAW,EAAE,cAAc,CAAC,CAAA;YAC7D,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,WAAW,CAAC,CAAA;YAEtD,IAAI,cAAc,EAAE,CAAC;gBACnB,IAAI,CAAC,wBAAwB,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;gBACpE,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,WAAW,CAAC,CAAA;YACzC,CAAC;YAED,IAAI,QAAQ,KAAK,IAAI,IAAI,CAAC,cAAc,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,WAAW,CAAC,EAAE,CAAC;gBAClF,MAAM,WAAW,GAAG,IAAI,CAAC,wBAAwB,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,EAAE,CAAA;gBACxE,MAAM,QAAQ,GAAG,WAAW;qBACzB,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC;qBACxC,MAAM,CAAC,CAAC,CAAC,EAAkB,EAAE,CAAC,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,KAAK,SAAS,IAAI,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAA;gBAEnG,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACxB,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,WAAW,EAAE,WAAW,EAAE,QAAQ,CAAC,CAAA;oBACxE,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;oBAC9B,MAAM,CAAC,aAAa,IAAI,KAAK,CAAC,kBAAkB,CAAC,MAAM,CAAA;oBACvD,MAAM,CAAC,cAAc,IAAI,KAAK,CAAC,iBAAiB,CAAC,MAAM,CAAA;oBACvD,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,WAAW,EAAE,IAAI,CAAC,GAAG,EAAE,CAAC,CAAA;gBAClD,CAAC;YACH,CAAC;YAED,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,WAAW,EAAE,cAAc,CAAC,CAAA;QACvD,CAAC;QAED,yEAAyE;QACzE,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC;YAC9C,IAAI,GAAG,KAAK,MAAM,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;gBAChD,IAAI,CAAC,eAAe,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;gBAChC,IAAI,CAAC,wBAAwB,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;gBACzC,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;YACjC,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,WAAW,CACvB,WAAiC,EACjC,WAA+B,EAC/B,QAAqB;QAErB,MAAM,QAAQ,GAAG,WAAW,KAAK,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,aAAa,WAAW,EAAE,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAA;QAC3F,IAAI,CAAC,GAAG,CACN,4CAA4C,QAAQ,MAAM,QAAQ,CAAC,MAAM,oBAAoB,CAC9F,CAAA;QAED,MAAM,KAAK,GAAmB;YAC5B,SAAS,EAAE,IAAI,IAAI,EAAE;YACrB,WAAW;YACX,WAAW;YACX,kBAAkB,EAAE,QAAQ;YAC5B,iBAAiB,EAAE,KAAK;YACxB,iBAAiB,EAAE,EAAE;YACrB,gBAAgB,EAAE,EAAE;SACrB,CAAA;QAED,qDAAqD;QACrD,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;YAC5B,IAAI,CAAC,GAAG,CACN,0BAA0B,IAAI,CAAC,SAAS,KAAK,IAAI,CAAC,QAAQ,gBAAgB,IAAI,CAAC,SAAS,IAAI,SAAS,EAAE,CACxG,CAAA;YACD,IAAI,CAAC,OAAO,CAAC,YAAY,CACvB,IAAI,CAAC,EAAE,EACP,QAAQ,EACR,SAAS,EACT,wBAAwB,QAAQ,wBAAwB;gBACxD,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,0BAA0B,CAAC,CAAC,CAAC,0BAA0B,EAAE,CAChF,CAAA;YACD,IAAI,CAAC,OAAO,CAAC,oBAAoB,CAAC,IAAI,CAAC,EAAE,EAAE,MAAM,CAAC,CAAA;QACpD,CAAC;QAED,mCAAmC;QACnC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACrB,KAAK,CAAC,iBAAiB,GAAG,IAAI,CAAA;YAC9B,MAAM,IAAI,CAAC,eAAe,CAAC,KAAK,CAAC,CAAA;QACnC,CAAC;QAED,gBAAgB;QAChB,IAAI,IAAI,CAAC,eAAe,EAAE,CAAC;YACzB,MAAM,IAAI,CAAC,eAAe,CAAC,KAAK,CAAC,CAAA;QACnC,CAAC;QAED,uBAAuB;QACvB,IAAI,KAAK,CAAC,iBAAiB,EAAE,CAAC;YAC5B,IAAI,KAAK,CAAC,iBAAiB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvC,IAAI,CAAC,GAAG,CACN,wBAAwB,KAAK,CAAC,iBAAiB,CAAC,MAAM,IAAI,QAAQ,CAAC,MAAM,oBAAoB,CAC9F,CAAA;YACH,CAAC;YACD,IAAI,KAAK,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACtC,IAAI,CAAC,GAAG,CACN,mCAAmC,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACvE,CAAA;YACH,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAA;IACd,CAAC;IAED;;;;;;;;OAQG;IACK,KAAK,CAAC,eAAe,CAAC,KAAqB;QACjD,MAAM,EAAE,WAAW,EAAE,WAAW,EAAE,kBAAkB,EAAE,GAAG,KAAK,CAAA;QAE9D,gEAAgE;QAChE,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,qBAAqB,CAAC,WAAW,EAAE,WAAW,EAAE,IAAI,CAAC,CAAA;QAEnF,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,2EAA2E;YAC3E,2DAA2D;YAC3D,IAAI,CAAC,GAAG,CAAC,6EAA6E,CAAC,CAAA;QACzF,CAAC;QAED,KAAK,MAAM,IAAI,IAAI,kBAAkB,EAAE,CAAC;YACtC,MAAM,WAAW,GAAG,IAAI,CAAC,SAAS,CAAA;YAClC,IAAI,CAAC,WAAW,EAAE,CAAC;gBACjB,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBAC3C,IAAI,CAAC,GAAG,CAAC,qBAAqB,IAAI,CAAC,SAAS,0BAA0B,CAAC,CAAA;gBACvE,SAAQ;YACV,CAAC;YAED,IAAI,CAAC;gBACH,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,WAAW,EAAE,WAAW,CAAC,CAAA;gBACrE,IAAI,SAAS,EAAE,CAAC;oBACd,KAAK,CAAC,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;oBAE5C,mCAAmC;oBACnC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,EAAE,EAAE,SAAS,CAAC,CAAA;oBAC7C,IAAI,CAAC,OAAO,CAAC,oBAAoB,CAAC,IAAI,CAAC,EAAE,EAAE,SAAS,CAAC,CAAA;oBACrD,IAAI,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;oBAErC,IAAI,CAAC,GAAG,CAAC,2BAA2B,IAAI,CAAC,SAAS,KAAK,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAA;gBAC1E,CAAC;qBAAM,CAAC;oBACN,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;oBAC3C,IAAI,CAAC,GAAG,CAAC,mCAAmC,IAAI,CAAC,SAAS,KAAK,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAA;gBAClF,CAAC;YACH,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBAC3C,IAAI,CAAC,GAAG,CACN,iCAAiC,IAAI,CAAC,SAAS,KAAK,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,EAAE,CACrG,CAAA;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,qBAAqB,CACjC,WAAiC,EACjC,WAA+B,EAC/B,SAAiB;QAEjB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;QACxB,MAAM,aAAa,GAAG,IAAI,CAAA;QAE1B,OAAO,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,GAAG,SAAS,EAAE,CAAC;YACtC,MAAM,MAAM,GACV,WAAW,KAAK,MAAM;gBACpB,CAAC,CAAC,uBAAuB,EAAE;gBAC3B,CAAC,CAAC,4BAA4B,CAAC,WAAY,CAAC,CAAA;YAEhD,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;gBACzB,IAAI,CAAC,GAAG,CAAC,kCAAkC,CAAC,CAAA;gBAC5C,OAAO,IAAI,CAAA;YACb,CAAC;YAED,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC,CAAA;QAClE,CAAC;QAED,OAAO,KAAK,CAAA;IACd,CAAC;IAED;;OAEG;IACK,cAAc,CACpB,IAAe,EACf,WAAiC,EACjC,WAA+B;QAE/B,MAAM,WAAW,GAAG,IAAI,CAAC,SAAU,CAAA;QAEnC,yCAAyC;QACzC,IAAI,SAAS,GAAG,iBAAiB,CAAA;QACjC,IAAI,IAAI,CAAC,cAAc,KAAK,QAAQ,EAAE,CAAC;YACrC,SAAS,IAAI,iCAAiC,CAAA;QAChD,CAAC;QAED,IAAI,WAAW,KAAK,WAAW,IAAI,WAAW,EAAE,CAAC;YAC/C,OAAO,IAAI,CAAC,uBAAuB,CAAC,WAAW,EAAE,WAAW,EAAE,SAAS,CAAC,CAAA;QAC1E,CAAC;QAED,OAAO,IAAI,CAAC,kBAAkB,CAAC,WAAW,EAAE,SAAS,CAAC,CAAA;IACxD,CAAC;IAED;;OAEG;IACK,kBAAkB,CAAC,WAAmB,EAAE,OAAe;QAC7D,IAAI,CAAC;YACH,4EAA4E;YAC5E,YAAY,CAAC,MAAM,EAAE;gBACnB,aAAa,EAAE,IAAI;gBACnB,IAAI,EAAE,WAAW;gBACjB,IAAI,EAAE,WAAW;gBACjB,OAAO;aACR,EAAE;gBACD,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;gBAC/B,OAAO,EAAE,KAAK;aACf,CAAC,CAAA;YAEF,uBAAuB;YACvB,IAAI,CAAC;gBACH,YAAY,CAAC,MAAM,EAAE,CAAC,YAAY,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE;oBACxD,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;oBAC/B,OAAO,EAAE,IAAI;iBACd,CAAC,CAAA;YACJ,CAAC;YAAC,MAAM,CAAC;gBACP,eAAe;YACjB,CAAC;YAED,OAAO,IAAI,CAAA;QACb,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAA;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACK,uBAAuB,CAC7B,WAAmB,EACnB,WAAmB,EACnB,OAAe;QAEf,IAAI,CAAC;YACH,2CAA2C;YAC3C,YAAY,CAAC,QAAQ,EAAE;gBACrB,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,aAAa,EAAE,IAAI;gBAC3B,IAAI,EAAE,WAAW;gBACjB,IAAI,EAAE,WAAW;gBACjB,OAAO;aACR,EAAE;gBACD,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;gBAC/B,OAAO,EAAE,KAAK;aACf,CAAC,CAAA;YACF,OAAO,IAAI,CAAA;QACb,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAA;QACd,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,eAAe;QACb,OAAO,IAAI,GAAG,CAAC,IAAI,CAAC,eAAe,CAAC,CAAA;IACtC,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,CAAA;QAC5B,IAAI,CAAC,wBAAwB,CAAC,KAAK,EAAE,CAAA;QACrC,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,CAAA;IAC7B,CAAC;IAED;;;OAGG;IACH,eAAe,CAAC,WAAwB;QACtC,uBAAuB;QACvB,MAAM,SAAS,GAAG,WAAW,CAAC,MAAM,CAClC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,KAAK,MAAM,IAAI,CAAC,CAAC,WAAW,KAAK,SAAS,CAC7D,CAAA;QACD,MAAM,cAAc,GAAG,WAAW,CAAC,MAAM,CACvC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,KAAK,cAAc,IAAI,CAAC,CAAC,WAAW,KAAK,QAAQ,CAAC,IAAI,CAAC,CAAC,WAAW,CACvF,CAAA;QAED,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,MAAM,UAAU,GAAG,uBAAuB,EAAE,CAAA;YAC5C,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,MAAM,EAAE,UAAU,KAAK,SAAS,CAAC,CAAA;YAC1D,IAAI,CAAC,wBAAwB,CAAC,GAAG,CAAC,MAAM,EAAE,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;QACrE,CAAC;QAED,MAAM,eAAe,GAAG,IAAI,GAAG,EAAuB,CAAA;QACtD,KAAK,MAAM,IAAI,IAAI,cAAc,EAAE,CAAC;YAClC,MAAM,GAAG,GAAG,IAAI,CAAC,WAAY,CAAA;YAC7B,MAAM,KAAK,GAAG,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAA;YAC5C,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;YAChB,eAAe,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAA;QACjC,CAAC;QAED,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,eAAe,EAAE,CAAC;YAC3C,MAAM,MAAM,GAAG,4BAA4B,CAAC,GAAG,CAAC,CAAA;YAChD,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,SAAS,CAAC,CAAA;YACnD,IAAI,CAAC,wBAAwB,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;QAC9D,CAAC;IACH,CAAC;CACF;AAED,gFAAgF;AAChF,uBAAuB;AACvB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,uBAAuB,CAAC,KAAqB;IAC3D,MAAM,QAAQ,GACZ,KAAK,CAAC,WAAW,KAAK,MAAM;QAC1B,CAAC,CAAC,kBAAkB;QACpB,CAAC,CAAC,aAAa,KAAK,CAAC,WAAW,EAAE,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAA;IAEhE,MAAM,KAAK,GAAa;QACtB,2BAA2B,QAAQ,EAAE;QACrC,YAAY,KAAK,CAAC,SAAS,CAAC,cAAc,EAAE,EAAE;QAC9C,uBAAuB,KAAK,CAAC,kBAAkB,CAAC,MAAM,EAAE;KACzD,CAAA;IAED,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,kBAAkB,EAAE,CAAC;QAC5C,KAAK,CAAC,IAAI,CAAC,UAAU,IAAI,CAAC,SAAS,KAAK,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAA;IAC3D,CAAC;IAED,IAAI,KAAK,CAAC,iBAAiB,EAAE,CAAC;QAC5B,IAAI,KAAK,CAAC,iBAAiB,CAAC,MAAM,KAAK,KAAK,CAAC,kBAAkB,CAAC,MAAM,EAAE,CAAC;YACvE,KAAK,CAAC,IAAI,CAAC,oBAAoB,KAAK,CAAC,iBAAiB,CAAC,MAAM,oBAAoB,CAAC,CAAA;QACpF,CAAC;aAAM,IAAI,KAAK,CAAC,iBAAiB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9C,KAAK,CAAC,IAAI,CACR,gBAAgB,KAAK,CAAC,iBAAiB,CAAC,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,MAAM,WAAW,CAC7F,CAAA;YACD,KAAK,CAAC,IAAI,CAAC,cAAc,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;QAC/D,CAAC;aAAM,CAAC;YACN,KAAK,CAAC,IAAI,CAAC,6BAA6B,KAAK,CAAC,kBAAkB,CAAC,MAAM,+BAA+B,CAAC,CAAA;YACvG,KAAK,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAA;QAC3C,CAAC;IACH,CAAC;SAAM,CAAC;QACN,KAAK,CAAC,IAAI,CAAC,4BAA4B,CAAC,CAAA;QACxC,KAAK,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAA;IAC3C,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACzB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CAAC,KAAa,EAAE,OAAe;IACpE,IAAI,CAAC;QACH,IAAI,OAAO,CAAC,QAAQ,KAAK,QAAQ,EAAE,CAAC;YAClC,QAAQ,CACN,uCAAuC,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,iBAAiB,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,EAClH,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,CACjC,CAAA;QACH,CAAC;aAAM,IAAI,OAAO,CAAC,QAAQ,KAAK,OAAO,EAAE,CAAC;YACxC,QAAQ,CACN,gBAAgB,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,MAAM,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,eAAe,EAC3F,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,CACjC,CAAA;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,0DAA0D;IAC5D,CAAC;AACH,CAAC"}
|
|
@@ -6,14 +6,15 @@
|
|
|
6
6
|
* that can't self-report (OOM kills, zombie processes, container crashes).
|
|
7
7
|
*
|
|
8
8
|
* Architecture:
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
9
|
+
* Phase 0: Tmux server health check — detect full server crashes (PRLT-1115)
|
|
10
|
+
* Phase 1: Record heartbeats for alive agents (tmux pane inspection)
|
|
11
|
+
* Phase 2: Detect stale executions that exceeded the timeout
|
|
12
|
+
* Phase 3: Act on stale executions — mark failed, kill containers, fire events
|
|
13
13
|
*/
|
|
14
14
|
import type Database from 'better-sqlite3';
|
|
15
15
|
import type { AgentWork } from '../execution/types.js';
|
|
16
16
|
import { type StaleExecution } from './heartbeat.js';
|
|
17
|
+
import { TmuxWatchdog, type TmuxCrashEvent } from './tmux-watchdog.js';
|
|
17
18
|
export interface WatcherOptions {
|
|
18
19
|
/** Database connection */
|
|
19
20
|
db: Database.Database;
|
|
@@ -23,10 +24,14 @@ export interface WatcherOptions {
|
|
|
23
24
|
timeoutMinutes?: number;
|
|
24
25
|
/** Whether to kill containers on timeout (default: true) */
|
|
25
26
|
autoKill?: boolean;
|
|
27
|
+
/** Whether to auto-recover from tmux server crashes (default: true) */
|
|
28
|
+
autoRecover?: boolean;
|
|
26
29
|
/** Logger function */
|
|
27
30
|
log?: (msg: string) => void;
|
|
28
31
|
/** Callback when a stale execution is detected and handled */
|
|
29
32
|
onStaleDetected?: (execution: AgentWork, reason: string) => void | Promise<void>;
|
|
33
|
+
/** Callback when a tmux server crash is detected */
|
|
34
|
+
onCrashDetected?: (event: TmuxCrashEvent) => void | Promise<void>;
|
|
30
35
|
}
|
|
31
36
|
export interface WatchCycleResult {
|
|
32
37
|
/** Number of active executions checked */
|
|
@@ -37,6 +42,10 @@ export interface WatchCycleResult {
|
|
|
37
42
|
staleExecutions: StaleExecution[];
|
|
38
43
|
/** Number of containers killed */
|
|
39
44
|
containersKilled: number;
|
|
45
|
+
/** Tmux crash events detected in this cycle */
|
|
46
|
+
crashEvents: TmuxCrashEvent[];
|
|
47
|
+
/** Total sessions recovered from crashes */
|
|
48
|
+
crashRecoveries: number;
|
|
40
49
|
}
|
|
41
50
|
export declare class SessionWatcher {
|
|
42
51
|
private storage;
|
|
@@ -44,10 +53,14 @@ export declare class SessionWatcher {
|
|
|
44
53
|
private intervalMinutes;
|
|
45
54
|
private timeoutMinutes;
|
|
46
55
|
private autoKill;
|
|
56
|
+
private autoRecover;
|
|
47
57
|
private log;
|
|
48
58
|
private onStaleDetected?;
|
|
59
|
+
private onCrashDetected?;
|
|
49
60
|
private timer;
|
|
50
61
|
private running;
|
|
62
|
+
private watchdog;
|
|
63
|
+
private isFirstCycle;
|
|
51
64
|
constructor(options: WatcherOptions);
|
|
52
65
|
/**
|
|
53
66
|
* Run a single watch cycle.
|
|
@@ -74,6 +87,11 @@ export declare class SessionWatcher {
|
|
|
74
87
|
intervalMinutes: number;
|
|
75
88
|
timeoutMinutes: number;
|
|
76
89
|
autoKill: boolean;
|
|
90
|
+
autoRecover: boolean;
|
|
77
91
|
};
|
|
92
|
+
/**
|
|
93
|
+
* Get the tmux watchdog instance (for testing/diagnostics).
|
|
94
|
+
*/
|
|
95
|
+
getWatchdog(): TmuxWatchdog;
|
|
78
96
|
private runCycleWithErrorHandling;
|
|
79
97
|
}
|
|
@@ -6,13 +6,14 @@
|
|
|
6
6
|
* that can't self-report (OOM kills, zombie processes, container crashes).
|
|
7
7
|
*
|
|
8
8
|
* Architecture:
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
9
|
+
* Phase 0: Tmux server health check — detect full server crashes (PRLT-1115)
|
|
10
|
+
* Phase 1: Record heartbeats for alive agents (tmux pane inspection)
|
|
11
|
+
* Phase 2: Detect stale executions that exceeded the timeout
|
|
12
|
+
* Phase 3: Act on stale executions — mark failed, kill containers, fire events
|
|
13
13
|
*/
|
|
14
14
|
import { ExecutionStorage } from '../execution/storage.js';
|
|
15
15
|
import { recordAllHeartbeats, detectStaleExecutions, killContainer, } from './heartbeat.js';
|
|
16
|
+
import { TmuxWatchdog, formatCrashNotification, sendDesktopNotification, } from './tmux-watchdog.js';
|
|
16
17
|
// =============================================================================
|
|
17
18
|
// Session Watcher
|
|
18
19
|
// =============================================================================
|
|
@@ -22,18 +23,40 @@ export class SessionWatcher {
|
|
|
22
23
|
intervalMinutes;
|
|
23
24
|
timeoutMinutes;
|
|
24
25
|
autoKill;
|
|
26
|
+
autoRecover;
|
|
25
27
|
log;
|
|
26
28
|
onStaleDetected;
|
|
29
|
+
onCrashDetected;
|
|
27
30
|
timer = null;
|
|
28
31
|
running = false;
|
|
32
|
+
watchdog;
|
|
33
|
+
isFirstCycle = true;
|
|
29
34
|
constructor(options) {
|
|
30
35
|
this.db = options.db;
|
|
31
36
|
this.storage = new ExecutionStorage(options.db);
|
|
32
37
|
this.intervalMinutes = options.intervalMinutes ?? 5;
|
|
33
38
|
this.timeoutMinutes = options.timeoutMinutes ?? 15;
|
|
34
39
|
this.autoKill = options.autoKill ?? true;
|
|
40
|
+
this.autoRecover = options.autoRecover ?? true;
|
|
35
41
|
this.log = options.log ?? (() => { });
|
|
36
42
|
this.onStaleDetected = options.onStaleDetected;
|
|
43
|
+
this.onCrashDetected = options.onCrashDetected;
|
|
44
|
+
this.watchdog = new TmuxWatchdog({
|
|
45
|
+
storage: this.storage,
|
|
46
|
+
autoRecover: this.autoRecover,
|
|
47
|
+
log: this.log,
|
|
48
|
+
onCrashDetected: async (event) => {
|
|
49
|
+
// Send desktop notification
|
|
50
|
+
const notification = formatCrashNotification(event);
|
|
51
|
+
this.log(notification);
|
|
52
|
+
sendDesktopNotification('prlt: tmux server crash', `${event.affectedExecutions.length} agent(s) affected. ` +
|
|
53
|
+
`${event.recoveredSessions.length} recovered.`);
|
|
54
|
+
// Fire user callback
|
|
55
|
+
if (this.onCrashDetected) {
|
|
56
|
+
await this.onCrashDetected(event);
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
});
|
|
37
60
|
}
|
|
38
61
|
/**
|
|
39
62
|
* Run a single watch cycle.
|
|
@@ -45,7 +68,25 @@ export class SessionWatcher {
|
|
|
45
68
|
heartbeatsUpdated: 0,
|
|
46
69
|
staleExecutions: [],
|
|
47
70
|
containersKilled: 0,
|
|
71
|
+
crashEvents: [],
|
|
72
|
+
crashRecoveries: 0,
|
|
48
73
|
};
|
|
74
|
+
// Phase 0: Tmux server health check (PRLT-1115)
|
|
75
|
+
// Detect full server crashes before doing individual heartbeat checks.
|
|
76
|
+
// On the first cycle, seed the watchdog state without triggering crash detection.
|
|
77
|
+
if (this.isFirstCycle) {
|
|
78
|
+
const activeExecs = [
|
|
79
|
+
...this.storage.listExecutions({ status: 'running' }),
|
|
80
|
+
...this.storage.listExecutions({ status: 'starting' }),
|
|
81
|
+
];
|
|
82
|
+
this.watchdog.seedServerState(activeExecs);
|
|
83
|
+
this.isFirstCycle = false;
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
const watchdogResult = await this.watchdog.checkAndRecover();
|
|
87
|
+
result.crashEvents = watchdogResult.crashEvents;
|
|
88
|
+
result.crashRecoveries = watchdogResult.totalRecovered;
|
|
89
|
+
}
|
|
49
90
|
// Phase 1: Record heartbeats for all active executions
|
|
50
91
|
// This is the "push" side — we check tmux panes and update heartbeats
|
|
51
92
|
const heartbeats = recordAllHeartbeats(this.storage);
|
|
@@ -88,7 +129,8 @@ export class SessionWatcher {
|
|
|
88
129
|
if (this.running)
|
|
89
130
|
return;
|
|
90
131
|
this.running = true;
|
|
91
|
-
this.log(`[watcher] Starting session watcher (interval: ${this.intervalMinutes}m, timeout: ${this.timeoutMinutes}m,
|
|
132
|
+
this.log(`[watcher] Starting session watcher (interval: ${this.intervalMinutes}m, timeout: ${this.timeoutMinutes}m, ` +
|
|
133
|
+
`auto-kill: ${this.autoKill}, auto-recover: ${this.autoRecover})`);
|
|
92
134
|
// Run initial cycle
|
|
93
135
|
void this.runCycleWithErrorHandling();
|
|
94
136
|
// Start polling
|
|
@@ -124,14 +166,30 @@ export class SessionWatcher {
|
|
|
124
166
|
intervalMinutes: this.intervalMinutes,
|
|
125
167
|
timeoutMinutes: this.timeoutMinutes,
|
|
126
168
|
autoKill: this.autoKill,
|
|
169
|
+
autoRecover: this.autoRecover,
|
|
127
170
|
};
|
|
128
171
|
}
|
|
172
|
+
/**
|
|
173
|
+
* Get the tmux watchdog instance (for testing/diagnostics).
|
|
174
|
+
*/
|
|
175
|
+
getWatchdog() {
|
|
176
|
+
return this.watchdog;
|
|
177
|
+
}
|
|
129
178
|
async runCycleWithErrorHandling() {
|
|
130
179
|
try {
|
|
131
180
|
const result = await this.runCycle();
|
|
132
|
-
if (result.staleExecutions.length > 0) {
|
|
133
|
-
|
|
134
|
-
`${result.
|
|
181
|
+
if (result.staleExecutions.length > 0 || result.crashEvents.length > 0) {
|
|
182
|
+
const parts = [
|
|
183
|
+
`${result.checked} checked`,
|
|
184
|
+
`${result.heartbeatsUpdated} heartbeats`,
|
|
185
|
+
`${result.staleExecutions.length} stale`,
|
|
186
|
+
`${result.containersKilled} killed`,
|
|
187
|
+
];
|
|
188
|
+
if (result.crashEvents.length > 0) {
|
|
189
|
+
parts.push(`${result.crashEvents.length} crash(es)`);
|
|
190
|
+
parts.push(`${result.crashRecoveries} recovered`);
|
|
191
|
+
}
|
|
192
|
+
this.log(`[watcher] Cycle complete: ${parts.join(', ')}`);
|
|
135
193
|
}
|
|
136
194
|
}
|
|
137
195
|
catch (error) {
|