@coralai/sps-cli 0.15.12 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -0
- package/dist/commands/monitorTick.d.ts.map +1 -1
- package/dist/commands/monitorTick.js +3 -1
- package/dist/commands/monitorTick.js.map +1 -1
- package/dist/commands/pipelineTick.d.ts.map +1 -1
- package/dist/commands/pipelineTick.js +12 -3
- package/dist/commands/pipelineTick.js.map +1 -1
- package/dist/commands/tick.d.ts +1 -0
- package/dist/commands/tick.d.ts.map +1 -1
- package/dist/commands/tick.js +64 -8
- package/dist/commands/tick.js.map +1 -1
- package/dist/commands/workerLaunch.d.ts.map +1 -1
- package/dist/commands/workerLaunch.js +12 -3
- package/dist/commands/workerLaunch.js.map +1 -1
- package/dist/engines/ExecutionEngine.d.ts +29 -32
- package/dist/engines/ExecutionEngine.d.ts.map +1 -1
- package/dist/engines/ExecutionEngine.js +224 -527
- package/dist/engines/ExecutionEngine.js.map +1 -1
- package/dist/engines/MonitorEngine.d.ts +14 -27
- package/dist/engines/MonitorEngine.d.ts.map +1 -1
- package/dist/engines/MonitorEngine.js +91 -313
- package/dist/engines/MonitorEngine.js.map +1 -1
- package/dist/main.js +0 -0
- package/dist/manager/completion-judge.d.ts +27 -0
- package/dist/manager/completion-judge.d.ts.map +1 -0
- package/dist/manager/completion-judge.js +81 -0
- package/dist/manager/completion-judge.js.map +1 -0
- package/dist/manager/pm-client.d.ts +10 -0
- package/dist/manager/pm-client.d.ts.map +1 -0
- package/dist/manager/pm-client.js +245 -0
- package/dist/manager/pm-client.js.map +1 -0
- package/dist/manager/post-actions.d.ts +60 -0
- package/dist/manager/post-actions.d.ts.map +1 -0
- package/dist/manager/post-actions.js +326 -0
- package/dist/manager/post-actions.js.map +1 -0
- package/dist/manager/recovery.d.ts +39 -0
- package/dist/manager/recovery.d.ts.map +1 -0
- package/dist/manager/recovery.js +133 -0
- package/dist/manager/recovery.js.map +1 -0
- package/dist/manager/resource-limiter.d.ts +44 -0
- package/dist/manager/resource-limiter.d.ts.map +1 -0
- package/dist/manager/resource-limiter.js +79 -0
- package/dist/manager/resource-limiter.js.map +1 -0
- package/dist/manager/supervisor.d.ts +70 -0
- package/dist/manager/supervisor.d.ts.map +1 -0
- package/dist/manager/supervisor.js +216 -0
- package/dist/manager/supervisor.js.map +1 -0
- package/package.json +1 -1
|
@@ -4,13 +4,15 @@ import { Logger } from '../core/logger.js';
|
|
|
4
4
|
/**
|
|
5
5
|
* MonitorEngine performs anomaly detection and health checks.
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
7
|
+
* With ProcessSupervisor, completion detection and post-actions are handled
|
|
8
|
+
* by exit callbacks (CompletionJudge + PostActions). MonitorEngine focuses on:
|
|
9
|
+
* 1. Orphan slot cleanup (stale entries not tracked by Supervisor)
|
|
10
|
+
* 2. Stale runtime detection (Inprogress cards with no worker)
|
|
11
|
+
* 3. Timeout detection (INPROGRESS_TIMEOUT_HOURS)
|
|
12
|
+
* 4. Waiting confirmation detection (interactive mode only)
|
|
13
|
+
* 5. BLOCKED condition check
|
|
14
|
+
* 6. State alignment (Supervisor vs state.json sync)
|
|
15
|
+
* 7. Worker health (launch/idle timeouts for Supervisor-tracked workers)
|
|
14
16
|
*/
|
|
15
17
|
export class MonitorEngine {
|
|
16
18
|
ctx;
|
|
@@ -18,13 +20,15 @@ export class MonitorEngine {
|
|
|
18
20
|
workerProvider;
|
|
19
21
|
repoBackend;
|
|
20
22
|
notifier;
|
|
23
|
+
supervisor;
|
|
21
24
|
log;
|
|
22
|
-
constructor(ctx, taskBackend, workerProvider, repoBackend, notifier) {
|
|
25
|
+
constructor(ctx, taskBackend, workerProvider, repoBackend, notifier, supervisor) {
|
|
23
26
|
this.ctx = ctx;
|
|
24
27
|
this.taskBackend = taskBackend;
|
|
25
28
|
this.workerProvider = workerProvider;
|
|
26
29
|
this.repoBackend = repoBackend;
|
|
27
30
|
this.notifier = notifier;
|
|
31
|
+
this.supervisor = supervisor;
|
|
28
32
|
this.log = new Logger('monitor', ctx.projectName, ctx.paths.logsDir);
|
|
29
33
|
}
|
|
30
34
|
async tick() {
|
|
@@ -41,19 +45,12 @@ export class MonitorEngine {
|
|
|
41
45
|
details: { checks },
|
|
42
46
|
};
|
|
43
47
|
try {
|
|
44
|
-
// 0. Worker health check (print-mode: launch timeout, idle timeout, auto-retry)
|
|
45
48
|
await this.checkWorkerHealth(checks, actions);
|
|
46
|
-
// 1. Orphan slot cleanup
|
|
47
49
|
await this.checkOrphanSlots(checks, actions);
|
|
48
|
-
// 2. Stale runtime detection
|
|
49
50
|
await this.checkStaleRuntimes(checks, actions, recommendedActions);
|
|
50
|
-
// 3. Timeout detection
|
|
51
51
|
await this.checkTimeouts(checks, actions, recommendedActions);
|
|
52
|
-
// 4. Waiting confirmation detection
|
|
53
52
|
await this.checkWaitingConfirmation(checks, actions);
|
|
54
|
-
// 5. BLOCKED condition check
|
|
55
53
|
await this.checkBlockedCards(checks);
|
|
56
|
-
// 6. State alignment
|
|
57
54
|
await this.checkStateAlignment(checks, recommendedActions);
|
|
58
55
|
}
|
|
59
56
|
catch (err) {
|
|
@@ -63,59 +60,36 @@ export class MonitorEngine {
|
|
|
63
60
|
result.exitCode = 1;
|
|
64
61
|
result.details = { error: msg, checks };
|
|
65
62
|
}
|
|
66
|
-
// Set degraded if any checks failed
|
|
67
63
|
if (checks.some((c) => c.status === 'fail') && result.status === 'ok') {
|
|
68
64
|
result.status = 'degraded';
|
|
69
65
|
}
|
|
70
66
|
return result;
|
|
71
67
|
}
|
|
72
|
-
// ─── Check 1: Orphan Slot Cleanup
|
|
68
|
+
// ─── Check 1: Orphan Slot Cleanup ─────────────────────────────
|
|
73
69
|
async checkOrphanSlots(checks, actions) {
|
|
74
70
|
const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
|
|
75
71
|
let orphansFound = 0;
|
|
76
72
|
for (const [slotName, slotState] of Object.entries(state.workers)) {
|
|
77
|
-
if (slotState.status !== 'active'
|
|
73
|
+
if (slotState.status !== 'active')
|
|
78
74
|
continue;
|
|
79
|
-
//
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
}
|
|
85
|
-
catch {
|
|
86
|
-
// PID dead — but did the worker actually complete its task?
|
|
87
|
-
const completionStatus = await this.checkPrintWorkerCompletion(slotState);
|
|
88
|
-
if (completionStatus === 'COMPLETED') {
|
|
89
|
-
this.log.ok(`Orphan slot ${slotName}: print-mode pid ${slotState.pid} is dead but task COMPLETED, handling as completion`);
|
|
90
|
-
const handled = await this.handleCompletedWorker(slotName, slotState, state, actions);
|
|
91
|
-
if (handled)
|
|
92
|
-
orphansFound++; // only count if slot was actually released
|
|
93
|
-
continue;
|
|
94
|
-
}
|
|
95
|
-
// Not completed → fall through to orphan cleanup
|
|
96
|
-
this.log.warn(`Orphan slot ${slotName}: print-mode pid ${slotState.pid} is dead (status=${completionStatus}), releasing`);
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
else if (slotState.mode === 'print' && !slotState.pid) {
|
|
100
|
-
// Print mode but no PID recorded — skip, launch may still be writing state
|
|
101
|
-
this.log.debug(`Skipping orphan check for ${slotName}: print mode, no PID yet`);
|
|
75
|
+
// Build the workerId that Supervisor would track
|
|
76
|
+
const seq = slotState.seq != null ? String(slotState.seq) : '';
|
|
77
|
+
const workerId = `${this.ctx.projectName}:${slotName}:${seq}`;
|
|
78
|
+
// If Supervisor is tracking this worker, it handles lifecycle — skip
|
|
79
|
+
if (this.supervisor.get(workerId))
|
|
102
80
|
continue;
|
|
81
|
+
// Supervisor doesn't know about this worker.
|
|
82
|
+
// PostActions exit callback may have already cleaned up state.
|
|
83
|
+
// Re-read state to check for race with exit callback.
|
|
84
|
+
const freshState = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
|
|
85
|
+
const freshSlot = freshState.workers[slotName];
|
|
86
|
+
if (!freshSlot || freshSlot.status !== 'active')
|
|
87
|
+
continue;
|
|
88
|
+
// Still active with no Supervisor handle — truly orphaned / stale
|
|
89
|
+
this.log.warn(`Orphan slot ${slotName}: not tracked by Supervisor, marking STALE-RUNTIME and releasing`);
|
|
90
|
+
if (slotState.seq != null) {
|
|
91
|
+
await this.addLabelSafe(String(slotState.seq), 'STALE-RUNTIME');
|
|
103
92
|
}
|
|
104
|
-
else {
|
|
105
|
-
// Tmux mode: use inspect()
|
|
106
|
-
try {
|
|
107
|
-
const inspection = await this.workerProvider.inspect(slotState.tmuxSession);
|
|
108
|
-
if (inspection.alive)
|
|
109
|
-
continue; // alive → not orphan
|
|
110
|
-
}
|
|
111
|
-
catch (err) {
|
|
112
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
113
|
-
this.log.warn(`Failed to inspect session ${slotState.tmuxSession}: ${msg}`);
|
|
114
|
-
continue; // can't determine → skip
|
|
115
|
-
}
|
|
116
|
-
this.log.warn(`Orphan slot ${slotName}: session ${slotState.tmuxSession} is dead, releasing`);
|
|
117
|
-
}
|
|
118
|
-
// Orphan cleanup (shared for both modes)
|
|
119
93
|
state.workers[slotName] = {
|
|
120
94
|
status: 'idle',
|
|
121
95
|
seq: null,
|
|
@@ -130,7 +104,6 @@ export class MonitorEngine {
|
|
|
130
104
|
outputFile: null,
|
|
131
105
|
exitCode: null,
|
|
132
106
|
};
|
|
133
|
-
// Remove from active cards if present
|
|
134
107
|
if (slotState.seq != null) {
|
|
135
108
|
delete state.activeCards[String(slotState.seq)];
|
|
136
109
|
}
|
|
@@ -139,10 +112,9 @@ export class MonitorEngine {
|
|
|
139
112
|
action: 'orphan-cleanup',
|
|
140
113
|
entity: `slot:${slotName}`,
|
|
141
114
|
result: 'ok',
|
|
142
|
-
message: `Released orphan slot (
|
|
115
|
+
message: `Released orphan slot (not tracked by Supervisor)`,
|
|
143
116
|
});
|
|
144
117
|
this.logEvent('orphan-cleanup', slotName, 'ok', {
|
|
145
|
-
session: slotState.tmuxSession,
|
|
146
118
|
seq: slotState.seq,
|
|
147
119
|
});
|
|
148
120
|
}
|
|
@@ -163,12 +135,11 @@ export class MonitorEngine {
|
|
|
163
135
|
: 'No orphan slots detected',
|
|
164
136
|
});
|
|
165
137
|
}
|
|
166
|
-
// ─── Check 2: Stale Runtime Detection
|
|
138
|
+
// ─── Check 2: Stale Runtime Detection ─────────────────────────
|
|
167
139
|
async checkStaleRuntimes(checks, actions, recommendedActions) {
|
|
168
140
|
const inprogressCards = await this.taskBackend.listByState('Inprogress');
|
|
169
141
|
let staleCount = 0;
|
|
170
142
|
for (const card of inprogressCards) {
|
|
171
|
-
// Skip cards already marked
|
|
172
143
|
if (card.labels.includes('STALE-RUNTIME'))
|
|
173
144
|
continue;
|
|
174
145
|
const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
|
|
@@ -180,81 +151,43 @@ export class MonitorEngine {
|
|
|
180
151
|
staleCount++;
|
|
181
152
|
continue;
|
|
182
153
|
}
|
|
183
|
-
const [
|
|
184
|
-
if
|
|
154
|
+
const [, slotState] = slotEntry;
|
|
155
|
+
// Check if Supervisor is tracking this worker
|
|
156
|
+
const seq = slotState.seq != null ? String(slotState.seq) : '';
|
|
157
|
+
const workerId = `${this.ctx.projectName}:${slotEntry[0]}:${seq}`;
|
|
158
|
+
const handle = this.supervisor.get(workerId);
|
|
159
|
+
if (handle && handle.exitCode === null) {
|
|
160
|
+
// Supervisor tracking and worker still running — not stale
|
|
185
161
|
continue;
|
|
186
|
-
// Determine if worker is alive
|
|
187
|
-
let workerAlive;
|
|
188
|
-
if (slotState.mode === 'print') {
|
|
189
|
-
// Print mode: check PID liveness directly (no tmux session)
|
|
190
|
-
if (slotState.pid) {
|
|
191
|
-
try {
|
|
192
|
-
process.kill(slotState.pid, 0);
|
|
193
|
-
workerAlive = true;
|
|
194
|
-
}
|
|
195
|
-
catch {
|
|
196
|
-
workerAlive = false;
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
else {
|
|
200
|
-
continue; // No PID yet, skip
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
else {
|
|
204
|
-
// Interactive mode: use inspect()
|
|
205
|
-
try {
|
|
206
|
-
const inspection = await this.workerProvider.inspect(slotState.tmuxSession);
|
|
207
|
-
workerAlive = inspection.alive;
|
|
208
|
-
}
|
|
209
|
-
catch {
|
|
210
|
-
continue; // Can't determine, skip
|
|
211
|
-
}
|
|
212
162
|
}
|
|
213
|
-
if (!
|
|
214
|
-
//
|
|
215
|
-
//
|
|
216
|
-
if (slotState.mode === 'print') {
|
|
217
|
-
const completionStatus = await this.checkPrintWorkerCompletion(slotState);
|
|
218
|
-
if (completionStatus === 'COMPLETED') {
|
|
219
|
-
this.log.ok(`seq ${card.seq}: Worker dead but detectCompleted → COMPLETED`);
|
|
220
|
-
const handled = await this.handleCompletedWorker(slotName, slotState, state, actions);
|
|
221
|
-
if (handled) {
|
|
222
|
-
try {
|
|
223
|
-
writeState(this.ctx.paths.stateFile, state, 'monitor-stale-completed');
|
|
224
|
-
}
|
|
225
|
-
catch { /* logged */ }
|
|
226
|
-
}
|
|
227
|
-
staleCount++;
|
|
228
|
-
continue;
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
// Fall back to MR-only check
|
|
163
|
+
if (!handle) {
|
|
164
|
+
// Not tracked by Supervisor and no slot → stale
|
|
165
|
+
// Check for MR as a last resort
|
|
232
166
|
const branchName = slotState.branch || this.buildBranchName(card);
|
|
233
167
|
const mrStatus = await this.repoBackend.getMrStatus(branchName);
|
|
234
168
|
if (mrStatus.exists) {
|
|
235
|
-
this.log.warn(`seq ${card.seq}: Worker
|
|
169
|
+
this.log.warn(`seq ${card.seq}: Worker not tracked, but MR exists — stale runtime`);
|
|
236
170
|
await this.handleStaleRuntime(card, actions, recommendedActions);
|
|
237
|
-
staleCount++;
|
|
238
171
|
}
|
|
239
172
|
else {
|
|
240
|
-
this.log.warn(`seq ${card.seq}: Worker
|
|
173
|
+
this.log.warn(`seq ${card.seq}: Worker not tracked, no MR found`);
|
|
241
174
|
await this.addLabelSafe(card.seq, 'STALE-RUNTIME');
|
|
242
175
|
actions.push({
|
|
243
176
|
action: 'mark-stale',
|
|
244
177
|
entity: `seq:${card.seq}`,
|
|
245
178
|
result: 'ok',
|
|
246
|
-
message: 'Worker
|
|
179
|
+
message: 'Worker not tracked by Supervisor, no MR — needs manual review',
|
|
247
180
|
});
|
|
248
181
|
recommendedActions.push({
|
|
249
|
-
action: `Review seq:${card.seq} — worker
|
|
250
|
-
reason: 'Worker
|
|
182
|
+
action: `Review seq:${card.seq} — worker not tracked, no MR`,
|
|
183
|
+
reason: 'Worker not tracked by Supervisor with no MR',
|
|
251
184
|
severity: 'warning',
|
|
252
185
|
autoExecutable: false,
|
|
253
186
|
requiresConfirmation: true,
|
|
254
187
|
safeToRetry: false,
|
|
255
188
|
});
|
|
256
|
-
staleCount++;
|
|
257
189
|
}
|
|
190
|
+
staleCount++;
|
|
258
191
|
}
|
|
259
192
|
}
|
|
260
193
|
checks.push({
|
|
@@ -269,7 +202,6 @@ export class MonitorEngine {
|
|
|
269
202
|
const seq = card.seq;
|
|
270
203
|
await this.addLabelSafe(seq, 'STALE-RUNTIME');
|
|
271
204
|
if (this.ctx.config.MONITOR_AUTO_QA) {
|
|
272
|
-
// Auto-move to QA
|
|
273
205
|
try {
|
|
274
206
|
await this.taskBackend.move(seq, 'QA');
|
|
275
207
|
this.log.ok(`seq ${seq}: Auto-moved to QA (MONITOR_AUTO_QA=true)`);
|
|
@@ -294,7 +226,6 @@ export class MonitorEngine {
|
|
|
294
226
|
}
|
|
295
227
|
}
|
|
296
228
|
else {
|
|
297
|
-
// Notify, wait for human
|
|
298
229
|
await this.notifySafe(`seq:${seq} has a stale runtime — worker session dead but MR may exist`, 'warning');
|
|
299
230
|
recommendedActions.push({
|
|
300
231
|
action: `Move seq:${seq} to QA or investigate stale runtime`,
|
|
@@ -312,7 +243,7 @@ export class MonitorEngine {
|
|
|
312
243
|
});
|
|
313
244
|
}
|
|
314
245
|
}
|
|
315
|
-
// ─── Check 3: Timeout Detection
|
|
246
|
+
// ─── Check 3: Timeout Detection ───────────────────────────────
|
|
316
247
|
async checkTimeouts(checks, actions, recommendedActions) {
|
|
317
248
|
const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
|
|
318
249
|
const timeoutHours = this.ctx.config.INPROGRESS_TIMEOUT_HOURS;
|
|
@@ -332,12 +263,10 @@ export class MonitorEngine {
|
|
|
332
263
|
if (slotState.lastHeartbeat) {
|
|
333
264
|
const hbTime = new Date(slotState.lastHeartbeat).getTime();
|
|
334
265
|
const hbAge = (now - hbTime) / (1000 * 60 * 60);
|
|
335
|
-
// If heartbeat is recent (within timeout window), skip
|
|
336
266
|
if (hbAge < timeoutHours)
|
|
337
267
|
continue;
|
|
338
268
|
}
|
|
339
269
|
}
|
|
340
|
-
// Timed out
|
|
341
270
|
this.log.warn(`seq ${seq}: Timed out (${elapsedHours.toFixed(1)}h > ${timeoutHours}h threshold)`);
|
|
342
271
|
await this.addLabelSafe(seq, 'STALE-RUNTIME');
|
|
343
272
|
await this.notifySafe(`seq:${seq} has exceeded timeout (${elapsedHours.toFixed(1)}h)`, 'warning');
|
|
@@ -369,7 +298,7 @@ export class MonitorEngine {
|
|
|
369
298
|
: 'No timeouts detected',
|
|
370
299
|
});
|
|
371
300
|
}
|
|
372
|
-
// ─── Check 4: Waiting Confirmation Detection (
|
|
301
|
+
// ─── Check 4: Waiting Confirmation Detection (interactive mode) ─
|
|
373
302
|
async checkWaitingConfirmation(checks, actions) {
|
|
374
303
|
const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
|
|
375
304
|
let waitingCount = 0;
|
|
@@ -385,10 +314,8 @@ export class MonitorEngine {
|
|
|
385
314
|
continue;
|
|
386
315
|
const seq = slotState.seq != null ? String(slotState.seq) : slotName;
|
|
387
316
|
if (!waitResult.destructive) {
|
|
388
|
-
// Non-destructive prompt → auto-confirm
|
|
389
317
|
this.log.info(`seq ${seq}: Worker waiting for non-destructive confirmation, auto-confirming`);
|
|
390
318
|
try {
|
|
391
|
-
// Send Enter/y to confirm
|
|
392
319
|
await this.workerProvider.sendFix(slotState.tmuxSession, 'y');
|
|
393
320
|
actions.push({
|
|
394
321
|
action: 'auto-confirm',
|
|
@@ -410,7 +337,6 @@ export class MonitorEngine {
|
|
|
410
337
|
}
|
|
411
338
|
}
|
|
412
339
|
else {
|
|
413
|
-
// Destructive prompt → label + notify
|
|
414
340
|
this.log.warn(`seq ${seq}: Worker waiting for destructive confirmation: ${waitResult.prompt}`);
|
|
415
341
|
if (slotState.seq != null) {
|
|
416
342
|
await this.addLabelSafe(String(slotState.seq), 'WAITING-CONFIRMATION');
|
|
@@ -442,9 +368,8 @@ export class MonitorEngine {
|
|
|
442
368
|
: 'No workers waiting for confirmation',
|
|
443
369
|
});
|
|
444
370
|
}
|
|
445
|
-
// ─── Check 5: BLOCKED Condition Check
|
|
371
|
+
// ─── Check 5: BLOCKED Condition Check ─────────────────────────
|
|
446
372
|
async checkBlockedCards(checks) {
|
|
447
|
-
// Collect cards from all active states that might have BLOCKED label
|
|
448
373
|
const states = ['Backlog', 'Todo', 'Inprogress', 'QA'];
|
|
449
374
|
let blockedCount = 0;
|
|
450
375
|
for (const cardState of states) {
|
|
@@ -469,42 +394,21 @@ export class MonitorEngine {
|
|
|
469
394
|
: 'No blocked cards',
|
|
470
395
|
});
|
|
471
396
|
}
|
|
472
|
-
// ─── Check 6: State Alignment (
|
|
397
|
+
// ─── Check 6: State Alignment (Supervisor vs state.json) ──────
|
|
473
398
|
async checkStateAlignment(checks, recommendedActions) {
|
|
474
399
|
const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
|
|
475
400
|
const discrepancies = [];
|
|
476
401
|
for (const [slotName, slotState] of Object.entries(state.workers)) {
|
|
477
|
-
if (slotState.status !== 'active'
|
|
402
|
+
if (slotState.status !== 'active')
|
|
478
403
|
continue;
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
try {
|
|
485
|
-
process.kill(slotState.pid, 0);
|
|
486
|
-
alive = true;
|
|
487
|
-
}
|
|
488
|
-
catch {
|
|
489
|
-
alive = false;
|
|
490
|
-
}
|
|
404
|
+
const seq = slotState.seq != null ? String(slotState.seq) : '';
|
|
405
|
+
const workerId = `${this.ctx.projectName}:${slotName}:${seq}`;
|
|
406
|
+
const handle = this.supervisor.get(workerId);
|
|
407
|
+
if (!handle) {
|
|
408
|
+
discrepancies.push(`${slotName}: state says active but Supervisor has no handle`);
|
|
491
409
|
}
|
|
492
|
-
else {
|
|
493
|
-
|
|
494
|
-
try {
|
|
495
|
-
const inspection = await this.workerProvider.inspect(slotState.tmuxSession);
|
|
496
|
-
alive = inspection.alive;
|
|
497
|
-
}
|
|
498
|
-
catch {
|
|
499
|
-
discrepancies.push(`${slotName}: could not inspect session ${slotState.tmuxSession}`);
|
|
500
|
-
continue;
|
|
501
|
-
}
|
|
502
|
-
}
|
|
503
|
-
if (!alive) {
|
|
504
|
-
const desc = slotState.mode === 'print'
|
|
505
|
-
? `pid ${slotState.pid} is dead`
|
|
506
|
-
: `session ${slotState.tmuxSession} is dead`;
|
|
507
|
-
discrepancies.push(`${slotName}: state says active but ${desc}`);
|
|
410
|
+
else if (handle.exitCode !== null) {
|
|
411
|
+
discrepancies.push(`${slotName}: state says active but Supervisor reports exited (code=${handle.exitCode})`);
|
|
508
412
|
}
|
|
509
413
|
}
|
|
510
414
|
if (discrepancies.length > 0) {
|
|
@@ -525,53 +429,28 @@ export class MonitorEngine {
|
|
|
525
429
|
status: discrepancies.length > 0 ? 'warn' : 'pass',
|
|
526
430
|
message: discrepancies.length > 0
|
|
527
431
|
? `${discrepancies.length} state alignment discrepancy(ies)`
|
|
528
|
-
: 'State aligned with
|
|
432
|
+
: 'State aligned with Supervisor',
|
|
529
433
|
});
|
|
530
434
|
}
|
|
531
|
-
// ─── Check
|
|
435
|
+
// ─── Check 7: Worker Health (launch/idle timeouts) ────────────
|
|
532
436
|
async checkWorkerHealth(checks, actions) {
|
|
533
437
|
const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
|
|
534
438
|
let issues = 0;
|
|
535
439
|
for (const [slotName, slotState] of Object.entries(state.workers)) {
|
|
536
440
|
if (slotState.status !== 'active' || slotState.mode !== 'print')
|
|
537
441
|
continue;
|
|
538
|
-
if (!slotState.
|
|
442
|
+
if (!slotState.outputFile || !slotState.claimedAt)
|
|
539
443
|
continue;
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
}
|
|
546
|
-
catch {
|
|
547
|
-
pidAlive = false;
|
|
548
|
-
}
|
|
549
|
-
if (!pidAlive) {
|
|
550
|
-
// PID dead — but check if the worker actually completed its task first
|
|
551
|
-
const completionStatus = await this.checkPrintWorkerCompletion(slotState);
|
|
552
|
-
if (completionStatus === 'COMPLETED') {
|
|
553
|
-
this.log.ok(`Worker health: ${slotName} pid ${slotState.pid} is dead but task COMPLETED`);
|
|
554
|
-
const handled = await this.handleCompletedWorker(slotName, slotState, state, actions);
|
|
555
|
-
if (handled) {
|
|
556
|
-
try {
|
|
557
|
-
writeState(this.ctx.paths.stateFile, state, 'monitor-health-completed');
|
|
558
|
-
}
|
|
559
|
-
catch { /* logged */ }
|
|
560
|
-
continue; // worker finished successfully
|
|
561
|
-
}
|
|
562
|
-
// Move-to-Done failed — fall through to auto-retry
|
|
563
|
-
this.log.warn(`Worker health: ${slotName} completed but move-to-Done failed, will auto-retry`);
|
|
564
|
-
}
|
|
565
|
-
// Not completed → auto-retry
|
|
566
|
-
const seq = slotState.seq != null ? String(slotState.seq) : null;
|
|
567
|
-
if (seq) {
|
|
568
|
-
this.log.warn(`Worker health: ${slotName} pid ${slotState.pid} is dead (status=${completionStatus}), auto-retrying seq ${seq}`);
|
|
569
|
-
await this.autoRetry(seq, slotName, state, actions, 'Worker process died');
|
|
570
|
-
issues++;
|
|
571
|
-
}
|
|
444
|
+
const seq = slotState.seq != null ? String(slotState.seq) : '';
|
|
445
|
+
const workerId = `${this.ctx.projectName}:${slotName}:${seq}`;
|
|
446
|
+
const handle = this.supervisor.get(workerId);
|
|
447
|
+
// If Supervisor doesn't track it, orphan check handles it
|
|
448
|
+
if (!handle)
|
|
572
449
|
continue;
|
|
573
|
-
|
|
574
|
-
|
|
450
|
+
// If worker already exited, Supervisor exit callback handles it
|
|
451
|
+
if (handle.exitCode !== null)
|
|
452
|
+
continue;
|
|
453
|
+
// Worker is alive — check output health
|
|
575
454
|
const nowMs = Date.now();
|
|
576
455
|
const claimedMs = new Date(slotState.claimedAt).getTime();
|
|
577
456
|
const elapsedS = (nowMs - claimedMs) / 1000;
|
|
@@ -588,28 +467,22 @@ export class MonitorEngine {
|
|
|
588
467
|
const launchTimeout = this.ctx.config.WORKER_LAUNCH_TIMEOUT_S;
|
|
589
468
|
const idleTimeout = this.ctx.config.WORKER_IDLE_TIMEOUT_M * 60 * 1000;
|
|
590
469
|
// Launch timeout: process alive but no output after N seconds
|
|
591
|
-
if (outputSize === 0 && elapsedS > launchTimeout) {
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
await this.autoRetry(seq, slotName, state, actions, `No output after ${Math.round(elapsedS)}s`);
|
|
597
|
-
issues++;
|
|
598
|
-
}
|
|
470
|
+
if (outputSize === 0 && elapsedS > launchTimeout && seq) {
|
|
471
|
+
this.log.warn(`Worker health: ${slotName} has no output after ${Math.round(elapsedS)}s, killing and retrying seq ${seq}`);
|
|
472
|
+
await this.supervisor.kill(workerId);
|
|
473
|
+
await this.autoRetry(seq, slotName, state, actions, `No output after ${Math.round(elapsedS)}s`);
|
|
474
|
+
issues++;
|
|
599
475
|
continue;
|
|
600
476
|
}
|
|
601
477
|
// Idle timeout: process alive but no new output for N minutes
|
|
602
|
-
if (outputSize > 0 && outputMtimeMs > 0) {
|
|
478
|
+
if (outputSize > 0 && outputMtimeMs > 0 && seq) {
|
|
603
479
|
const idleSinceMs = nowMs - outputMtimeMs;
|
|
604
480
|
if (idleSinceMs > idleTimeout) {
|
|
605
|
-
const
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
await this.autoRetry(seq, slotName, state, actions, `No output for ${idleMin}min`);
|
|
611
|
-
issues++;
|
|
612
|
-
}
|
|
481
|
+
const idleMin = Math.round(idleSinceMs / 60000);
|
|
482
|
+
this.log.warn(`Worker health: ${slotName} no output for ${idleMin}min, killing and retrying seq ${seq}`);
|
|
483
|
+
await this.supervisor.kill(workerId);
|
|
484
|
+
await this.autoRetry(seq, slotName, state, actions, `No output for ${idleMin}min`);
|
|
485
|
+
issues++;
|
|
613
486
|
}
|
|
614
487
|
}
|
|
615
488
|
}
|
|
@@ -621,21 +494,9 @@ export class MonitorEngine {
|
|
|
621
494
|
: 'All active workers healthy',
|
|
622
495
|
});
|
|
623
496
|
}
|
|
624
|
-
|
|
625
|
-
try {
|
|
626
|
-
process.kill(pid, 'SIGTERM');
|
|
627
|
-
}
|
|
628
|
-
catch { /* already dead */ }
|
|
629
|
-
// Give it a moment, then force kill
|
|
630
|
-
try {
|
|
631
|
-
process.kill(pid, 0); // check alive
|
|
632
|
-
process.kill(pid, 'SIGKILL');
|
|
633
|
-
}
|
|
634
|
-
catch { /* dead */ }
|
|
635
|
-
}
|
|
497
|
+
// ─── Auto-retry ───────────────────────────────────────────────
|
|
636
498
|
async autoRetry(seq, slotName, state, actions, reason) {
|
|
637
499
|
const restartLimit = this.ctx.config.WORKER_RESTART_LIMIT;
|
|
638
|
-
// Get retry count from activeCards
|
|
639
500
|
const activeCard = state.activeCards[seq];
|
|
640
501
|
const retryCount = activeCard?.retryCount ?? 0;
|
|
641
502
|
// Release the slot
|
|
@@ -655,12 +516,10 @@ export class MonitorEngine {
|
|
|
655
516
|
};
|
|
656
517
|
delete state.activeCards[seq];
|
|
657
518
|
if (retryCount < restartLimit) {
|
|
658
|
-
// Move back to Todo for re-launch on next tick
|
|
659
519
|
try {
|
|
660
520
|
await this.taskBackend.move(seq, 'Todo');
|
|
661
521
|
await this.removeLabelSafe(seq, 'CLAIMED');
|
|
662
522
|
await this.removeLabelSafe(seq, 'STALE-RUNTIME');
|
|
663
|
-
// Track retry count — store in a fresh activeCards entry
|
|
664
523
|
state.activeCards[seq] = {
|
|
665
524
|
seq: parseInt(seq, 10),
|
|
666
525
|
state: 'Todo',
|
|
@@ -714,94 +573,7 @@ export class MonitorEngine {
|
|
|
714
573
|
this.logEvent('retry-exhausted', seq, 'ok', { retryCount, reason });
|
|
715
574
|
}
|
|
716
575
|
}
|
|
717
|
-
|
|
718
|
-
try {
|
|
719
|
-
await this.taskBackend.removeLabel(seq, label);
|
|
720
|
-
}
|
|
721
|
-
catch { /* best effort */ }
|
|
722
|
-
}
|
|
723
|
-
// ─── Completion-aware helpers for dead print-mode workers ─────
|
|
724
|
-
/**
|
|
725
|
-
* Check if a dead print-mode worker actually completed its task.
|
|
726
|
-
* Uses workerProvider.detectCompleted() with the slot's session/branch info.
|
|
727
|
-
*/
|
|
728
|
-
async checkPrintWorkerCompletion(slotState) {
|
|
729
|
-
if (!slotState.tmuxSession)
|
|
730
|
-
return 'DEAD';
|
|
731
|
-
const branch = slotState.branch || '';
|
|
732
|
-
const logDir = this.ctx.paths.logsDir;
|
|
733
|
-
try {
|
|
734
|
-
return await this.workerProvider.detectCompleted(slotState.tmuxSession, logDir, branch);
|
|
735
|
-
}
|
|
736
|
-
catch (err) {
|
|
737
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
738
|
-
this.log.debug(`detectCompleted failed for ${slotState.tmuxSession}: ${msg}`);
|
|
739
|
-
return 'DEAD';
|
|
740
|
-
}
|
|
741
|
-
}
|
|
742
|
-
/**
|
|
743
|
-
* Handle a dead print-mode worker that has been confirmed as COMPLETED.
|
|
744
|
-
* Mutates state in-place (releases slot, updates activeCards).
|
|
745
|
-
* Does NOT call writeState — caller is responsible for flushing state.
|
|
746
|
-
*/
|
|
747
|
-
/**
|
|
748
|
-
* Handle a dead print-mode worker confirmed as COMPLETED.
|
|
749
|
-
* Returns true if card moved to Done + slot released (state mutated).
|
|
750
|
-
* Returns false if move failed (state NOT mutated, caller should not count as handled).
|
|
751
|
-
*/
|
|
752
|
-
async handleCompletedWorker(slotName, slotState, state, actions) {
|
|
753
|
-
const seq = slotState.seq != null ? String(slotState.seq) : null;
|
|
754
|
-
if (!seq)
|
|
755
|
-
return false;
|
|
756
|
-
// 1. Move card to Done FIRST — if this fails, don't touch state
|
|
757
|
-
const targetState = 'Done';
|
|
758
|
-
try {
|
|
759
|
-
await this.taskBackend.move(seq, targetState);
|
|
760
|
-
}
|
|
761
|
-
catch (err) {
|
|
762
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
763
|
-
this.log.error(`seq ${seq}: Failed to move to ${targetState}: ${msg}. Slot NOT released.`);
|
|
764
|
-
actions.push({
|
|
765
|
-
action: 'complete',
|
|
766
|
-
entity: `seq:${seq}`,
|
|
767
|
-
result: 'fail',
|
|
768
|
-
message: `Move to ${targetState} failed: ${msg}`,
|
|
769
|
-
});
|
|
770
|
-
return false;
|
|
771
|
-
}
|
|
772
|
-
// 2. Done confirmed — now release slot + cleanup (mutate state)
|
|
773
|
-
state.workers[slotName] = {
|
|
774
|
-
status: 'idle', seq: null, branch: null, worktree: null,
|
|
775
|
-
tmuxSession: null, claimedAt: null, lastHeartbeat: null,
|
|
776
|
-
mode: null, sessionId: null, pid: null, outputFile: null, exitCode: null,
|
|
777
|
-
};
|
|
778
|
-
delete state.activeCards[seq];
|
|
779
|
-
try {
|
|
780
|
-
await this.taskBackend.releaseClaim(seq);
|
|
781
|
-
}
|
|
782
|
-
catch { /* best effort */ }
|
|
783
|
-
// 3. Mark worktree for cleanup
|
|
784
|
-
const branch = slotState.branch || '';
|
|
785
|
-
const worktreePath = slotState.worktree || '';
|
|
786
|
-
if (branch && worktreePath) {
|
|
787
|
-
const cleanup = state.worktreeCleanup ?? [];
|
|
788
|
-
if (!cleanup.some((e) => e.branch === branch)) {
|
|
789
|
-
cleanup.push({ branch, worktreePath, markedAt: new Date().toISOString() });
|
|
790
|
-
state.worktreeCleanup = cleanup;
|
|
791
|
-
}
|
|
792
|
-
}
|
|
793
|
-
this.log.ok(`seq ${seq}: Worker completed (detected by monitor), moved to ${targetState}`);
|
|
794
|
-
await this.notifySafe(`seq:${seq} worker completed, moved to ${targetState}`, 'success');
|
|
795
|
-
actions.push({
|
|
796
|
-
action: 'complete',
|
|
797
|
-
entity: `seq:${seq}`,
|
|
798
|
-
result: 'ok',
|
|
799
|
-
message: `Worker completed (PID dead, artifacts verified) → ${targetState}`,
|
|
800
|
-
});
|
|
801
|
-
this.logEvent('complete', seq, 'ok', { detectedBy: 'monitor' });
|
|
802
|
-
return true;
|
|
803
|
-
}
|
|
804
|
-
// ─── Helpers ───────────────────────────────────────────────────
|
|
576
|
+
// ─── Helpers ──────────────────────────────────────────────────
|
|
805
577
|
buildBranchName(card) {
|
|
806
578
|
const slug = card.name
|
|
807
579
|
.toLowerCase()
|
|
@@ -819,6 +591,12 @@ export class MonitorEngine {
|
|
|
819
591
|
this.log.error(`Failed to add label ${label} to seq ${seq}: ${msg}`);
|
|
820
592
|
}
|
|
821
593
|
}
|
|
594
|
+
async removeLabelSafe(seq, label) {
|
|
595
|
+
try {
|
|
596
|
+
await this.taskBackend.removeLabel(seq, label);
|
|
597
|
+
}
|
|
598
|
+
catch { /* best effort */ }
|
|
599
|
+
}
|
|
822
600
|
async notifySafe(message, level) {
|
|
823
601
|
if (!this.notifier)
|
|
824
602
|
return;
|