@coralai/sps-cli 0.15.12 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +11 -0
  2. package/dist/commands/monitorTick.d.ts.map +1 -1
  3. package/dist/commands/monitorTick.js +3 -1
  4. package/dist/commands/monitorTick.js.map +1 -1
  5. package/dist/commands/pipelineTick.d.ts.map +1 -1
  6. package/dist/commands/pipelineTick.js +12 -3
  7. package/dist/commands/pipelineTick.js.map +1 -1
  8. package/dist/commands/tick.d.ts +1 -0
  9. package/dist/commands/tick.d.ts.map +1 -1
  10. package/dist/commands/tick.js +64 -8
  11. package/dist/commands/tick.js.map +1 -1
  12. package/dist/commands/workerLaunch.d.ts.map +1 -1
  13. package/dist/commands/workerLaunch.js +12 -3
  14. package/dist/commands/workerLaunch.js.map +1 -1
  15. package/dist/engines/ExecutionEngine.d.ts +29 -32
  16. package/dist/engines/ExecutionEngine.d.ts.map +1 -1
  17. package/dist/engines/ExecutionEngine.js +224 -527
  18. package/dist/engines/ExecutionEngine.js.map +1 -1
  19. package/dist/engines/MonitorEngine.d.ts +14 -27
  20. package/dist/engines/MonitorEngine.d.ts.map +1 -1
  21. package/dist/engines/MonitorEngine.js +91 -313
  22. package/dist/engines/MonitorEngine.js.map +1 -1
  23. package/dist/main.js +0 -0
  24. package/dist/manager/completion-judge.d.ts +27 -0
  25. package/dist/manager/completion-judge.d.ts.map +1 -0
  26. package/dist/manager/completion-judge.js +81 -0
  27. package/dist/manager/completion-judge.js.map +1 -0
  28. package/dist/manager/pm-client.d.ts +10 -0
  29. package/dist/manager/pm-client.d.ts.map +1 -0
  30. package/dist/manager/pm-client.js +245 -0
  31. package/dist/manager/pm-client.js.map +1 -0
  32. package/dist/manager/post-actions.d.ts +60 -0
  33. package/dist/manager/post-actions.d.ts.map +1 -0
  34. package/dist/manager/post-actions.js +326 -0
  35. package/dist/manager/post-actions.js.map +1 -0
  36. package/dist/manager/recovery.d.ts +39 -0
  37. package/dist/manager/recovery.d.ts.map +1 -0
  38. package/dist/manager/recovery.js +133 -0
  39. package/dist/manager/recovery.js.map +1 -0
  40. package/dist/manager/resource-limiter.d.ts +44 -0
  41. package/dist/manager/resource-limiter.d.ts.map +1 -0
  42. package/dist/manager/resource-limiter.js +79 -0
  43. package/dist/manager/resource-limiter.js.map +1 -0
  44. package/dist/manager/supervisor.d.ts +70 -0
  45. package/dist/manager/supervisor.d.ts.map +1 -0
  46. package/dist/manager/supervisor.js +216 -0
  47. package/dist/manager/supervisor.js.map +1 -0
  48. package/package.json +1 -1
@@ -4,13 +4,15 @@ import { Logger } from '../core/logger.js';
4
4
  /**
5
5
  * MonitorEngine performs anomaly detection and health checks.
6
6
  *
7
- * Checks (in order):
8
- * 1. Orphan slot cleanup (doc 12 §6.1)
9
- * 2. Stale runtime detection (01 §10.2.3)
10
- * 3. Timeout detection (01 §10.2.4)
11
- * 4. Waiting confirmation detection (doc 12 §3)
12
- * 5. BLOCKED condition check (01 §3.6.1)
13
- * 6. State alignment (07 §2.1)
7
+ * With ProcessSupervisor, completion detection and post-actions are handled
8
+ * by exit callbacks (CompletionJudge + PostActions). MonitorEngine focuses on:
9
+ * 1. Orphan slot cleanup (stale entries not tracked by Supervisor)
10
+ * 2. Stale runtime detection (Inprogress cards with no worker)
11
+ * 3. Timeout detection (INPROGRESS_TIMEOUT_HOURS)
12
+ * 4. Waiting confirmation detection (interactive mode only)
13
+ * 5. BLOCKED condition check
14
+ * 6. State alignment (Supervisor vs state.json sync)
15
+ * 7. Worker health (launch/idle timeouts for Supervisor-tracked workers)
14
16
  */
15
17
  export class MonitorEngine {
16
18
  ctx;
@@ -18,13 +20,15 @@ export class MonitorEngine {
18
20
  workerProvider;
19
21
  repoBackend;
20
22
  notifier;
23
+ supervisor;
21
24
  log;
22
- constructor(ctx, taskBackend, workerProvider, repoBackend, notifier) {
25
+ constructor(ctx, taskBackend, workerProvider, repoBackend, notifier, supervisor) {
23
26
  this.ctx = ctx;
24
27
  this.taskBackend = taskBackend;
25
28
  this.workerProvider = workerProvider;
26
29
  this.repoBackend = repoBackend;
27
30
  this.notifier = notifier;
31
+ this.supervisor = supervisor;
28
32
  this.log = new Logger('monitor', ctx.projectName, ctx.paths.logsDir);
29
33
  }
30
34
  async tick() {
@@ -41,19 +45,12 @@ export class MonitorEngine {
41
45
  details: { checks },
42
46
  };
43
47
  try {
44
- // 0. Worker health check (print-mode: launch timeout, idle timeout, auto-retry)
45
48
  await this.checkWorkerHealth(checks, actions);
46
- // 1. Orphan slot cleanup
47
49
  await this.checkOrphanSlots(checks, actions);
48
- // 2. Stale runtime detection
49
50
  await this.checkStaleRuntimes(checks, actions, recommendedActions);
50
- // 3. Timeout detection
51
51
  await this.checkTimeouts(checks, actions, recommendedActions);
52
- // 4. Waiting confirmation detection
53
52
  await this.checkWaitingConfirmation(checks, actions);
54
- // 5. BLOCKED condition check
55
53
  await this.checkBlockedCards(checks);
56
- // 6. State alignment
57
54
  await this.checkStateAlignment(checks, recommendedActions);
58
55
  }
59
56
  catch (err) {
@@ -63,59 +60,36 @@ export class MonitorEngine {
63
60
  result.exitCode = 1;
64
61
  result.details = { error: msg, checks };
65
62
  }
66
- // Set degraded if any checks failed
67
63
  if (checks.some((c) => c.status === 'fail') && result.status === 'ok') {
68
64
  result.status = 'degraded';
69
65
  }
70
66
  return result;
71
67
  }
72
- // ─── Check 1: Orphan Slot Cleanup (doc 12 §6.1) ───────────────
68
+ // ─── Check 1: Orphan Slot Cleanup ─────────────────────────────
73
69
  async checkOrphanSlots(checks, actions) {
74
70
  const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
75
71
  let orphansFound = 0;
76
72
  for (const [slotName, slotState] of Object.entries(state.workers)) {
77
- if (slotState.status !== 'active' || !slotState.tmuxSession)
73
+ if (slotState.status !== 'active')
78
74
  continue;
79
- // Print mode: check PID liveness directly instead of inspect()
80
- if (slotState.mode === 'print' && slotState.pid) {
81
- try {
82
- process.kill(slotState.pid, 0); // signal 0 = check alive
83
- continue; // PID alive → not orphan
84
- }
85
- catch {
86
- // PID dead — but did the worker actually complete its task?
87
- const completionStatus = await this.checkPrintWorkerCompletion(slotState);
88
- if (completionStatus === 'COMPLETED') {
89
- this.log.ok(`Orphan slot ${slotName}: print-mode pid ${slotState.pid} is dead but task COMPLETED, handling as completion`);
90
- const handled = await this.handleCompletedWorker(slotName, slotState, state, actions);
91
- if (handled)
92
- orphansFound++; // only count if slot was actually released
93
- continue;
94
- }
95
- // Not completed → fall through to orphan cleanup
96
- this.log.warn(`Orphan slot ${slotName}: print-mode pid ${slotState.pid} is dead (status=${completionStatus}), releasing`);
97
- }
98
- }
99
- else if (slotState.mode === 'print' && !slotState.pid) {
100
- // Print mode but no PID recorded — skip, launch may still be writing state
101
- this.log.debug(`Skipping orphan check for ${slotName}: print mode, no PID yet`);
75
+ // Build the workerId that Supervisor would track
76
+ const seq = slotState.seq != null ? String(slotState.seq) : '';
77
+ const workerId = `${this.ctx.projectName}:${slotName}:${seq}`;
78
+ // If Supervisor is tracking this worker, it handles lifecycle — skip
79
+ if (this.supervisor.get(workerId))
102
80
  continue;
81
+ // Supervisor doesn't know about this worker.
82
+ // PostActions exit callback may have already cleaned up state.
83
+ // Re-read state to check for race with exit callback.
84
+ const freshState = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
85
+ const freshSlot = freshState.workers[slotName];
86
+ if (!freshSlot || freshSlot.status !== 'active')
87
+ continue;
88
+ // Still active with no Supervisor handle — truly orphaned / stale
89
+ this.log.warn(`Orphan slot ${slotName}: not tracked by Supervisor, marking STALE-RUNTIME and releasing`);
90
+ if (slotState.seq != null) {
91
+ await this.addLabelSafe(String(slotState.seq), 'STALE-RUNTIME');
103
92
  }
104
- else {
105
- // Tmux mode: use inspect()
106
- try {
107
- const inspection = await this.workerProvider.inspect(slotState.tmuxSession);
108
- if (inspection.alive)
109
- continue; // alive → not orphan
110
- }
111
- catch (err) {
112
- const msg = err instanceof Error ? err.message : String(err);
113
- this.log.warn(`Failed to inspect session ${slotState.tmuxSession}: ${msg}`);
114
- continue; // can't determine → skip
115
- }
116
- this.log.warn(`Orphan slot ${slotName}: session ${slotState.tmuxSession} is dead, releasing`);
117
- }
118
- // Orphan cleanup (shared for both modes)
119
93
  state.workers[slotName] = {
120
94
  status: 'idle',
121
95
  seq: null,
@@ -130,7 +104,6 @@ export class MonitorEngine {
130
104
  outputFile: null,
131
105
  exitCode: null,
132
106
  };
133
- // Remove from active cards if present
134
107
  if (slotState.seq != null) {
135
108
  delete state.activeCards[String(slotState.seq)];
136
109
  }
@@ -139,10 +112,9 @@ export class MonitorEngine {
139
112
  action: 'orphan-cleanup',
140
113
  entity: `slot:${slotName}`,
141
114
  result: 'ok',
142
- message: `Released orphan slot (${slotState.mode === 'print' ? `pid ${slotState.pid}` : `session ${slotState.tmuxSession}`} dead)`,
115
+ message: `Released orphan slot (not tracked by Supervisor)`,
143
116
  });
144
117
  this.logEvent('orphan-cleanup', slotName, 'ok', {
145
- session: slotState.tmuxSession,
146
118
  seq: slotState.seq,
147
119
  });
148
120
  }
@@ -163,12 +135,11 @@ export class MonitorEngine {
163
135
  : 'No orphan slots detected',
164
136
  });
165
137
  }
166
- // ─── Check 2: Stale Runtime Detection (01 §10.2.3) ────────────
138
+ // ─── Check 2: Stale Runtime Detection ─────────────────────────
167
139
  async checkStaleRuntimes(checks, actions, recommendedActions) {
168
140
  const inprogressCards = await this.taskBackend.listByState('Inprogress');
169
141
  let staleCount = 0;
170
142
  for (const card of inprogressCards) {
171
- // Skip cards already marked
172
143
  if (card.labels.includes('STALE-RUNTIME'))
173
144
  continue;
174
145
  const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
@@ -180,81 +151,43 @@ export class MonitorEngine {
180
151
  staleCount++;
181
152
  continue;
182
153
  }
183
- const [slotName, slotState] = slotEntry;
184
- if (!slotState.tmuxSession)
154
+ const [, slotState] = slotEntry;
155
+ // Check if Supervisor is tracking this worker
156
+ const seq = slotState.seq != null ? String(slotState.seq) : '';
157
+ const workerId = `${this.ctx.projectName}:${slotEntry[0]}:${seq}`;
158
+ const handle = this.supervisor.get(workerId);
159
+ if (handle && handle.exitCode === null) {
160
+ // Supervisor tracking and worker still running — not stale
185
161
  continue;
186
- // Determine if worker is alive
187
- let workerAlive;
188
- if (slotState.mode === 'print') {
189
- // Print mode: check PID liveness directly (no tmux session)
190
- if (slotState.pid) {
191
- try {
192
- process.kill(slotState.pid, 0);
193
- workerAlive = true;
194
- }
195
- catch {
196
- workerAlive = false;
197
- }
198
- }
199
- else {
200
- continue; // No PID yet, skip
201
- }
202
- }
203
- else {
204
- // Interactive mode: use inspect()
205
- try {
206
- const inspection = await this.workerProvider.inspect(slotState.tmuxSession);
207
- workerAlive = inspection.alive;
208
- }
209
- catch {
210
- continue; // Can't determine, skip
211
- }
212
162
  }
213
- if (!workerAlive) {
214
- // Worker dead + card still Inprogress
215
- // For print-mode workers, use full completion detection first
216
- if (slotState.mode === 'print') {
217
- const completionStatus = await this.checkPrintWorkerCompletion(slotState);
218
- if (completionStatus === 'COMPLETED') {
219
- this.log.ok(`seq ${card.seq}: Worker dead but detectCompleted → COMPLETED`);
220
- const handled = await this.handleCompletedWorker(slotName, slotState, state, actions);
221
- if (handled) {
222
- try {
223
- writeState(this.ctx.paths.stateFile, state, 'monitor-stale-completed');
224
- }
225
- catch { /* logged */ }
226
- }
227
- staleCount++;
228
- continue;
229
- }
230
- }
231
- // Fall back to MR-only check
163
+ if (!handle) {
164
+ // Not tracked by Supervisor and no slot → stale
165
+ // Check for MR as a last resort
232
166
  const branchName = slotState.branch || this.buildBranchName(card);
233
167
  const mrStatus = await this.repoBackend.getMrStatus(branchName);
234
168
  if (mrStatus.exists) {
235
- this.log.warn(`seq ${card.seq}: Worker dead but MR exists — stale runtime`);
169
+ this.log.warn(`seq ${card.seq}: Worker not tracked, but MR exists — stale runtime`);
236
170
  await this.handleStaleRuntime(card, actions, recommendedActions);
237
- staleCount++;
238
171
  }
239
172
  else {
240
- this.log.warn(`seq ${card.seq}: Worker dead, no MR found`);
173
+ this.log.warn(`seq ${card.seq}: Worker not tracked, no MR found`);
241
174
  await this.addLabelSafe(card.seq, 'STALE-RUNTIME');
242
175
  actions.push({
243
176
  action: 'mark-stale',
244
177
  entity: `seq:${card.seq}`,
245
178
  result: 'ok',
246
- message: 'Worker dead, no MR — needs manual review',
179
+ message: 'Worker not tracked by Supervisor, no MR — needs manual review',
247
180
  });
248
181
  recommendedActions.push({
249
- action: `Review seq:${card.seq} — worker died without creating MR`,
250
- reason: 'Worker dead with no MR',
182
+ action: `Review seq:${card.seq} — worker not tracked, no MR`,
183
+ reason: 'Worker not tracked by Supervisor with no MR',
251
184
  severity: 'warning',
252
185
  autoExecutable: false,
253
186
  requiresConfirmation: true,
254
187
  safeToRetry: false,
255
188
  });
256
- staleCount++;
257
189
  }
190
+ staleCount++;
258
191
  }
259
192
  }
260
193
  checks.push({
@@ -269,7 +202,6 @@ export class MonitorEngine {
269
202
  const seq = card.seq;
270
203
  await this.addLabelSafe(seq, 'STALE-RUNTIME');
271
204
  if (this.ctx.config.MONITOR_AUTO_QA) {
272
- // Auto-move to QA
273
205
  try {
274
206
  await this.taskBackend.move(seq, 'QA');
275
207
  this.log.ok(`seq ${seq}: Auto-moved to QA (MONITOR_AUTO_QA=true)`);
@@ -294,7 +226,6 @@ export class MonitorEngine {
294
226
  }
295
227
  }
296
228
  else {
297
- // Notify, wait for human
298
229
  await this.notifySafe(`seq:${seq} has a stale runtime — worker session dead but MR may exist`, 'warning');
299
230
  recommendedActions.push({
300
231
  action: `Move seq:${seq} to QA or investigate stale runtime`,
@@ -312,7 +243,7 @@ export class MonitorEngine {
312
243
  });
313
244
  }
314
245
  }
315
- // ─── Check 3: Timeout Detection (01 §10.2.4) ──────────────────
246
+ // ─── Check 3: Timeout Detection ───────────────────────────────
316
247
  async checkTimeouts(checks, actions, recommendedActions) {
317
248
  const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
318
249
  const timeoutHours = this.ctx.config.INPROGRESS_TIMEOUT_HOURS;
@@ -332,12 +263,10 @@ export class MonitorEngine {
332
263
  if (slotState.lastHeartbeat) {
333
264
  const hbTime = new Date(slotState.lastHeartbeat).getTime();
334
265
  const hbAge = (now - hbTime) / (1000 * 60 * 60);
335
- // If heartbeat is recent (within timeout window), skip
336
266
  if (hbAge < timeoutHours)
337
267
  continue;
338
268
  }
339
269
  }
340
- // Timed out
341
270
  this.log.warn(`seq ${seq}: Timed out (${elapsedHours.toFixed(1)}h > ${timeoutHours}h threshold)`);
342
271
  await this.addLabelSafe(seq, 'STALE-RUNTIME');
343
272
  await this.notifySafe(`seq:${seq} has exceeded timeout (${elapsedHours.toFixed(1)}h)`, 'warning');
@@ -369,7 +298,7 @@ export class MonitorEngine {
369
298
  : 'No timeouts detected',
370
299
  });
371
300
  }
372
- // ─── Check 4: Waiting Confirmation Detection (doc 12 §3) ──────
301
+ // ─── Check 4: Waiting Confirmation Detection (interactive mode)
373
302
  async checkWaitingConfirmation(checks, actions) {
374
303
  const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
375
304
  let waitingCount = 0;
@@ -385,10 +314,8 @@ export class MonitorEngine {
385
314
  continue;
386
315
  const seq = slotState.seq != null ? String(slotState.seq) : slotName;
387
316
  if (!waitResult.destructive) {
388
- // Non-destructive prompt → auto-confirm
389
317
  this.log.info(`seq ${seq}: Worker waiting for non-destructive confirmation, auto-confirming`);
390
318
  try {
391
- // Send Enter/y to confirm
392
319
  await this.workerProvider.sendFix(slotState.tmuxSession, 'y');
393
320
  actions.push({
394
321
  action: 'auto-confirm',
@@ -410,7 +337,6 @@ export class MonitorEngine {
410
337
  }
411
338
  }
412
339
  else {
413
- // Destructive prompt → label + notify
414
340
  this.log.warn(`seq ${seq}: Worker waiting for destructive confirmation: ${waitResult.prompt}`);
415
341
  if (slotState.seq != null) {
416
342
  await this.addLabelSafe(String(slotState.seq), 'WAITING-CONFIRMATION');
@@ -442,9 +368,8 @@ export class MonitorEngine {
442
368
  : 'No workers waiting for confirmation',
443
369
  });
444
370
  }
445
- // ─── Check 5: BLOCKED Condition Check (01 §3.6.1) ─────────────
371
+ // ─── Check 5: BLOCKED Condition Check ─────────────────────────
446
372
  async checkBlockedCards(checks) {
447
- // Collect cards from all active states that might have BLOCKED label
448
373
  const states = ['Backlog', 'Todo', 'Inprogress', 'QA'];
449
374
  let blockedCount = 0;
450
375
  for (const cardState of states) {
@@ -469,42 +394,21 @@ export class MonitorEngine {
469
394
  : 'No blocked cards',
470
395
  });
471
396
  }
472
- // ─── Check 6: State Alignment (07 §2.1) ───────────────────────
397
+ // ─── Check 6: State Alignment (Supervisor vs state.json) ──────
473
398
  async checkStateAlignment(checks, recommendedActions) {
474
399
  const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
475
400
  const discrepancies = [];
476
401
  for (const [slotName, slotState] of Object.entries(state.workers)) {
477
- if (slotState.status !== 'active' || !slotState.tmuxSession)
402
+ if (slotState.status !== 'active')
478
403
  continue;
479
- let alive;
480
- if (slotState.mode === 'print') {
481
- // Print mode: check PID directly
482
- if (!slotState.pid)
483
- continue; // No PID yet, skip
484
- try {
485
- process.kill(slotState.pid, 0);
486
- alive = true;
487
- }
488
- catch {
489
- alive = false;
490
- }
404
+ const seq = slotState.seq != null ? String(slotState.seq) : '';
405
+ const workerId = `${this.ctx.projectName}:${slotName}:${seq}`;
406
+ const handle = this.supervisor.get(workerId);
407
+ if (!handle) {
408
+ discrepancies.push(`${slotName}: state says active but Supervisor has no handle`);
491
409
  }
492
- else {
493
- // Interactive mode: check tmux session
494
- try {
495
- const inspection = await this.workerProvider.inspect(slotState.tmuxSession);
496
- alive = inspection.alive;
497
- }
498
- catch {
499
- discrepancies.push(`${slotName}: could not inspect session ${slotState.tmuxSession}`);
500
- continue;
501
- }
502
- }
503
- if (!alive) {
504
- const desc = slotState.mode === 'print'
505
- ? `pid ${slotState.pid} is dead`
506
- : `session ${slotState.tmuxSession} is dead`;
507
- discrepancies.push(`${slotName}: state says active but ${desc}`);
410
+ else if (handle.exitCode !== null) {
411
+ discrepancies.push(`${slotName}: state says active but Supervisor reports exited (code=${handle.exitCode})`);
508
412
  }
509
413
  }
510
414
  if (discrepancies.length > 0) {
@@ -525,53 +429,28 @@ export class MonitorEngine {
525
429
  status: discrepancies.length > 0 ? 'warn' : 'pass',
526
430
  message: discrepancies.length > 0
527
431
  ? `${discrepancies.length} state alignment discrepancy(ies)`
528
- : 'State aligned with runtime',
432
+ : 'State aligned with Supervisor',
529
433
  });
530
434
  }
531
- // ─── Check 0: Worker Health (print-mode auto-recovery) ────────
435
+ // ─── Check 7: Worker Health (launch/idle timeouts) ────────────
532
436
  async checkWorkerHealth(checks, actions) {
533
437
  const state = readState(this.ctx.paths.stateFile, this.ctx.maxWorkers);
534
438
  let issues = 0;
535
439
  for (const [slotName, slotState] of Object.entries(state.workers)) {
536
440
  if (slotState.status !== 'active' || slotState.mode !== 'print')
537
441
  continue;
538
- if (!slotState.pid || !slotState.outputFile || !slotState.claimedAt)
442
+ if (!slotState.outputFile || !slotState.claimedAt)
539
443
  continue;
540
- // Check if PID is alive
541
- let pidAlive;
542
- try {
543
- process.kill(slotState.pid, 0);
544
- pidAlive = true;
545
- }
546
- catch {
547
- pidAlive = false;
548
- }
549
- if (!pidAlive) {
550
- // PID dead — but check if the worker actually completed its task first
551
- const completionStatus = await this.checkPrintWorkerCompletion(slotState);
552
- if (completionStatus === 'COMPLETED') {
553
- this.log.ok(`Worker health: ${slotName} pid ${slotState.pid} is dead but task COMPLETED`);
554
- const handled = await this.handleCompletedWorker(slotName, slotState, state, actions);
555
- if (handled) {
556
- try {
557
- writeState(this.ctx.paths.stateFile, state, 'monitor-health-completed');
558
- }
559
- catch { /* logged */ }
560
- continue; // worker finished successfully
561
- }
562
- // Move-to-Done failed — fall through to auto-retry
563
- this.log.warn(`Worker health: ${slotName} completed but move-to-Done failed, will auto-retry`);
564
- }
565
- // Not completed → auto-retry
566
- const seq = slotState.seq != null ? String(slotState.seq) : null;
567
- if (seq) {
568
- this.log.warn(`Worker health: ${slotName} pid ${slotState.pid} is dead (status=${completionStatus}), auto-retrying seq ${seq}`);
569
- await this.autoRetry(seq, slotName, state, actions, 'Worker process died');
570
- issues++;
571
- }
444
+ const seq = slotState.seq != null ? String(slotState.seq) : '';
445
+ const workerId = `${this.ctx.projectName}:${slotName}:${seq}`;
446
+ const handle = this.supervisor.get(workerId);
447
+ // If Supervisor doesn't track it, orphan check handles it
448
+ if (!handle)
572
449
  continue;
573
- }
574
- // PID alive — check output health
450
+ // If worker already exited, Supervisor exit callback handles it
451
+ if (handle.exitCode !== null)
452
+ continue;
453
+ // Worker is alive — check output health
575
454
  const nowMs = Date.now();
576
455
  const claimedMs = new Date(slotState.claimedAt).getTime();
577
456
  const elapsedS = (nowMs - claimedMs) / 1000;
@@ -588,28 +467,22 @@ export class MonitorEngine {
588
467
  const launchTimeout = this.ctx.config.WORKER_LAUNCH_TIMEOUT_S;
589
468
  const idleTimeout = this.ctx.config.WORKER_IDLE_TIMEOUT_M * 60 * 1000;
590
469
  // Launch timeout: process alive but no output after N seconds
591
- if (outputSize === 0 && elapsedS > launchTimeout) {
592
- const seq = slotState.seq != null ? String(slotState.seq) : null;
593
- if (seq) {
594
- this.log.warn(`Worker health: ${slotName} has no output after ${Math.round(elapsedS)}s, killing and retrying seq ${seq}`);
595
- this.killWorker(slotState.pid);
596
- await this.autoRetry(seq, slotName, state, actions, `No output after ${Math.round(elapsedS)}s`);
597
- issues++;
598
- }
470
+ if (outputSize === 0 && elapsedS > launchTimeout && seq) {
471
+ this.log.warn(`Worker health: ${slotName} has no output after ${Math.round(elapsedS)}s, killing and retrying seq ${seq}`);
472
+ await this.supervisor.kill(workerId);
473
+ await this.autoRetry(seq, slotName, state, actions, `No output after ${Math.round(elapsedS)}s`);
474
+ issues++;
599
475
  continue;
600
476
  }
601
477
  // Idle timeout: process alive but no new output for N minutes
602
- if (outputSize > 0 && outputMtimeMs > 0) {
478
+ if (outputSize > 0 && outputMtimeMs > 0 && seq) {
603
479
  const idleSinceMs = nowMs - outputMtimeMs;
604
480
  if (idleSinceMs > idleTimeout) {
605
- const seq = slotState.seq != null ? String(slotState.seq) : null;
606
- if (seq) {
607
- const idleMin = Math.round(idleSinceMs / 60000);
608
- this.log.warn(`Worker health: ${slotName} no output for ${idleMin}min, killing and retrying seq ${seq}`);
609
- this.killWorker(slotState.pid);
610
- await this.autoRetry(seq, slotName, state, actions, `No output for ${idleMin}min`);
611
- issues++;
612
- }
481
+ const idleMin = Math.round(idleSinceMs / 60000);
482
+ this.log.warn(`Worker health: ${slotName} no output for ${idleMin}min, killing and retrying seq ${seq}`);
483
+ await this.supervisor.kill(workerId);
484
+ await this.autoRetry(seq, slotName, state, actions, `No output for ${idleMin}min`);
485
+ issues++;
613
486
  }
614
487
  }
615
488
  }
@@ -621,21 +494,9 @@ export class MonitorEngine {
621
494
  : 'All active workers healthy',
622
495
  });
623
496
  }
624
- killWorker(pid) {
625
- try {
626
- process.kill(pid, 'SIGTERM');
627
- }
628
- catch { /* already dead */ }
629
- // Give it a moment, then force kill
630
- try {
631
- process.kill(pid, 0); // check alive
632
- process.kill(pid, 'SIGKILL');
633
- }
634
- catch { /* dead */ }
635
- }
497
+ // ─── Auto-retry ───────────────────────────────────────────────
636
498
  async autoRetry(seq, slotName, state, actions, reason) {
637
499
  const restartLimit = this.ctx.config.WORKER_RESTART_LIMIT;
638
- // Get retry count from activeCards
639
500
  const activeCard = state.activeCards[seq];
640
501
  const retryCount = activeCard?.retryCount ?? 0;
641
502
  // Release the slot
@@ -655,12 +516,10 @@ export class MonitorEngine {
655
516
  };
656
517
  delete state.activeCards[seq];
657
518
  if (retryCount < restartLimit) {
658
- // Move back to Todo for re-launch on next tick
659
519
  try {
660
520
  await this.taskBackend.move(seq, 'Todo');
661
521
  await this.removeLabelSafe(seq, 'CLAIMED');
662
522
  await this.removeLabelSafe(seq, 'STALE-RUNTIME');
663
- // Track retry count — store in a fresh activeCards entry
664
523
  state.activeCards[seq] = {
665
524
  seq: parseInt(seq, 10),
666
525
  state: 'Todo',
@@ -714,94 +573,7 @@ export class MonitorEngine {
714
573
  this.logEvent('retry-exhausted', seq, 'ok', { retryCount, reason });
715
574
  }
716
575
  }
717
- async removeLabelSafe(seq, label) {
718
- try {
719
- await this.taskBackend.removeLabel(seq, label);
720
- }
721
- catch { /* best effort */ }
722
- }
723
- // ─── Completion-aware helpers for dead print-mode workers ─────
724
- /**
725
- * Check if a dead print-mode worker actually completed its task.
726
- * Uses workerProvider.detectCompleted() with the slot's session/branch info.
727
- */
728
- async checkPrintWorkerCompletion(slotState) {
729
- if (!slotState.tmuxSession)
730
- return 'DEAD';
731
- const branch = slotState.branch || '';
732
- const logDir = this.ctx.paths.logsDir;
733
- try {
734
- return await this.workerProvider.detectCompleted(slotState.tmuxSession, logDir, branch);
735
- }
736
- catch (err) {
737
- const msg = err instanceof Error ? err.message : String(err);
738
- this.log.debug(`detectCompleted failed for ${slotState.tmuxSession}: ${msg}`);
739
- return 'DEAD';
740
- }
741
- }
742
- /**
743
- * Handle a dead print-mode worker that has been confirmed as COMPLETED.
744
- * Mutates state in-place (releases slot, updates activeCards).
745
- * Does NOT call writeState — caller is responsible for flushing state.
746
- */
747
- /**
748
- * Handle a dead print-mode worker confirmed as COMPLETED.
749
- * Returns true if card moved to Done + slot released (state mutated).
750
- * Returns false if move failed (state NOT mutated, caller should not count as handled).
751
- */
752
- async handleCompletedWorker(slotName, slotState, state, actions) {
753
- const seq = slotState.seq != null ? String(slotState.seq) : null;
754
- if (!seq)
755
- return false;
756
- // 1. Move card to Done FIRST — if this fails, don't touch state
757
- const targetState = 'Done';
758
- try {
759
- await this.taskBackend.move(seq, targetState);
760
- }
761
- catch (err) {
762
- const msg = err instanceof Error ? err.message : String(err);
763
- this.log.error(`seq ${seq}: Failed to move to ${targetState}: ${msg}. Slot NOT released.`);
764
- actions.push({
765
- action: 'complete',
766
- entity: `seq:${seq}`,
767
- result: 'fail',
768
- message: `Move to ${targetState} failed: ${msg}`,
769
- });
770
- return false;
771
- }
772
- // 2. Done confirmed — now release slot + cleanup (mutate state)
773
- state.workers[slotName] = {
774
- status: 'idle', seq: null, branch: null, worktree: null,
775
- tmuxSession: null, claimedAt: null, lastHeartbeat: null,
776
- mode: null, sessionId: null, pid: null, outputFile: null, exitCode: null,
777
- };
778
- delete state.activeCards[seq];
779
- try {
780
- await this.taskBackend.releaseClaim(seq);
781
- }
782
- catch { /* best effort */ }
783
- // 3. Mark worktree for cleanup
784
- const branch = slotState.branch || '';
785
- const worktreePath = slotState.worktree || '';
786
- if (branch && worktreePath) {
787
- const cleanup = state.worktreeCleanup ?? [];
788
- if (!cleanup.some((e) => e.branch === branch)) {
789
- cleanup.push({ branch, worktreePath, markedAt: new Date().toISOString() });
790
- state.worktreeCleanup = cleanup;
791
- }
792
- }
793
- this.log.ok(`seq ${seq}: Worker completed (detected by monitor), moved to ${targetState}`);
794
- await this.notifySafe(`seq:${seq} worker completed, moved to ${targetState}`, 'success');
795
- actions.push({
796
- action: 'complete',
797
- entity: `seq:${seq}`,
798
- result: 'ok',
799
- message: `Worker completed (PID dead, artifacts verified) → ${targetState}`,
800
- });
801
- this.logEvent('complete', seq, 'ok', { detectedBy: 'monitor' });
802
- return true;
803
- }
804
- // ─── Helpers ───────────────────────────────────────────────────
576
+ // ─── Helpers ──────────────────────────────────────────────────
805
577
  buildBranchName(card) {
806
578
  const slug = card.name
807
579
  .toLowerCase()
@@ -819,6 +591,12 @@ export class MonitorEngine {
819
591
  this.log.error(`Failed to add label ${label} to seq ${seq}: ${msg}`);
820
592
  }
821
593
  }
594
+ async removeLabelSafe(seq, label) {
595
+ try {
596
+ await this.taskBackend.removeLabel(seq, label);
597
+ }
598
+ catch { /* best effort */ }
599
+ }
822
600
  async notifySafe(message, level) {
823
601
  if (!this.notifier)
824
602
  return;