@coralai/sps-cli 0.23.21 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +16 -7
  2. package/dist/commands/cardDashboard.js +3 -3
  3. package/dist/commands/cardDashboard.js.map +1 -1
  4. package/dist/commands/pipelineTick.d.ts.map +1 -1
  5. package/dist/commands/pipelineTick.js +19 -6
  6. package/dist/commands/pipelineTick.js.map +1 -1
  7. package/dist/commands/qaTick.d.ts.map +1 -1
  8. package/dist/commands/qaTick.js +33 -4
  9. package/dist/commands/qaTick.js.map +1 -1
  10. package/dist/commands/status.d.ts.map +1 -1
  11. package/dist/commands/status.js +2 -5
  12. package/dist/commands/status.js.map +1 -1
  13. package/dist/commands/tick.d.ts.map +1 -1
  14. package/dist/commands/tick.js +56 -35
  15. package/dist/commands/tick.js.map +1 -1
  16. package/dist/commands/workerDashboard.d.ts.map +1 -1
  17. package/dist/commands/workerDashboard.js +9 -9
  18. package/dist/commands/workerDashboard.js.map +1 -1
  19. package/dist/commands/workerLaunch.d.ts.map +1 -1
  20. package/dist/commands/workerLaunch.js +19 -6
  21. package/dist/commands/workerLaunch.js.map +1 -1
  22. package/dist/core/acpState.js +1 -1
  23. package/dist/core/acpState.js.map +1 -1
  24. package/dist/core/config.d.ts +9 -0
  25. package/dist/core/config.d.ts.map +1 -1
  26. package/dist/core/config.js +13 -0
  27. package/dist/core/config.js.map +1 -1
  28. package/dist/core/runtimeSnapshot.d.ts +1 -0
  29. package/dist/core/runtimeSnapshot.d.ts.map +1 -1
  30. package/dist/core/runtimeSnapshot.js +6 -6
  31. package/dist/core/runtimeSnapshot.js.map +1 -1
  32. package/dist/core/runtimeStore.d.ts +23 -1
  33. package/dist/core/runtimeStore.d.ts.map +1 -1
  34. package/dist/core/runtimeStore.js +71 -32
  35. package/dist/core/runtimeStore.js.map +1 -1
  36. package/dist/core/state.d.ts +33 -0
  37. package/dist/core/state.d.ts.map +1 -1
  38. package/dist/core/state.js +6 -0
  39. package/dist/core/state.js.map +1 -1
  40. package/dist/core/taskPrompts.d.ts.map +1 -1
  41. package/dist/core/taskPrompts.js +13 -9
  42. package/dist/core/taskPrompts.js.map +1 -1
  43. package/dist/core/workerRuntimeSummary.d.ts +1 -2
  44. package/dist/core/workerRuntimeSummary.d.ts.map +1 -1
  45. package/dist/core/workerRuntimeSummary.js +2 -2
  46. package/dist/core/workerRuntimeSummary.js.map +1 -1
  47. package/dist/engines/CloseoutEngine.d.ts +3 -6
  48. package/dist/engines/CloseoutEngine.d.ts.map +1 -1
  49. package/dist/engines/CloseoutEngine.js +113 -285
  50. package/dist/engines/CloseoutEngine.js.map +1 -1
  51. package/dist/engines/EventHandler.d.ts +57 -0
  52. package/dist/engines/EventHandler.d.ts.map +1 -0
  53. package/dist/engines/EventHandler.js +210 -0
  54. package/dist/engines/EventHandler.js.map +1 -0
  55. package/dist/engines/ExecutionEngine.d.ts +5 -17
  56. package/dist/engines/ExecutionEngine.d.ts.map +1 -1
  57. package/dist/engines/ExecutionEngine.js +110 -368
  58. package/dist/engines/ExecutionEngine.js.map +1 -1
  59. package/dist/engines/MonitorEngine.d.ts.map +1 -1
  60. package/dist/engines/MonitorEngine.js +8 -9
  61. package/dist/engines/MonitorEngine.js.map +1 -1
  62. package/dist/manager/integration-queue.d.ts +65 -0
  63. package/dist/manager/integration-queue.d.ts.map +1 -0
  64. package/dist/manager/integration-queue.js +123 -0
  65. package/dist/manager/integration-queue.js.map +1 -0
  66. package/dist/manager/recovery.d.ts.map +1 -1
  67. package/dist/manager/recovery.js +10 -9
  68. package/dist/manager/recovery.js.map +1 -1
  69. package/dist/manager/runtime-coordinator.d.ts +1 -3
  70. package/dist/manager/runtime-coordinator.d.ts.map +1 -1
  71. package/dist/manager/runtime-coordinator.js +13 -15
  72. package/dist/manager/runtime-coordinator.js.map +1 -1
  73. package/dist/manager/worker-manager-impl.d.ts +81 -0
  74. package/dist/manager/worker-manager-impl.d.ts.map +1 -0
  75. package/dist/manager/worker-manager-impl.js +648 -0
  76. package/dist/manager/worker-manager-impl.js.map +1 -0
  77. package/dist/manager/worker-manager.d.ts +176 -0
  78. package/dist/manager/worker-manager.d.ts.map +1 -0
  79. package/dist/manager/worker-manager.js +12 -0
  80. package/dist/manager/worker-manager.js.map +1 -0
  81. package/dist/models/acp.d.ts +4 -0
  82. package/dist/models/acp.d.ts.map +1 -1
  83. package/package.json +1 -1
@@ -0,0 +1,648 @@
1
+ /**
2
+ * WorkerManagerImpl — concrete implementation of the WorkerManager interface.
3
+ *
4
+ * Wraps ProcessSupervisor, CompletionJudge, ResourceLimiter into the
5
+ * unified ACP interface. PM operations are delegated to SPSEventHandler
6
+ * via the event system (Phase 3 refactor).
7
+ *
8
+ * Phase 4: recover() fully implemented with decision matrix from doc-09 §11.3.
9
+ */
10
+ import { execFileSync } from 'node:child_process';
11
+ import { readState, writeState, createIdleWorkerSlot } from '../core/state.js';
12
+ import { IntegrationQueue } from './integration-queue.js';
13
+ // ─── Timeout Defaults ──────────────────────────────────────────
14
+ const DEFAULT_TIMEOUTS = {
15
+ startupSec: 60, // 60s for worker to start
16
+ developmentSec: 4 * 3600, // 4h for development
17
+ integrationSec: 3600, // 1h for integration
18
+ inputWaitSec: 1800, // 30min waiting for input
19
+ forceMultiplier: 1.5, // Hard kill at 1.5x timeout
20
+ };
21
+ // ─── Implementation ────────────────────────────────────────────
22
+ export class WorkerManagerImpl {
23
+ supervisor;
24
+ completionJudge;
25
+ resourceLimiter;
26
+ agentRuntime;
27
+ stateFile;
28
+ maxWorkers;
29
+ integrationQueue;
30
+ eventHandlers = [];
31
+ taskSlotMap = new Map();
32
+ timeouts = new Map();
33
+ constructor(deps) {
34
+ this.supervisor = deps.supervisor;
35
+ this.completionJudge = deps.completionJudge;
36
+ this.resourceLimiter = deps.resourceLimiter;
37
+ this.agentRuntime = deps.agentRuntime;
38
+ this.stateFile = deps.stateFile;
39
+ this.maxWorkers = deps.maxWorkers;
40
+ this.integrationQueue = deps.integrationQueue ?? new IntegrationQueue(deps.stateFile, deps.maxWorkers);
41
+ }
42
+ // ─── run / resume ────────────────────────────────────────────
43
+ async run(request) {
44
+ return this.acquireAndSpawn({
45
+ taskId: request.taskId, cardId: request.cardId, project: request.project,
46
+ phase: request.phase, prompt: request.prompt, cwd: request.cwd,
47
+ branch: request.branch, targetBranch: request.targetBranch,
48
+ tool: request.tool, transport: request.transport,
49
+ outputFile: request.outputFile, maxRetries: request.maxRetries ?? 0,
50
+ customTimeoutSec: request.timeoutSec,
51
+ }, 'wm-run');
52
+ }
53
+ async resume(request) {
54
+ return this.acquireAndSpawn({
55
+ taskId: request.taskId, cardId: request.cardId, project: request.project,
56
+ phase: request.phase, prompt: request.prompt, cwd: request.cwd,
57
+ branch: request.branch, targetBranch: request.targetBranch,
58
+ tool: request.tool, transport: request.transport,
59
+ outputFile: request.outputFile, maxRetries: 0,
60
+ resumeSessionId: request.sessionId,
61
+ }, 'wm-resume');
62
+ }
63
+ // ─── cancel ──────────────────────────────────────────────────
64
+ async cancel(request) {
65
+ const { taskId, project, reason } = request;
66
+ // ── Check if the task is queued (not yet spawned) ───────────
67
+ const queuePos = this.integrationQueue.getPosition(taskId);
68
+ if (queuePos > 0) {
69
+ // Task is in waiting list — remove without killing any worker
70
+ this.integrationQueue.remove(taskId);
71
+ this.emitEvent({
72
+ type: 'run.failed', taskId, cardId: taskId, project,
73
+ phase: 'integration', slot: '', workerId: '',
74
+ timestamp: new Date().toISOString(), state: 'failed',
75
+ error: `Cancelled from queue (position=${queuePos}): ${reason}`,
76
+ });
77
+ this.log(`Removed queued task ${taskId} from integration queue (reason=${reason})`);
78
+ return;
79
+ }
80
+ const slot = this.taskSlotMap.get(taskId);
81
+ if (!slot) {
82
+ this.log(`Cancel: task ${taskId} not found`);
83
+ return;
84
+ }
85
+ // Determine phase from lease to know if we need to advance queue
86
+ const lease = this.rd().leases[taskId];
87
+ const isIntegration = lease
88
+ ? (lease.pmStateObserved === 'QA' || lease.phase === 'merging' || lease.phase === 'resolving_conflict')
89
+ : queuePos === 0;
90
+ this.clearTimeoutForTask(taskId);
91
+ const workerId = `${project}:${slot}:${taskId}`;
92
+ await this.supervisor.kill(workerId);
93
+ this.releaseSlotInState(slot, taskId);
94
+ this.resourceLimiter.release();
95
+ this.taskSlotMap.delete(taskId);
96
+ this.emitEvent({
97
+ type: 'run.failed', taskId, cardId: taskId, project,
98
+ phase: isIntegration ? 'integration' : 'development', slot, workerId,
99
+ timestamp: new Date().toISOString(), state: 'failed',
100
+ error: `Cancelled: ${reason}`,
101
+ });
102
+ this.log(`Cancelled worker ${workerId} (reason=${reason})`);
103
+ // ── If active integration worker was cancelled, advance queue ─
104
+ if (isIntegration) {
105
+ const targetBranch = lease?.branch ?? 'main';
106
+ // Find the actual targetBranch from the queue active entry
107
+ const state = this.rd();
108
+ let actualTarget = targetBranch;
109
+ for (const [key, q] of Object.entries(state.integrationQueues)) {
110
+ if (q.active?.taskId === taskId) {
111
+ actualTarget = key.split(':').slice(1).join(':');
112
+ break;
113
+ }
114
+ }
115
+ await this.advanceIntegrationQueue(project, actualTarget);
116
+ }
117
+ }
118
+ // ─── inspect ─────────────────────────────────────────────────
119
+ inspect(query) {
120
+ const state = this.rd();
121
+ const snapshots = [];
122
+ for (const [slotName, worker] of Object.entries(state.workers)) {
123
+ if (query.slot && query.slot !== slotName)
124
+ continue;
125
+ if (query.taskId && worker.seq !== null && String(worker.seq) !== query.taskId)
126
+ continue;
127
+ const seq = worker.seq !== null ? String(worker.seq) : null;
128
+ const activeCard = seq ? state.activeCards[seq] ?? null : null;
129
+ const lease = seq ? state.leases[seq] ?? null : null;
130
+ snapshots.push({
131
+ slot: slotName, taskId: seq,
132
+ cardId: activeCard ? String(activeCard.seq) : seq,
133
+ project: query.project ?? '',
134
+ state: this.mapWorkerState(worker),
135
+ phase: lease ? (lease.phase === 'merging' || lease.phase === 'resolving_conflict' ? 'integration' : 'development') : null,
136
+ pid: worker.pid ?? null, sessionId: worker.sessionId ?? null,
137
+ cwd: worker.worktree, branch: worker.branch,
138
+ startedAt: worker.claimedAt,
139
+ updatedAt: worker.lastHeartbeat ?? worker.claimedAt ?? new Date().toISOString(),
140
+ outputTail: null, pendingInput: null,
141
+ });
142
+ }
143
+ return snapshots;
144
+ }
145
+ onEvent(handler) { this.eventHandlers.push(handler); }
146
+ // ─── sendInput / confirm ─────────────────────────────────────
147
+ async sendInput(request) {
148
+ const slot = this.requirePtySlot(request.taskId, 'sendInput');
149
+ await this.agentRuntime.resumeRun(slot, request.input);
150
+ this.log(`Sent input to task ${request.taskId} in ${slot}`);
151
+ }
152
+ async confirm(request) {
153
+ const slot = this.requirePtySlot(request.taskId, 'confirm');
154
+ const input = request.action === 'confirm' ? (request.message ?? 'yes') : (request.message ?? 'no');
155
+ await this.agentRuntime.resumeRun(slot, input);
156
+ this.log(`Confirmed (${request.action}) task ${request.taskId} in ${slot}`);
157
+ }
158
+ // ─── recover (Phase 4 — full decision matrix from doc-09 §11.3) ──
159
+ async recover(contexts) {
160
+ const result = {
161
+ scanned: 0, alive: 0, completed: 0, failed: 0,
162
+ released: 0, rebuilt: 0, queueRebuilt: 0, events: [],
163
+ };
164
+ // Phase 1+2: Scan leases and apply per-task decision matrix
165
+ for (const ctx of contexts) {
166
+ const state = readState(ctx.stateFile, this.maxWorkers);
167
+ for (const [seq, lease] of Object.entries(state.leases)) {
168
+ if (lease.phase === 'released' || lease.phase === 'suspended')
169
+ continue;
170
+ result.scanned++;
171
+ const slot = lease.slot;
172
+ const worker = slot ? state.workers[slot] ?? null : null;
173
+ const pid = worker?.pid ?? null;
174
+ const isAlive = pid ? this.isPidAlive(pid) : false;
175
+ const evidence = state.worktreeEvidence[seq] ?? null;
176
+ const pmState = lease.pmStateObserved;
177
+ // R8/R9: PM manually completed or reverted — release immediately
178
+ if (pmState === 'Done' || pmState === 'Backlog' || pmState === 'Todo') {
179
+ if (slot)
180
+ this.releaseSlotInState(slot, seq);
181
+ result.released++;
182
+ this.log(`Recovery R8/R9: task ${seq} PM state=${pmState}, released`);
183
+ continue;
184
+ }
185
+ // R1: Worker still running — re-attach orphan PID monitoring
186
+ if (isAlive && pid && slot) {
187
+ this.recoverAliveWorker(ctx, seq, lease, slot, pid);
188
+ result.alive++;
189
+ continue;
190
+ }
191
+ // Dead worker — check git evidence for decision
192
+ const event = this.judgeDeadWorker(ctx, seq, lease, slot, evidence);
193
+ if (event) {
194
+ result.events.push(event);
195
+ if (event.type === 'run.completed')
196
+ result.completed++;
197
+ else
198
+ result.failed++;
199
+ }
200
+ else {
201
+ // Released (R7: worktree missing or fallback)
202
+ result.released++;
203
+ }
204
+ }
205
+ // Phase 3: Rebuild integration queues from merging/resolving leases
206
+ this.rebuildIntegrationQueue(ctx, state, result);
207
+ }
208
+ // Phase 4: Emit collected events so SPSEventHandler processes them
209
+ for (const event of result.events) {
210
+ this.emitEvent(event);
211
+ }
212
+ if (result.scanned > 0) {
213
+ this.log(`Recovery complete: scanned=${result.scanned} alive=${result.alive} ` +
214
+ `completed=${result.completed} failed=${result.failed} ` +
215
+ `released=${result.released} rebuilt=${result.rebuilt} queueRebuilt=${result.queueRebuilt}`);
216
+ }
217
+ return result;
218
+ }
219
+ // ─── Private: Unified acquire + spawn flow ───────────────────
220
+ async acquireAndSpawn(ctx, label) {
221
+ const { taskId, cardId, project, phase, prompt, cwd, branch, targetBranch, tool, transport, outputFile, maxRetries, resumeSessionId } = ctx;
222
+ if (this.taskSlotMap.has(taskId)) {
223
+ this.log(`Duplicate task ${taskId}, already in slot ${this.taskSlotMap.get(taskId)}`);
224
+ return this.reject('duplicate_task');
225
+ }
226
+ // ── Integration queue gate ──────────────────────────────────
227
+ if (phase === 'integration') {
228
+ const entry = {
229
+ taskId, cardId, project, prompt, cwd, branch, targetBranch,
230
+ tool, transport, outputFile, enqueuedAt: new Date().toISOString(),
231
+ };
232
+ const active = this.integrationQueue.getActive(project, targetBranch);
233
+ if (active) {
234
+ const { position } = this.integrationQueue.enqueue(entry);
235
+ this.log(`Integration task ${taskId} queued at position ${position} (active=${active.taskId})`);
236
+ return { accepted: true, queued: true, queuePosition: position, slot: null, workerId: null };
237
+ }
238
+ // No active — register as active before spawning
239
+ this.integrationQueue.enqueue(entry);
240
+ }
241
+ if (!this.resourceLimiter.tryAcquire()) {
242
+ this.log(`Resource exhausted for task ${taskId}`);
243
+ // If we just registered as active in the queue, roll back
244
+ if (phase === 'integration') {
245
+ this.integrationQueue.dequeueNext(project, targetBranch);
246
+ }
247
+ return this.reject('resource_exhausted');
248
+ }
249
+ const state = this.rd();
250
+ const slot = this.findIdleSlot(state);
251
+ if (!slot) {
252
+ this.resourceLimiter.release();
253
+ if (phase === 'integration') {
254
+ this.integrationQueue.dequeueNext(project, targetBranch);
255
+ }
256
+ this.log(`No idle slot for task ${taskId}`);
257
+ return this.reject('resource_exhausted');
258
+ }
259
+ await this.resourceLimiter.enforceStagger();
260
+ const nowIso = new Date().toISOString();
261
+ this.claimSlot(state, slot, { seq: taskId, cardId, project, phase, branch, cwd, tool, transport, outputFile, nowIso, targetBranch });
262
+ this.wr(state, label);
263
+ this.taskSlotMap.set(taskId, slot);
264
+ const workerId = `${project}:${slot}:${taskId}`;
265
+ let pid = null;
266
+ let sessionId;
267
+ try {
268
+ if (transport === 'proc') {
269
+ const handle = this.supervisor.spawn({
270
+ id: workerId, project, seq: taskId, slot, worktree: cwd, branch,
271
+ prompt, outputFile, tool, resumeSessionId,
272
+ onExit: (exitCode) => this.handleExit({
273
+ workerId, taskId, cardId, project, phase, slot, branch, cwd,
274
+ targetBranch, outputFile, tool, transport, exitCode, maxRetries,
275
+ }),
276
+ });
277
+ pid = handle.pid;
278
+ sessionId = handle.sessionId ?? undefined;
279
+ }
280
+ else if (transport === 'pty') {
281
+ if (!this.agentRuntime) {
282
+ throw new Error('PTY transport requires agentRuntime');
283
+ }
284
+ const session = resumeSessionId
285
+ ? await this.agentRuntime.resumeRun(slot, prompt)
286
+ : await this.agentRuntime.startRun(slot, prompt, tool, cwd);
287
+ sessionId = session.sessionId;
288
+ pid = session.pid ?? null;
289
+ }
290
+ }
291
+ catch (err) {
292
+ this.log(`Spawn failed for ${taskId}: ${err instanceof Error ? err.message : String(err)}`);
293
+ this.resourceLimiter.release();
294
+ this.releaseSlotInState(slot, taskId);
295
+ this.taskSlotMap.delete(taskId);
296
+ if (phase === 'integration') {
297
+ this.integrationQueue.remove(taskId);
298
+ await this.advanceIntegrationQueue(project, targetBranch);
299
+ }
300
+ this.emitEvent({
301
+ type: 'run.failed', taskId, cardId, project, phase, slot, workerId,
302
+ timestamp: new Date().toISOString(), state: 'failed',
303
+ error: `Spawn failed: ${err instanceof Error ? err.message : String(err)}`,
304
+ });
305
+ return this.reject('spawn_failed');
306
+ }
307
+ this.log(`Launched ${transport}/${tool} worker ${workerId} in ${slot} (pid=${pid})`);
308
+ this.startTimeout(taskId, phase, project, slot, ctx.customTimeoutSec);
309
+ return { accepted: true, slot, workerId, pid: pid ?? undefined, sessionId };
310
+ }
311
+ // ─── Private: Exit Handler ───────────────────────────────────
312
+ async handleExit(ctx) {
313
+ const { workerId, taskId, cardId, project, phase, slot, branch, cwd, targetBranch, outputFile, tool, transport, exitCode, maxRetries } = ctx;
314
+ this.clearTimeoutForTask(taskId);
315
+ this.log(`Worker ${workerId} exited with code ${exitCode}`);
316
+ const completion = this.completionJudge.judge({
317
+ worktree: cwd, branch, baseBranch: targetBranch, outputFile, exitCode, phase,
318
+ });
319
+ this.log(`CompletionJudge for ${workerId}: ${completion.status} (${completion.reason})`);
320
+ const isComplete = completion.status === 'completed';
321
+ // Emit event — SPSEventHandler handles PM operations, slot release, notifications
322
+ this.emitEvent({
323
+ type: isComplete ? 'run.completed' : 'run.failed',
324
+ taskId, cardId, project, phase, slot, workerId,
325
+ timestamp: new Date().toISOString(),
326
+ state: isComplete ? 'completed' : 'failed',
327
+ exitCode, completionResult: completion,
328
+ });
329
+ // Release supervisor handle and resource limiter slot
330
+ this.supervisor.remove(workerId);
331
+ this.resourceLimiter.release();
332
+ this.taskSlotMap.delete(taskId);
333
+ // ── Auto-dequeue next integration task ──────────────────────
334
+ if (phase === 'integration') {
335
+ await this.advanceIntegrationQueue(project, targetBranch);
336
+ }
337
+ }
338
+ /**
339
+ * Try to spawn the next queued integration task.
340
+ * On spawn failure, skip and try the next entry — never deadlock.
341
+ */
342
+ async advanceIntegrationQueue(project, targetBranch) {
343
+ // eslint-disable-next-line no-constant-condition
344
+ while (true) {
345
+ const next = this.integrationQueue.dequeueNext(project, targetBranch);
346
+ if (!next) {
347
+ this.log(`Integration queue empty for ${project}:${targetBranch}`);
348
+ return;
349
+ }
350
+ this.log(`Auto-dequeuing integration task ${next.taskId} for ${project}:${targetBranch}`);
351
+ // Skip entries with empty prompt (recovery stubs — SPS must re-prepare)
352
+ if (!next.prompt) {
353
+ this.log(`Skipping ${next.taskId}: empty prompt (needs SPS re-preparation)`);
354
+ this.emitEvent({
355
+ type: 'run.failed', taskId: next.taskId, cardId: next.cardId, project,
356
+ phase: 'integration', slot: '', workerId: '',
357
+ timestamp: new Date().toISOString(), state: 'failed',
358
+ error: 'Empty prompt — needs SPS re-preparation after recovery',
359
+ });
360
+ continue;
361
+ }
362
+ try {
363
+ const resp = await this.acquireAndSpawn({
364
+ taskId: next.taskId, cardId: next.cardId, project,
365
+ phase: 'integration', prompt: next.prompt, cwd: next.cwd,
366
+ branch: next.branch, targetBranch: next.targetBranch,
367
+ tool: next.tool, transport: next.transport,
368
+ outputFile: next.outputFile, maxRetries: 0,
369
+ }, 'wm-iq-dequeue');
370
+ if (resp.accepted && !resp.queued) {
371
+ this.log(`Integration task ${next.taskId} spawned after dequeue`);
372
+ return;
373
+ }
374
+ // If accepted but re-queued, something odd — keep going
375
+ this.log(`Integration task ${next.taskId} could not spawn (accepted=${resp.accepted}), trying next`);
376
+ }
377
+ catch (err) {
378
+ this.log(`Failed to spawn dequeued task ${next.taskId}: ${err instanceof Error ? err.message : String(err)}`);
379
+ // Emit failure event so SPS knows this task was dropped
380
+ this.emitEvent({
381
+ type: 'run.failed', taskId: next.taskId, cardId: next.cardId, project,
382
+ phase: 'integration', slot: '', workerId: '',
383
+ timestamp: new Date().toISOString(), state: 'failed',
384
+ error: `Dequeue spawn failed: ${err instanceof Error ? err.message : String(err)}`,
385
+ });
386
+ // Continue to next entry — never deadlock
387
+ }
388
+ }
389
+ }
390
+ // ─── Private: Recovery Helpers ────────────────────────────────
391
+ /**
392
+ * R1: Worker PID is still alive — re-attach orphan monitoring.
393
+ */
394
+ recoverAliveWorker(ctx, seq, lease, slot, pid) {
395
+ const workerId = `${ctx.project}:${slot}:${seq}`;
396
+ const phase = (lease.phase === 'merging' || lease.phase === 'resolving_conflict')
397
+ ? 'integration' : 'development';
398
+ this.resourceLimiter.tryAcquire();
399
+ this.supervisor.monitorOrphanPid(workerId, pid, {
400
+ id: workerId, transport: 'proc', pid,
401
+ outputFile: null, project: ctx.project, seq, slot,
402
+ branch: lease.branch ?? '', worktree: lease.worktree ?? '',
403
+ tool: 'claude', exitCode: null, sessionId: lease.sessionId ?? null,
404
+ runId: lease.runId ?? null, sessionState: null, remoteStatus: null,
405
+ lastEventAt: null,
406
+ startedAt: lease.claimedAt ?? new Date().toISOString(),
407
+ exitedAt: null,
408
+ }, (exitCode) => this.handleExit({
409
+ workerId, taskId: seq, cardId: seq, project: ctx.project,
410
+ phase, slot, branch: lease.branch ?? '',
411
+ cwd: lease.worktree ?? '', targetBranch: ctx.baseBranch,
412
+ outputFile: '', tool: 'claude', transport: 'proc',
413
+ exitCode, maxRetries: 0,
414
+ }));
415
+ this.taskSlotMap.set(seq, slot);
416
+ this.log(`Recovery R1: task ${seq} alive (pid=${pid}), re-attached monitor`);
417
+ }
418
+ /**
419
+ * Decision matrix for dead workers (R2-R7).
420
+ * Returns a WorkerEvent for completed/failed, or null if slot was released.
421
+ */
422
+ judgeDeadWorker(ctx, seq, lease, slot, evidence) {
423
+ const phase = (lease.phase === 'merging' || lease.phase === 'resolving_conflict')
424
+ ? 'integration' : 'development';
425
+ const workerId = `${ctx.project}:${slot ?? 'unknown'}:${seq}`;
426
+ // R2: Branch merged into base — task complete
427
+ if (evidence?.mergedToBase) {
428
+ this.log(`Recovery R2: task ${seq} branch merged to base`);
429
+ return this.makeRecoveryEvent('run.completed', seq, ctx, slot ?? '', phase, workerId, 'already_merged');
430
+ }
431
+ // R3: Pushed with commits ahead — development complete
432
+ if (evidence?.pushed && evidence.aheadOfBase > 0) {
433
+ this.log(`Recovery R3: task ${seq} pushed with ${evidence.aheadOfBase} commits ahead`);
434
+ return this.makeRecoveryEvent('run.completed', seq, ctx, slot ?? '', phase, workerId, 'branch_pushed');
435
+ }
436
+ // R4: Local commits unpushed — rescue push, then fail for restart
437
+ if (!evidence?.pushed && evidence && evidence.aheadOfBase > 0) {
438
+ if (lease.worktree && lease.branch) {
439
+ const rescued = this.rescuePush(lease.worktree, lease.branch);
440
+ if (rescued)
441
+ this.log(`Recovery R4: rescued push for task ${seq}`);
442
+ }
443
+ return this.makeRecoveryEvent('run.failed', seq, ctx, slot ?? '', phase, workerId, 'needs_restart');
444
+ }
445
+ // R5: Dirty state (rebase/merge/conflict) — rescue push, then fail
446
+ if (evidence && ['rebase', 'merge', 'conflict'].includes(evidence.gitStatus)) {
447
+ if (lease.worktree && lease.branch) {
448
+ const rescued = this.rescuePush(lease.worktree, lease.branch);
449
+ if (rescued)
450
+ this.log(`Recovery R5: rescued push for task ${seq} (dirty=${evidence.gitStatus})`);
451
+ }
452
+ return this.makeRecoveryEvent('run.failed', seq, ctx, slot ?? '', phase, workerId, 'needs_restart');
453
+ }
454
+ // R7: Worktree missing — release slot, SPS re-prepares
455
+ if (evidence && !evidence.worktreeExists) {
456
+ if (slot)
457
+ this.releaseSlotInState(slot, seq);
458
+ this.log(`Recovery R7: task ${seq} worktree missing, released`);
459
+ return null;
460
+ }
461
+ // R6: No changes (fallback) — failed with no artifacts
462
+ this.log(`Recovery R6: task ${seq} no artifacts found`);
463
+ return this.makeRecoveryEvent('run.failed', seq, ctx, slot ?? '', phase, workerId, 'no_artifacts');
464
+ }
465
+ /**
466
+ * Rebuild integration queue from leases in merging/resolving_conflict phase.
467
+ */
468
+ rebuildIntegrationQueue(ctx, state, result) {
469
+ const qaLeases = Object.entries(state.leases)
470
+ .filter(([, l]) => l.phase === 'merging' || l.phase === 'resolving_conflict')
471
+ .sort(([, a], [, b]) => (a.lastTransitionAt ?? '').localeCompare(b.lastTransitionAt ?? ''));
472
+ for (const [seq, lease] of qaLeases) {
473
+ this.integrationQueue.enqueue({
474
+ taskId: seq, cardId: seq, project: ctx.project,
475
+ prompt: '', // Prompt will be regenerated by SPS
476
+ cwd: lease.worktree ?? '', branch: lease.branch ?? '',
477
+ targetBranch: ctx.baseBranch, tool: 'claude', transport: 'proc',
478
+ outputFile: '', enqueuedAt: lease.lastTransitionAt,
479
+ });
480
+ result.queueRebuilt++;
481
+ }
482
+ if (qaLeases.length > 0) {
483
+ this.log(`Recovery: rebuilt ${qaLeases.length} integration queue entries for ${ctx.project}`);
484
+ }
485
+ }
486
+ /**
487
+ * Create a WorkerEvent for recovery results.
488
+ */
489
+ makeRecoveryEvent(type, seq, ctx, slot, phase, workerId, reason) {
490
+ return {
491
+ type, taskId: seq, cardId: seq, project: ctx.project,
492
+ phase, slot, workerId,
493
+ timestamp: new Date().toISOString(),
494
+ state: type === 'run.completed' ? 'completed' : 'failed',
495
+ completionResult: {
496
+ status: type === 'run.completed' ? 'completed' : 'failed',
497
+ reason,
498
+ },
499
+ };
500
+ }
501
+ /**
502
+ * Try to push unpushed commits from a worktree as a rescue operation.
503
+ */
504
+ rescuePush(worktree, branch) {
505
+ try {
506
+ execFileSync('git', ['-C', worktree, 'push', 'origin', branch], {
507
+ timeout: 30_000,
508
+ stdio: ['ignore', 'pipe', 'pipe'],
509
+ });
510
+ return true;
511
+ }
512
+ catch {
513
+ this.log(`Rescue push failed for ${branch} in ${worktree}`);
514
+ return false;
515
+ }
516
+ }
517
+ /**
518
+ * Check if a process is still alive by sending signal 0.
519
+ */
520
+ isPidAlive(pid) {
521
+ try {
522
+ process.kill(pid, 0);
523
+ return true;
524
+ }
525
+ catch {
526
+ return false;
527
+ }
528
+ }
529
+ // ─── Private: State Helpers ──────────────────────────────────
530
+ rd() { return readState(this.stateFile, this.maxWorkers); }
531
+ wr(state, by) { writeState(this.stateFile, state, by); }
532
+ findIdleSlot(state) {
533
+ return Object.entries(state.workers).find(([, w]) => w.status === 'idle')?.[0] ?? null;
534
+ }
535
+ claimSlot(state, slot, ctx) {
536
+ const seqNum = parseInt(ctx.seq, 10) || 0;
537
+ state.workers[slot] = {
538
+ ...createIdleWorkerSlot(), status: 'active', seq: seqNum,
539
+ branch: ctx.branch, worktree: ctx.cwd, claimedAt: ctx.nowIso, lastHeartbeat: ctx.nowIso,
540
+ mode: ctx.transport === 'pty' ? 'pty' : 'print', transport: ctx.transport, agent: ctx.tool,
541
+ outputFile: ctx.transport === 'proc' ? ctx.outputFile : null,
542
+ };
543
+ state.activeCards[ctx.seq] = {
544
+ seq: seqNum, state: 'Inprogress', worker: slot, mrUrl: null,
545
+ conflictDomains: [], startedAt: ctx.nowIso, retryCount: 0,
546
+ };
547
+ state.leases[ctx.seq] = {
548
+ seq: seqNum, pmStateObserved: ctx.phase === 'integration' ? 'QA' : 'Inprogress',
549
+ phase: 'coding', slot, branch: ctx.branch, worktree: ctx.cwd,
550
+ sessionId: null, runId: null, claimedAt: ctx.nowIso, retryCount: 0,
551
+ lastTransitionAt: ctx.nowIso,
552
+ };
553
+ }
554
+ releaseSlotInState(slot, taskId) {
555
+ const state = this.rd();
556
+ state.workers[slot] = createIdleWorkerSlot();
557
+ delete state.activeCards[taskId];
558
+ delete state.leases[taskId];
559
+ this.wr(state, 'wm-release');
560
+ }
561
+ mapWorkerState(w) {
562
+ if (w.status === 'idle')
563
+ return 'idle';
564
+ if (w.status === 'active') {
565
+ if (w.remoteStatus === 'waiting_input')
566
+ return 'waiting_input';
567
+ if (w.remoteStatus === 'needs_confirmation' || w.sessionState === 'needs_confirmation')
568
+ return 'needs_confirmation';
569
+ if (w.remoteStatus === 'completed')
570
+ return 'completed';
571
+ if (w.remoteStatus === 'failed')
572
+ return 'failed';
573
+ return 'running';
574
+ }
575
+ if (w.status === 'releasing')
576
+ return 'completed';
577
+ return 'running';
578
+ }
579
+ requirePtySlot(taskId, op) {
580
+ const slot = this.taskSlotMap.get(taskId);
581
+ if (!slot)
582
+ throw new Error(`Task ${taskId} not found`);
583
+ if (!this.agentRuntime)
584
+ throw new Error(`${op} requires PTY transport (agentRuntime not available)`);
585
+ const state = this.rd();
586
+ const w = state.workers[slot];
587
+ if (!w || (w.transport !== 'pty' && w.mode !== 'pty')) {
588
+ throw new Error(`${op} unsupported for transport=${w?.transport ?? 'unknown'}`);
589
+ }
590
+ return slot;
591
+ }
592
+ // ─── Private: Timeout Management ─────────────────────────────
593
+ startTimeout(taskId, phase, project, slot, customTimeoutSec) {
594
+ const baseSec = customTimeoutSec ?? (phase === 'integration' ? DEFAULT_TIMEOUTS.integrationSec : DEFAULT_TIMEOUTS.developmentSec);
595
+ const hardSec = Math.ceil(baseSec * DEFAULT_TIMEOUTS.forceMultiplier);
596
+ const workerId = `${project}:${slot}:${taskId}`;
597
+ const softTimer = setTimeout(() => {
598
+ this.emitEvent({
599
+ type: 'status.update', taskId, cardId: taskId, project, phase, slot,
600
+ workerId, timestamp: new Date().toISOString(), state: 'running',
601
+ error: `Timeout: exceeded ${baseSec}s`,
602
+ });
603
+ this.log(`Soft timeout for ${taskId} after ${baseSec}s`);
604
+ // Hard timeout — force kill (wrapped in try-catch to prevent unhandled rejection)
605
+ const hardTimer = setTimeout(async () => {
606
+ try {
607
+ this.log(`Hard timeout for ${taskId} after ${hardSec}s — force killing`);
608
+ await this.cancel({ taskId, project, reason: 'timeout' });
609
+ }
610
+ catch (err) {
611
+ this.log(`Hard timeout cancel failed for ${taskId}: ${err instanceof Error ? err.message : err}`);
612
+ }
613
+ }, (hardSec - baseSec) * 1000);
614
+ hardTimer.unref();
615
+ this.timeouts.set(`${taskId}:hard`, hardTimer);
616
+ }, baseSec * 1000);
617
+ softTimer.unref();
618
+ this.timeouts.set(taskId, softTimer);
619
+ }
620
+ clearTimeoutForTask(taskId) {
621
+ const soft = this.timeouts.get(taskId);
622
+ const hard = this.timeouts.get(`${taskId}:hard`);
623
+ if (soft) {
624
+ clearTimeout(soft);
625
+ this.timeouts.delete(taskId);
626
+ }
627
+ if (hard) {
628
+ clearTimeout(hard);
629
+ this.timeouts.delete(`${taskId}:hard`);
630
+ }
631
+ }
632
+ // ─── Private: Event + Response Helpers ───────────────────────
633
+ emitEvent(event) {
634
+ for (const handler of this.eventHandlers) {
635
+ try {
636
+ handler(event);
637
+ }
638
+ catch (err) {
639
+ this.log(`Event handler error: ${err instanceof Error ? err.message : String(err)}`);
640
+ }
641
+ }
642
+ }
643
+ reject(reason) {
644
+ return { accepted: false, slot: null, workerId: null, rejectReason: reason };
645
+ }
646
+ log(msg) { process.stderr.write(`[worker-manager] ${msg}\n`); }
647
+ }
648
+ //# sourceMappingURL=worker-manager-impl.js.map