@coralai/sps-cli 0.23.21 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -7
- package/dist/commands/cardDashboard.js +3 -3
- package/dist/commands/cardDashboard.js.map +1 -1
- package/dist/commands/pipelineTick.d.ts.map +1 -1
- package/dist/commands/pipelineTick.js +19 -6
- package/dist/commands/pipelineTick.js.map +1 -1
- package/dist/commands/qaTick.d.ts.map +1 -1
- package/dist/commands/qaTick.js +33 -4
- package/dist/commands/qaTick.js.map +1 -1
- package/dist/commands/status.d.ts.map +1 -1
- package/dist/commands/status.js +2 -5
- package/dist/commands/status.js.map +1 -1
- package/dist/commands/tick.d.ts.map +1 -1
- package/dist/commands/tick.js +56 -35
- package/dist/commands/tick.js.map +1 -1
- package/dist/commands/workerDashboard.d.ts.map +1 -1
- package/dist/commands/workerDashboard.js +9 -9
- package/dist/commands/workerDashboard.js.map +1 -1
- package/dist/commands/workerLaunch.d.ts.map +1 -1
- package/dist/commands/workerLaunch.js +19 -6
- package/dist/commands/workerLaunch.js.map +1 -1
- package/dist/core/acpState.js +1 -1
- package/dist/core/acpState.js.map +1 -1
- package/dist/core/config.d.ts +9 -0
- package/dist/core/config.d.ts.map +1 -1
- package/dist/core/config.js +13 -0
- package/dist/core/config.js.map +1 -1
- package/dist/core/runtimeSnapshot.d.ts +1 -0
- package/dist/core/runtimeSnapshot.d.ts.map +1 -1
- package/dist/core/runtimeSnapshot.js +6 -6
- package/dist/core/runtimeSnapshot.js.map +1 -1
- package/dist/core/runtimeStore.d.ts +23 -1
- package/dist/core/runtimeStore.d.ts.map +1 -1
- package/dist/core/runtimeStore.js +71 -32
- package/dist/core/runtimeStore.js.map +1 -1
- package/dist/core/state.d.ts +33 -0
- package/dist/core/state.d.ts.map +1 -1
- package/dist/core/state.js +6 -0
- package/dist/core/state.js.map +1 -1
- package/dist/core/taskPrompts.d.ts.map +1 -1
- package/dist/core/taskPrompts.js +13 -9
- package/dist/core/taskPrompts.js.map +1 -1
- package/dist/core/workerRuntimeSummary.d.ts +1 -2
- package/dist/core/workerRuntimeSummary.d.ts.map +1 -1
- package/dist/core/workerRuntimeSummary.js +2 -2
- package/dist/core/workerRuntimeSummary.js.map +1 -1
- package/dist/engines/CloseoutEngine.d.ts +3 -6
- package/dist/engines/CloseoutEngine.d.ts.map +1 -1
- package/dist/engines/CloseoutEngine.js +113 -285
- package/dist/engines/CloseoutEngine.js.map +1 -1
- package/dist/engines/EventHandler.d.ts +57 -0
- package/dist/engines/EventHandler.d.ts.map +1 -0
- package/dist/engines/EventHandler.js +210 -0
- package/dist/engines/EventHandler.js.map +1 -0
- package/dist/engines/ExecutionEngine.d.ts +5 -17
- package/dist/engines/ExecutionEngine.d.ts.map +1 -1
- package/dist/engines/ExecutionEngine.js +110 -368
- package/dist/engines/ExecutionEngine.js.map +1 -1
- package/dist/engines/MonitorEngine.d.ts.map +1 -1
- package/dist/engines/MonitorEngine.js +8 -9
- package/dist/engines/MonitorEngine.js.map +1 -1
- package/dist/manager/integration-queue.d.ts +65 -0
- package/dist/manager/integration-queue.d.ts.map +1 -0
- package/dist/manager/integration-queue.js +123 -0
- package/dist/manager/integration-queue.js.map +1 -0
- package/dist/manager/recovery.d.ts.map +1 -1
- package/dist/manager/recovery.js +10 -9
- package/dist/manager/recovery.js.map +1 -1
- package/dist/manager/runtime-coordinator.d.ts +1 -3
- package/dist/manager/runtime-coordinator.d.ts.map +1 -1
- package/dist/manager/runtime-coordinator.js +13 -15
- package/dist/manager/runtime-coordinator.js.map +1 -1
- package/dist/manager/worker-manager-impl.d.ts +81 -0
- package/dist/manager/worker-manager-impl.d.ts.map +1 -0
- package/dist/manager/worker-manager-impl.js +648 -0
- package/dist/manager/worker-manager-impl.js.map +1 -0
- package/dist/manager/worker-manager.d.ts +176 -0
- package/dist/manager/worker-manager.d.ts.map +1 -0
- package/dist/manager/worker-manager.js +12 -0
- package/dist/manager/worker-manager.js.map +1 -0
- package/dist/models/acp.d.ts +4 -0
- package/dist/models/acp.d.ts.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WorkerManagerImpl — concrete implementation of the WorkerManager interface.
|
|
3
|
+
*
|
|
4
|
+
* Wraps ProcessSupervisor, CompletionJudge, ResourceLimiter into the
|
|
5
|
+
* unified ACP interface. PM operations are delegated to SPSEventHandler
|
|
6
|
+
* via the event system (Phase 3 refactor).
|
|
7
|
+
*
|
|
8
|
+
* Phase 4: recover() fully implemented with decision matrix from doc-09 §11.3.
|
|
9
|
+
*/
|
|
10
|
+
import { execFileSync } from 'node:child_process';
|
|
11
|
+
import { readState, writeState, createIdleWorkerSlot } from '../core/state.js';
|
|
12
|
+
import { IntegrationQueue } from './integration-queue.js';
|
|
13
|
+
// ─── Timeout Defaults ──────────────────────────────────────────
|
|
14
|
+
const DEFAULT_TIMEOUTS = {
|
|
15
|
+
startupSec: 60, // 60s for worker to start
|
|
16
|
+
developmentSec: 4 * 3600, // 4h for development
|
|
17
|
+
integrationSec: 3600, // 1h for integration
|
|
18
|
+
inputWaitSec: 1800, // 30min waiting for input
|
|
19
|
+
forceMultiplier: 1.5, // Hard kill at 1.5x timeout
|
|
20
|
+
};
|
|
21
|
+
// ─── Implementation ────────────────────────────────────────────
|
|
22
|
+
export class WorkerManagerImpl {
|
|
23
|
+
supervisor;
|
|
24
|
+
completionJudge;
|
|
25
|
+
resourceLimiter;
|
|
26
|
+
agentRuntime;
|
|
27
|
+
stateFile;
|
|
28
|
+
maxWorkers;
|
|
29
|
+
integrationQueue;
|
|
30
|
+
eventHandlers = [];
|
|
31
|
+
taskSlotMap = new Map();
|
|
32
|
+
timeouts = new Map();
|
|
33
|
+
constructor(deps) {
|
|
34
|
+
this.supervisor = deps.supervisor;
|
|
35
|
+
this.completionJudge = deps.completionJudge;
|
|
36
|
+
this.resourceLimiter = deps.resourceLimiter;
|
|
37
|
+
this.agentRuntime = deps.agentRuntime;
|
|
38
|
+
this.stateFile = deps.stateFile;
|
|
39
|
+
this.maxWorkers = deps.maxWorkers;
|
|
40
|
+
this.integrationQueue = deps.integrationQueue ?? new IntegrationQueue(deps.stateFile, deps.maxWorkers);
|
|
41
|
+
}
|
|
42
|
+
// ─── run / resume ────────────────────────────────────────────
|
|
43
|
+
async run(request) {
|
|
44
|
+
return this.acquireAndSpawn({
|
|
45
|
+
taskId: request.taskId, cardId: request.cardId, project: request.project,
|
|
46
|
+
phase: request.phase, prompt: request.prompt, cwd: request.cwd,
|
|
47
|
+
branch: request.branch, targetBranch: request.targetBranch,
|
|
48
|
+
tool: request.tool, transport: request.transport,
|
|
49
|
+
outputFile: request.outputFile, maxRetries: request.maxRetries ?? 0,
|
|
50
|
+
customTimeoutSec: request.timeoutSec,
|
|
51
|
+
}, 'wm-run');
|
|
52
|
+
}
|
|
53
|
+
async resume(request) {
|
|
54
|
+
return this.acquireAndSpawn({
|
|
55
|
+
taskId: request.taskId, cardId: request.cardId, project: request.project,
|
|
56
|
+
phase: request.phase, prompt: request.prompt, cwd: request.cwd,
|
|
57
|
+
branch: request.branch, targetBranch: request.targetBranch,
|
|
58
|
+
tool: request.tool, transport: request.transport,
|
|
59
|
+
outputFile: request.outputFile, maxRetries: 0,
|
|
60
|
+
resumeSessionId: request.sessionId,
|
|
61
|
+
}, 'wm-resume');
|
|
62
|
+
}
|
|
63
|
+
// ─── cancel ──────────────────────────────────────────────────
|
|
64
|
+
async cancel(request) {
|
|
65
|
+
const { taskId, project, reason } = request;
|
|
66
|
+
// ── Check if the task is queued (not yet spawned) ───────────
|
|
67
|
+
const queuePos = this.integrationQueue.getPosition(taskId);
|
|
68
|
+
if (queuePos > 0) {
|
|
69
|
+
// Task is in waiting list — remove without killing any worker
|
|
70
|
+
this.integrationQueue.remove(taskId);
|
|
71
|
+
this.emitEvent({
|
|
72
|
+
type: 'run.failed', taskId, cardId: taskId, project,
|
|
73
|
+
phase: 'integration', slot: '', workerId: '',
|
|
74
|
+
timestamp: new Date().toISOString(), state: 'failed',
|
|
75
|
+
error: `Cancelled from queue (position=${queuePos}): ${reason}`,
|
|
76
|
+
});
|
|
77
|
+
this.log(`Removed queued task ${taskId} from integration queue (reason=${reason})`);
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
const slot = this.taskSlotMap.get(taskId);
|
|
81
|
+
if (!slot) {
|
|
82
|
+
this.log(`Cancel: task ${taskId} not found`);
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
// Determine phase from lease to know if we need to advance queue
|
|
86
|
+
const lease = this.rd().leases[taskId];
|
|
87
|
+
const isIntegration = lease
|
|
88
|
+
? (lease.pmStateObserved === 'QA' || lease.phase === 'merging' || lease.phase === 'resolving_conflict')
|
|
89
|
+
: queuePos === 0;
|
|
90
|
+
this.clearTimeoutForTask(taskId);
|
|
91
|
+
const workerId = `${project}:${slot}:${taskId}`;
|
|
92
|
+
await this.supervisor.kill(workerId);
|
|
93
|
+
this.releaseSlotInState(slot, taskId);
|
|
94
|
+
this.resourceLimiter.release();
|
|
95
|
+
this.taskSlotMap.delete(taskId);
|
|
96
|
+
this.emitEvent({
|
|
97
|
+
type: 'run.failed', taskId, cardId: taskId, project,
|
|
98
|
+
phase: isIntegration ? 'integration' : 'development', slot, workerId,
|
|
99
|
+
timestamp: new Date().toISOString(), state: 'failed',
|
|
100
|
+
error: `Cancelled: ${reason}`,
|
|
101
|
+
});
|
|
102
|
+
this.log(`Cancelled worker ${workerId} (reason=${reason})`);
|
|
103
|
+
// ── If active integration worker was cancelled, advance queue ─
|
|
104
|
+
if (isIntegration) {
|
|
105
|
+
const targetBranch = lease?.branch ?? 'main';
|
|
106
|
+
// Find the actual targetBranch from the queue active entry
|
|
107
|
+
const state = this.rd();
|
|
108
|
+
let actualTarget = targetBranch;
|
|
109
|
+
for (const [key, q] of Object.entries(state.integrationQueues)) {
|
|
110
|
+
if (q.active?.taskId === taskId) {
|
|
111
|
+
actualTarget = key.split(':').slice(1).join(':');
|
|
112
|
+
break;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
await this.advanceIntegrationQueue(project, actualTarget);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
// ─── inspect ─────────────────────────────────────────────────
|
|
119
|
+
inspect(query) {
|
|
120
|
+
const state = this.rd();
|
|
121
|
+
const snapshots = [];
|
|
122
|
+
for (const [slotName, worker] of Object.entries(state.workers)) {
|
|
123
|
+
if (query.slot && query.slot !== slotName)
|
|
124
|
+
continue;
|
|
125
|
+
if (query.taskId && worker.seq !== null && String(worker.seq) !== query.taskId)
|
|
126
|
+
continue;
|
|
127
|
+
const seq = worker.seq !== null ? String(worker.seq) : null;
|
|
128
|
+
const activeCard = seq ? state.activeCards[seq] ?? null : null;
|
|
129
|
+
const lease = seq ? state.leases[seq] ?? null : null;
|
|
130
|
+
snapshots.push({
|
|
131
|
+
slot: slotName, taskId: seq,
|
|
132
|
+
cardId: activeCard ? String(activeCard.seq) : seq,
|
|
133
|
+
project: query.project ?? '',
|
|
134
|
+
state: this.mapWorkerState(worker),
|
|
135
|
+
phase: lease ? (lease.phase === 'merging' || lease.phase === 'resolving_conflict' ? 'integration' : 'development') : null,
|
|
136
|
+
pid: worker.pid ?? null, sessionId: worker.sessionId ?? null,
|
|
137
|
+
cwd: worker.worktree, branch: worker.branch,
|
|
138
|
+
startedAt: worker.claimedAt,
|
|
139
|
+
updatedAt: worker.lastHeartbeat ?? worker.claimedAt ?? new Date().toISOString(),
|
|
140
|
+
outputTail: null, pendingInput: null,
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
return snapshots;
|
|
144
|
+
}
|
|
145
|
+
onEvent(handler) { this.eventHandlers.push(handler); }
|
|
146
|
+
// ─── sendInput / confirm ─────────────────────────────────────
|
|
147
|
+
async sendInput(request) {
|
|
148
|
+
const slot = this.requirePtySlot(request.taskId, 'sendInput');
|
|
149
|
+
await this.agentRuntime.resumeRun(slot, request.input);
|
|
150
|
+
this.log(`Sent input to task ${request.taskId} in ${slot}`);
|
|
151
|
+
}
|
|
152
|
+
async confirm(request) {
|
|
153
|
+
const slot = this.requirePtySlot(request.taskId, 'confirm');
|
|
154
|
+
const input = request.action === 'confirm' ? (request.message ?? 'yes') : (request.message ?? 'no');
|
|
155
|
+
await this.agentRuntime.resumeRun(slot, input);
|
|
156
|
+
this.log(`Confirmed (${request.action}) task ${request.taskId} in ${slot}`);
|
|
157
|
+
}
|
|
158
|
+
// ─── recover (Phase 4 — full decision matrix from doc-09 §11.3) ──
|
|
159
|
+
async recover(contexts) {
|
|
160
|
+
const result = {
|
|
161
|
+
scanned: 0, alive: 0, completed: 0, failed: 0,
|
|
162
|
+
released: 0, rebuilt: 0, queueRebuilt: 0, events: [],
|
|
163
|
+
};
|
|
164
|
+
// Phase 1+2: Scan leases and apply per-task decision matrix
|
|
165
|
+
for (const ctx of contexts) {
|
|
166
|
+
const state = readState(ctx.stateFile, this.maxWorkers);
|
|
167
|
+
for (const [seq, lease] of Object.entries(state.leases)) {
|
|
168
|
+
if (lease.phase === 'released' || lease.phase === 'suspended')
|
|
169
|
+
continue;
|
|
170
|
+
result.scanned++;
|
|
171
|
+
const slot = lease.slot;
|
|
172
|
+
const worker = slot ? state.workers[slot] ?? null : null;
|
|
173
|
+
const pid = worker?.pid ?? null;
|
|
174
|
+
const isAlive = pid ? this.isPidAlive(pid) : false;
|
|
175
|
+
const evidence = state.worktreeEvidence[seq] ?? null;
|
|
176
|
+
const pmState = lease.pmStateObserved;
|
|
177
|
+
// R8/R9: PM manually completed or reverted — release immediately
|
|
178
|
+
if (pmState === 'Done' || pmState === 'Backlog' || pmState === 'Todo') {
|
|
179
|
+
if (slot)
|
|
180
|
+
this.releaseSlotInState(slot, seq);
|
|
181
|
+
result.released++;
|
|
182
|
+
this.log(`Recovery R8/R9: task ${seq} PM state=${pmState}, released`);
|
|
183
|
+
continue;
|
|
184
|
+
}
|
|
185
|
+
// R1: Worker still running — re-attach orphan PID monitoring
|
|
186
|
+
if (isAlive && pid && slot) {
|
|
187
|
+
this.recoverAliveWorker(ctx, seq, lease, slot, pid);
|
|
188
|
+
result.alive++;
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
// Dead worker — check git evidence for decision
|
|
192
|
+
const event = this.judgeDeadWorker(ctx, seq, lease, slot, evidence);
|
|
193
|
+
if (event) {
|
|
194
|
+
result.events.push(event);
|
|
195
|
+
if (event.type === 'run.completed')
|
|
196
|
+
result.completed++;
|
|
197
|
+
else
|
|
198
|
+
result.failed++;
|
|
199
|
+
}
|
|
200
|
+
else {
|
|
201
|
+
// Released (R7: worktree missing or fallback)
|
|
202
|
+
result.released++;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
// Phase 3: Rebuild integration queues from merging/resolving leases
|
|
206
|
+
this.rebuildIntegrationQueue(ctx, state, result);
|
|
207
|
+
}
|
|
208
|
+
// Phase 4: Emit collected events so SPSEventHandler processes them
|
|
209
|
+
for (const event of result.events) {
|
|
210
|
+
this.emitEvent(event);
|
|
211
|
+
}
|
|
212
|
+
if (result.scanned > 0) {
|
|
213
|
+
this.log(`Recovery complete: scanned=${result.scanned} alive=${result.alive} ` +
|
|
214
|
+
`completed=${result.completed} failed=${result.failed} ` +
|
|
215
|
+
`released=${result.released} rebuilt=${result.rebuilt} queueRebuilt=${result.queueRebuilt}`);
|
|
216
|
+
}
|
|
217
|
+
return result;
|
|
218
|
+
}
|
|
219
|
+
// ─── Private: Unified acquire + spawn flow ───────────────────
|
|
220
|
+
async acquireAndSpawn(ctx, label) {
|
|
221
|
+
const { taskId, cardId, project, phase, prompt, cwd, branch, targetBranch, tool, transport, outputFile, maxRetries, resumeSessionId } = ctx;
|
|
222
|
+
if (this.taskSlotMap.has(taskId)) {
|
|
223
|
+
this.log(`Duplicate task ${taskId}, already in slot ${this.taskSlotMap.get(taskId)}`);
|
|
224
|
+
return this.reject('duplicate_task');
|
|
225
|
+
}
|
|
226
|
+
// ── Integration queue gate ──────────────────────────────────
|
|
227
|
+
if (phase === 'integration') {
|
|
228
|
+
const entry = {
|
|
229
|
+
taskId, cardId, project, prompt, cwd, branch, targetBranch,
|
|
230
|
+
tool, transport, outputFile, enqueuedAt: new Date().toISOString(),
|
|
231
|
+
};
|
|
232
|
+
const active = this.integrationQueue.getActive(project, targetBranch);
|
|
233
|
+
if (active) {
|
|
234
|
+
const { position } = this.integrationQueue.enqueue(entry);
|
|
235
|
+
this.log(`Integration task ${taskId} queued at position ${position} (active=${active.taskId})`);
|
|
236
|
+
return { accepted: true, queued: true, queuePosition: position, slot: null, workerId: null };
|
|
237
|
+
}
|
|
238
|
+
// No active — register as active before spawning
|
|
239
|
+
this.integrationQueue.enqueue(entry);
|
|
240
|
+
}
|
|
241
|
+
if (!this.resourceLimiter.tryAcquire()) {
|
|
242
|
+
this.log(`Resource exhausted for task ${taskId}`);
|
|
243
|
+
// If we just registered as active in the queue, roll back
|
|
244
|
+
if (phase === 'integration') {
|
|
245
|
+
this.integrationQueue.dequeueNext(project, targetBranch);
|
|
246
|
+
}
|
|
247
|
+
return this.reject('resource_exhausted');
|
|
248
|
+
}
|
|
249
|
+
const state = this.rd();
|
|
250
|
+
const slot = this.findIdleSlot(state);
|
|
251
|
+
if (!slot) {
|
|
252
|
+
this.resourceLimiter.release();
|
|
253
|
+
if (phase === 'integration') {
|
|
254
|
+
this.integrationQueue.dequeueNext(project, targetBranch);
|
|
255
|
+
}
|
|
256
|
+
this.log(`No idle slot for task ${taskId}`);
|
|
257
|
+
return this.reject('resource_exhausted');
|
|
258
|
+
}
|
|
259
|
+
await this.resourceLimiter.enforceStagger();
|
|
260
|
+
const nowIso = new Date().toISOString();
|
|
261
|
+
this.claimSlot(state, slot, { seq: taskId, cardId, project, phase, branch, cwd, tool, transport, outputFile, nowIso, targetBranch });
|
|
262
|
+
this.wr(state, label);
|
|
263
|
+
this.taskSlotMap.set(taskId, slot);
|
|
264
|
+
const workerId = `${project}:${slot}:${taskId}`;
|
|
265
|
+
let pid = null;
|
|
266
|
+
let sessionId;
|
|
267
|
+
try {
|
|
268
|
+
if (transport === 'proc') {
|
|
269
|
+
const handle = this.supervisor.spawn({
|
|
270
|
+
id: workerId, project, seq: taskId, slot, worktree: cwd, branch,
|
|
271
|
+
prompt, outputFile, tool, resumeSessionId,
|
|
272
|
+
onExit: (exitCode) => this.handleExit({
|
|
273
|
+
workerId, taskId, cardId, project, phase, slot, branch, cwd,
|
|
274
|
+
targetBranch, outputFile, tool, transport, exitCode, maxRetries,
|
|
275
|
+
}),
|
|
276
|
+
});
|
|
277
|
+
pid = handle.pid;
|
|
278
|
+
sessionId = handle.sessionId ?? undefined;
|
|
279
|
+
}
|
|
280
|
+
else if (transport === 'pty') {
|
|
281
|
+
if (!this.agentRuntime) {
|
|
282
|
+
throw new Error('PTY transport requires agentRuntime');
|
|
283
|
+
}
|
|
284
|
+
const session = resumeSessionId
|
|
285
|
+
? await this.agentRuntime.resumeRun(slot, prompt)
|
|
286
|
+
: await this.agentRuntime.startRun(slot, prompt, tool, cwd);
|
|
287
|
+
sessionId = session.sessionId;
|
|
288
|
+
pid = session.pid ?? null;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
catch (err) {
|
|
292
|
+
this.log(`Spawn failed for ${taskId}: ${err instanceof Error ? err.message : String(err)}`);
|
|
293
|
+
this.resourceLimiter.release();
|
|
294
|
+
this.releaseSlotInState(slot, taskId);
|
|
295
|
+
this.taskSlotMap.delete(taskId);
|
|
296
|
+
if (phase === 'integration') {
|
|
297
|
+
this.integrationQueue.remove(taskId);
|
|
298
|
+
await this.advanceIntegrationQueue(project, targetBranch);
|
|
299
|
+
}
|
|
300
|
+
this.emitEvent({
|
|
301
|
+
type: 'run.failed', taskId, cardId, project, phase, slot, workerId,
|
|
302
|
+
timestamp: new Date().toISOString(), state: 'failed',
|
|
303
|
+
error: `Spawn failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
304
|
+
});
|
|
305
|
+
return this.reject('spawn_failed');
|
|
306
|
+
}
|
|
307
|
+
this.log(`Launched ${transport}/${tool} worker ${workerId} in ${slot} (pid=${pid})`);
|
|
308
|
+
this.startTimeout(taskId, phase, project, slot, ctx.customTimeoutSec);
|
|
309
|
+
return { accepted: true, slot, workerId, pid: pid ?? undefined, sessionId };
|
|
310
|
+
}
|
|
311
|
+
// ─── Private: Exit Handler ───────────────────────────────────
|
|
312
|
+
async handleExit(ctx) {
|
|
313
|
+
const { workerId, taskId, cardId, project, phase, slot, branch, cwd, targetBranch, outputFile, tool, transport, exitCode, maxRetries } = ctx;
|
|
314
|
+
this.clearTimeoutForTask(taskId);
|
|
315
|
+
this.log(`Worker ${workerId} exited with code ${exitCode}`);
|
|
316
|
+
const completion = this.completionJudge.judge({
|
|
317
|
+
worktree: cwd, branch, baseBranch: targetBranch, outputFile, exitCode, phase,
|
|
318
|
+
});
|
|
319
|
+
this.log(`CompletionJudge for ${workerId}: ${completion.status} (${completion.reason})`);
|
|
320
|
+
const isComplete = completion.status === 'completed';
|
|
321
|
+
// Emit event — SPSEventHandler handles PM operations, slot release, notifications
|
|
322
|
+
this.emitEvent({
|
|
323
|
+
type: isComplete ? 'run.completed' : 'run.failed',
|
|
324
|
+
taskId, cardId, project, phase, slot, workerId,
|
|
325
|
+
timestamp: new Date().toISOString(),
|
|
326
|
+
state: isComplete ? 'completed' : 'failed',
|
|
327
|
+
exitCode, completionResult: completion,
|
|
328
|
+
});
|
|
329
|
+
// Release supervisor handle and resource limiter slot
|
|
330
|
+
this.supervisor.remove(workerId);
|
|
331
|
+
this.resourceLimiter.release();
|
|
332
|
+
this.taskSlotMap.delete(taskId);
|
|
333
|
+
// ── Auto-dequeue next integration task ──────────────────────
|
|
334
|
+
if (phase === 'integration') {
|
|
335
|
+
await this.advanceIntegrationQueue(project, targetBranch);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
/**
|
|
339
|
+
* Try to spawn the next queued integration task.
|
|
340
|
+
* On spawn failure, skip and try the next entry — never deadlock.
|
|
341
|
+
*/
|
|
342
|
+
async advanceIntegrationQueue(project, targetBranch) {
|
|
343
|
+
// eslint-disable-next-line no-constant-condition
|
|
344
|
+
while (true) {
|
|
345
|
+
const next = this.integrationQueue.dequeueNext(project, targetBranch);
|
|
346
|
+
if (!next) {
|
|
347
|
+
this.log(`Integration queue empty for ${project}:${targetBranch}`);
|
|
348
|
+
return;
|
|
349
|
+
}
|
|
350
|
+
this.log(`Auto-dequeuing integration task ${next.taskId} for ${project}:${targetBranch}`);
|
|
351
|
+
// Skip entries with empty prompt (recovery stubs — SPS must re-prepare)
|
|
352
|
+
if (!next.prompt) {
|
|
353
|
+
this.log(`Skipping ${next.taskId}: empty prompt (needs SPS re-preparation)`);
|
|
354
|
+
this.emitEvent({
|
|
355
|
+
type: 'run.failed', taskId: next.taskId, cardId: next.cardId, project,
|
|
356
|
+
phase: 'integration', slot: '', workerId: '',
|
|
357
|
+
timestamp: new Date().toISOString(), state: 'failed',
|
|
358
|
+
error: 'Empty prompt — needs SPS re-preparation after recovery',
|
|
359
|
+
});
|
|
360
|
+
continue;
|
|
361
|
+
}
|
|
362
|
+
try {
|
|
363
|
+
const resp = await this.acquireAndSpawn({
|
|
364
|
+
taskId: next.taskId, cardId: next.cardId, project,
|
|
365
|
+
phase: 'integration', prompt: next.prompt, cwd: next.cwd,
|
|
366
|
+
branch: next.branch, targetBranch: next.targetBranch,
|
|
367
|
+
tool: next.tool, transport: next.transport,
|
|
368
|
+
outputFile: next.outputFile, maxRetries: 0,
|
|
369
|
+
}, 'wm-iq-dequeue');
|
|
370
|
+
if (resp.accepted && !resp.queued) {
|
|
371
|
+
this.log(`Integration task ${next.taskId} spawned after dequeue`);
|
|
372
|
+
return;
|
|
373
|
+
}
|
|
374
|
+
// If accepted but re-queued, something odd — keep going
|
|
375
|
+
this.log(`Integration task ${next.taskId} could not spawn (accepted=${resp.accepted}), trying next`);
|
|
376
|
+
}
|
|
377
|
+
catch (err) {
|
|
378
|
+
this.log(`Failed to spawn dequeued task ${next.taskId}: ${err instanceof Error ? err.message : String(err)}`);
|
|
379
|
+
// Emit failure event so SPS knows this task was dropped
|
|
380
|
+
this.emitEvent({
|
|
381
|
+
type: 'run.failed', taskId: next.taskId, cardId: next.cardId, project,
|
|
382
|
+
phase: 'integration', slot: '', workerId: '',
|
|
383
|
+
timestamp: new Date().toISOString(), state: 'failed',
|
|
384
|
+
error: `Dequeue spawn failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
385
|
+
});
|
|
386
|
+
// Continue to next entry — never deadlock
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
// ─── Private: Recovery Helpers ────────────────────────────────
|
|
391
|
+
/**
|
|
392
|
+
* R1: Worker PID is still alive — re-attach orphan monitoring.
|
|
393
|
+
*/
|
|
394
|
+
recoverAliveWorker(ctx, seq, lease, slot, pid) {
|
|
395
|
+
const workerId = `${ctx.project}:${slot}:${seq}`;
|
|
396
|
+
const phase = (lease.phase === 'merging' || lease.phase === 'resolving_conflict')
|
|
397
|
+
? 'integration' : 'development';
|
|
398
|
+
this.resourceLimiter.tryAcquire();
|
|
399
|
+
this.supervisor.monitorOrphanPid(workerId, pid, {
|
|
400
|
+
id: workerId, transport: 'proc', pid,
|
|
401
|
+
outputFile: null, project: ctx.project, seq, slot,
|
|
402
|
+
branch: lease.branch ?? '', worktree: lease.worktree ?? '',
|
|
403
|
+
tool: 'claude', exitCode: null, sessionId: lease.sessionId ?? null,
|
|
404
|
+
runId: lease.runId ?? null, sessionState: null, remoteStatus: null,
|
|
405
|
+
lastEventAt: null,
|
|
406
|
+
startedAt: lease.claimedAt ?? new Date().toISOString(),
|
|
407
|
+
exitedAt: null,
|
|
408
|
+
}, (exitCode) => this.handleExit({
|
|
409
|
+
workerId, taskId: seq, cardId: seq, project: ctx.project,
|
|
410
|
+
phase, slot, branch: lease.branch ?? '',
|
|
411
|
+
cwd: lease.worktree ?? '', targetBranch: ctx.baseBranch,
|
|
412
|
+
outputFile: '', tool: 'claude', transport: 'proc',
|
|
413
|
+
exitCode, maxRetries: 0,
|
|
414
|
+
}));
|
|
415
|
+
this.taskSlotMap.set(seq, slot);
|
|
416
|
+
this.log(`Recovery R1: task ${seq} alive (pid=${pid}), re-attached monitor`);
|
|
417
|
+
}
|
|
418
|
+
/**
|
|
419
|
+
* Decision matrix for dead workers (R2-R7).
|
|
420
|
+
* Returns a WorkerEvent for completed/failed, or null if slot was released.
|
|
421
|
+
*/
|
|
422
|
+
judgeDeadWorker(ctx, seq, lease, slot, evidence) {
|
|
423
|
+
const phase = (lease.phase === 'merging' || lease.phase === 'resolving_conflict')
|
|
424
|
+
? 'integration' : 'development';
|
|
425
|
+
const workerId = `${ctx.project}:${slot ?? 'unknown'}:${seq}`;
|
|
426
|
+
// R2: Branch merged into base — task complete
|
|
427
|
+
if (evidence?.mergedToBase) {
|
|
428
|
+
this.log(`Recovery R2: task ${seq} branch merged to base`);
|
|
429
|
+
return this.makeRecoveryEvent('run.completed', seq, ctx, slot ?? '', phase, workerId, 'already_merged');
|
|
430
|
+
}
|
|
431
|
+
// R3: Pushed with commits ahead — development complete
|
|
432
|
+
if (evidence?.pushed && evidence.aheadOfBase > 0) {
|
|
433
|
+
this.log(`Recovery R3: task ${seq} pushed with ${evidence.aheadOfBase} commits ahead`);
|
|
434
|
+
return this.makeRecoveryEvent('run.completed', seq, ctx, slot ?? '', phase, workerId, 'branch_pushed');
|
|
435
|
+
}
|
|
436
|
+
// R4: Local commits unpushed — rescue push, then fail for restart
|
|
437
|
+
if (!evidence?.pushed && evidence && evidence.aheadOfBase > 0) {
|
|
438
|
+
if (lease.worktree && lease.branch) {
|
|
439
|
+
const rescued = this.rescuePush(lease.worktree, lease.branch);
|
|
440
|
+
if (rescued)
|
|
441
|
+
this.log(`Recovery R4: rescued push for task ${seq}`);
|
|
442
|
+
}
|
|
443
|
+
return this.makeRecoveryEvent('run.failed', seq, ctx, slot ?? '', phase, workerId, 'needs_restart');
|
|
444
|
+
}
|
|
445
|
+
// R5: Dirty state (rebase/merge/conflict) — rescue push, then fail
|
|
446
|
+
if (evidence && ['rebase', 'merge', 'conflict'].includes(evidence.gitStatus)) {
|
|
447
|
+
if (lease.worktree && lease.branch) {
|
|
448
|
+
const rescued = this.rescuePush(lease.worktree, lease.branch);
|
|
449
|
+
if (rescued)
|
|
450
|
+
this.log(`Recovery R5: rescued push for task ${seq} (dirty=${evidence.gitStatus})`);
|
|
451
|
+
}
|
|
452
|
+
return this.makeRecoveryEvent('run.failed', seq, ctx, slot ?? '', phase, workerId, 'needs_restart');
|
|
453
|
+
}
|
|
454
|
+
// R7: Worktree missing — release slot, SPS re-prepares
|
|
455
|
+
if (evidence && !evidence.worktreeExists) {
|
|
456
|
+
if (slot)
|
|
457
|
+
this.releaseSlotInState(slot, seq);
|
|
458
|
+
this.log(`Recovery R7: task ${seq} worktree missing, released`);
|
|
459
|
+
return null;
|
|
460
|
+
}
|
|
461
|
+
// R6: No changes (fallback) — failed with no artifacts
|
|
462
|
+
this.log(`Recovery R6: task ${seq} no artifacts found`);
|
|
463
|
+
return this.makeRecoveryEvent('run.failed', seq, ctx, slot ?? '', phase, workerId, 'no_artifacts');
|
|
464
|
+
}
|
|
465
|
+
/**
|
|
466
|
+
* Rebuild integration queue from leases in merging/resolving_conflict phase.
|
|
467
|
+
*/
|
|
468
|
+
rebuildIntegrationQueue(ctx, state, result) {
|
|
469
|
+
const qaLeases = Object.entries(state.leases)
|
|
470
|
+
.filter(([, l]) => l.phase === 'merging' || l.phase === 'resolving_conflict')
|
|
471
|
+
.sort(([, a], [, b]) => (a.lastTransitionAt ?? '').localeCompare(b.lastTransitionAt ?? ''));
|
|
472
|
+
for (const [seq, lease] of qaLeases) {
|
|
473
|
+
this.integrationQueue.enqueue({
|
|
474
|
+
taskId: seq, cardId: seq, project: ctx.project,
|
|
475
|
+
prompt: '', // Prompt will be regenerated by SPS
|
|
476
|
+
cwd: lease.worktree ?? '', branch: lease.branch ?? '',
|
|
477
|
+
targetBranch: ctx.baseBranch, tool: 'claude', transport: 'proc',
|
|
478
|
+
outputFile: '', enqueuedAt: lease.lastTransitionAt,
|
|
479
|
+
});
|
|
480
|
+
result.queueRebuilt++;
|
|
481
|
+
}
|
|
482
|
+
if (qaLeases.length > 0) {
|
|
483
|
+
this.log(`Recovery: rebuilt ${qaLeases.length} integration queue entries for ${ctx.project}`);
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
/**
|
|
487
|
+
* Create a WorkerEvent for recovery results.
|
|
488
|
+
*/
|
|
489
|
+
makeRecoveryEvent(type, seq, ctx, slot, phase, workerId, reason) {
|
|
490
|
+
return {
|
|
491
|
+
type, taskId: seq, cardId: seq, project: ctx.project,
|
|
492
|
+
phase, slot, workerId,
|
|
493
|
+
timestamp: new Date().toISOString(),
|
|
494
|
+
state: type === 'run.completed' ? 'completed' : 'failed',
|
|
495
|
+
completionResult: {
|
|
496
|
+
status: type === 'run.completed' ? 'completed' : 'failed',
|
|
497
|
+
reason,
|
|
498
|
+
},
|
|
499
|
+
};
|
|
500
|
+
}
|
|
501
|
+
/**
|
|
502
|
+
* Try to push unpushed commits from a worktree as a rescue operation.
|
|
503
|
+
*/
|
|
504
|
+
rescuePush(worktree, branch) {
|
|
505
|
+
try {
|
|
506
|
+
execFileSync('git', ['-C', worktree, 'push', 'origin', branch], {
|
|
507
|
+
timeout: 30_000,
|
|
508
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
509
|
+
});
|
|
510
|
+
return true;
|
|
511
|
+
}
|
|
512
|
+
catch {
|
|
513
|
+
this.log(`Rescue push failed for ${branch} in ${worktree}`);
|
|
514
|
+
return false;
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
/**
|
|
518
|
+
* Check if a process is still alive by sending signal 0.
|
|
519
|
+
*/
|
|
520
|
+
isPidAlive(pid) {
|
|
521
|
+
try {
|
|
522
|
+
process.kill(pid, 0);
|
|
523
|
+
return true;
|
|
524
|
+
}
|
|
525
|
+
catch {
|
|
526
|
+
return false;
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
// ─── Private: State Helpers ──────────────────────────────────
|
|
530
|
+
rd() { return readState(this.stateFile, this.maxWorkers); }
|
|
531
|
+
wr(state, by) { writeState(this.stateFile, state, by); }
|
|
532
|
+
findIdleSlot(state) {
|
|
533
|
+
return Object.entries(state.workers).find(([, w]) => w.status === 'idle')?.[0] ?? null;
|
|
534
|
+
}
|
|
535
|
+
claimSlot(state, slot, ctx) {
|
|
536
|
+
const seqNum = parseInt(ctx.seq, 10) || 0;
|
|
537
|
+
state.workers[slot] = {
|
|
538
|
+
...createIdleWorkerSlot(), status: 'active', seq: seqNum,
|
|
539
|
+
branch: ctx.branch, worktree: ctx.cwd, claimedAt: ctx.nowIso, lastHeartbeat: ctx.nowIso,
|
|
540
|
+
mode: ctx.transport === 'pty' ? 'pty' : 'print', transport: ctx.transport, agent: ctx.tool,
|
|
541
|
+
outputFile: ctx.transport === 'proc' ? ctx.outputFile : null,
|
|
542
|
+
};
|
|
543
|
+
state.activeCards[ctx.seq] = {
|
|
544
|
+
seq: seqNum, state: 'Inprogress', worker: slot, mrUrl: null,
|
|
545
|
+
conflictDomains: [], startedAt: ctx.nowIso, retryCount: 0,
|
|
546
|
+
};
|
|
547
|
+
state.leases[ctx.seq] = {
|
|
548
|
+
seq: seqNum, pmStateObserved: ctx.phase === 'integration' ? 'QA' : 'Inprogress',
|
|
549
|
+
phase: 'coding', slot, branch: ctx.branch, worktree: ctx.cwd,
|
|
550
|
+
sessionId: null, runId: null, claimedAt: ctx.nowIso, retryCount: 0,
|
|
551
|
+
lastTransitionAt: ctx.nowIso,
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
releaseSlotInState(slot, taskId) {
|
|
555
|
+
const state = this.rd();
|
|
556
|
+
state.workers[slot] = createIdleWorkerSlot();
|
|
557
|
+
delete state.activeCards[taskId];
|
|
558
|
+
delete state.leases[taskId];
|
|
559
|
+
this.wr(state, 'wm-release');
|
|
560
|
+
}
|
|
561
|
+
mapWorkerState(w) {
|
|
562
|
+
if (w.status === 'idle')
|
|
563
|
+
return 'idle';
|
|
564
|
+
if (w.status === 'active') {
|
|
565
|
+
if (w.remoteStatus === 'waiting_input')
|
|
566
|
+
return 'waiting_input';
|
|
567
|
+
if (w.remoteStatus === 'needs_confirmation' || w.sessionState === 'needs_confirmation')
|
|
568
|
+
return 'needs_confirmation';
|
|
569
|
+
if (w.remoteStatus === 'completed')
|
|
570
|
+
return 'completed';
|
|
571
|
+
if (w.remoteStatus === 'failed')
|
|
572
|
+
return 'failed';
|
|
573
|
+
return 'running';
|
|
574
|
+
}
|
|
575
|
+
if (w.status === 'releasing')
|
|
576
|
+
return 'completed';
|
|
577
|
+
return 'running';
|
|
578
|
+
}
|
|
579
|
+
requirePtySlot(taskId, op) {
|
|
580
|
+
const slot = this.taskSlotMap.get(taskId);
|
|
581
|
+
if (!slot)
|
|
582
|
+
throw new Error(`Task ${taskId} not found`);
|
|
583
|
+
if (!this.agentRuntime)
|
|
584
|
+
throw new Error(`${op} requires PTY transport (agentRuntime not available)`);
|
|
585
|
+
const state = this.rd();
|
|
586
|
+
const w = state.workers[slot];
|
|
587
|
+
if (!w || (w.transport !== 'pty' && w.mode !== 'pty')) {
|
|
588
|
+
throw new Error(`${op} unsupported for transport=${w?.transport ?? 'unknown'}`);
|
|
589
|
+
}
|
|
590
|
+
return slot;
|
|
591
|
+
}
|
|
592
|
+
// ─── Private: Timeout Management ─────────────────────────────
|
|
593
|
+
startTimeout(taskId, phase, project, slot, customTimeoutSec) {
|
|
594
|
+
const baseSec = customTimeoutSec ?? (phase === 'integration' ? DEFAULT_TIMEOUTS.integrationSec : DEFAULT_TIMEOUTS.developmentSec);
|
|
595
|
+
const hardSec = Math.ceil(baseSec * DEFAULT_TIMEOUTS.forceMultiplier);
|
|
596
|
+
const workerId = `${project}:${slot}:${taskId}`;
|
|
597
|
+
const softTimer = setTimeout(() => {
|
|
598
|
+
this.emitEvent({
|
|
599
|
+
type: 'status.update', taskId, cardId: taskId, project, phase, slot,
|
|
600
|
+
workerId, timestamp: new Date().toISOString(), state: 'running',
|
|
601
|
+
error: `Timeout: exceeded ${baseSec}s`,
|
|
602
|
+
});
|
|
603
|
+
this.log(`Soft timeout for ${taskId} after ${baseSec}s`);
|
|
604
|
+
// Hard timeout — force kill (wrapped in try-catch to prevent unhandled rejection)
|
|
605
|
+
const hardTimer = setTimeout(async () => {
|
|
606
|
+
try {
|
|
607
|
+
this.log(`Hard timeout for ${taskId} after ${hardSec}s — force killing`);
|
|
608
|
+
await this.cancel({ taskId, project, reason: 'timeout' });
|
|
609
|
+
}
|
|
610
|
+
catch (err) {
|
|
611
|
+
this.log(`Hard timeout cancel failed for ${taskId}: ${err instanceof Error ? err.message : err}`);
|
|
612
|
+
}
|
|
613
|
+
}, (hardSec - baseSec) * 1000);
|
|
614
|
+
hardTimer.unref();
|
|
615
|
+
this.timeouts.set(`${taskId}:hard`, hardTimer);
|
|
616
|
+
}, baseSec * 1000);
|
|
617
|
+
softTimer.unref();
|
|
618
|
+
this.timeouts.set(taskId, softTimer);
|
|
619
|
+
}
|
|
620
|
+
clearTimeoutForTask(taskId) {
|
|
621
|
+
const soft = this.timeouts.get(taskId);
|
|
622
|
+
const hard = this.timeouts.get(`${taskId}:hard`);
|
|
623
|
+
if (soft) {
|
|
624
|
+
clearTimeout(soft);
|
|
625
|
+
this.timeouts.delete(taskId);
|
|
626
|
+
}
|
|
627
|
+
if (hard) {
|
|
628
|
+
clearTimeout(hard);
|
|
629
|
+
this.timeouts.delete(`${taskId}:hard`);
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
// ─── Private: Event + Response Helpers ───────────────────────
|
|
633
|
+
emitEvent(event) {
|
|
634
|
+
for (const handler of this.eventHandlers) {
|
|
635
|
+
try {
|
|
636
|
+
handler(event);
|
|
637
|
+
}
|
|
638
|
+
catch (err) {
|
|
639
|
+
this.log(`Event handler error: ${err instanceof Error ? err.message : String(err)}`);
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
reject(reason) {
|
|
644
|
+
return { accepted: false, slot: null, workerId: null, rejectReason: reason };
|
|
645
|
+
}
|
|
646
|
+
log(msg) { process.stderr.write(`[worker-manager] ${msg}\n`); }
|
|
647
|
+
}
|
|
648
|
+
//# sourceMappingURL=worker-manager-impl.js.map
|