@agent-relay/sdk 3.1.19 → 3.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/agent-relay-broker-darwin-arm64 +0 -0
- package/bin/agent-relay-broker-darwin-x64 +0 -0
- package/bin/agent-relay-broker-linux-arm64 +0 -0
- package/bin/agent-relay-broker-linux-x64 +0 -0
- package/dist/__tests__/integration.test.js +35 -0
- package/dist/__tests__/integration.test.js.map +1 -1
- package/dist/client.d.ts +9 -0
- package/dist/client.d.ts.map +1 -1
- package/dist/client.js +33 -22
- package/dist/client.js.map +1 -1
- package/dist/protocol.d.ts +1 -0
- package/dist/protocol.d.ts.map +1 -1
- package/dist/relay.d.ts +8 -0
- package/dist/relay.d.ts.map +1 -1
- package/dist/relay.js +50 -5
- package/dist/relay.js.map +1 -1
- package/dist/workflows/cli.js +2 -0
- package/dist/workflows/cli.js.map +1 -1
- package/dist/workflows/runner.d.ts +11 -0
- package/dist/workflows/runner.d.ts.map +1 -1
- package/dist/workflows/runner.js +350 -167
- package/dist/workflows/runner.js.map +1 -1
- package/dist/workflows/trajectory.d.ts +6 -1
- package/dist/workflows/trajectory.d.ts.map +1 -1
- package/dist/workflows/trajectory.js +16 -2
- package/dist/workflows/trajectory.js.map +1 -1
- package/package.json +2 -2
package/dist/workflows/runner.js
CHANGED
|
@@ -17,6 +17,17 @@ import { WorkflowTrajectory } from './trajectory.js';
|
|
|
17
17
|
// Import from sub-paths to avoid pulling in the full @relaycast/sdk dependency.
|
|
18
18
|
import { AgentRelay } from '../relay.js';
|
|
19
19
|
import { RelayCast, RelayError } from '@relaycast/sdk';
|
|
20
|
+
/** Error carrying exit code/signal from a failed subprocess spawn. */
|
|
21
|
+
class SpawnExitError extends Error {
|
|
22
|
+
exitCode;
|
|
23
|
+
exitSignal;
|
|
24
|
+
constructor(message, exitCode, exitSignal) {
|
|
25
|
+
super(message);
|
|
26
|
+
this.name = 'SpawnExitError';
|
|
27
|
+
this.exitCode = exitCode;
|
|
28
|
+
this.exitSignal = exitSignal ?? undefined;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
20
31
|
// ── CLI resolution ───────────────────────────────────────────────────────────
|
|
21
32
|
/**
|
|
22
33
|
* Resolve `cursor` to the concrete cursor agent binary available in PATH.
|
|
@@ -70,6 +81,8 @@ export class WorkflowRunner {
|
|
|
70
81
|
activeAgentHandles = new Map();
|
|
71
82
|
// PTY-based output capture: accumulate terminal output per-agent
|
|
72
83
|
ptyOutputBuffers = new Map();
|
|
84
|
+
/** Snapshot of PTY output from the most recent failed attempt, keyed by step name. */
|
|
85
|
+
lastFailedStepOutput = new Map();
|
|
73
86
|
ptyListeners = new Map();
|
|
74
87
|
ptyLogStreams = new Map();
|
|
75
88
|
/** Path to workers.json so `agents:kill` can find workflow-spawned agents */
|
|
@@ -865,7 +878,12 @@ export class WorkflowRunner {
|
|
|
865
878
|
// ── Execution ───────────────────────────────────────────────────────────
|
|
866
879
|
/** Execute a named workflow from a validated config. */
|
|
867
880
|
async execute(config, workflowName, vars) {
|
|
881
|
+
// Set up abort controller early so callers can abort() even during setup
|
|
882
|
+
this.abortController = new AbortController();
|
|
883
|
+
this.paused = false;
|
|
868
884
|
const resolved = vars ? this.resolveVariables(config, vars) : config;
|
|
885
|
+
// Validate config (catches cycles, missing deps, invalid steps, etc.)
|
|
886
|
+
this.validateConfig(resolved);
|
|
869
887
|
// Resolve and validate named paths from the top-level `paths` config
|
|
870
888
|
const pathResult = this.resolvePathDefinitions(resolved.paths, this.cwd);
|
|
871
889
|
if (pathResult.errors.length > 0) {
|
|
@@ -935,6 +953,9 @@ export class WorkflowRunner {
|
|
|
935
953
|
}
|
|
936
954
|
/** Resume a previously paused or partially completed run. */
|
|
937
955
|
async resume(runId, vars) {
|
|
956
|
+
// Set up abort controller early so callers can abort() even during setup
|
|
957
|
+
this.abortController = new AbortController();
|
|
958
|
+
this.paused = false;
|
|
938
959
|
const run = await this.db.getRun(runId);
|
|
939
960
|
if (!run) {
|
|
940
961
|
throw new Error(`Run "${runId}" not found`);
|
|
@@ -982,9 +1003,7 @@ export class WorkflowRunner {
|
|
|
982
1003
|
async runWorkflowCore(input) {
|
|
983
1004
|
const { run, workflow, config, stepStates, isResume } = input;
|
|
984
1005
|
const runId = run.id;
|
|
985
|
-
// Start execution
|
|
986
|
-
this.abortController = new AbortController();
|
|
987
|
-
this.paused = false;
|
|
1006
|
+
// Start execution (abortController already set by execute()/resume())
|
|
988
1007
|
this.currentConfig = config;
|
|
989
1008
|
this.currentRunId = runId;
|
|
990
1009
|
this.runStartTime = Date.now();
|
|
@@ -1012,15 +1031,20 @@ export class WorkflowRunner {
|
|
|
1012
1031
|
config.swarm.channel = channel;
|
|
1013
1032
|
await this.db.updateRun(runId, { config });
|
|
1014
1033
|
}
|
|
1034
|
+
const relaycastDisabled = this.relayOptions.env?.AGENT_RELAY_WORKFLOW_DISABLE_RELAYCAST === '1';
|
|
1035
|
+
const requiresBroker = !this.executor &&
|
|
1036
|
+
workflow.steps.some((step) => step.type !== 'deterministic' && step.type !== 'worktree');
|
|
1015
1037
|
// Skip broker/relay init when an external executor handles agent spawning
|
|
1016
|
-
if (
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
this.
|
|
1022
|
-
|
|
1023
|
-
|
|
1038
|
+
if (requiresBroker) {
|
|
1039
|
+
if (!relaycastDisabled) {
|
|
1040
|
+
this.log('Resolving Relaycast API key...');
|
|
1041
|
+
await this.ensureRelaycastApiKey(channel);
|
|
1042
|
+
this.log('API key resolved');
|
|
1043
|
+
if (this.relayApiKeyAutoCreated && this.relayApiKey) {
|
|
1044
|
+
this.log(`Workspace created — follow this run in Relaycast:`);
|
|
1045
|
+
this.log(` Observer: https://agentrelay.dev/observer?key=${this.relayApiKey}`);
|
|
1046
|
+
this.log(` Channel: ${channel}`);
|
|
1047
|
+
}
|
|
1024
1048
|
}
|
|
1025
1049
|
this.log('Starting broker...');
|
|
1026
1050
|
// Include a short run ID suffix in the broker name so each workflow execution
|
|
@@ -1031,7 +1055,7 @@ export class WorkflowRunner {
|
|
|
1031
1055
|
this.relay = new AgentRelay({
|
|
1032
1056
|
...this.relayOptions,
|
|
1033
1057
|
brokerName,
|
|
1034
|
-
channels: [channel],
|
|
1058
|
+
channels: relaycastDisabled ? [] : [channel],
|
|
1035
1059
|
env: this.getRelayEnv(),
|
|
1036
1060
|
// Workflows spawn agents across multiple waves; each spawn requires a PTY +
|
|
1037
1061
|
// Relaycast registration. 60s is too tight when the broker is saturated with
|
|
@@ -1092,6 +1116,18 @@ export class WorkflowRunner {
|
|
|
1092
1116
|
};
|
|
1093
1117
|
// Wire relay event hooks for rich console logging
|
|
1094
1118
|
this.relay.onMessageReceived = (msg) => {
|
|
1119
|
+
this.emit({
|
|
1120
|
+
type: 'broker:event',
|
|
1121
|
+
runId,
|
|
1122
|
+
event: {
|
|
1123
|
+
kind: 'relay_inbound',
|
|
1124
|
+
event_id: msg.eventId,
|
|
1125
|
+
from: msg.from,
|
|
1126
|
+
target: msg.to,
|
|
1127
|
+
body: msg.text,
|
|
1128
|
+
thread_id: msg.threadId,
|
|
1129
|
+
},
|
|
1130
|
+
});
|
|
1095
1131
|
const body = msg.text.length > 120 ? msg.text.slice(0, 117) + '...' : msg.text;
|
|
1096
1132
|
const fromShort = msg.from.replace(/-[a-f0-9]{6,}$/, '');
|
|
1097
1133
|
const toShort = msg.to.replace(/-[a-f0-9]{6,}$/, '');
|
|
@@ -1102,19 +1138,60 @@ export class WorkflowRunner {
|
|
|
1102
1138
|
}
|
|
1103
1139
|
};
|
|
1104
1140
|
this.relay.onAgentSpawned = (agent) => {
|
|
1141
|
+
this.emit({
|
|
1142
|
+
type: 'broker:event',
|
|
1143
|
+
runId,
|
|
1144
|
+
event: {
|
|
1145
|
+
kind: 'agent_spawned',
|
|
1146
|
+
name: agent.name,
|
|
1147
|
+
runtime: agent.runtime,
|
|
1148
|
+
},
|
|
1149
|
+
});
|
|
1105
1150
|
// Skip agents already managed by step execution
|
|
1106
1151
|
if (!this.activeAgentHandles.has(agent.name)) {
|
|
1107
1152
|
this.log(`[spawned] ${agent.name} (${agent.runtime})`);
|
|
1108
1153
|
}
|
|
1109
1154
|
};
|
|
1155
|
+
this.relay.onAgentReleased = (agent) => {
|
|
1156
|
+
this.emit({
|
|
1157
|
+
type: 'broker:event',
|
|
1158
|
+
runId,
|
|
1159
|
+
event: {
|
|
1160
|
+
kind: 'agent_released',
|
|
1161
|
+
name: agent.name,
|
|
1162
|
+
},
|
|
1163
|
+
});
|
|
1164
|
+
};
|
|
1110
1165
|
this.relay.onAgentExited = (agent) => {
|
|
1166
|
+
this.emit({
|
|
1167
|
+
type: 'broker:event',
|
|
1168
|
+
runId,
|
|
1169
|
+
event: {
|
|
1170
|
+
kind: 'agent_exited',
|
|
1171
|
+
name: agent.name,
|
|
1172
|
+
code: agent.exitCode,
|
|
1173
|
+
signal: agent.exitSignal,
|
|
1174
|
+
},
|
|
1175
|
+
});
|
|
1111
1176
|
this.lastActivity.delete(agent.name);
|
|
1112
1177
|
this.lastIdleLog.delete(agent.name);
|
|
1113
1178
|
if (!this.activeAgentHandles.has(agent.name)) {
|
|
1114
1179
|
this.log(`[exited] ${agent.name} (code: ${agent.exitCode ?? '?'})`);
|
|
1115
1180
|
}
|
|
1116
1181
|
};
|
|
1182
|
+
this.relay.onDeliveryUpdate = (event) => {
|
|
1183
|
+
this.emit({ type: 'broker:event', runId, event });
|
|
1184
|
+
};
|
|
1117
1185
|
this.relay.onAgentIdle = ({ name, idleSecs }) => {
|
|
1186
|
+
this.emit({
|
|
1187
|
+
type: 'broker:event',
|
|
1188
|
+
runId,
|
|
1189
|
+
event: {
|
|
1190
|
+
kind: 'agent_idle',
|
|
1191
|
+
name,
|
|
1192
|
+
idle_secs: idleSecs,
|
|
1193
|
+
},
|
|
1194
|
+
});
|
|
1118
1195
|
// Only log at 30s multiples to avoid watchdog spam
|
|
1119
1196
|
const bucket = Math.floor(idleSecs / 30) * 30;
|
|
1120
1197
|
if (bucket >= 30 && this.lastIdleLog.get(name) !== bucket) {
|
|
@@ -1129,19 +1206,21 @@ export class WorkflowRunner {
|
|
|
1129
1206
|
this.unsubBrokerStderr = this.relay.onBrokerStderr((line) => {
|
|
1130
1207
|
console.log(`[broker] ${line}`);
|
|
1131
1208
|
});
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1209
|
+
if (!relaycastDisabled) {
|
|
1210
|
+
this.log(`Creating channel: ${channel}...`);
|
|
1211
|
+
if (isResume) {
|
|
1212
|
+
await this.createAndJoinRelaycastChannel(channel);
|
|
1213
|
+
}
|
|
1214
|
+
else {
|
|
1215
|
+
await this.createAndJoinRelaycastChannel(channel, workflow.description);
|
|
1216
|
+
}
|
|
1217
|
+
this.log('Channel ready');
|
|
1218
|
+
if (isResume) {
|
|
1219
|
+
this.postToChannel(`Workflow **${workflow.name}** resumed — ${pendingCount} pending steps`);
|
|
1220
|
+
}
|
|
1221
|
+
else {
|
|
1222
|
+
this.postToChannel(`Workflow **${workflow.name}** started — ${workflow.steps.length} steps, pattern: ${config.swarm.pattern}`);
|
|
1223
|
+
}
|
|
1145
1224
|
}
|
|
1146
1225
|
}
|
|
1147
1226
|
const agentMap = new Map();
|
|
@@ -1154,7 +1233,11 @@ export class WorkflowRunner {
|
|
|
1154
1233
|
}
|
|
1155
1234
|
this.log(`Executing ${workflow.steps.length} steps (pattern: ${config.swarm.pattern})`);
|
|
1156
1235
|
await this.executeSteps(workflow, stepStates, agentMap, config.errorHandling, runId);
|
|
1157
|
-
const
|
|
1236
|
+
const errorStrategy = config.errorHandling?.strategy ?? workflow.onError ?? 'fail-fast';
|
|
1237
|
+
const continueOnError = errorStrategy === 'continue' || errorStrategy === 'skip';
|
|
1238
|
+
const allCompleted = [...stepStates.values()].every((s) => s.row.status === 'completed' ||
|
|
1239
|
+
s.row.status === 'skipped' ||
|
|
1240
|
+
(continueOnError && s.row.status === 'failed'));
|
|
1158
1241
|
if (allCompleted) {
|
|
1159
1242
|
this.log('Workflow completed successfully');
|
|
1160
1243
|
await this.updateRunStatus(runId, 'completed');
|
|
@@ -1175,9 +1258,18 @@ export class WorkflowRunner {
|
|
|
1175
1258
|
await this.updateRunStatus(runId, 'failed', errorMsg);
|
|
1176
1259
|
this.emit({ type: 'run:failed', runId, error: errorMsg });
|
|
1177
1260
|
const outcomes = this.collectOutcomes(stepStates, workflow.steps);
|
|
1261
|
+
const summary = this.trajectory.buildRunSummary(outcomes);
|
|
1262
|
+
const confidence = this.trajectory.computeConfidence(outcomes);
|
|
1263
|
+
const learnings = this.trajectory.extractLearnings(outcomes);
|
|
1264
|
+
const challenges = this.trajectory.extractChallenges(outcomes);
|
|
1178
1265
|
this.postFailureReport(workflow.name, outcomes, errorMsg);
|
|
1179
1266
|
this.logRunSummary(workflow.name, outcomes, runId);
|
|
1180
|
-
await this.trajectory.abandon(errorMsg
|
|
1267
|
+
await this.trajectory.abandon(errorMsg, {
|
|
1268
|
+
summary,
|
|
1269
|
+
confidence,
|
|
1270
|
+
learnings,
|
|
1271
|
+
challenges,
|
|
1272
|
+
});
|
|
1181
1273
|
}
|
|
1182
1274
|
}
|
|
1183
1275
|
catch (err) {
|
|
@@ -1185,6 +1277,19 @@ export class WorkflowRunner {
|
|
|
1185
1277
|
const status = !isResume && this.abortController?.signal.aborted ? 'cancelled' : 'failed';
|
|
1186
1278
|
await this.updateRunStatus(runId, status, errorMsg);
|
|
1187
1279
|
if (status === 'cancelled') {
|
|
1280
|
+
// Mark any pending or in-progress steps as failed due to cancellation
|
|
1281
|
+
for (const [stepName, state] of stepStates) {
|
|
1282
|
+
if (state.row.status === 'pending' || state.row.status === 'running') {
|
|
1283
|
+
state.row.status = 'failed';
|
|
1284
|
+
state.row.error = 'Cancelled';
|
|
1285
|
+
await this.db.updateStep(state.row.id, {
|
|
1286
|
+
status: 'failed',
|
|
1287
|
+
error: 'Cancelled',
|
|
1288
|
+
updatedAt: new Date().toISOString(),
|
|
1289
|
+
});
|
|
1290
|
+
this.emit({ type: 'step:failed', runId, stepName, error: 'Cancelled' });
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1188
1293
|
this.emit({ type: 'run:cancelled', runId });
|
|
1189
1294
|
this.postToChannel(`Workflow **${workflow.name}** cancelled`);
|
|
1190
1295
|
await this.trajectory.abandon('Cancelled by user');
|
|
@@ -1192,10 +1297,17 @@ export class WorkflowRunner {
|
|
|
1192
1297
|
else {
|
|
1193
1298
|
this.emit({ type: 'run:failed', runId, error: errorMsg });
|
|
1194
1299
|
this.postToChannel(`Workflow failed: ${errorMsg}`);
|
|
1195
|
-
|
|
1300
|
+
const outcomes = this.collectOutcomes(stepStates, workflow.steps);
|
|
1301
|
+
await this.trajectory.abandon(errorMsg, {
|
|
1302
|
+
summary: this.trajectory.buildRunSummary(outcomes),
|
|
1303
|
+
confidence: this.trajectory.computeConfidence(outcomes),
|
|
1304
|
+
learnings: this.trajectory.extractLearnings(outcomes),
|
|
1305
|
+
challenges: this.trajectory.extractChallenges(outcomes),
|
|
1306
|
+
});
|
|
1196
1307
|
}
|
|
1197
1308
|
}
|
|
1198
1309
|
finally {
|
|
1310
|
+
this.lastFailedStepOutput.clear();
|
|
1199
1311
|
for (const stream of this.ptyLogStreams.values())
|
|
1200
1312
|
stream.end();
|
|
1201
1313
|
this.ptyLogStreams.clear();
|
|
@@ -1207,9 +1319,11 @@ export class WorkflowRunner {
|
|
|
1207
1319
|
if (this.relay) {
|
|
1208
1320
|
this.relay.onMessageReceived = null;
|
|
1209
1321
|
this.relay.onAgentSpawned = null;
|
|
1322
|
+
this.relay.onAgentReleased = null;
|
|
1210
1323
|
this.relay.onAgentExited = null;
|
|
1211
1324
|
this.relay.onAgentIdle = null;
|
|
1212
1325
|
this.relay.onWorkerOutput = null;
|
|
1326
|
+
this.relay.onDeliveryUpdate = null;
|
|
1213
1327
|
}
|
|
1214
1328
|
this.lastIdleLog.clear();
|
|
1215
1329
|
this.lastActivity.clear();
|
|
@@ -1461,7 +1575,7 @@ export class WorkflowRunner {
|
|
|
1461
1575
|
async executeStep(step, stepStates, agentMap, errorHandling, runId) {
|
|
1462
1576
|
// Branch: deterministic steps execute shell commands
|
|
1463
1577
|
if (this.isDeterministicStep(step)) {
|
|
1464
|
-
return this.executeDeterministicStep(step, stepStates, runId);
|
|
1578
|
+
return this.executeDeterministicStep(step, stepStates, runId, errorHandling);
|
|
1465
1579
|
}
|
|
1466
1580
|
// Branch: worktree steps set up git worktrees
|
|
1467
1581
|
if (this.isWorktreeStep(step)) {
|
|
@@ -1474,42 +1588,143 @@ export class WorkflowRunner {
|
|
|
1474
1588
|
* Execute a deterministic step (shell command).
|
|
1475
1589
|
* Fast, reliable, $0 LLM cost.
|
|
1476
1590
|
*/
|
|
1477
|
-
async executeDeterministicStep(step, stepStates, runId) {
|
|
1591
|
+
async executeDeterministicStep(step, stepStates, runId, errorHandling) {
|
|
1478
1592
|
const state = stepStates.get(step.name);
|
|
1479
1593
|
if (!state)
|
|
1480
1594
|
throw new Error(`Step state not found: ${step.name}`);
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1595
|
+
const maxRetries = step.retries ?? errorHandling?.maxRetries ?? 0;
|
|
1596
|
+
const retryDelay = errorHandling?.retryDelayMs ?? 1000;
|
|
1597
|
+
let lastError;
|
|
1598
|
+
for (let attempt = 0; attempt <= maxRetries; attempt += 1) {
|
|
1599
|
+
this.checkAborted();
|
|
1600
|
+
if (attempt > 0) {
|
|
1601
|
+
this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
|
|
1602
|
+
this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
|
|
1603
|
+
state.row.retryCount = attempt;
|
|
1604
|
+
await this.db.updateStep(state.row.id, {
|
|
1605
|
+
retryCount: attempt,
|
|
1606
|
+
updatedAt: new Date().toISOString(),
|
|
1607
|
+
});
|
|
1608
|
+
await this.delay(retryDelay);
|
|
1609
|
+
}
|
|
1610
|
+
// Mark step as running
|
|
1611
|
+
state.row.status = 'running';
|
|
1612
|
+
state.row.startedAt = new Date().toISOString();
|
|
1613
|
+
await this.db.updateStep(state.row.id, {
|
|
1614
|
+
status: 'running',
|
|
1615
|
+
startedAt: state.row.startedAt,
|
|
1616
|
+
updatedAt: new Date().toISOString(),
|
|
1617
|
+
});
|
|
1618
|
+
this.emit({ type: 'step:started', runId, stepName: step.name });
|
|
1619
|
+
this.postToChannel(`**[${step.name}]** Started (deterministic)`);
|
|
1620
|
+
// Resolve variables in the command (e.g., {{steps.plan.output}}, {{branch-name}})
|
|
1621
|
+
const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
|
|
1622
|
+
let resolvedCommand = this.interpolateStepTask(step.command ?? '', stepOutputContext);
|
|
1623
|
+
// Also resolve simple {{variable}} placeholders (already resolved in top-level config but safe to re-run)
|
|
1624
|
+
resolvedCommand = resolvedCommand.replace(/\{\{([\w][\w.\-]*)\}\}/g, (_match, key) => {
|
|
1625
|
+
if (key.startsWith('steps.'))
|
|
1626
|
+
return _match; // Already handled above
|
|
1627
|
+
const value = this.resolveDotPath(key, stepOutputContext);
|
|
1628
|
+
return value !== undefined ? String(value) : _match;
|
|
1629
|
+
});
|
|
1630
|
+
// Resolve step workdir (named path reference) for deterministic steps
|
|
1631
|
+
const stepCwd = this.resolveStepWorkdir(step) ?? this.cwd;
|
|
1632
|
+
try {
|
|
1633
|
+
// Delegate to executor if present
|
|
1634
|
+
if (this.executor?.executeDeterministicStep) {
|
|
1635
|
+
const result = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
|
|
1636
|
+
const failOnError = step.failOnError !== false;
|
|
1637
|
+
if (failOnError && result.exitCode !== 0) {
|
|
1638
|
+
throw new Error(`Command failed with exit code ${result.exitCode}: ${result.output.slice(0, 500)}`);
|
|
1639
|
+
}
|
|
1640
|
+
const output = step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
|
|
1641
|
+
if (step.verification) {
|
|
1642
|
+
this.runVerification(step.verification, output, step.name);
|
|
1643
|
+
}
|
|
1644
|
+
// Mark completed
|
|
1645
|
+
state.row.status = 'completed';
|
|
1646
|
+
state.row.output = output;
|
|
1647
|
+
state.row.completedAt = new Date().toISOString();
|
|
1648
|
+
await this.db.updateStep(state.row.id, {
|
|
1649
|
+
status: 'completed',
|
|
1650
|
+
output,
|
|
1651
|
+
completedAt: state.row.completedAt,
|
|
1652
|
+
updatedAt: new Date().toISOString(),
|
|
1653
|
+
});
|
|
1654
|
+
await this.persistStepOutput(runId, step.name, output);
|
|
1655
|
+
this.emit({ type: 'step:completed', runId, stepName: step.name, output });
|
|
1656
|
+
return;
|
|
1657
|
+
}
|
|
1658
|
+
const output = await new Promise((resolve, reject) => {
|
|
1659
|
+
const child = cpSpawn('sh', ['-c', resolvedCommand], {
|
|
1660
|
+
stdio: 'pipe',
|
|
1661
|
+
cwd: stepCwd,
|
|
1662
|
+
env: { ...process.env },
|
|
1663
|
+
});
|
|
1664
|
+
const stdoutChunks = [];
|
|
1665
|
+
const stderrChunks = [];
|
|
1666
|
+
// Wire abort signal
|
|
1667
|
+
const abortSignal = this.abortController?.signal;
|
|
1668
|
+
let abortHandler;
|
|
1669
|
+
if (abortSignal && !abortSignal.aborted) {
|
|
1670
|
+
abortHandler = () => {
|
|
1671
|
+
child.kill('SIGTERM');
|
|
1672
|
+
setTimeout(() => child.kill('SIGKILL'), 5000);
|
|
1673
|
+
};
|
|
1674
|
+
abortSignal.addEventListener('abort', abortHandler, { once: true });
|
|
1675
|
+
}
|
|
1676
|
+
// Handle timeout
|
|
1677
|
+
let timedOut = false;
|
|
1678
|
+
let timer;
|
|
1679
|
+
if (step.timeoutMs) {
|
|
1680
|
+
timer = setTimeout(() => {
|
|
1681
|
+
timedOut = true;
|
|
1682
|
+
child.kill('SIGTERM');
|
|
1683
|
+
setTimeout(() => child.kill('SIGKILL'), 5000);
|
|
1684
|
+
}, step.timeoutMs);
|
|
1685
|
+
}
|
|
1686
|
+
child.stdout?.on('data', (chunk) => {
|
|
1687
|
+
stdoutChunks.push(chunk.toString());
|
|
1688
|
+
});
|
|
1689
|
+
child.stderr?.on('data', (chunk) => {
|
|
1690
|
+
stderrChunks.push(chunk.toString());
|
|
1691
|
+
});
|
|
1692
|
+
child.on('close', (code) => {
|
|
1693
|
+
if (timer)
|
|
1694
|
+
clearTimeout(timer);
|
|
1695
|
+
if (abortHandler && abortSignal) {
|
|
1696
|
+
abortSignal.removeEventListener('abort', abortHandler);
|
|
1697
|
+
}
|
|
1698
|
+
if (abortSignal?.aborted) {
|
|
1699
|
+
reject(new Error(`Step "${step.name}" aborted`));
|
|
1700
|
+
return;
|
|
1701
|
+
}
|
|
1702
|
+
if (timedOut) {
|
|
1703
|
+
reject(new Error(`Step "${step.name}" timed out (no step timeout set, check global swarm.timeoutMs)`));
|
|
1704
|
+
return;
|
|
1705
|
+
}
|
|
1706
|
+
const stdout = stdoutChunks.join('');
|
|
1707
|
+
const stderr = stderrChunks.join('');
|
|
1708
|
+
// Check exit code unless failOnError is explicitly false
|
|
1709
|
+
const failOnError = step.failOnError !== false;
|
|
1710
|
+
if (failOnError && code !== 0 && code !== null) {
|
|
1711
|
+
reject(new Error(`Command failed with exit code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`));
|
|
1712
|
+
return;
|
|
1713
|
+
}
|
|
1714
|
+
resolve(step.captureOutput !== false ? stdout : `Command completed (exit code ${code ?? 0})`);
|
|
1715
|
+
});
|
|
1716
|
+
child.on('error', (err) => {
|
|
1717
|
+
if (timer)
|
|
1718
|
+
clearTimeout(timer);
|
|
1719
|
+
if (abortHandler && abortSignal) {
|
|
1720
|
+
abortSignal.removeEventListener('abort', abortHandler);
|
|
1721
|
+
}
|
|
1722
|
+
reject(new Error(`Failed to execute command: ${err.message}`));
|
|
1723
|
+
});
|
|
1724
|
+
});
|
|
1725
|
+
if (step.verification) {
|
|
1726
|
+
this.runVerification(step.verification, output, step.name);
|
|
1511
1727
|
}
|
|
1512
|
-
const output = step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
|
|
1513
1728
|
// Mark completed
|
|
1514
1729
|
state.row.status = 'completed';
|
|
1515
1730
|
state.row.output = output;
|
|
@@ -1520,97 +1735,19 @@ export class WorkflowRunner {
|
|
|
1520
1735
|
completedAt: state.row.completedAt,
|
|
1521
1736
|
updatedAt: new Date().toISOString(),
|
|
1522
1737
|
});
|
|
1738
|
+
// Persist step output
|
|
1523
1739
|
await this.persistStepOutput(runId, step.name, output);
|
|
1524
1740
|
this.emit({ type: 'step:completed', runId, stepName: step.name, output });
|
|
1525
1741
|
return;
|
|
1526
1742
|
}
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
cwd: stepCwd,
|
|
1531
|
-
env: { ...process.env },
|
|
1532
|
-
});
|
|
1533
|
-
const stdoutChunks = [];
|
|
1534
|
-
const stderrChunks = [];
|
|
1535
|
-
// Wire abort signal
|
|
1536
|
-
const abortSignal = this.abortController?.signal;
|
|
1537
|
-
let abortHandler;
|
|
1538
|
-
if (abortSignal && !abortSignal.aborted) {
|
|
1539
|
-
abortHandler = () => {
|
|
1540
|
-
child.kill('SIGTERM');
|
|
1541
|
-
setTimeout(() => child.kill('SIGKILL'), 5000);
|
|
1542
|
-
};
|
|
1543
|
-
abortSignal.addEventListener('abort', abortHandler, { once: true });
|
|
1544
|
-
}
|
|
1545
|
-
// Handle timeout
|
|
1546
|
-
let timedOut = false;
|
|
1547
|
-
let timer;
|
|
1548
|
-
if (step.timeoutMs) {
|
|
1549
|
-
timer = setTimeout(() => {
|
|
1550
|
-
timedOut = true;
|
|
1551
|
-
child.kill('SIGTERM');
|
|
1552
|
-
setTimeout(() => child.kill('SIGKILL'), 5000);
|
|
1553
|
-
}, step.timeoutMs);
|
|
1554
|
-
}
|
|
1555
|
-
child.stdout?.on('data', (chunk) => {
|
|
1556
|
-
stdoutChunks.push(chunk.toString());
|
|
1557
|
-
});
|
|
1558
|
-
child.stderr?.on('data', (chunk) => {
|
|
1559
|
-
stderrChunks.push(chunk.toString());
|
|
1560
|
-
});
|
|
1561
|
-
child.on('close', (code) => {
|
|
1562
|
-
if (timer)
|
|
1563
|
-
clearTimeout(timer);
|
|
1564
|
-
if (abortHandler && abortSignal) {
|
|
1565
|
-
abortSignal.removeEventListener('abort', abortHandler);
|
|
1566
|
-
}
|
|
1567
|
-
if (abortSignal?.aborted) {
|
|
1568
|
-
reject(new Error(`Step "${step.name}" aborted`));
|
|
1569
|
-
return;
|
|
1570
|
-
}
|
|
1571
|
-
if (timedOut) {
|
|
1572
|
-
reject(new Error(`Step "${step.name}" timed out (no step timeout set, check global swarm.timeoutMs)`));
|
|
1573
|
-
return;
|
|
1574
|
-
}
|
|
1575
|
-
const stdout = stdoutChunks.join('');
|
|
1576
|
-
const stderr = stderrChunks.join('');
|
|
1577
|
-
// Check exit code unless failOnError is explicitly false
|
|
1578
|
-
const failOnError = step.failOnError !== false;
|
|
1579
|
-
if (failOnError && code !== 0 && code !== null) {
|
|
1580
|
-
reject(new Error(`Command failed with exit code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`));
|
|
1581
|
-
return;
|
|
1582
|
-
}
|
|
1583
|
-
resolve(step.captureOutput !== false ? stdout : `Command completed (exit code ${code ?? 0})`);
|
|
1584
|
-
});
|
|
1585
|
-
child.on('error', (err) => {
|
|
1586
|
-
if (timer)
|
|
1587
|
-
clearTimeout(timer);
|
|
1588
|
-
if (abortHandler && abortSignal) {
|
|
1589
|
-
abortSignal.removeEventListener('abort', abortHandler);
|
|
1590
|
-
}
|
|
1591
|
-
reject(new Error(`Failed to execute command: ${err.message}`));
|
|
1592
|
-
});
|
|
1593
|
-
});
|
|
1594
|
-
// Mark completed
|
|
1595
|
-
state.row.status = 'completed';
|
|
1596
|
-
state.row.output = output;
|
|
1597
|
-
state.row.completedAt = new Date().toISOString();
|
|
1598
|
-
await this.db.updateStep(state.row.id, {
|
|
1599
|
-
status: 'completed',
|
|
1600
|
-
output,
|
|
1601
|
-
completedAt: state.row.completedAt,
|
|
1602
|
-
updatedAt: new Date().toISOString(),
|
|
1603
|
-
});
|
|
1604
|
-
// Persist step output
|
|
1605
|
-
await this.persistStepOutput(runId, step.name, output);
|
|
1606
|
-
this.emit({ type: 'step:completed', runId, stepName: step.name, output });
|
|
1607
|
-
}
|
|
1608
|
-
catch (err) {
|
|
1609
|
-
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
1610
|
-
this.postToChannel(`**[${step.name}]** Failed: ${errorMsg}`);
|
|
1611
|
-
await this.markStepFailed(state, errorMsg, runId);
|
|
1612
|
-
throw new Error(`Step "${step.name}" failed: ${errorMsg}`);
|
|
1743
|
+
catch (err) {
|
|
1744
|
+
lastError = err instanceof Error ? err.message : String(err);
|
|
1745
|
+
}
|
|
1613
1746
|
}
|
|
1747
|
+
const errorMsg = lastError ?? 'Unknown error';
|
|
1748
|
+
this.postToChannel(`**[${step.name}]** Failed: ${errorMsg}`);
|
|
1749
|
+
await this.markStepFailed(state, errorMsg, runId);
|
|
1750
|
+
throw new Error(`Step "${step.name}" failed: ${errorMsg}`);
|
|
1614
1751
|
}
|
|
1615
1752
|
/**
|
|
1616
1753
|
* Execute a worktree step (git worktree setup).
|
|
@@ -1807,8 +1944,13 @@ export class WorkflowRunner {
|
|
|
1807
1944
|
specialistDef.constraints?.timeoutMs ??
|
|
1808
1945
|
this.currentConfig?.swarm?.timeoutMs;
|
|
1809
1946
|
let lastError;
|
|
1947
|
+
let lastExitCode;
|
|
1948
|
+
let lastExitSignal;
|
|
1810
1949
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
1811
1950
|
this.checkAborted();
|
|
1951
|
+
// Reset per-attempt exit info so stale values don't leak across retries
|
|
1952
|
+
lastExitCode = undefined;
|
|
1953
|
+
lastExitSignal = undefined;
|
|
1812
1954
|
if (attempt > 0) {
|
|
1813
1955
|
this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
|
|
1814
1956
|
this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
|
|
@@ -1850,6 +1992,15 @@ export class WorkflowRunner {
|
|
|
1850
1992
|
// Resolve step-output variables (e.g. {{steps.plan.output}}) at execution time
|
|
1851
1993
|
const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
|
|
1852
1994
|
let resolvedTask = this.interpolateStepTask(step.task ?? '', stepOutputContext);
|
|
1995
|
+
// On retry attempts, prepend failure context so the agent knows what went wrong
|
|
1996
|
+
if (attempt > 0 && lastError) {
|
|
1997
|
+
const priorOutput = (this.lastFailedStepOutput.get(step.name) ?? '').slice(-2000);
|
|
1998
|
+
resolvedTask =
|
|
1999
|
+
`[RETRY — Attempt ${attempt + 1}/${maxRetries + 1}]\n` +
|
|
2000
|
+
`Previous attempt failed: ${lastError}\n` +
|
|
2001
|
+
(priorOutput ? `Previous output (last 2000 chars):\n${priorOutput}\n` : '') +
|
|
2002
|
+
`---\n${resolvedTask}`;
|
|
2003
|
+
}
|
|
1853
2004
|
// If this is an interactive agent, append awareness of non-interactive workers
|
|
1854
2005
|
// so the lead knows not to message them and to use step output chaining instead
|
|
1855
2006
|
if (specialistDef.interactive !== false || ownerDef.interactive !== false) {
|
|
@@ -1884,9 +2035,12 @@ export class WorkflowRunner {
|
|
|
1884
2035
|
this.log(`[${step.name}] Spawning owner "${effectiveOwner.name}" (cli: ${effectiveOwner.cli})${step.workdir ? ` [workdir: ${step.workdir}]` : ''}`);
|
|
1885
2036
|
const resolvedStep = { ...step, task: ownerTask };
|
|
1886
2037
|
const ownerStartTime = Date.now();
|
|
1887
|
-
const
|
|
2038
|
+
const spawnResult = this.executor
|
|
1888
2039
|
? await this.executor.executeAgentStep(resolvedStep, effectiveOwner, ownerTask, timeoutMs)
|
|
1889
2040
|
: await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs);
|
|
2041
|
+
const output = typeof spawnResult === 'string' ? spawnResult : spawnResult.output;
|
|
2042
|
+
lastExitCode = typeof spawnResult === 'string' ? undefined : spawnResult.exitCode;
|
|
2043
|
+
lastExitSignal = typeof spawnResult === 'string' ? undefined : spawnResult.exitSignal;
|
|
1890
2044
|
ownerElapsed = Date.now() - ownerStartTime;
|
|
1891
2045
|
this.log(`[${step.name}] Owner "${effectiveOwner.name}" exited`);
|
|
1892
2046
|
if (usesOwnerFlow) {
|
|
@@ -1897,7 +2051,7 @@ export class WorkflowRunner {
|
|
|
1897
2051
|
}
|
|
1898
2052
|
// Run verification if configured
|
|
1899
2053
|
if (step.verification) {
|
|
1900
|
-
this.runVerification(step.verification, specialistOutput, step.name, resolvedTask);
|
|
2054
|
+
this.runVerification(step.verification, specialistOutput, step.name, effectiveOwner.interactive === false ? undefined : resolvedTask);
|
|
1901
2055
|
}
|
|
1902
2056
|
// Every interactive step gets a review pass; pick a dedicated reviewer when available.
|
|
1903
2057
|
let combinedOutput = specialistOutput;
|
|
@@ -1918,12 +2072,16 @@ export class WorkflowRunner {
|
|
|
1918
2072
|
});
|
|
1919
2073
|
// Persist step output to disk so it survives restarts and is inspectable
|
|
1920
2074
|
await this.persistStepOutput(runId, step.name, combinedOutput);
|
|
1921
|
-
this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput });
|
|
2075
|
+
this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput, exitCode: lastExitCode, exitSignal: lastExitSignal });
|
|
1922
2076
|
await this.trajectory?.stepCompleted(step, combinedOutput, attempt + 1);
|
|
1923
2077
|
return;
|
|
1924
2078
|
}
|
|
1925
2079
|
catch (err) {
|
|
1926
2080
|
lastError = err instanceof Error ? err.message : String(err);
|
|
2081
|
+
if (err instanceof SpawnExitError) {
|
|
2082
|
+
lastExitCode = err.exitCode;
|
|
2083
|
+
lastExitSignal = err.exitSignal;
|
|
2084
|
+
}
|
|
1927
2085
|
const ownerTimedOut = usesDedicatedOwner
|
|
1928
2086
|
? /\bowner timed out\b/i.test(lastError)
|
|
1929
2087
|
: /\btimed out\b/i.test(lastError) && !lastError.includes(`${step.name}-review`);
|
|
@@ -1943,7 +2101,10 @@ export class WorkflowRunner {
|
|
|
1943
2101
|
verificationValue,
|
|
1944
2102
|
});
|
|
1945
2103
|
this.postToChannel(`**[${step.name}]** Failed: ${lastError ?? 'Unknown error'}`);
|
|
1946
|
-
await this.markStepFailed(state, lastError ?? 'Unknown error', runId
|
|
2104
|
+
await this.markStepFailed(state, lastError ?? 'Unknown error', runId, {
|
|
2105
|
+
exitCode: lastExitCode,
|
|
2106
|
+
exitSignal: lastExitSignal,
|
|
2107
|
+
});
|
|
1947
2108
|
throw new Error(`Step "${step.name}" failed after ${maxRetries} retries: ${lastError ?? 'Unknown error'}`);
|
|
1948
2109
|
}
|
|
1949
2110
|
injectStepOwnerContract(step, resolvedTask, ownerDef, specialistDef) {
|
|
@@ -2058,10 +2219,10 @@ export class WorkflowRunner {
|
|
|
2058
2219
|
});
|
|
2059
2220
|
const workerSettled = workerPromise.catch(() => undefined);
|
|
2060
2221
|
workerPromise
|
|
2061
|
-
.then((
|
|
2222
|
+
.then((result) => {
|
|
2062
2223
|
workerReleased = true;
|
|
2063
2224
|
this.postToChannel(`**[${step.name}]** Worker \`${workerRuntimeName}\` exited`);
|
|
2064
|
-
if (step.verification?.type === 'output_contains' && output.includes(step.verification.value)) {
|
|
2225
|
+
if (step.verification?.type === 'output_contains' && result.output.includes(step.verification.value)) {
|
|
2065
2226
|
this.postToChannel(`**[${step.name}]** Verification gate observed: output contains ${JSON.stringify(step.verification.value)}`);
|
|
2066
2227
|
}
|
|
2067
2228
|
})
|
|
@@ -2080,7 +2241,7 @@ export class WorkflowRunner {
|
|
|
2080
2241
|
this.log(`[${step.name}] Spawning owner "${supervised.owner.name}" (cli: ${supervised.owner.cli})`);
|
|
2081
2242
|
const ownerStartTime = Date.now();
|
|
2082
2243
|
try {
|
|
2083
|
-
const
|
|
2244
|
+
const ownerResultObj = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
|
|
2084
2245
|
agentNameSuffix: 'owner',
|
|
2085
2246
|
onSpawned: ({ actualName }) => {
|
|
2086
2247
|
this.supervisedRuntimeAgents.set(actualName, {
|
|
@@ -2094,9 +2255,10 @@ export class WorkflowRunner {
|
|
|
2094
2255
|
},
|
|
2095
2256
|
});
|
|
2096
2257
|
const ownerElapsed = Date.now() - ownerStartTime;
|
|
2258
|
+
const ownerOutput = ownerResultObj.output;
|
|
2097
2259
|
this.log(`[${step.name}] Owner "${supervised.owner.name}" exited`);
|
|
2098
2260
|
this.assertOwnerCompletionMarker(step, ownerOutput, supervisorTask);
|
|
2099
|
-
const specialistOutput = await workerPromise;
|
|
2261
|
+
const specialistOutput = (await workerPromise).output;
|
|
2100
2262
|
return { specialistOutput, ownerOutput, ownerElapsed };
|
|
2101
2263
|
}
|
|
2102
2264
|
catch (error) {
|
|
@@ -2307,7 +2469,7 @@ export class WorkflowRunner {
|
|
|
2307
2469
|
})();
|
|
2308
2470
|
};
|
|
2309
2471
|
try {
|
|
2310
|
-
|
|
2472
|
+
await this.spawnAndWait(reviewerDef, reviewStep, safetyTimeoutMs, {
|
|
2311
2473
|
onSpawned: ({ agent }) => {
|
|
2312
2474
|
reviewerHandle = agent;
|
|
2313
2475
|
},
|
|
@@ -2502,7 +2664,7 @@ export class WorkflowRunner {
|
|
|
2502
2664
|
const stdoutChunks = [];
|
|
2503
2665
|
const stderrChunks = [];
|
|
2504
2666
|
try {
|
|
2505
|
-
const output = await new Promise((resolve, reject) => {
|
|
2667
|
+
const { stdout: output, exitCode, exitSignal } = await new Promise((resolve, reject) => {
|
|
2506
2668
|
const child = cpSpawn(cmd, args, {
|
|
2507
2669
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
2508
2670
|
cwd: this.resolveAgentCwd(agentDef),
|
|
@@ -2560,7 +2722,7 @@ export class WorkflowRunner {
|
|
|
2560
2722
|
setTimeout(() => child.kill('SIGKILL'), 5000);
|
|
2561
2723
|
}, timeoutMs);
|
|
2562
2724
|
}
|
|
2563
|
-
child.on('close', (code) => {
|
|
2725
|
+
child.on('close', (code, signal) => {
|
|
2564
2726
|
clearInterval(heartbeat);
|
|
2565
2727
|
if (timer)
|
|
2566
2728
|
clearTimeout(timer);
|
|
@@ -2578,10 +2740,14 @@ export class WorkflowRunner {
|
|
|
2578
2740
|
}
|
|
2579
2741
|
if (code !== 0 && code !== null) {
|
|
2580
2742
|
const stderr = stderrChunks.join('');
|
|
2581
|
-
reject(new
|
|
2743
|
+
reject(new SpawnExitError(`Step "${step.name}" exited with code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`, code, signal));
|
|
2582
2744
|
return;
|
|
2583
2745
|
}
|
|
2584
|
-
resolve(
|
|
2746
|
+
resolve({
|
|
2747
|
+
stdout,
|
|
2748
|
+
exitCode: code ?? undefined,
|
|
2749
|
+
exitSignal: signal ?? undefined,
|
|
2750
|
+
});
|
|
2585
2751
|
});
|
|
2586
2752
|
child.on('error', (err) => {
|
|
2587
2753
|
clearInterval(heartbeat);
|
|
@@ -2593,9 +2759,11 @@ export class WorkflowRunner {
|
|
|
2593
2759
|
reject(new Error(`Failed to spawn ${cmd}: ${err.message}`));
|
|
2594
2760
|
});
|
|
2595
2761
|
});
|
|
2596
|
-
return output;
|
|
2762
|
+
return { output, exitCode, exitSignal };
|
|
2597
2763
|
}
|
|
2598
2764
|
finally {
|
|
2765
|
+
const combinedOutput = stdoutChunks.join('') + stderrChunks.join('');
|
|
2766
|
+
this.lastFailedStepOutput.set(step.name, combinedOutput);
|
|
2599
2767
|
stopHeartbeat?.();
|
|
2600
2768
|
logStream.end();
|
|
2601
2769
|
this.unregisterWorker(agentName);
|
|
@@ -2755,10 +2923,14 @@ export class WorkflowRunner {
|
|
|
2755
2923
|
throw new Error(`Step "${step.name}" timed out after ${timeoutMs ?? 'unknown'}ms`);
|
|
2756
2924
|
}
|
|
2757
2925
|
}
|
|
2926
|
+
if (exitResult === 'force-released') {
|
|
2927
|
+
throw new Error(`Step "${step.name}" failed — agent was force-released after exhausting idle nudges without completing`);
|
|
2928
|
+
}
|
|
2758
2929
|
}
|
|
2759
2930
|
finally {
|
|
2760
2931
|
// Snapshot PTY chunks before cleanup — we need them for output reading below
|
|
2761
2932
|
ptyChunks = this.ptyOutputBuffers.get(agentName) ?? [];
|
|
2933
|
+
this.lastFailedStepOutput.set(step.name, ptyChunks.join(''));
|
|
2762
2934
|
// Always clean up PTY resources — prevents fd leaks if spawnPty or waitForExit throws
|
|
2763
2935
|
stopHeartbeat?.();
|
|
2764
2936
|
this.activeAgentHandles.delete(agentName);
|
|
@@ -2784,10 +2956,14 @@ export class WorkflowRunner {
|
|
|
2784
2956
|
: exitResult === 'timeout'
|
|
2785
2957
|
? 'Agent completed (released after idle timeout)'
|
|
2786
2958
|
: exitResult === 'released'
|
|
2787
|
-
? 'Agent completed (
|
|
2959
|
+
? 'Agent completed (idle — treated as done)'
|
|
2788
2960
|
: `Agent exited (${exitResult})`;
|
|
2789
2961
|
}
|
|
2790
|
-
return
|
|
2962
|
+
return {
|
|
2963
|
+
output,
|
|
2964
|
+
exitCode: agent?.exitCode,
|
|
2965
|
+
exitSignal: agent?.exitSignal,
|
|
2966
|
+
};
|
|
2791
2967
|
}
|
|
2792
2968
|
// ── Idle nudging ────────────────────────────────────────────────────────
|
|
2793
2969
|
/** Patterns where a hub agent coordinates spoke agents. */
|
|
@@ -2858,7 +3034,7 @@ export class WorkflowRunner {
|
|
|
2858
3034
|
return exitResult;
|
|
2859
3035
|
}
|
|
2860
3036
|
// Agent is still running after the window expired.
|
|
2861
|
-
if (
|
|
3037
|
+
if (timeoutMs !== undefined && Date.now() - startTime >= timeoutMs) {
|
|
2862
3038
|
return 'timeout';
|
|
2863
3039
|
}
|
|
2864
3040
|
// Nudge if we haven't exhausted the limit
|
|
@@ -2873,7 +3049,7 @@ export class WorkflowRunner {
|
|
|
2873
3049
|
this.postToChannel(`**[${step.name}]** Agent \`${agent.name}\` still idle after ${nudgeCount} nudge(s) — force-releasing`);
|
|
2874
3050
|
this.emit({ type: 'step:force-released', runId: this.currentRunId ?? '', stepName: step.name });
|
|
2875
3051
|
await agent.release();
|
|
2876
|
-
return 'released';
|
|
3052
|
+
return 'force-released';
|
|
2877
3053
|
}
|
|
2878
3054
|
}
|
|
2879
3055
|
/**
|
|
@@ -2988,7 +3164,7 @@ export class WorkflowRunner {
|
|
|
2988
3164
|
}
|
|
2989
3165
|
await this.db.updateRun(runId, patch);
|
|
2990
3166
|
}
|
|
2991
|
-
async markStepFailed(state, error, runId) {
|
|
3167
|
+
async markStepFailed(state, error, runId, exitInfo) {
|
|
2992
3168
|
state.row.status = 'failed';
|
|
2993
3169
|
state.row.error = error;
|
|
2994
3170
|
state.row.completedAt = new Date().toISOString();
|
|
@@ -2998,7 +3174,14 @@ export class WorkflowRunner {
|
|
|
2998
3174
|
completedAt: state.row.completedAt,
|
|
2999
3175
|
updatedAt: new Date().toISOString(),
|
|
3000
3176
|
});
|
|
3001
|
-
this.emit({
|
|
3177
|
+
this.emit({
|
|
3178
|
+
type: 'step:failed',
|
|
3179
|
+
runId,
|
|
3180
|
+
stepName: state.row.stepName,
|
|
3181
|
+
error,
|
|
3182
|
+
exitCode: exitInfo?.exitCode,
|
|
3183
|
+
exitSignal: exitInfo?.exitSignal,
|
|
3184
|
+
});
|
|
3002
3185
|
}
|
|
3003
3186
|
async markDownstreamSkipped(failedStepName, allSteps, stepStates, runId) {
|
|
3004
3187
|
const queue = [failedStepName];
|