@agent-relay/sdk 3.1.19 → 3.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,17 @@ import { WorkflowTrajectory } from './trajectory.js';
17
17
  // Import from sub-paths to avoid pulling in the full @relaycast/sdk dependency.
18
18
  import { AgentRelay } from '../relay.js';
19
19
  import { RelayCast, RelayError } from '@relaycast/sdk';
20
+ /** Error carrying exit code/signal from a failed subprocess spawn. */
21
+ class SpawnExitError extends Error {
22
+ exitCode;
23
+ exitSignal;
24
+ constructor(message, exitCode, exitSignal) {
25
+ super(message);
26
+ this.name = 'SpawnExitError';
27
+ this.exitCode = exitCode;
28
+ this.exitSignal = exitSignal ?? undefined;
29
+ }
30
+ }
20
31
  // ── CLI resolution ───────────────────────────────────────────────────────────
21
32
  /**
22
33
  * Resolve `cursor` to the concrete cursor agent binary available in PATH.
@@ -70,6 +81,8 @@ export class WorkflowRunner {
70
81
  activeAgentHandles = new Map();
71
82
  // PTY-based output capture: accumulate terminal output per-agent
72
83
  ptyOutputBuffers = new Map();
84
+ /** Snapshot of PTY output from the most recent failed attempt, keyed by step name. */
85
+ lastFailedStepOutput = new Map();
73
86
  ptyListeners = new Map();
74
87
  ptyLogStreams = new Map();
75
88
  /** Path to workers.json so `agents:kill` can find workflow-spawned agents */
@@ -865,7 +878,12 @@ export class WorkflowRunner {
865
878
  // ── Execution ───────────────────────────────────────────────────────────
866
879
  /** Execute a named workflow from a validated config. */
867
880
  async execute(config, workflowName, vars) {
881
+ // Set up abort controller early so callers can abort() even during setup
882
+ this.abortController = new AbortController();
883
+ this.paused = false;
868
884
  const resolved = vars ? this.resolveVariables(config, vars) : config;
885
+ // Validate config (catches cycles, missing deps, invalid steps, etc.)
886
+ this.validateConfig(resolved);
869
887
  // Resolve and validate named paths from the top-level `paths` config
870
888
  const pathResult = this.resolvePathDefinitions(resolved.paths, this.cwd);
871
889
  if (pathResult.errors.length > 0) {
@@ -935,6 +953,9 @@ export class WorkflowRunner {
935
953
  }
936
954
  /** Resume a previously paused or partially completed run. */
937
955
  async resume(runId, vars) {
956
+ // Set up abort controller early so callers can abort() even during setup
957
+ this.abortController = new AbortController();
958
+ this.paused = false;
938
959
  const run = await this.db.getRun(runId);
939
960
  if (!run) {
940
961
  throw new Error(`Run "${runId}" not found`);
@@ -982,9 +1003,7 @@ export class WorkflowRunner {
982
1003
  async runWorkflowCore(input) {
983
1004
  const { run, workflow, config, stepStates, isResume } = input;
984
1005
  const runId = run.id;
985
- // Start execution
986
- this.abortController = new AbortController();
987
- this.paused = false;
1006
+ // Start execution (abortController already set by execute()/resume())
988
1007
  this.currentConfig = config;
989
1008
  this.currentRunId = runId;
990
1009
  this.runStartTime = Date.now();
@@ -1012,15 +1031,20 @@ export class WorkflowRunner {
1012
1031
  config.swarm.channel = channel;
1013
1032
  await this.db.updateRun(runId, { config });
1014
1033
  }
1034
+ const relaycastDisabled = this.relayOptions.env?.AGENT_RELAY_WORKFLOW_DISABLE_RELAYCAST === '1';
1035
+ const requiresBroker = !this.executor &&
1036
+ workflow.steps.some((step) => step.type !== 'deterministic' && step.type !== 'worktree');
1015
1037
  // Skip broker/relay init when an external executor handles agent spawning
1016
- if (!this.executor) {
1017
- this.log('Resolving Relaycast API key...');
1018
- await this.ensureRelaycastApiKey(channel);
1019
- this.log('API key resolved');
1020
- if (this.relayApiKeyAutoCreated && this.relayApiKey) {
1021
- this.log(`Workspace created — follow this run in Relaycast:`);
1022
- this.log(` Observer: https://agentrelay.dev/observer?key=${this.relayApiKey}`);
1023
- this.log(` Channel: ${channel}`);
1038
+ if (requiresBroker) {
1039
+ if (!relaycastDisabled) {
1040
+ this.log('Resolving Relaycast API key...');
1041
+ await this.ensureRelaycastApiKey(channel);
1042
+ this.log('API key resolved');
1043
+ if (this.relayApiKeyAutoCreated && this.relayApiKey) {
1044
+ this.log(`Workspace created — follow this run in Relaycast:`);
1045
+ this.log(` Observer: https://agentrelay.dev/observer?key=${this.relayApiKey}`);
1046
+ this.log(` Channel: ${channel}`);
1047
+ }
1024
1048
  }
1025
1049
  this.log('Starting broker...');
1026
1050
  // Include a short run ID suffix in the broker name so each workflow execution
@@ -1031,7 +1055,7 @@ export class WorkflowRunner {
1031
1055
  this.relay = new AgentRelay({
1032
1056
  ...this.relayOptions,
1033
1057
  brokerName,
1034
- channels: [channel],
1058
+ channels: relaycastDisabled ? [] : [channel],
1035
1059
  env: this.getRelayEnv(),
1036
1060
  // Workflows spawn agents across multiple waves; each spawn requires a PTY +
1037
1061
  // Relaycast registration. 60s is too tight when the broker is saturated with
@@ -1092,6 +1116,18 @@ export class WorkflowRunner {
1092
1116
  };
1093
1117
  // Wire relay event hooks for rich console logging
1094
1118
  this.relay.onMessageReceived = (msg) => {
1119
+ this.emit({
1120
+ type: 'broker:event',
1121
+ runId,
1122
+ event: {
1123
+ kind: 'relay_inbound',
1124
+ event_id: msg.eventId,
1125
+ from: msg.from,
1126
+ target: msg.to,
1127
+ body: msg.text,
1128
+ thread_id: msg.threadId,
1129
+ },
1130
+ });
1095
1131
  const body = msg.text.length > 120 ? msg.text.slice(0, 117) + '...' : msg.text;
1096
1132
  const fromShort = msg.from.replace(/-[a-f0-9]{6,}$/, '');
1097
1133
  const toShort = msg.to.replace(/-[a-f0-9]{6,}$/, '');
@@ -1102,19 +1138,60 @@ export class WorkflowRunner {
1102
1138
  }
1103
1139
  };
1104
1140
  this.relay.onAgentSpawned = (agent) => {
1141
+ this.emit({
1142
+ type: 'broker:event',
1143
+ runId,
1144
+ event: {
1145
+ kind: 'agent_spawned',
1146
+ name: agent.name,
1147
+ runtime: agent.runtime,
1148
+ },
1149
+ });
1105
1150
  // Skip agents already managed by step execution
1106
1151
  if (!this.activeAgentHandles.has(agent.name)) {
1107
1152
  this.log(`[spawned] ${agent.name} (${agent.runtime})`);
1108
1153
  }
1109
1154
  };
1155
+ this.relay.onAgentReleased = (agent) => {
1156
+ this.emit({
1157
+ type: 'broker:event',
1158
+ runId,
1159
+ event: {
1160
+ kind: 'agent_released',
1161
+ name: agent.name,
1162
+ },
1163
+ });
1164
+ };
1110
1165
  this.relay.onAgentExited = (agent) => {
1166
+ this.emit({
1167
+ type: 'broker:event',
1168
+ runId,
1169
+ event: {
1170
+ kind: 'agent_exited',
1171
+ name: agent.name,
1172
+ code: agent.exitCode,
1173
+ signal: agent.exitSignal,
1174
+ },
1175
+ });
1111
1176
  this.lastActivity.delete(agent.name);
1112
1177
  this.lastIdleLog.delete(agent.name);
1113
1178
  if (!this.activeAgentHandles.has(agent.name)) {
1114
1179
  this.log(`[exited] ${agent.name} (code: ${agent.exitCode ?? '?'})`);
1115
1180
  }
1116
1181
  };
1182
+ this.relay.onDeliveryUpdate = (event) => {
1183
+ this.emit({ type: 'broker:event', runId, event });
1184
+ };
1117
1185
  this.relay.onAgentIdle = ({ name, idleSecs }) => {
1186
+ this.emit({
1187
+ type: 'broker:event',
1188
+ runId,
1189
+ event: {
1190
+ kind: 'agent_idle',
1191
+ name,
1192
+ idle_secs: idleSecs,
1193
+ },
1194
+ });
1118
1195
  // Only log at 30s multiples to avoid watchdog spam
1119
1196
  const bucket = Math.floor(idleSecs / 30) * 30;
1120
1197
  if (bucket >= 30 && this.lastIdleLog.get(name) !== bucket) {
@@ -1129,19 +1206,21 @@ export class WorkflowRunner {
1129
1206
  this.unsubBrokerStderr = this.relay.onBrokerStderr((line) => {
1130
1207
  console.log(`[broker] ${line}`);
1131
1208
  });
1132
- this.log(`Creating channel: ${channel}...`);
1133
- if (isResume) {
1134
- await this.createAndJoinRelaycastChannel(channel);
1135
- }
1136
- else {
1137
- await this.createAndJoinRelaycastChannel(channel, workflow.description);
1138
- }
1139
- this.log('Channel ready');
1140
- if (isResume) {
1141
- this.postToChannel(`Workflow **${workflow.name}** resumed — ${pendingCount} pending steps`);
1142
- }
1143
- else {
1144
- this.postToChannel(`Workflow **${workflow.name}** started — ${workflow.steps.length} steps, pattern: ${config.swarm.pattern}`);
1209
+ if (!relaycastDisabled) {
1210
+ this.log(`Creating channel: ${channel}...`);
1211
+ if (isResume) {
1212
+ await this.createAndJoinRelaycastChannel(channel);
1213
+ }
1214
+ else {
1215
+ await this.createAndJoinRelaycastChannel(channel, workflow.description);
1216
+ }
1217
+ this.log('Channel ready');
1218
+ if (isResume) {
1219
+ this.postToChannel(`Workflow **${workflow.name}** resumed — ${pendingCount} pending steps`);
1220
+ }
1221
+ else {
1222
+ this.postToChannel(`Workflow **${workflow.name}** started — ${workflow.steps.length} steps, pattern: ${config.swarm.pattern}`);
1223
+ }
1145
1224
  }
1146
1225
  }
1147
1226
  const agentMap = new Map();
@@ -1154,7 +1233,11 @@ export class WorkflowRunner {
1154
1233
  }
1155
1234
  this.log(`Executing ${workflow.steps.length} steps (pattern: ${config.swarm.pattern})`);
1156
1235
  await this.executeSteps(workflow, stepStates, agentMap, config.errorHandling, runId);
1157
- const allCompleted = [...stepStates.values()].every((s) => s.row.status === 'completed' || s.row.status === 'skipped');
1236
+ const errorStrategy = config.errorHandling?.strategy ?? workflow.onError ?? 'fail-fast';
1237
+ const continueOnError = errorStrategy === 'continue' || errorStrategy === 'skip';
1238
+ const allCompleted = [...stepStates.values()].every((s) => s.row.status === 'completed' ||
1239
+ s.row.status === 'skipped' ||
1240
+ (continueOnError && s.row.status === 'failed'));
1158
1241
  if (allCompleted) {
1159
1242
  this.log('Workflow completed successfully');
1160
1243
  await this.updateRunStatus(runId, 'completed');
@@ -1175,9 +1258,18 @@ export class WorkflowRunner {
1175
1258
  await this.updateRunStatus(runId, 'failed', errorMsg);
1176
1259
  this.emit({ type: 'run:failed', runId, error: errorMsg });
1177
1260
  const outcomes = this.collectOutcomes(stepStates, workflow.steps);
1261
+ const summary = this.trajectory.buildRunSummary(outcomes);
1262
+ const confidence = this.trajectory.computeConfidence(outcomes);
1263
+ const learnings = this.trajectory.extractLearnings(outcomes);
1264
+ const challenges = this.trajectory.extractChallenges(outcomes);
1178
1265
  this.postFailureReport(workflow.name, outcomes, errorMsg);
1179
1266
  this.logRunSummary(workflow.name, outcomes, runId);
1180
- await this.trajectory.abandon(errorMsg);
1267
+ await this.trajectory.abandon(errorMsg, {
1268
+ summary,
1269
+ confidence,
1270
+ learnings,
1271
+ challenges,
1272
+ });
1181
1273
  }
1182
1274
  }
1183
1275
  catch (err) {
@@ -1185,6 +1277,19 @@ export class WorkflowRunner {
1185
1277
  const status = !isResume && this.abortController?.signal.aborted ? 'cancelled' : 'failed';
1186
1278
  await this.updateRunStatus(runId, status, errorMsg);
1187
1279
  if (status === 'cancelled') {
1280
+ // Mark any pending or in-progress steps as failed due to cancellation
1281
+ for (const [stepName, state] of stepStates) {
1282
+ if (state.row.status === 'pending' || state.row.status === 'running') {
1283
+ state.row.status = 'failed';
1284
+ state.row.error = 'Cancelled';
1285
+ await this.db.updateStep(state.row.id, {
1286
+ status: 'failed',
1287
+ error: 'Cancelled',
1288
+ updatedAt: new Date().toISOString(),
1289
+ });
1290
+ this.emit({ type: 'step:failed', runId, stepName, error: 'Cancelled' });
1291
+ }
1292
+ }
1188
1293
  this.emit({ type: 'run:cancelled', runId });
1189
1294
  this.postToChannel(`Workflow **${workflow.name}** cancelled`);
1190
1295
  await this.trajectory.abandon('Cancelled by user');
@@ -1192,10 +1297,17 @@ export class WorkflowRunner {
1192
1297
  else {
1193
1298
  this.emit({ type: 'run:failed', runId, error: errorMsg });
1194
1299
  this.postToChannel(`Workflow failed: ${errorMsg}`);
1195
- await this.trajectory.abandon(errorMsg);
1300
+ const outcomes = this.collectOutcomes(stepStates, workflow.steps);
1301
+ await this.trajectory.abandon(errorMsg, {
1302
+ summary: this.trajectory.buildRunSummary(outcomes),
1303
+ confidence: this.trajectory.computeConfidence(outcomes),
1304
+ learnings: this.trajectory.extractLearnings(outcomes),
1305
+ challenges: this.trajectory.extractChallenges(outcomes),
1306
+ });
1196
1307
  }
1197
1308
  }
1198
1309
  finally {
1310
+ this.lastFailedStepOutput.clear();
1199
1311
  for (const stream of this.ptyLogStreams.values())
1200
1312
  stream.end();
1201
1313
  this.ptyLogStreams.clear();
@@ -1207,9 +1319,11 @@ export class WorkflowRunner {
1207
1319
  if (this.relay) {
1208
1320
  this.relay.onMessageReceived = null;
1209
1321
  this.relay.onAgentSpawned = null;
1322
+ this.relay.onAgentReleased = null;
1210
1323
  this.relay.onAgentExited = null;
1211
1324
  this.relay.onAgentIdle = null;
1212
1325
  this.relay.onWorkerOutput = null;
1326
+ this.relay.onDeliveryUpdate = null;
1213
1327
  }
1214
1328
  this.lastIdleLog.clear();
1215
1329
  this.lastActivity.clear();
@@ -1461,7 +1575,7 @@ export class WorkflowRunner {
1461
1575
  async executeStep(step, stepStates, agentMap, errorHandling, runId) {
1462
1576
  // Branch: deterministic steps execute shell commands
1463
1577
  if (this.isDeterministicStep(step)) {
1464
- return this.executeDeterministicStep(step, stepStates, runId);
1578
+ return this.executeDeterministicStep(step, stepStates, runId, errorHandling);
1465
1579
  }
1466
1580
  // Branch: worktree steps set up git worktrees
1467
1581
  if (this.isWorktreeStep(step)) {
@@ -1474,42 +1588,143 @@ export class WorkflowRunner {
1474
1588
  * Execute a deterministic step (shell command).
1475
1589
  * Fast, reliable, $0 LLM cost.
1476
1590
  */
1477
- async executeDeterministicStep(step, stepStates, runId) {
1591
+ async executeDeterministicStep(step, stepStates, runId, errorHandling) {
1478
1592
  const state = stepStates.get(step.name);
1479
1593
  if (!state)
1480
1594
  throw new Error(`Step state not found: ${step.name}`);
1481
- this.checkAborted();
1482
- // Mark step as running
1483
- state.row.status = 'running';
1484
- state.row.startedAt = new Date().toISOString();
1485
- await this.db.updateStep(state.row.id, {
1486
- status: 'running',
1487
- startedAt: state.row.startedAt,
1488
- updatedAt: new Date().toISOString(),
1489
- });
1490
- this.emit({ type: 'step:started', runId, stepName: step.name });
1491
- this.postToChannel(`**[${step.name}]** Started (deterministic)`);
1492
- // Resolve variables in the command (e.g., {{steps.plan.output}}, {{branch-name}})
1493
- const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
1494
- let resolvedCommand = this.interpolateStepTask(step.command ?? '', stepOutputContext);
1495
- // Also resolve simple {{variable}} placeholders (already resolved in top-level config but safe to re-run)
1496
- resolvedCommand = resolvedCommand.replace(/\{\{([\w][\w.\-]*)\}\}/g, (_match, key) => {
1497
- if (key.startsWith('steps.'))
1498
- return _match; // Already handled above
1499
- const value = this.resolveDotPath(key, stepOutputContext);
1500
- return value !== undefined ? String(value) : _match;
1501
- });
1502
- // Resolve step workdir (named path reference) for deterministic steps
1503
- const stepCwd = this.resolveStepWorkdir(step) ?? this.cwd;
1504
- try {
1505
- // Delegate to executor if present
1506
- if (this.executor?.executeDeterministicStep) {
1507
- const result = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
1508
- const failOnError = step.failOnError !== false;
1509
- if (failOnError && result.exitCode !== 0) {
1510
- throw new Error(`Command failed with exit code ${result.exitCode}: ${result.output.slice(0, 500)}`);
1595
+ const maxRetries = step.retries ?? errorHandling?.maxRetries ?? 0;
1596
+ const retryDelay = errorHandling?.retryDelayMs ?? 1000;
1597
+ let lastError;
1598
+ for (let attempt = 0; attempt <= maxRetries; attempt += 1) {
1599
+ this.checkAborted();
1600
+ if (attempt > 0) {
1601
+ this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
1602
+ this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
1603
+ state.row.retryCount = attempt;
1604
+ await this.db.updateStep(state.row.id, {
1605
+ retryCount: attempt,
1606
+ updatedAt: new Date().toISOString(),
1607
+ });
1608
+ await this.delay(retryDelay);
1609
+ }
1610
+ // Mark step as running
1611
+ state.row.status = 'running';
1612
+ state.row.startedAt = new Date().toISOString();
1613
+ await this.db.updateStep(state.row.id, {
1614
+ status: 'running',
1615
+ startedAt: state.row.startedAt,
1616
+ updatedAt: new Date().toISOString(),
1617
+ });
1618
+ this.emit({ type: 'step:started', runId, stepName: step.name });
1619
+ this.postToChannel(`**[${step.name}]** Started (deterministic)`);
1620
+ // Resolve variables in the command (e.g., {{steps.plan.output}}, {{branch-name}})
1621
+ const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
1622
+ let resolvedCommand = this.interpolateStepTask(step.command ?? '', stepOutputContext);
1623
+ // Also resolve simple {{variable}} placeholders (already resolved in top-level config but safe to re-run)
1624
+ resolvedCommand = resolvedCommand.replace(/\{\{([\w][\w.\-]*)\}\}/g, (_match, key) => {
1625
+ if (key.startsWith('steps.'))
1626
+ return _match; // Already handled above
1627
+ const value = this.resolveDotPath(key, stepOutputContext);
1628
+ return value !== undefined ? String(value) : _match;
1629
+ });
1630
+ // Resolve step workdir (named path reference) for deterministic steps
1631
+ const stepCwd = this.resolveStepWorkdir(step) ?? this.cwd;
1632
+ try {
1633
+ // Delegate to executor if present
1634
+ if (this.executor?.executeDeterministicStep) {
1635
+ const result = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
1636
+ const failOnError = step.failOnError !== false;
1637
+ if (failOnError && result.exitCode !== 0) {
1638
+ throw new Error(`Command failed with exit code ${result.exitCode}: ${result.output.slice(0, 500)}`);
1639
+ }
1640
+ const output = step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
1641
+ if (step.verification) {
1642
+ this.runVerification(step.verification, output, step.name);
1643
+ }
1644
+ // Mark completed
1645
+ state.row.status = 'completed';
1646
+ state.row.output = output;
1647
+ state.row.completedAt = new Date().toISOString();
1648
+ await this.db.updateStep(state.row.id, {
1649
+ status: 'completed',
1650
+ output,
1651
+ completedAt: state.row.completedAt,
1652
+ updatedAt: new Date().toISOString(),
1653
+ });
1654
+ await this.persistStepOutput(runId, step.name, output);
1655
+ this.emit({ type: 'step:completed', runId, stepName: step.name, output });
1656
+ return;
1657
+ }
1658
+ const output = await new Promise((resolve, reject) => {
1659
+ const child = cpSpawn('sh', ['-c', resolvedCommand], {
1660
+ stdio: 'pipe',
1661
+ cwd: stepCwd,
1662
+ env: { ...process.env },
1663
+ });
1664
+ const stdoutChunks = [];
1665
+ const stderrChunks = [];
1666
+ // Wire abort signal
1667
+ const abortSignal = this.abortController?.signal;
1668
+ let abortHandler;
1669
+ if (abortSignal && !abortSignal.aborted) {
1670
+ abortHandler = () => {
1671
+ child.kill('SIGTERM');
1672
+ setTimeout(() => child.kill('SIGKILL'), 5000);
1673
+ };
1674
+ abortSignal.addEventListener('abort', abortHandler, { once: true });
1675
+ }
1676
+ // Handle timeout
1677
+ let timedOut = false;
1678
+ let timer;
1679
+ if (step.timeoutMs) {
1680
+ timer = setTimeout(() => {
1681
+ timedOut = true;
1682
+ child.kill('SIGTERM');
1683
+ setTimeout(() => child.kill('SIGKILL'), 5000);
1684
+ }, step.timeoutMs);
1685
+ }
1686
+ child.stdout?.on('data', (chunk) => {
1687
+ stdoutChunks.push(chunk.toString());
1688
+ });
1689
+ child.stderr?.on('data', (chunk) => {
1690
+ stderrChunks.push(chunk.toString());
1691
+ });
1692
+ child.on('close', (code) => {
1693
+ if (timer)
1694
+ clearTimeout(timer);
1695
+ if (abortHandler && abortSignal) {
1696
+ abortSignal.removeEventListener('abort', abortHandler);
1697
+ }
1698
+ if (abortSignal?.aborted) {
1699
+ reject(new Error(`Step "${step.name}" aborted`));
1700
+ return;
1701
+ }
1702
+ if (timedOut) {
1703
+ reject(new Error(`Step "${step.name}" timed out (no step timeout set, check global swarm.timeoutMs)`));
1704
+ return;
1705
+ }
1706
+ const stdout = stdoutChunks.join('');
1707
+ const stderr = stderrChunks.join('');
1708
+ // Check exit code unless failOnError is explicitly false
1709
+ const failOnError = step.failOnError !== false;
1710
+ if (failOnError && code !== 0 && code !== null) {
1711
+ reject(new Error(`Command failed with exit code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`));
1712
+ return;
1713
+ }
1714
+ resolve(step.captureOutput !== false ? stdout : `Command completed (exit code ${code ?? 0})`);
1715
+ });
1716
+ child.on('error', (err) => {
1717
+ if (timer)
1718
+ clearTimeout(timer);
1719
+ if (abortHandler && abortSignal) {
1720
+ abortSignal.removeEventListener('abort', abortHandler);
1721
+ }
1722
+ reject(new Error(`Failed to execute command: ${err.message}`));
1723
+ });
1724
+ });
1725
+ if (step.verification) {
1726
+ this.runVerification(step.verification, output, step.name);
1511
1727
  }
1512
- const output = step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
1513
1728
  // Mark completed
1514
1729
  state.row.status = 'completed';
1515
1730
  state.row.output = output;
@@ -1520,97 +1735,19 @@ export class WorkflowRunner {
1520
1735
  completedAt: state.row.completedAt,
1521
1736
  updatedAt: new Date().toISOString(),
1522
1737
  });
1738
+ // Persist step output
1523
1739
  await this.persistStepOutput(runId, step.name, output);
1524
1740
  this.emit({ type: 'step:completed', runId, stepName: step.name, output });
1525
1741
  return;
1526
1742
  }
1527
- const output = await new Promise((resolve, reject) => {
1528
- const child = cpSpawn('sh', ['-c', resolvedCommand], {
1529
- stdio: 'pipe',
1530
- cwd: stepCwd,
1531
- env: { ...process.env },
1532
- });
1533
- const stdoutChunks = [];
1534
- const stderrChunks = [];
1535
- // Wire abort signal
1536
- const abortSignal = this.abortController?.signal;
1537
- let abortHandler;
1538
- if (abortSignal && !abortSignal.aborted) {
1539
- abortHandler = () => {
1540
- child.kill('SIGTERM');
1541
- setTimeout(() => child.kill('SIGKILL'), 5000);
1542
- };
1543
- abortSignal.addEventListener('abort', abortHandler, { once: true });
1544
- }
1545
- // Handle timeout
1546
- let timedOut = false;
1547
- let timer;
1548
- if (step.timeoutMs) {
1549
- timer = setTimeout(() => {
1550
- timedOut = true;
1551
- child.kill('SIGTERM');
1552
- setTimeout(() => child.kill('SIGKILL'), 5000);
1553
- }, step.timeoutMs);
1554
- }
1555
- child.stdout?.on('data', (chunk) => {
1556
- stdoutChunks.push(chunk.toString());
1557
- });
1558
- child.stderr?.on('data', (chunk) => {
1559
- stderrChunks.push(chunk.toString());
1560
- });
1561
- child.on('close', (code) => {
1562
- if (timer)
1563
- clearTimeout(timer);
1564
- if (abortHandler && abortSignal) {
1565
- abortSignal.removeEventListener('abort', abortHandler);
1566
- }
1567
- if (abortSignal?.aborted) {
1568
- reject(new Error(`Step "${step.name}" aborted`));
1569
- return;
1570
- }
1571
- if (timedOut) {
1572
- reject(new Error(`Step "${step.name}" timed out (no step timeout set, check global swarm.timeoutMs)`));
1573
- return;
1574
- }
1575
- const stdout = stdoutChunks.join('');
1576
- const stderr = stderrChunks.join('');
1577
- // Check exit code unless failOnError is explicitly false
1578
- const failOnError = step.failOnError !== false;
1579
- if (failOnError && code !== 0 && code !== null) {
1580
- reject(new Error(`Command failed with exit code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`));
1581
- return;
1582
- }
1583
- resolve(step.captureOutput !== false ? stdout : `Command completed (exit code ${code ?? 0})`);
1584
- });
1585
- child.on('error', (err) => {
1586
- if (timer)
1587
- clearTimeout(timer);
1588
- if (abortHandler && abortSignal) {
1589
- abortSignal.removeEventListener('abort', abortHandler);
1590
- }
1591
- reject(new Error(`Failed to execute command: ${err.message}`));
1592
- });
1593
- });
1594
- // Mark completed
1595
- state.row.status = 'completed';
1596
- state.row.output = output;
1597
- state.row.completedAt = new Date().toISOString();
1598
- await this.db.updateStep(state.row.id, {
1599
- status: 'completed',
1600
- output,
1601
- completedAt: state.row.completedAt,
1602
- updatedAt: new Date().toISOString(),
1603
- });
1604
- // Persist step output
1605
- await this.persistStepOutput(runId, step.name, output);
1606
- this.emit({ type: 'step:completed', runId, stepName: step.name, output });
1607
- }
1608
- catch (err) {
1609
- const errorMsg = err instanceof Error ? err.message : String(err);
1610
- this.postToChannel(`**[${step.name}]** Failed: ${errorMsg}`);
1611
- await this.markStepFailed(state, errorMsg, runId);
1612
- throw new Error(`Step "${step.name}" failed: ${errorMsg}`);
1743
+ catch (err) {
1744
+ lastError = err instanceof Error ? err.message : String(err);
1745
+ }
1613
1746
  }
1747
+ const errorMsg = lastError ?? 'Unknown error';
1748
+ this.postToChannel(`**[${step.name}]** Failed: ${errorMsg}`);
1749
+ await this.markStepFailed(state, errorMsg, runId);
1750
+ throw new Error(`Step "${step.name}" failed: ${errorMsg}`);
1614
1751
  }
1615
1752
  /**
1616
1753
  * Execute a worktree step (git worktree setup).
@@ -1807,8 +1944,13 @@ export class WorkflowRunner {
1807
1944
  specialistDef.constraints?.timeoutMs ??
1808
1945
  this.currentConfig?.swarm?.timeoutMs;
1809
1946
  let lastError;
1947
+ let lastExitCode;
1948
+ let lastExitSignal;
1810
1949
  for (let attempt = 0; attempt <= maxRetries; attempt++) {
1811
1950
  this.checkAborted();
1951
+ // Reset per-attempt exit info so stale values don't leak across retries
1952
+ lastExitCode = undefined;
1953
+ lastExitSignal = undefined;
1812
1954
  if (attempt > 0) {
1813
1955
  this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
1814
1956
  this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
@@ -1850,6 +1992,15 @@ export class WorkflowRunner {
1850
1992
  // Resolve step-output variables (e.g. {{steps.plan.output}}) at execution time
1851
1993
  const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
1852
1994
  let resolvedTask = this.interpolateStepTask(step.task ?? '', stepOutputContext);
1995
+ // On retry attempts, prepend failure context so the agent knows what went wrong
1996
+ if (attempt > 0 && lastError) {
1997
+ const priorOutput = (this.lastFailedStepOutput.get(step.name) ?? '').slice(-2000);
1998
+ resolvedTask =
1999
+ `[RETRY — Attempt ${attempt + 1}/${maxRetries + 1}]\n` +
2000
+ `Previous attempt failed: ${lastError}\n` +
2001
+ (priorOutput ? `Previous output (last 2000 chars):\n${priorOutput}\n` : '') +
2002
+ `---\n${resolvedTask}`;
2003
+ }
1853
2004
  // If this is an interactive agent, append awareness of non-interactive workers
1854
2005
  // so the lead knows not to message them and to use step output chaining instead
1855
2006
  if (specialistDef.interactive !== false || ownerDef.interactive !== false) {
@@ -1884,9 +2035,12 @@ export class WorkflowRunner {
1884
2035
  this.log(`[${step.name}] Spawning owner "${effectiveOwner.name}" (cli: ${effectiveOwner.cli})${step.workdir ? ` [workdir: ${step.workdir}]` : ''}`);
1885
2036
  const resolvedStep = { ...step, task: ownerTask };
1886
2037
  const ownerStartTime = Date.now();
1887
- const output = this.executor
2038
+ const spawnResult = this.executor
1888
2039
  ? await this.executor.executeAgentStep(resolvedStep, effectiveOwner, ownerTask, timeoutMs)
1889
2040
  : await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs);
2041
+ const output = typeof spawnResult === 'string' ? spawnResult : spawnResult.output;
2042
+ lastExitCode = typeof spawnResult === 'string' ? undefined : spawnResult.exitCode;
2043
+ lastExitSignal = typeof spawnResult === 'string' ? undefined : spawnResult.exitSignal;
1890
2044
  ownerElapsed = Date.now() - ownerStartTime;
1891
2045
  this.log(`[${step.name}] Owner "${effectiveOwner.name}" exited`);
1892
2046
  if (usesOwnerFlow) {
@@ -1897,7 +2051,7 @@ export class WorkflowRunner {
1897
2051
  }
1898
2052
  // Run verification if configured
1899
2053
  if (step.verification) {
1900
- this.runVerification(step.verification, specialistOutput, step.name, resolvedTask);
2054
+ this.runVerification(step.verification, specialistOutput, step.name, effectiveOwner.interactive === false ? undefined : resolvedTask);
1901
2055
  }
1902
2056
  // Every interactive step gets a review pass; pick a dedicated reviewer when available.
1903
2057
  let combinedOutput = specialistOutput;
@@ -1918,12 +2072,16 @@ export class WorkflowRunner {
1918
2072
  });
1919
2073
  // Persist step output to disk so it survives restarts and is inspectable
1920
2074
  await this.persistStepOutput(runId, step.name, combinedOutput);
1921
- this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput });
2075
+ this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput, exitCode: lastExitCode, exitSignal: lastExitSignal });
1922
2076
  await this.trajectory?.stepCompleted(step, combinedOutput, attempt + 1);
1923
2077
  return;
1924
2078
  }
1925
2079
  catch (err) {
1926
2080
  lastError = err instanceof Error ? err.message : String(err);
2081
+ if (err instanceof SpawnExitError) {
2082
+ lastExitCode = err.exitCode;
2083
+ lastExitSignal = err.exitSignal;
2084
+ }
1927
2085
  const ownerTimedOut = usesDedicatedOwner
1928
2086
  ? /\bowner timed out\b/i.test(lastError)
1929
2087
  : /\btimed out\b/i.test(lastError) && !lastError.includes(`${step.name}-review`);
@@ -1943,7 +2101,10 @@ export class WorkflowRunner {
1943
2101
  verificationValue,
1944
2102
  });
1945
2103
  this.postToChannel(`**[${step.name}]** Failed: ${lastError ?? 'Unknown error'}`);
1946
- await this.markStepFailed(state, lastError ?? 'Unknown error', runId);
2104
+ await this.markStepFailed(state, lastError ?? 'Unknown error', runId, {
2105
+ exitCode: lastExitCode,
2106
+ exitSignal: lastExitSignal,
2107
+ });
1947
2108
  throw new Error(`Step "${step.name}" failed after ${maxRetries} retries: ${lastError ?? 'Unknown error'}`);
1948
2109
  }
1949
2110
  injectStepOwnerContract(step, resolvedTask, ownerDef, specialistDef) {
@@ -2058,10 +2219,10 @@ export class WorkflowRunner {
2058
2219
  });
2059
2220
  const workerSettled = workerPromise.catch(() => undefined);
2060
2221
  workerPromise
2061
- .then((output) => {
2222
+ .then((result) => {
2062
2223
  workerReleased = true;
2063
2224
  this.postToChannel(`**[${step.name}]** Worker \`${workerRuntimeName}\` exited`);
2064
- if (step.verification?.type === 'output_contains' && output.includes(step.verification.value)) {
2225
+ if (step.verification?.type === 'output_contains' && result.output.includes(step.verification.value)) {
2065
2226
  this.postToChannel(`**[${step.name}]** Verification gate observed: output contains ${JSON.stringify(step.verification.value)}`);
2066
2227
  }
2067
2228
  })
@@ -2080,7 +2241,7 @@ export class WorkflowRunner {
2080
2241
  this.log(`[${step.name}] Spawning owner "${supervised.owner.name}" (cli: ${supervised.owner.cli})`);
2081
2242
  const ownerStartTime = Date.now();
2082
2243
  try {
2083
- const ownerOutput = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
2244
+ const ownerResultObj = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
2084
2245
  agentNameSuffix: 'owner',
2085
2246
  onSpawned: ({ actualName }) => {
2086
2247
  this.supervisedRuntimeAgents.set(actualName, {
@@ -2094,9 +2255,10 @@ export class WorkflowRunner {
2094
2255
  },
2095
2256
  });
2096
2257
  const ownerElapsed = Date.now() - ownerStartTime;
2258
+ const ownerOutput = ownerResultObj.output;
2097
2259
  this.log(`[${step.name}] Owner "${supervised.owner.name}" exited`);
2098
2260
  this.assertOwnerCompletionMarker(step, ownerOutput, supervisorTask);
2099
- const specialistOutput = await workerPromise;
2261
+ const specialistOutput = (await workerPromise).output;
2100
2262
  return { specialistOutput, ownerOutput, ownerElapsed };
2101
2263
  }
2102
2264
  catch (error) {
@@ -2307,7 +2469,7 @@ export class WorkflowRunner {
2307
2469
  })();
2308
2470
  };
2309
2471
  try {
2310
- reviewOutput = await this.spawnAndWait(reviewerDef, reviewStep, safetyTimeoutMs, {
2472
+ await this.spawnAndWait(reviewerDef, reviewStep, safetyTimeoutMs, {
2311
2473
  onSpawned: ({ agent }) => {
2312
2474
  reviewerHandle = agent;
2313
2475
  },
@@ -2502,7 +2664,7 @@ export class WorkflowRunner {
2502
2664
  const stdoutChunks = [];
2503
2665
  const stderrChunks = [];
2504
2666
  try {
2505
- const output = await new Promise((resolve, reject) => {
2667
+ const { stdout: output, exitCode, exitSignal } = await new Promise((resolve, reject) => {
2506
2668
  const child = cpSpawn(cmd, args, {
2507
2669
  stdio: ['ignore', 'pipe', 'pipe'],
2508
2670
  cwd: this.resolveAgentCwd(agentDef),
@@ -2560,7 +2722,7 @@ export class WorkflowRunner {
2560
2722
  setTimeout(() => child.kill('SIGKILL'), 5000);
2561
2723
  }, timeoutMs);
2562
2724
  }
2563
- child.on('close', (code) => {
2725
+ child.on('close', (code, signal) => {
2564
2726
  clearInterval(heartbeat);
2565
2727
  if (timer)
2566
2728
  clearTimeout(timer);
@@ -2578,10 +2740,14 @@ export class WorkflowRunner {
2578
2740
  }
2579
2741
  if (code !== 0 && code !== null) {
2580
2742
  const stderr = stderrChunks.join('');
2581
- reject(new Error(`Step "${step.name}" exited with code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`));
2743
+ reject(new SpawnExitError(`Step "${step.name}" exited with code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`, code, signal));
2582
2744
  return;
2583
2745
  }
2584
- resolve(stdout);
2746
+ resolve({
2747
+ stdout,
2748
+ exitCode: code ?? undefined,
2749
+ exitSignal: signal ?? undefined,
2750
+ });
2585
2751
  });
2586
2752
  child.on('error', (err) => {
2587
2753
  clearInterval(heartbeat);
@@ -2593,9 +2759,11 @@ export class WorkflowRunner {
2593
2759
  reject(new Error(`Failed to spawn ${cmd}: ${err.message}`));
2594
2760
  });
2595
2761
  });
2596
- return output;
2762
+ return { output, exitCode, exitSignal };
2597
2763
  }
2598
2764
  finally {
2765
+ const combinedOutput = stdoutChunks.join('') + stderrChunks.join('');
2766
+ this.lastFailedStepOutput.set(step.name, combinedOutput);
2599
2767
  stopHeartbeat?.();
2600
2768
  logStream.end();
2601
2769
  this.unregisterWorker(agentName);
@@ -2755,10 +2923,14 @@ export class WorkflowRunner {
2755
2923
  throw new Error(`Step "${step.name}" timed out after ${timeoutMs ?? 'unknown'}ms`);
2756
2924
  }
2757
2925
  }
2926
+ if (exitResult === 'force-released') {
2927
+ throw new Error(`Step "${step.name}" failed — agent was force-released after exhausting idle nudges without completing`);
2928
+ }
2758
2929
  }
2759
2930
  finally {
2760
2931
  // Snapshot PTY chunks before cleanup — we need them for output reading below
2761
2932
  ptyChunks = this.ptyOutputBuffers.get(agentName) ?? [];
2933
+ this.lastFailedStepOutput.set(step.name, ptyChunks.join(''));
2762
2934
  // Always clean up PTY resources — prevents fd leaks if spawnPty or waitForExit throws
2763
2935
  stopHeartbeat?.();
2764
2936
  this.activeAgentHandles.delete(agentName);
@@ -2784,10 +2956,14 @@ export class WorkflowRunner {
2784
2956
  : exitResult === 'timeout'
2785
2957
  ? 'Agent completed (released after idle timeout)'
2786
2958
  : exitResult === 'released'
2787
- ? 'Agent completed (force-released after idle nudging)'
2959
+ ? 'Agent completed (idle treated as done)'
2788
2960
  : `Agent exited (${exitResult})`;
2789
2961
  }
2790
- return output;
2962
+ return {
2963
+ output,
2964
+ exitCode: agent?.exitCode,
2965
+ exitSignal: agent?.exitSignal,
2966
+ };
2791
2967
  }
2792
2968
  // ── Idle nudging ────────────────────────────────────────────────────────
2793
2969
  /** Patterns where a hub agent coordinates spoke agents. */
@@ -2858,7 +3034,7 @@ export class WorkflowRunner {
2858
3034
  return exitResult;
2859
3035
  }
2860
3036
  // Agent is still running after the window expired.
2861
- if (remaining !== undefined && Date.now() - startTime >= remaining) {
3037
+ if (timeoutMs !== undefined && Date.now() - startTime >= timeoutMs) {
2862
3038
  return 'timeout';
2863
3039
  }
2864
3040
  // Nudge if we haven't exhausted the limit
@@ -2873,7 +3049,7 @@ export class WorkflowRunner {
2873
3049
  this.postToChannel(`**[${step.name}]** Agent \`${agent.name}\` still idle after ${nudgeCount} nudge(s) — force-releasing`);
2874
3050
  this.emit({ type: 'step:force-released', runId: this.currentRunId ?? '', stepName: step.name });
2875
3051
  await agent.release();
2876
- return 'released';
3052
+ return 'force-released';
2877
3053
  }
2878
3054
  }
2879
3055
  /**
@@ -2988,7 +3164,7 @@ export class WorkflowRunner {
2988
3164
  }
2989
3165
  await this.db.updateRun(runId, patch);
2990
3166
  }
2991
- async markStepFailed(state, error, runId) {
3167
+ async markStepFailed(state, error, runId, exitInfo) {
2992
3168
  state.row.status = 'failed';
2993
3169
  state.row.error = error;
2994
3170
  state.row.completedAt = new Date().toISOString();
@@ -2998,7 +3174,14 @@ export class WorkflowRunner {
2998
3174
  completedAt: state.row.completedAt,
2999
3175
  updatedAt: new Date().toISOString(),
3000
3176
  });
3001
- this.emit({ type: 'step:failed', runId, stepName: state.row.stepName, error });
3177
+ this.emit({
3178
+ type: 'step:failed',
3179
+ runId,
3180
+ stepName: state.row.stepName,
3181
+ error,
3182
+ exitCode: exitInfo?.exitCode,
3183
+ exitSignal: exitInfo?.exitSignal,
3184
+ });
3002
3185
  }
3003
3186
  async markDownstreamSkipped(failedStepName, allSteps, stepStates, runId) {
3004
3187
  const queue = [failedStepName];