agent-relay 3.1.19 → 3.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +13 -1
  2. package/bin/agent-relay-broker-darwin-arm64 +0 -0
  3. package/bin/agent-relay-broker-darwin-x64 +0 -0
  4. package/bin/agent-relay-broker-linux-arm64 +0 -0
  5. package/bin/agent-relay-broker-linux-x64 +0 -0
  6. package/dist/index.cjs +435 -190
  7. package/dist/src/cli/bootstrap.js +0 -15
  8. package/dist/src/cli/bootstrap.js.map +1 -1
  9. package/dist/src/cli/commands/agent-management.d.ts +1 -0
  10. package/dist/src/cli/commands/agent-management.d.ts.map +1 -1
  11. package/dist/src/cli/commands/agent-management.js +235 -16
  12. package/dist/src/cli/commands/agent-management.js.map +1 -1
  13. package/dist/src/cli/commands/core.js +1 -1
  14. package/dist/src/cli/commands/core.js.map +1 -1
  15. package/dist/src/cli/index.d.ts.map +1 -1
  16. package/dist/src/cli/index.js +13 -1
  17. package/dist/src/cli/index.js.map +1 -1
  18. package/dist/src/cli/lib/broker-lifecycle.d.ts.map +1 -1
  19. package/dist/src/cli/lib/broker-lifecycle.js +3 -5
  20. package/dist/src/cli/lib/broker-lifecycle.js.map +1 -1
  21. package/dist/src/cli/lib/connect-daytona.js +2 -2
  22. package/dist/src/cli/lib/connect-daytona.js.map +1 -1
  23. package/install.sh +9 -3
  24. package/package.json +13 -13
  25. package/packages/acp-bridge/package.json +2 -2
  26. package/packages/config/package.json +1 -1
  27. package/packages/hooks/package.json +4 -4
  28. package/packages/memory/package.json +2 -2
  29. package/packages/openclaw/dist/cli.js +79 -2
  30. package/packages/openclaw/dist/cli.js.map +1 -1
  31. package/packages/openclaw/dist/config.d.ts +28 -1
  32. package/packages/openclaw/dist/config.d.ts.map +1 -1
  33. package/packages/openclaw/dist/config.js +145 -0
  34. package/packages/openclaw/dist/config.js.map +1 -1
  35. package/packages/openclaw/dist/index.d.ts +2 -2
  36. package/packages/openclaw/dist/index.d.ts.map +1 -1
  37. package/packages/openclaw/dist/index.js +1 -1
  38. package/packages/openclaw/dist/index.js.map +1 -1
  39. package/packages/openclaw/dist/setup.d.ts.map +1 -1
  40. package/packages/openclaw/dist/setup.js +24 -1
  41. package/packages/openclaw/dist/setup.js.map +1 -1
  42. package/packages/openclaw/dist/types.d.ts +23 -0
  43. package/packages/openclaw/dist/types.d.ts.map +1 -1
  44. package/packages/openclaw/package.json +2 -2
  45. package/packages/openclaw/skill/SKILL.md +46 -0
  46. package/packages/openclaw/src/cli.ts +90 -2
  47. package/packages/openclaw/src/config.ts +165 -1
  48. package/packages/openclaw/src/index.ts +7 -1
  49. package/packages/openclaw/src/setup.ts +26 -1
  50. package/packages/openclaw/src/types.ts +25 -0
  51. package/packages/policy/package.json +2 -2
  52. package/packages/sdk/dist/__tests__/integration.test.js +35 -0
  53. package/packages/sdk/dist/__tests__/integration.test.js.map +1 -1
  54. package/packages/sdk/dist/client.d.ts +9 -0
  55. package/packages/sdk/dist/client.d.ts.map +1 -1
  56. package/packages/sdk/dist/client.js +33 -22
  57. package/packages/sdk/dist/client.js.map +1 -1
  58. package/packages/sdk/dist/protocol.d.ts +1 -0
  59. package/packages/sdk/dist/protocol.d.ts.map +1 -1
  60. package/packages/sdk/dist/relay.d.ts +8 -0
  61. package/packages/sdk/dist/relay.d.ts.map +1 -1
  62. package/packages/sdk/dist/relay.js +50 -5
  63. package/packages/sdk/dist/relay.js.map +1 -1
  64. package/packages/sdk/dist/workflows/cli.js +2 -0
  65. package/packages/sdk/dist/workflows/cli.js.map +1 -1
  66. package/packages/sdk/dist/workflows/runner.d.ts +11 -0
  67. package/packages/sdk/dist/workflows/runner.d.ts.map +1 -1
  68. package/packages/sdk/dist/workflows/runner.js +350 -167
  69. package/packages/sdk/dist/workflows/runner.js.map +1 -1
  70. package/packages/sdk/dist/workflows/trajectory.d.ts +6 -1
  71. package/packages/sdk/dist/workflows/trajectory.d.ts.map +1 -1
  72. package/packages/sdk/dist/workflows/trajectory.js +16 -2
  73. package/packages/sdk/dist/workflows/trajectory.js.map +1 -1
  74. package/packages/sdk/package.json +2 -2
  75. package/packages/sdk/src/__tests__/integration.test.ts +49 -0
  76. package/packages/sdk/src/__tests__/orchestration-upgrades.test.ts +50 -1
  77. package/packages/sdk/src/client.ts +44 -21
  78. package/packages/sdk/src/protocol.ts +1 -1
  79. package/packages/sdk/src/relay.ts +70 -5
  80. package/packages/sdk/src/workflows/cli.ts +2 -0
  81. package/packages/sdk/src/workflows/runner.ts +414 -185
  82. package/packages/sdk/src/workflows/trajectory.ts +22 -2
  83. package/packages/sdk-py/pyproject.toml +1 -1
  84. package/packages/sdk-py/src/agent_relay/client.py +18 -1
  85. package/packages/sdk-py/src/agent_relay/relay.py +4 -0
  86. package/packages/sdk-py/src/agent_relay/types.py +4 -0
  87. package/packages/telemetry/package.json +1 -1
  88. package/packages/trajectory/package.json +2 -2
  89. package/packages/user-directory/package.json +2 -2
  90. package/packages/utils/package.json +2 -2
@@ -13,6 +13,7 @@ import path from 'node:path';
13
13
 
14
14
  import { parse as parseYaml } from 'yaml';
15
15
  import { stripAnsi as stripAnsiFn } from '../pty.js';
16
+ import type { BrokerEvent } from '../protocol.js';
16
17
 
17
18
  import {
18
19
  loadCustomSteps,
@@ -64,6 +65,25 @@ export interface WorkflowDb {
64
65
  getStepsByRunId(runId: string): Promise<WorkflowStepRow[]>;
65
66
  }
66
67
 
68
+ /** Result returned by spawnAndWait / execNonInteractive with optional process exit info. */
69
+ interface SpawnResult {
70
+ output: string;
71
+ exitCode?: number;
72
+ exitSignal?: string;
73
+ }
74
+
75
+ /** Error carrying exit code/signal from a failed subprocess spawn. */
76
+ class SpawnExitError extends Error {
77
+ exitCode?: number;
78
+ exitSignal?: string;
79
+ constructor(message: string, exitCode?: number, exitSignal?: string | null) {
80
+ super(message);
81
+ this.name = 'SpawnExitError';
82
+ this.exitCode = exitCode;
83
+ this.exitSignal = exitSignal ?? undefined;
84
+ }
85
+ }
86
+
67
87
  // ── Events ──────────────────────────────────────────────────────────────────
68
88
 
69
89
  export type WorkflowEvent =
@@ -71,6 +91,7 @@ export type WorkflowEvent =
71
91
  | { type: 'run:completed'; runId: string }
72
92
  | { type: 'run:failed'; runId: string; error: string }
73
93
  | { type: 'run:cancelled'; runId: string }
94
+ | { type: 'broker:event'; runId: string; event: BrokerEvent }
74
95
  | { type: 'step:started'; runId: string; stepName: string }
75
96
  | {
76
97
  type: 'step:owner-assigned';
@@ -79,7 +100,7 @@ export type WorkflowEvent =
79
100
  ownerName: string;
80
101
  specialistName: string;
81
102
  }
82
- | { type: 'step:completed'; runId: string; stepName: string; output?: string }
103
+ | { type: 'step:completed'; runId: string; stepName: string; output?: string; exitCode?: number; exitSignal?: string }
83
104
  | {
84
105
  type: 'step:review-completed';
85
106
  runId: string;
@@ -88,7 +109,7 @@ export type WorkflowEvent =
88
109
  decision: 'approved' | 'rejected';
89
110
  }
90
111
  | { type: 'step:owner-timeout'; runId: string; stepName: string; ownerName: string }
91
- | { type: 'step:failed'; runId: string; stepName: string; error: string }
112
+ | { type: 'step:failed'; runId: string; stepName: string; error: string; exitCode?: number; exitSignal?: string }
92
113
  | { type: 'step:skipped'; runId: string; stepName: string }
93
114
  | { type: 'step:retrying'; runId: string; stepName: string; attempt: number }
94
115
  | { type: 'step:nudged'; runId: string; stepName: string; nudgeCount: number }
@@ -223,6 +244,8 @@ export class WorkflowRunner {
223
244
 
224
245
  // PTY-based output capture: accumulate terminal output per-agent
225
246
  private readonly ptyOutputBuffers = new Map<string, string[]>();
247
+ /** Snapshot of PTY output from the most recent failed attempt, keyed by step name. */
248
+ private readonly lastFailedStepOutput = new Map<string, string>();
226
249
  private readonly ptyListeners = new Map<string, (chunk: string) => void>();
227
250
  private readonly ptyLogStreams = new Map<string, WriteStream>();
228
251
  /** Path to workers.json so `agents:kill` can find workflow-spawned agents */
@@ -1142,8 +1165,15 @@ export class WorkflowRunner {
1142
1165
  workflowName?: string,
1143
1166
  vars?: VariableContext
1144
1167
  ): Promise<WorkflowRunRow> {
1168
+ // Set up abort controller early so callers can abort() even during setup
1169
+ this.abortController = new AbortController();
1170
+ this.paused = false;
1171
+
1145
1172
  const resolved = vars ? this.resolveVariables(config, vars) : config;
1146
1173
 
1174
+ // Validate config (catches cycles, missing deps, invalid steps, etc.)
1175
+ this.validateConfig(resolved);
1176
+
1147
1177
  // Resolve and validate named paths from the top-level `paths` config
1148
1178
  const pathResult = this.resolvePathDefinitions(resolved.paths, this.cwd);
1149
1179
  if (pathResult.errors.length > 0) {
@@ -1227,6 +1257,10 @@ export class WorkflowRunner {
1227
1257
 
1228
1258
  /** Resume a previously paused or partially completed run. */
1229
1259
  async resume(runId: string, vars?: VariableContext): Promise<WorkflowRunRow> {
1260
+ // Set up abort controller early so callers can abort() even during setup
1261
+ this.abortController = new AbortController();
1262
+ this.paused = false;
1263
+
1230
1264
  const run = await this.db.getRun(runId);
1231
1265
  if (!run) {
1232
1266
  throw new Error(`Run "${runId}" not found`);
@@ -1289,9 +1323,7 @@ export class WorkflowRunner {
1289
1323
  const { run, workflow, config, stepStates, isResume } = input;
1290
1324
  const runId = run.id;
1291
1325
 
1292
- // Start execution
1293
- this.abortController = new AbortController();
1294
- this.paused = false;
1326
+ // Start execution (abortController already set by execute()/resume())
1295
1327
  this.currentConfig = config;
1296
1328
  this.currentRunId = runId;
1297
1329
  this.runStartTime = Date.now();
@@ -1336,15 +1368,22 @@ export class WorkflowRunner {
1336
1368
  config.swarm.channel = channel;
1337
1369
  await this.db.updateRun(runId, { config });
1338
1370
  }
1371
+ const relaycastDisabled =
1372
+ this.relayOptions.env?.AGENT_RELAY_WORKFLOW_DISABLE_RELAYCAST === '1';
1373
+ const requiresBroker =
1374
+ !this.executor &&
1375
+ workflow.steps.some((step) => step.type !== 'deterministic' && step.type !== 'worktree');
1339
1376
  // Skip broker/relay init when an external executor handles agent spawning
1340
- if (!this.executor) {
1341
- this.log('Resolving Relaycast API key...');
1342
- await this.ensureRelaycastApiKey(channel);
1343
- this.log('API key resolved');
1344
- if (this.relayApiKeyAutoCreated && this.relayApiKey) {
1345
- this.log(`Workspace created — follow this run in Relaycast:`);
1346
- this.log(` Observer: https://agentrelay.dev/observer?key=${this.relayApiKey}`);
1347
- this.log(` Channel: ${channel}`);
1377
+ if (requiresBroker) {
1378
+ if (!relaycastDisabled) {
1379
+ this.log('Resolving Relaycast API key...');
1380
+ await this.ensureRelaycastApiKey(channel);
1381
+ this.log('API key resolved');
1382
+ if (this.relayApiKeyAutoCreated && this.relayApiKey) {
1383
+ this.log(`Workspace created — follow this run in Relaycast:`);
1384
+ this.log(` Observer: https://agentrelay.dev/observer?key=${this.relayApiKey}`);
1385
+ this.log(` Channel: ${channel}`);
1386
+ }
1348
1387
  }
1349
1388
 
1350
1389
  this.log('Starting broker...');
@@ -1356,7 +1395,7 @@ export class WorkflowRunner {
1356
1395
  this.relay = new AgentRelay({
1357
1396
  ...this.relayOptions,
1358
1397
  brokerName,
1359
- channels: [channel],
1398
+ channels: relaycastDisabled ? [] : [channel],
1360
1399
  env: this.getRelayEnv(),
1361
1400
  // Workflows spawn agents across multiple waves; each spawn requires a PTY +
1362
1401
  // Relaycast registration. 60s is too tight when the broker is saturated with
@@ -1412,6 +1451,18 @@ export class WorkflowRunner {
1412
1451
 
1413
1452
  // Wire relay event hooks for rich console logging
1414
1453
  this.relay.onMessageReceived = (msg) => {
1454
+ this.emit({
1455
+ type: 'broker:event',
1456
+ runId,
1457
+ event: {
1458
+ kind: 'relay_inbound',
1459
+ event_id: msg.eventId,
1460
+ from: msg.from,
1461
+ target: msg.to,
1462
+ body: msg.text,
1463
+ thread_id: msg.threadId,
1464
+ } as BrokerEvent,
1465
+ });
1415
1466
  const body = msg.text.length > 120 ? msg.text.slice(0, 117) + '...' : msg.text;
1416
1467
  const fromShort = msg.from.replace(/-[a-f0-9]{6,}$/, '');
1417
1468
  const toShort = msg.to.replace(/-[a-f0-9]{6,}$/, '');
@@ -1429,13 +1480,43 @@ export class WorkflowRunner {
1429
1480
  };
1430
1481
 
1431
1482
  this.relay.onAgentSpawned = (agent) => {
1483
+ this.emit({
1484
+ type: 'broker:event',
1485
+ runId,
1486
+ event: {
1487
+ kind: 'agent_spawned',
1488
+ name: agent.name,
1489
+ runtime: agent.runtime,
1490
+ } as BrokerEvent,
1491
+ });
1432
1492
  // Skip agents already managed by step execution
1433
1493
  if (!this.activeAgentHandles.has(agent.name)) {
1434
1494
  this.log(`[spawned] ${agent.name} (${agent.runtime})`);
1435
1495
  }
1436
1496
  };
1437
1497
 
1498
+ this.relay.onAgentReleased = (agent) => {
1499
+ this.emit({
1500
+ type: 'broker:event',
1501
+ runId,
1502
+ event: {
1503
+ kind: 'agent_released',
1504
+ name: agent.name,
1505
+ } as BrokerEvent,
1506
+ });
1507
+ };
1508
+
1438
1509
  this.relay.onAgentExited = (agent) => {
1510
+ this.emit({
1511
+ type: 'broker:event',
1512
+ runId,
1513
+ event: {
1514
+ kind: 'agent_exited',
1515
+ name: agent.name,
1516
+ code: agent.exitCode,
1517
+ signal: agent.exitSignal,
1518
+ } as BrokerEvent,
1519
+ });
1439
1520
  this.lastActivity.delete(agent.name);
1440
1521
  this.lastIdleLog.delete(agent.name);
1441
1522
  if (!this.activeAgentHandles.has(agent.name)) {
@@ -1443,7 +1524,20 @@ export class WorkflowRunner {
1443
1524
  }
1444
1525
  };
1445
1526
 
1527
+ this.relay.onDeliveryUpdate = (event) => {
1528
+ this.emit({ type: 'broker:event', runId, event });
1529
+ };
1530
+
1446
1531
  this.relay.onAgentIdle = ({ name, idleSecs }) => {
1532
+ this.emit({
1533
+ type: 'broker:event',
1534
+ runId,
1535
+ event: {
1536
+ kind: 'agent_idle',
1537
+ name,
1538
+ idle_secs: idleSecs,
1539
+ } as BrokerEvent,
1540
+ });
1447
1541
  // Only log at 30s multiples to avoid watchdog spam
1448
1542
  const bucket = Math.floor(idleSecs / 30) * 30;
1449
1543
  if (bucket >= 30 && this.lastIdleLog.get(name) !== bucket) {
@@ -1461,20 +1555,22 @@ export class WorkflowRunner {
1461
1555
  console.log(`[broker] ${line}`);
1462
1556
  });
1463
1557
 
1464
- this.log(`Creating channel: ${channel}...`);
1465
- if (isResume) {
1466
- await this.createAndJoinRelaycastChannel(channel);
1467
- } else {
1468
- await this.createAndJoinRelaycastChannel(channel, workflow.description);
1469
- }
1470
- this.log('Channel ready');
1558
+ if (!relaycastDisabled) {
1559
+ this.log(`Creating channel: ${channel}...`);
1560
+ if (isResume) {
1561
+ await this.createAndJoinRelaycastChannel(channel);
1562
+ } else {
1563
+ await this.createAndJoinRelaycastChannel(channel, workflow.description);
1564
+ }
1565
+ this.log('Channel ready');
1471
1566
 
1472
- if (isResume) {
1473
- this.postToChannel(`Workflow **${workflow.name}** resumed — ${pendingCount} pending steps`);
1474
- } else {
1475
- this.postToChannel(
1476
- `Workflow **${workflow.name}** started — ${workflow.steps.length} steps, pattern: ${config.swarm.pattern}`
1477
- );
1567
+ if (isResume) {
1568
+ this.postToChannel(`Workflow **${workflow.name}** resumed — ${pendingCount} pending steps`);
1569
+ } else {
1570
+ this.postToChannel(
1571
+ `Workflow **${workflow.name}** started — ${workflow.steps.length} steps, pattern: ${config.swarm.pattern}`
1572
+ );
1573
+ }
1478
1574
  }
1479
1575
  }
1480
1576
 
@@ -1491,8 +1587,15 @@ export class WorkflowRunner {
1491
1587
  this.log(`Executing ${workflow.steps.length} steps (pattern: ${config.swarm.pattern})`);
1492
1588
  await this.executeSteps(workflow, stepStates, agentMap, config.errorHandling, runId);
1493
1589
 
1590
+ const errorStrategy =
1591
+ config.errorHandling?.strategy ?? workflow.onError ?? 'fail-fast';
1592
+ const continueOnError =
1593
+ errorStrategy === 'continue' || errorStrategy === 'skip';
1494
1594
  const allCompleted = [...stepStates.values()].every(
1495
- (s) => s.row.status === 'completed' || s.row.status === 'skipped'
1595
+ (s) =>
1596
+ s.row.status === 'completed' ||
1597
+ s.row.status === 'skipped' ||
1598
+ (continueOnError && s.row.status === 'failed')
1496
1599
  );
1497
1600
 
1498
1601
  if (allCompleted) {
@@ -1517,9 +1620,18 @@ export class WorkflowRunner {
1517
1620
  this.emit({ type: 'run:failed', runId, error: errorMsg });
1518
1621
 
1519
1622
  const outcomes = this.collectOutcomes(stepStates, workflow.steps);
1623
+ const summary = this.trajectory.buildRunSummary(outcomes);
1624
+ const confidence = this.trajectory.computeConfidence(outcomes);
1625
+ const learnings = this.trajectory.extractLearnings(outcomes);
1626
+ const challenges = this.trajectory.extractChallenges(outcomes);
1520
1627
  this.postFailureReport(workflow.name, outcomes, errorMsg);
1521
1628
  this.logRunSummary(workflow.name, outcomes, runId);
1522
- await this.trajectory.abandon(errorMsg);
1629
+ await this.trajectory.abandon(errorMsg, {
1630
+ summary,
1631
+ confidence,
1632
+ learnings,
1633
+ challenges,
1634
+ });
1523
1635
  }
1524
1636
  } catch (err) {
1525
1637
  const errorMsg = err instanceof Error ? err.message : String(err);
@@ -1528,15 +1640,35 @@ export class WorkflowRunner {
1528
1640
  await this.updateRunStatus(runId, status, errorMsg);
1529
1641
 
1530
1642
  if (status === 'cancelled') {
1643
+ // Mark any pending or in-progress steps as failed due to cancellation
1644
+ for (const [stepName, state] of stepStates) {
1645
+ if (state.row.status === 'pending' || state.row.status === 'running') {
1646
+ state.row.status = 'failed';
1647
+ state.row.error = 'Cancelled';
1648
+ await this.db.updateStep(state.row.id, {
1649
+ status: 'failed',
1650
+ error: 'Cancelled',
1651
+ updatedAt: new Date().toISOString(),
1652
+ });
1653
+ this.emit({ type: 'step:failed', runId, stepName, error: 'Cancelled' });
1654
+ }
1655
+ }
1531
1656
  this.emit({ type: 'run:cancelled', runId });
1532
1657
  this.postToChannel(`Workflow **${workflow.name}** cancelled`);
1533
1658
  await this.trajectory.abandon('Cancelled by user');
1534
1659
  } else {
1535
1660
  this.emit({ type: 'run:failed', runId, error: errorMsg });
1536
1661
  this.postToChannel(`Workflow failed: ${errorMsg}`);
1537
- await this.trajectory.abandon(errorMsg);
1662
+ const outcomes = this.collectOutcomes(stepStates, workflow.steps);
1663
+ await this.trajectory.abandon(errorMsg, {
1664
+ summary: this.trajectory.buildRunSummary(outcomes),
1665
+ confidence: this.trajectory.computeConfidence(outcomes),
1666
+ learnings: this.trajectory.extractLearnings(outcomes),
1667
+ challenges: this.trajectory.extractChallenges(outcomes),
1668
+ });
1538
1669
  }
1539
1670
  } finally {
1671
+ this.lastFailedStepOutput.clear();
1540
1672
  for (const stream of this.ptyLogStreams.values()) stream.end();
1541
1673
  this.ptyLogStreams.clear();
1542
1674
  this.ptyOutputBuffers.clear();
@@ -1549,9 +1681,11 @@ export class WorkflowRunner {
1549
1681
  if (this.relay) {
1550
1682
  this.relay.onMessageReceived = null;
1551
1683
  this.relay.onAgentSpawned = null;
1684
+ this.relay.onAgentReleased = null;
1552
1685
  this.relay.onAgentExited = null;
1553
1686
  this.relay.onAgentIdle = null;
1554
1687
  this.relay.onWorkerOutput = null;
1688
+ this.relay.onDeliveryUpdate = null;
1555
1689
  }
1556
1690
  this.lastIdleLog.clear();
1557
1691
  this.lastActivity.clear();
@@ -1867,7 +2001,7 @@ export class WorkflowRunner {
1867
2001
  ): Promise<void> {
1868
2002
  // Branch: deterministic steps execute shell commands
1869
2003
  if (this.isDeterministicStep(step)) {
1870
- return this.executeDeterministicStep(step, stepStates, runId);
2004
+ return this.executeDeterministicStep(step, stepStates, runId, errorHandling);
1871
2005
  }
1872
2006
 
1873
2007
  // Branch: worktree steps set up git worktrees
@@ -1886,167 +2020,199 @@ export class WorkflowRunner {
1886
2020
  private async executeDeterministicStep(
1887
2021
  step: WorkflowStep,
1888
2022
  stepStates: Map<string, StepState>,
1889
- runId: string
2023
+ runId: string,
2024
+ errorHandling: ErrorHandlingConfig | undefined
1890
2025
  ): Promise<void> {
1891
2026
  const state = stepStates.get(step.name);
1892
2027
  if (!state) throw new Error(`Step state not found: ${step.name}`);
1893
2028
 
1894
- this.checkAborted();
1895
-
1896
- // Mark step as running
1897
- state.row.status = 'running';
1898
- state.row.startedAt = new Date().toISOString();
1899
- await this.db.updateStep(state.row.id, {
1900
- status: 'running',
1901
- startedAt: state.row.startedAt,
1902
- updatedAt: new Date().toISOString(),
1903
- });
1904
- this.emit({ type: 'step:started', runId, stepName: step.name });
1905
- this.postToChannel(`**[${step.name}]** Started (deterministic)`);
1906
-
1907
- // Resolve variables in the command (e.g., {{steps.plan.output}}, {{branch-name}})
1908
- const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
1909
- let resolvedCommand = this.interpolateStepTask(step.command ?? '', stepOutputContext);
1910
-
1911
- // Also resolve simple {{variable}} placeholders (already resolved in top-level config but safe to re-run)
1912
- resolvedCommand = resolvedCommand.replace(/\{\{([\w][\w.\-]*)\}\}/g, (_match, key: string) => {
1913
- if (key.startsWith('steps.')) return _match; // Already handled above
1914
- const value = this.resolveDotPath(key, stepOutputContext);
1915
- return value !== undefined ? String(value) : _match;
1916
- });
1917
-
1918
- // Resolve step workdir (named path reference) for deterministic steps
1919
- const stepCwd = this.resolveStepWorkdir(step) ?? this.cwd;
2029
+ const maxRetries = step.retries ?? errorHandling?.maxRetries ?? 0;
2030
+ const retryDelay = errorHandling?.retryDelayMs ?? 1000;
2031
+ let lastError: string | undefined;
1920
2032
 
1921
- try {
1922
- // Delegate to executor if present
1923
- if (this.executor?.executeDeterministicStep) {
1924
- const result = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
1925
- const failOnError = step.failOnError !== false;
1926
- if (failOnError && result.exitCode !== 0) {
1927
- throw new Error(`Command failed with exit code ${result.exitCode}: ${result.output.slice(0, 500)}`);
1928
- }
1929
- const output =
1930
- step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
2033
+ for (let attempt = 0; attempt <= maxRetries; attempt += 1) {
2034
+ this.checkAborted();
1931
2035
 
1932
- // Mark completed
1933
- state.row.status = 'completed';
1934
- state.row.output = output;
1935
- state.row.completedAt = new Date().toISOString();
2036
+ if (attempt > 0) {
2037
+ this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
2038
+ this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
2039
+ state.row.retryCount = attempt;
1936
2040
  await this.db.updateStep(state.row.id, {
1937
- status: 'completed',
1938
- output,
1939
- completedAt: state.row.completedAt,
2041
+ retryCount: attempt,
1940
2042
  updatedAt: new Date().toISOString(),
1941
2043
  });
1942
- await this.persistStepOutput(runId, step.name, output);
1943
- this.emit({ type: 'step:completed', runId, stepName: step.name, output });
1944
- return;
2044
+ await this.delay(retryDelay);
1945
2045
  }
1946
2046
 
1947
- const output = await new Promise<string>((resolve, reject) => {
1948
- const child = cpSpawn('sh', ['-c', resolvedCommand], {
1949
- stdio: 'pipe',
1950
- cwd: stepCwd,
1951
- env: { ...process.env },
1952
- });
2047
+ // Mark step as running
2048
+ state.row.status = 'running';
2049
+ state.row.startedAt = new Date().toISOString();
2050
+ await this.db.updateStep(state.row.id, {
2051
+ status: 'running',
2052
+ startedAt: state.row.startedAt,
2053
+ updatedAt: new Date().toISOString(),
2054
+ });
2055
+ this.emit({ type: 'step:started', runId, stepName: step.name });
2056
+ this.postToChannel(`**[${step.name}]** Started (deterministic)`);
2057
+
2058
+ // Resolve variables in the command (e.g., {{steps.plan.output}}, {{branch-name}})
2059
+ const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
2060
+ let resolvedCommand = this.interpolateStepTask(step.command ?? '', stepOutputContext);
2061
+
2062
+ // Also resolve simple {{variable}} placeholders (already resolved in top-level config but safe to re-run)
2063
+ resolvedCommand = resolvedCommand.replace(/\{\{([\w][\w.\-]*)\}\}/g, (_match, key: string) => {
2064
+ if (key.startsWith('steps.')) return _match; // Already handled above
2065
+ const value = this.resolveDotPath(key, stepOutputContext);
2066
+ return value !== undefined ? String(value) : _match;
2067
+ });
1953
2068
 
1954
- const stdoutChunks: string[] = [];
1955
- const stderrChunks: string[] = [];
2069
+ // Resolve step workdir (named path reference) for deterministic steps
2070
+ const stepCwd = this.resolveStepWorkdir(step) ?? this.cwd;
1956
2071
 
1957
- // Wire abort signal
1958
- const abortSignal = this.abortController?.signal;
1959
- let abortHandler: (() => void) | undefined;
1960
- if (abortSignal && !abortSignal.aborted) {
1961
- abortHandler = () => {
1962
- child.kill('SIGTERM');
1963
- setTimeout(() => child.kill('SIGKILL'), 5000);
1964
- };
1965
- abortSignal.addEventListener('abort', abortHandler, { once: true });
1966
- }
2072
+ try {
2073
+ // Delegate to executor if present
2074
+ if (this.executor?.executeDeterministicStep) {
2075
+ const result = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
2076
+ const failOnError = step.failOnError !== false;
2077
+ if (failOnError && result.exitCode !== 0) {
2078
+ throw new Error(
2079
+ `Command failed with exit code ${result.exitCode}: ${result.output.slice(0, 500)}`
2080
+ );
2081
+ }
2082
+ const output =
2083
+ step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
2084
+ if (step.verification) {
2085
+ this.runVerification(step.verification, output, step.name);
2086
+ }
1967
2087
 
1968
- // Handle timeout
1969
- let timedOut = false;
1970
- let timer: ReturnType<typeof setTimeout> | undefined;
1971
- if (step.timeoutMs) {
1972
- timer = setTimeout(() => {
1973
- timedOut = true;
1974
- child.kill('SIGTERM');
1975
- setTimeout(() => child.kill('SIGKILL'), 5000);
1976
- }, step.timeoutMs);
2088
+ // Mark completed
2089
+ state.row.status = 'completed';
2090
+ state.row.output = output;
2091
+ state.row.completedAt = new Date().toISOString();
2092
+ await this.db.updateStep(state.row.id, {
2093
+ status: 'completed',
2094
+ output,
2095
+ completedAt: state.row.completedAt,
2096
+ updatedAt: new Date().toISOString(),
2097
+ });
2098
+ await this.persistStepOutput(runId, step.name, output);
2099
+ this.emit({ type: 'step:completed', runId, stepName: step.name, output });
2100
+ return;
1977
2101
  }
1978
2102
 
1979
- child.stdout?.on('data', (chunk: Buffer) => {
1980
- stdoutChunks.push(chunk.toString());
1981
- });
2103
+ const output = await new Promise<string>((resolve, reject) => {
2104
+ const child = cpSpawn('sh', ['-c', resolvedCommand], {
2105
+ stdio: 'pipe',
2106
+ cwd: stepCwd,
2107
+ env: { ...process.env },
2108
+ });
1982
2109
 
1983
- child.stderr?.on('data', (chunk: Buffer) => {
1984
- stderrChunks.push(chunk.toString());
1985
- });
2110
+ const stdoutChunks: string[] = [];
2111
+ const stderrChunks: string[] = [];
1986
2112
 
1987
- child.on('close', (code) => {
1988
- if (timer) clearTimeout(timer);
1989
- if (abortHandler && abortSignal) {
1990
- abortSignal.removeEventListener('abort', abortHandler);
2113
+ // Wire abort signal
2114
+ const abortSignal = this.abortController?.signal;
2115
+ let abortHandler: (() => void) | undefined;
2116
+ if (abortSignal && !abortSignal.aborted) {
2117
+ abortHandler = () => {
2118
+ child.kill('SIGTERM');
2119
+ setTimeout(() => child.kill('SIGKILL'), 5000);
2120
+ };
2121
+ abortSignal.addEventListener('abort', abortHandler, { once: true });
1991
2122
  }
1992
2123
 
1993
- if (abortSignal?.aborted) {
1994
- reject(new Error(`Step "${step.name}" aborted`));
1995
- return;
2124
+ // Handle timeout
2125
+ let timedOut = false;
2126
+ let timer: ReturnType<typeof setTimeout> | undefined;
2127
+ if (step.timeoutMs) {
2128
+ timer = setTimeout(() => {
2129
+ timedOut = true;
2130
+ child.kill('SIGTERM');
2131
+ setTimeout(() => child.kill('SIGKILL'), 5000);
2132
+ }, step.timeoutMs);
1996
2133
  }
1997
2134
 
1998
- if (timedOut) {
1999
- reject(
2000
- new Error(`Step "${step.name}" timed out (no step timeout set, check global swarm.timeoutMs)`)
2001
- );
2002
- return;
2003
- }
2135
+ child.stdout?.on('data', (chunk: Buffer) => {
2136
+ stdoutChunks.push(chunk.toString());
2137
+ });
2004
2138
 
2005
- const stdout = stdoutChunks.join('');
2006
- const stderr = stderrChunks.join('');
2139
+ child.stderr?.on('data', (chunk: Buffer) => {
2140
+ stderrChunks.push(chunk.toString());
2141
+ });
2007
2142
 
2008
- // Check exit code unless failOnError is explicitly false
2009
- const failOnError = step.failOnError !== false;
2010
- if (failOnError && code !== 0 && code !== null) {
2011
- reject(
2012
- new Error(`Command failed with exit code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`)
2013
- );
2014
- return;
2015
- }
2143
+ child.on('close', (code) => {
2144
+ if (timer) clearTimeout(timer);
2145
+ if (abortHandler && abortSignal) {
2146
+ abortSignal.removeEventListener('abort', abortHandler);
2147
+ }
2016
2148
 
2017
- resolve(step.captureOutput !== false ? stdout : `Command completed (exit code ${code ?? 0})`);
2018
- });
2149
+ if (abortSignal?.aborted) {
2150
+ reject(new Error(`Step "${step.name}" aborted`));
2151
+ return;
2152
+ }
2019
2153
 
2020
- child.on('error', (err) => {
2021
- if (timer) clearTimeout(timer);
2022
- if (abortHandler && abortSignal) {
2023
- abortSignal.removeEventListener('abort', abortHandler);
2024
- }
2025
- reject(new Error(`Failed to execute command: ${err.message}`));
2154
+ if (timedOut) {
2155
+ reject(
2156
+ new Error(`Step "${step.name}" timed out (no step timeout set, check global swarm.timeoutMs)`)
2157
+ );
2158
+ return;
2159
+ }
2160
+
2161
+ const stdout = stdoutChunks.join('');
2162
+ const stderr = stderrChunks.join('');
2163
+
2164
+ // Check exit code unless failOnError is explicitly false
2165
+ const failOnError = step.failOnError !== false;
2166
+ if (failOnError && code !== 0 && code !== null) {
2167
+ reject(
2168
+ new Error(
2169
+ `Command failed with exit code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`
2170
+ )
2171
+ );
2172
+ return;
2173
+ }
2174
+
2175
+ resolve(step.captureOutput !== false ? stdout : `Command completed (exit code ${code ?? 0})`);
2176
+ });
2177
+
2178
+ child.on('error', (err) => {
2179
+ if (timer) clearTimeout(timer);
2180
+ if (abortHandler && abortSignal) {
2181
+ abortSignal.removeEventListener('abort', abortHandler);
2182
+ }
2183
+ reject(new Error(`Failed to execute command: ${err.message}`));
2184
+ });
2026
2185
  });
2027
- });
2028
2186
 
2029
- // Mark completed
2030
- state.row.status = 'completed';
2031
- state.row.output = output;
2032
- state.row.completedAt = new Date().toISOString();
2033
- await this.db.updateStep(state.row.id, {
2034
- status: 'completed',
2035
- output,
2036
- completedAt: state.row.completedAt,
2037
- updatedAt: new Date().toISOString(),
2038
- });
2187
+ if (step.verification) {
2188
+ this.runVerification(step.verification, output, step.name);
2189
+ }
2039
2190
 
2040
- // Persist step output
2041
- await this.persistStepOutput(runId, step.name, output);
2191
+ // Mark completed
2192
+ state.row.status = 'completed';
2193
+ state.row.output = output;
2194
+ state.row.completedAt = new Date().toISOString();
2195
+ await this.db.updateStep(state.row.id, {
2196
+ status: 'completed',
2197
+ output,
2198
+ completedAt: state.row.completedAt,
2199
+ updatedAt: new Date().toISOString(),
2200
+ });
2042
2201
 
2043
- this.emit({ type: 'step:completed', runId, stepName: step.name, output });
2044
- } catch (err) {
2045
- const errorMsg = err instanceof Error ? err.message : String(err);
2046
- this.postToChannel(`**[${step.name}]** Failed: ${errorMsg}`);
2047
- await this.markStepFailed(state, errorMsg, runId);
2048
- throw new Error(`Step "${step.name}" failed: ${errorMsg}`);
2202
+ // Persist step output
2203
+ await this.persistStepOutput(runId, step.name, output);
2204
+
2205
+ this.emit({ type: 'step:completed', runId, stepName: step.name, output });
2206
+ return;
2207
+ } catch (err) {
2208
+ lastError = err instanceof Error ? err.message : String(err);
2209
+ }
2049
2210
  }
2211
+
2212
+ const errorMsg = lastError ?? 'Unknown error';
2213
+ this.postToChannel(`**[${step.name}]** Failed: ${errorMsg}`);
2214
+ await this.markStepFailed(state, errorMsg, runId);
2215
+ throw new Error(`Step "${step.name}" failed: ${errorMsg}`);
2050
2216
  }
2051
2217
 
2052
2218
  /**
@@ -2286,10 +2452,16 @@ export class WorkflowRunner {
2286
2452
  this.currentConfig?.swarm?.timeoutMs;
2287
2453
 
2288
2454
  let lastError: string | undefined;
2455
+ let lastExitCode: number | undefined;
2456
+ let lastExitSignal: string | undefined;
2289
2457
 
2290
2458
  for (let attempt = 0; attempt <= maxRetries; attempt++) {
2291
2459
  this.checkAborted();
2292
2460
 
2461
+ // Reset per-attempt exit info so stale values don't leak across retries
2462
+ lastExitCode = undefined;
2463
+ lastExitSignal = undefined;
2464
+
2293
2465
  if (attempt > 0) {
2294
2466
  this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
2295
2467
  this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
@@ -2336,6 +2508,16 @@ export class WorkflowRunner {
2336
2508
  const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
2337
2509
  let resolvedTask = this.interpolateStepTask(step.task ?? '', stepOutputContext);
2338
2510
 
2511
+ // On retry attempts, prepend failure context so the agent knows what went wrong
2512
+ if (attempt > 0 && lastError) {
2513
+ const priorOutput = (this.lastFailedStepOutput.get(step.name) ?? '').slice(-2000);
2514
+ resolvedTask =
2515
+ `[RETRY — Attempt ${attempt + 1}/${maxRetries + 1}]\n` +
2516
+ `Previous attempt failed: ${lastError}\n` +
2517
+ (priorOutput ? `Previous output (last 2000 chars):\n${priorOutput}\n` : '') +
2518
+ `---\n${resolvedTask}`;
2519
+ }
2520
+
2339
2521
  // If this is an interactive agent, append awareness of non-interactive workers
2340
2522
  // so the lead knows not to message them and to use step output chaining instead
2341
2523
  if (specialistDef.interactive !== false || ownerDef.interactive !== false) {
@@ -2378,9 +2560,12 @@ export class WorkflowRunner {
2378
2560
  this.log(`[${step.name}] Spawning owner "${effectiveOwner.name}" (cli: ${effectiveOwner.cli})${step.workdir ? ` [workdir: ${step.workdir}]` : ''}`);
2379
2561
  const resolvedStep = { ...step, task: ownerTask };
2380
2562
  const ownerStartTime = Date.now();
2381
- const output = this.executor
2563
+ const spawnResult = this.executor
2382
2564
  ? await this.executor.executeAgentStep(resolvedStep, effectiveOwner, ownerTask, timeoutMs)
2383
2565
  : await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs);
2566
+ const output = typeof spawnResult === 'string' ? spawnResult : spawnResult.output;
2567
+ lastExitCode = typeof spawnResult === 'string' ? undefined : spawnResult.exitCode;
2568
+ lastExitSignal = typeof spawnResult === 'string' ? undefined : spawnResult.exitSignal;
2384
2569
  ownerElapsed = Date.now() - ownerStartTime;
2385
2570
  this.log(`[${step.name}] Owner "${effectiveOwner.name}" exited`);
2386
2571
  if (usesOwnerFlow) {
@@ -2392,7 +2577,12 @@ export class WorkflowRunner {
2392
2577
 
2393
2578
  // Run verification if configured
2394
2579
  if (step.verification) {
2395
- this.runVerification(step.verification, specialistOutput, step.name, resolvedTask);
2580
+ this.runVerification(
2581
+ step.verification,
2582
+ specialistOutput,
2583
+ step.name,
2584
+ effectiveOwner.interactive === false ? undefined : resolvedTask
2585
+ );
2396
2586
  }
2397
2587
 
2398
2588
  // Every interactive step gets a review pass; pick a dedicated reviewer when available.
@@ -2425,11 +2615,15 @@ export class WorkflowRunner {
2425
2615
  // Persist step output to disk so it survives restarts and is inspectable
2426
2616
  await this.persistStepOutput(runId, step.name, combinedOutput);
2427
2617
 
2428
- this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput });
2618
+ this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput, exitCode: lastExitCode, exitSignal: lastExitSignal });
2429
2619
  await this.trajectory?.stepCompleted(step, combinedOutput, attempt + 1);
2430
2620
  return;
2431
2621
  } catch (err) {
2432
2622
  lastError = err instanceof Error ? err.message : String(err);
2623
+ if (err instanceof SpawnExitError) {
2624
+ lastExitCode = err.exitCode;
2625
+ lastExitSignal = err.exitSignal;
2626
+ }
2433
2627
  const ownerTimedOut = usesDedicatedOwner
2434
2628
  ? /\bowner timed out\b/i.test(lastError)
2435
2629
  : /\btimed out\b/i.test(lastError) && !lastError.includes(`${step.name}-review`);
@@ -2452,7 +2646,10 @@ export class WorkflowRunner {
2452
2646
  verificationValue,
2453
2647
  });
2454
2648
  this.postToChannel(`**[${step.name}]** Failed: ${lastError ?? 'Unknown error'}`);
2455
- await this.markStepFailed(state, lastError ?? 'Unknown error', runId);
2649
+ await this.markStepFailed(state, lastError ?? 'Unknown error', runId, {
2650
+ exitCode: lastExitCode,
2651
+ exitSignal: lastExitSignal,
2652
+ });
2456
2653
  throw new Error(
2457
2654
  `Step "${step.name}" failed after ${maxRetries} retries: ${lastError ?? 'Unknown error'}`
2458
2655
  );
@@ -2615,10 +2812,10 @@ export class WorkflowRunner {
2615
2812
 
2616
2813
  const workerSettled = workerPromise.catch(() => undefined);
2617
2814
  workerPromise
2618
- .then((output) => {
2815
+ .then((result) => {
2619
2816
  workerReleased = true;
2620
2817
  this.postToChannel(`**[${step.name}]** Worker \`${workerRuntimeName}\` exited`);
2621
- if (step.verification?.type === 'output_contains' && output.includes(step.verification.value)) {
2818
+ if (step.verification?.type === 'output_contains' && result.output.includes(step.verification.value)) {
2622
2819
  this.postToChannel(
2623
2820
  `**[${step.name}]** Verification gate observed: output contains ${JSON.stringify(step.verification.value)}`
2624
2821
  );
@@ -2645,7 +2842,7 @@ export class WorkflowRunner {
2645
2842
  const ownerStartTime = Date.now();
2646
2843
 
2647
2844
  try {
2648
- const ownerOutput = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
2845
+ const ownerResultObj = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
2649
2846
  agentNameSuffix: 'owner',
2650
2847
  onSpawned: ({ actualName }) => {
2651
2848
  this.supervisedRuntimeAgents.set(actualName, {
@@ -2659,10 +2856,11 @@ export class WorkflowRunner {
2659
2856
  },
2660
2857
  });
2661
2858
  const ownerElapsed = Date.now() - ownerStartTime;
2859
+ const ownerOutput = ownerResultObj.output;
2662
2860
  this.log(`[${step.name}] Owner "${supervised.owner.name}" exited`);
2663
2861
  this.assertOwnerCompletionMarker(step, ownerOutput, supervisorTask);
2664
2862
 
2665
- const specialistOutput = await workerPromise;
2863
+ const specialistOutput = (await workerPromise).output;
2666
2864
  return { specialistOutput, ownerOutput, ownerElapsed };
2667
2865
  } catch (error) {
2668
2866
  const message = error instanceof Error ? error.message : String(error);
@@ -2909,7 +3107,7 @@ export class WorkflowRunner {
2909
3107
  };
2910
3108
 
2911
3109
  try {
2912
- reviewOutput = await this.spawnAndWait(reviewerDef, reviewStep, safetyTimeoutMs, {
3110
+ await this.spawnAndWait(reviewerDef, reviewStep, safetyTimeoutMs, {
2913
3111
  onSpawned: ({ agent }) => {
2914
3112
  reviewerHandle = agent;
2915
3113
  },
@@ -3089,7 +3287,7 @@ export class WorkflowRunner {
3089
3287
  agentDef: AgentDefinition,
3090
3288
  step: WorkflowStep,
3091
3289
  timeoutMs?: number
3092
- ): Promise<string> {
3290
+ ): Promise<SpawnResult> {
3093
3291
  const agentName = `${step.name}-${this.generateShortId()}`;
3094
3292
  const modelArgs = agentDef.constraints?.model ? ['--model', agentDef.constraints.model] : [];
3095
3293
 
@@ -3152,7 +3350,7 @@ export class WorkflowRunner {
3152
3350
  const stderrChunks: string[] = [];
3153
3351
 
3154
3352
  try {
3155
- const output = await new Promise<string>((resolve, reject) => {
3353
+ const { stdout: output, exitCode, exitSignal } = await new Promise<{ stdout: string; exitCode?: number; exitSignal?: string }>((resolve, reject) => {
3156
3354
  const child = cpSpawn(cmd, args, {
3157
3355
  stdio: ['ignore', 'pipe', 'pipe'],
3158
3356
  cwd: this.resolveAgentCwd(agentDef),
@@ -3217,7 +3415,7 @@ export class WorkflowRunner {
3217
3415
  }, timeoutMs);
3218
3416
  }
3219
3417
 
3220
- child.on('close', (code) => {
3418
+ child.on('close', (code, signal) => {
3221
3419
  clearInterval(heartbeat);
3222
3420
  if (timer) clearTimeout(timer);
3223
3421
  if (abortHandler && abortSignal) {
@@ -3238,14 +3436,20 @@ export class WorkflowRunner {
3238
3436
  if (code !== 0 && code !== null) {
3239
3437
  const stderr = stderrChunks.join('');
3240
3438
  reject(
3241
- new Error(
3242
- `Step "${step.name}" exited with code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`
3439
+ new SpawnExitError(
3440
+ `Step "${step.name}" exited with code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`,
3441
+ code,
3442
+ signal
3243
3443
  )
3244
3444
  );
3245
3445
  return;
3246
3446
  }
3247
3447
 
3248
- resolve(stdout);
3448
+ resolve({
3449
+ stdout,
3450
+ exitCode: code ?? undefined,
3451
+ exitSignal: signal ?? undefined,
3452
+ });
3249
3453
  });
3250
3454
 
3251
3455
  child.on('error', (err) => {
@@ -3258,8 +3462,10 @@ export class WorkflowRunner {
3258
3462
  });
3259
3463
  });
3260
3464
 
3261
- return output;
3465
+ return { output, exitCode, exitSignal };
3262
3466
  } finally {
3467
+ const combinedOutput = stdoutChunks.join('') + stderrChunks.join('');
3468
+ this.lastFailedStepOutput.set(step.name, combinedOutput);
3263
3469
  stopHeartbeat?.();
3264
3470
  logStream.end();
3265
3471
  this.unregisterWorker(agentName);
@@ -3271,7 +3477,7 @@ export class WorkflowRunner {
3271
3477
  step: WorkflowStep,
3272
3478
  timeoutMs?: number,
3273
3479
  options: SpawnAndWaitOptions = {}
3274
- ): Promise<string> {
3480
+ ): Promise<SpawnResult> {
3275
3481
  // Branch: non-interactive agents run as simple subprocesses
3276
3482
  if (agentDef.interactive === false) {
3277
3483
  return this.execNonInteractive(agentDef, step, timeoutMs);
@@ -3456,9 +3662,16 @@ export class WorkflowRunner {
3456
3662
  throw new Error(`Step "${step.name}" timed out after ${timeoutMs ?? 'unknown'}ms`);
3457
3663
  }
3458
3664
  }
3665
+
3666
+ if (exitResult === 'force-released') {
3667
+ throw new Error(
3668
+ `Step "${step.name}" failed — agent was force-released after exhausting idle nudges without completing`
3669
+ );
3670
+ }
3459
3671
  } finally {
3460
3672
  // Snapshot PTY chunks before cleanup — we need them for output reading below
3461
3673
  ptyChunks = this.ptyOutputBuffers.get(agentName) ?? [];
3674
+ this.lastFailedStepOutput.set(step.name, ptyChunks.join(''));
3462
3675
 
3463
3676
  // Always clean up PTY resources — prevents fd leaks if spawnPty or waitForExit throws
3464
3677
  stopHeartbeat?.();
@@ -3485,11 +3698,15 @@ export class WorkflowRunner {
3485
3698
  : exitResult === 'timeout'
3486
3699
  ? 'Agent completed (released after idle timeout)'
3487
3700
  : exitResult === 'released'
3488
- ? 'Agent completed (force-released after idle nudging)'
3701
+ ? 'Agent completed (idle treated as done)'
3489
3702
  : `Agent exited (${exitResult})`;
3490
3703
  }
3491
3704
 
3492
- return output;
3705
+ return {
3706
+ output,
3707
+ exitCode: agent?.exitCode,
3708
+ exitSignal: agent?.exitSignal,
3709
+ };
3493
3710
  }
3494
3711
 
3495
3712
  // ── Idle nudging ────────────────────────────────────────────────────────
@@ -3525,7 +3742,7 @@ export class WorkflowRunner {
3525
3742
  agentDef: AgentDefinition,
3526
3743
  step: WorkflowStep,
3527
3744
  timeoutMs?: number
3528
- ): Promise<'exited' | 'timeout' | 'released'> {
3745
+ ): Promise<'exited' | 'timeout' | 'released' | 'force-released'> {
3529
3746
  const nudgeConfig = this.currentConfig?.swarm.idleNudge;
3530
3747
  if (!nudgeConfig) {
3531
3748
  // Idle = done: race exit against idle. Whichever fires first completes the step.
@@ -3576,7 +3793,7 @@ export class WorkflowRunner {
3576
3793
  }
3577
3794
 
3578
3795
  // Agent is still running after the window expired.
3579
- if (remaining !== undefined && Date.now() - startTime >= remaining) {
3796
+ if (timeoutMs !== undefined && Date.now() - startTime >= timeoutMs) {
3580
3797
  return 'timeout';
3581
3798
  }
3582
3799
 
@@ -3595,7 +3812,7 @@ export class WorkflowRunner {
3595
3812
  );
3596
3813
  this.emit({ type: 'step:force-released', runId: this.currentRunId ?? '', stepName: step.name });
3597
3814
  await agent.release();
3598
- return 'released';
3815
+ return 'force-released';
3599
3816
  }
3600
3817
  }
3601
3818
 
@@ -3731,7 +3948,12 @@ export class WorkflowRunner {
3731
3948
  await this.db.updateRun(runId, patch);
3732
3949
  }
3733
3950
 
3734
- private async markStepFailed(state: StepState, error: string, runId: string): Promise<void> {
3951
+ private async markStepFailed(
3952
+ state: StepState,
3953
+ error: string,
3954
+ runId: string,
3955
+ exitInfo?: { exitCode?: number; exitSignal?: string }
3956
+ ): Promise<void> {
3735
3957
  state.row.status = 'failed';
3736
3958
  state.row.error = error;
3737
3959
  state.row.completedAt = new Date().toISOString();
@@ -3741,7 +3963,14 @@ export class WorkflowRunner {
3741
3963
  completedAt: state.row.completedAt,
3742
3964
  updatedAt: new Date().toISOString(),
3743
3965
  });
3744
- this.emit({ type: 'step:failed', runId, stepName: state.row.stepName, error });
3966
+ this.emit({
3967
+ type: 'step:failed',
3968
+ runId,
3969
+ stepName: state.row.stepName,
3970
+ error,
3971
+ exitCode: exitInfo?.exitCode,
3972
+ exitSignal: exitInfo?.exitSignal,
3973
+ });
3745
3974
  }
3746
3975
 
3747
3976
  private async markDownstreamSkipped(