npm - @agent-relay/sdk - Versions diffs - 3.1.19 → 3.1.21 - Mend

@agent-relay/sdk 3.1.19 → 3.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/bin/agent-relay-broker-darwin-arm64 +0 -0
package/bin/agent-relay-broker-darwin-x64 +0 -0
package/bin/agent-relay-broker-linux-arm64 +0 -0
package/bin/agent-relay-broker-linux-x64 +0 -0
package/dist/__tests__/integration.test.js +35 -0
package/dist/__tests__/integration.test.js.map +1 -1
package/dist/client.d.ts +9 -0
package/dist/client.d.ts.map +1 -1
package/dist/client.js +33 -22
package/dist/client.js.map +1 -1
package/dist/protocol.d.ts +1 -0
package/dist/protocol.d.ts.map +1 -1
package/dist/relay.d.ts +8 -0
package/dist/relay.d.ts.map +1 -1
package/dist/relay.js +50 -5
package/dist/relay.js.map +1 -1
package/dist/workflows/cli.js +2 -0
package/dist/workflows/cli.js.map +1 -1
package/dist/workflows/runner.d.ts +11 -0
package/dist/workflows/runner.d.ts.map +1 -1
package/dist/workflows/runner.js +350 -167
package/dist/workflows/runner.js.map +1 -1
package/dist/workflows/trajectory.d.ts +6 -1
package/dist/workflows/trajectory.d.ts.map +1 -1
package/dist/workflows/trajectory.js +16 -2
package/dist/workflows/trajectory.js.map +1 -1
package/package.json +2 -2

package/dist/workflows/runner.js CHANGED Viewed

@@ -17,6 +17,17 @@ import { WorkflowTrajectory } from './trajectory.js';
 // Import from sub-paths to avoid pulling in the full @relaycast/sdk dependency.
 import { AgentRelay } from '../relay.js';
 import { RelayCast, RelayError } from '@relaycast/sdk';
+/** Error carrying exit code/signal from a failed subprocess spawn. */
+class SpawnExitError extends Error {
+    exitCode;
+    exitSignal;
+    constructor(message, exitCode, exitSignal) {
+        super(message);
+        this.name = 'SpawnExitError';
+        this.exitCode = exitCode;
+        this.exitSignal = exitSignal ?? undefined;
+    }
+}
 // ── CLI resolution ───────────────────────────────────────────────────────────
 /**
  * Resolve `cursor` to the concrete cursor agent binary available in PATH.
@@ -70,6 +81,8 @@ export class WorkflowRunner {
     activeAgentHandles = new Map();
     // PTY-based output capture: accumulate terminal output per-agent
     ptyOutputBuffers = new Map();
+    /** Snapshot of PTY output from the most recent failed attempt, keyed by step name. */
+    lastFailedStepOutput = new Map();
     ptyListeners = new Map();
     ptyLogStreams = new Map();
     /** Path to workers.json so `agents:kill` can find workflow-spawned agents */
@@ -865,7 +878,12 @@ export class WorkflowRunner {
     // ── Execution ───────────────────────────────────────────────────────────
     /** Execute a named workflow from a validated config. */
     async execute(config, workflowName, vars) {
+        // Set up abort controller early so callers can abort() even during setup
+        this.abortController = new AbortController();
+        this.paused = false;
         const resolved = vars ? this.resolveVariables(config, vars) : config;
+        // Validate config (catches cycles, missing deps, invalid steps, etc.)
+        this.validateConfig(resolved);
         // Resolve and validate named paths from the top-level `paths` config
         const pathResult = this.resolvePathDefinitions(resolved.paths, this.cwd);
         if (pathResult.errors.length > 0) {
@@ -935,6 +953,9 @@ export class WorkflowRunner {
     }
     /** Resume a previously paused or partially completed run. */
     async resume(runId, vars) {
+        // Set up abort controller early so callers can abort() even during setup
+        this.abortController = new AbortController();
+        this.paused = false;
         const run = await this.db.getRun(runId);
         if (!run) {
             throw new Error(`Run "${runId}" not found`);
@@ -982,9 +1003,7 @@ export class WorkflowRunner {
     async runWorkflowCore(input) {
         const { run, workflow, config, stepStates, isResume } = input;
         const runId = run.id;
-        // Start execution
-        this.abortController = new AbortController();
-        this.paused = false;
+        // Start execution (abortController already set by execute()/resume())
         this.currentConfig = config;
         this.currentRunId = runId;
         this.runStartTime = Date.now();
@@ -1012,15 +1031,20 @@ export class WorkflowRunner {
                 config.swarm.channel = channel;
                 await this.db.updateRun(runId, { config });
             }
+            const relaycastDisabled = this.relayOptions.env?.AGENT_RELAY_WORKFLOW_DISABLE_RELAYCAST === '1';
+            const requiresBroker = !this.executor &&
+                workflow.steps.some((step) => step.type !== 'deterministic' && step.type !== 'worktree');
             // Skip broker/relay init when an external executor handles agent spawning
-            if (!this.executor) {
-                this.log('Resolving Relaycast API key...');
-                await this.ensureRelaycastApiKey(channel);
-                this.log('API key resolved');
-                if (this.relayApiKeyAutoCreated && this.relayApiKey) {
-                    this.log(`Workspace created — follow this run in Relaycast:`);
-                    this.log(`  Observer: https://agentrelay.dev/observer?key=${this.relayApiKey}`);
-                    this.log(`  Channel: ${channel}`);
+            if (requiresBroker) {
+                if (!relaycastDisabled) {
+                    this.log('Resolving Relaycast API key...');
+                    await this.ensureRelaycastApiKey(channel);
+                    this.log('API key resolved');
+                    if (this.relayApiKeyAutoCreated && this.relayApiKey) {
+                        this.log(`Workspace created — follow this run in Relaycast:`);
+                        this.log(`  Observer: https://agentrelay.dev/observer?key=${this.relayApiKey}`);
+                        this.log(`  Channel: ${channel}`);
+                    }
                 }
                 this.log('Starting broker...');
                 // Include a short run ID suffix in the broker name so each workflow execution
@@ -1031,7 +1055,7 @@ export class WorkflowRunner {
                 this.relay = new AgentRelay({
                     ...this.relayOptions,
                     brokerName,
-                    channels: [channel],
+                    channels: relaycastDisabled ? [] : [channel],
                     env: this.getRelayEnv(),
                     // Workflows spawn agents across multiple waves; each spawn requires a PTY +
                     // Relaycast registration. 60s is too tight when the broker is saturated with
@@ -1092,6 +1116,18 @@ export class WorkflowRunner {
                 };
                 // Wire relay event hooks for rich console logging
                 this.relay.onMessageReceived = (msg) => {
+                    this.emit({
+                        type: 'broker:event',
+                        runId,
+                        event: {
+                            kind: 'relay_inbound',
+                            event_id: msg.eventId,
+                            from: msg.from,
+                            target: msg.to,
+                            body: msg.text,
+                            thread_id: msg.threadId,
+                        },
+                    });
                     const body = msg.text.length > 120 ? msg.text.slice(0, 117) + '...' : msg.text;
                     const fromShort = msg.from.replace(/-[a-f0-9]{6,}$/, '');
                     const toShort = msg.to.replace(/-[a-f0-9]{6,}$/, '');
@@ -1102,19 +1138,60 @@ export class WorkflowRunner {
                     }
                 };
                 this.relay.onAgentSpawned = (agent) => {
+                    this.emit({
+                        type: 'broker:event',
+                        runId,
+                        event: {
+                            kind: 'agent_spawned',
+                            name: agent.name,
+                            runtime: agent.runtime,
+                        },
+                    });
                     // Skip agents already managed by step execution
                     if (!this.activeAgentHandles.has(agent.name)) {
                         this.log(`[spawned] ${agent.name} (${agent.runtime})`);
                     }
                 };
+                this.relay.onAgentReleased = (agent) => {
+                    this.emit({
+                        type: 'broker:event',
+                        runId,
+                        event: {
+                            kind: 'agent_released',
+                            name: agent.name,
+                        },
+                    });
+                };
                 this.relay.onAgentExited = (agent) => {
+                    this.emit({
+                        type: 'broker:event',
+                        runId,
+                        event: {
+                            kind: 'agent_exited',
+                            name: agent.name,
+                            code: agent.exitCode,
+                            signal: agent.exitSignal,
+                        },
+                    });
                     this.lastActivity.delete(agent.name);
                     this.lastIdleLog.delete(agent.name);
                     if (!this.activeAgentHandles.has(agent.name)) {
                         this.log(`[exited] ${agent.name} (code: ${agent.exitCode ?? '?'})`);
                     }
                 };
+                this.relay.onDeliveryUpdate = (event) => {
+                    this.emit({ type: 'broker:event', runId, event });
+                };
                 this.relay.onAgentIdle = ({ name, idleSecs }) => {
+                    this.emit({
+                        type: 'broker:event',
+                        runId,
+                        event: {
+                            kind: 'agent_idle',
+                            name,
+                            idle_secs: idleSecs,
+                        },
+                    });
                     // Only log at 30s multiples to avoid watchdog spam
                     const bucket = Math.floor(idleSecs / 30) * 30;
                     if (bucket >= 30 && this.lastIdleLog.get(name) !== bucket) {
@@ -1129,19 +1206,21 @@ export class WorkflowRunner {
                 this.unsubBrokerStderr = this.relay.onBrokerStderr((line) => {
                     console.log(`[broker] ${line}`);
                 });
-                this.log(`Creating channel: ${channel}...`);
-                if (isResume) {
-                    await this.createAndJoinRelaycastChannel(channel);
-                }
-                else {
-                    await this.createAndJoinRelaycastChannel(channel, workflow.description);
-                }
-                this.log('Channel ready');
-                if (isResume) {
-                    this.postToChannel(`Workflow **${workflow.name}** resumed — ${pendingCount} pending steps`);
-                }
-                else {
-                    this.postToChannel(`Workflow **${workflow.name}** started — ${workflow.steps.length} steps, pattern: ${config.swarm.pattern}`);
+                if (!relaycastDisabled) {
+                    this.log(`Creating channel: ${channel}...`);
+                    if (isResume) {
+                        await this.createAndJoinRelaycastChannel(channel);
+                    }
+                    else {
+                        await this.createAndJoinRelaycastChannel(channel, workflow.description);
+                    }
+                    this.log('Channel ready');
+                    if (isResume) {
+                        this.postToChannel(`Workflow **${workflow.name}** resumed — ${pendingCount} pending steps`);
+                    }
+                    else {
+                        this.postToChannel(`Workflow **${workflow.name}** started — ${workflow.steps.length} steps, pattern: ${config.swarm.pattern}`);
+                    }
                 }
             }
             const agentMap = new Map();
@@ -1154,7 +1233,11 @@ export class WorkflowRunner {
             }
             this.log(`Executing ${workflow.steps.length} steps (pattern: ${config.swarm.pattern})`);
             await this.executeSteps(workflow, stepStates, agentMap, config.errorHandling, runId);
-            const allCompleted = [...stepStates.values()].every((s) => s.row.status === 'completed' || s.row.status === 'skipped');
+            const errorStrategy = config.errorHandling?.strategy ?? workflow.onError ?? 'fail-fast';
+            const continueOnError = errorStrategy === 'continue' || errorStrategy === 'skip';
+            const allCompleted = [...stepStates.values()].every((s) => s.row.status === 'completed' ||
+                s.row.status === 'skipped' ||
+                (continueOnError && s.row.status === 'failed'));
             if (allCompleted) {
                 this.log('Workflow completed successfully');
                 await this.updateRunStatus(runId, 'completed');
@@ -1175,9 +1258,18 @@ export class WorkflowRunner {
                 await this.updateRunStatus(runId, 'failed', errorMsg);
                 this.emit({ type: 'run:failed', runId, error: errorMsg });
                 const outcomes = this.collectOutcomes(stepStates, workflow.steps);
+                const summary = this.trajectory.buildRunSummary(outcomes);
+                const confidence = this.trajectory.computeConfidence(outcomes);
+                const learnings = this.trajectory.extractLearnings(outcomes);
+                const challenges = this.trajectory.extractChallenges(outcomes);
                 this.postFailureReport(workflow.name, outcomes, errorMsg);
                 this.logRunSummary(workflow.name, outcomes, runId);
-                await this.trajectory.abandon(errorMsg);
+                await this.trajectory.abandon(errorMsg, {
+                    summary,
+                    confidence,
+                    learnings,
+                    challenges,
+                });
             }
         }
         catch (err) {
@@ -1185,6 +1277,19 @@ export class WorkflowRunner {
             const status = !isResume && this.abortController?.signal.aborted ? 'cancelled' : 'failed';
             await this.updateRunStatus(runId, status, errorMsg);
             if (status === 'cancelled') {
+                // Mark any pending or in-progress steps as failed due to cancellation
+                for (const [stepName, state] of stepStates) {
+                    if (state.row.status === 'pending' || state.row.status === 'running') {
+                        state.row.status = 'failed';
+                        state.row.error = 'Cancelled';
+                        await this.db.updateStep(state.row.id, {
+                            status: 'failed',
+                            error: 'Cancelled',
+                            updatedAt: new Date().toISOString(),
+                        });
+                        this.emit({ type: 'step:failed', runId, stepName, error: 'Cancelled' });
+                    }
+                }
                 this.emit({ type: 'run:cancelled', runId });
                 this.postToChannel(`Workflow **${workflow.name}** cancelled`);
                 await this.trajectory.abandon('Cancelled by user');
@@ -1192,10 +1297,17 @@ export class WorkflowRunner {
             else {
                 this.emit({ type: 'run:failed', runId, error: errorMsg });
                 this.postToChannel(`Workflow failed: ${errorMsg}`);
-                await this.trajectory.abandon(errorMsg);
+                const outcomes = this.collectOutcomes(stepStates, workflow.steps);
+                await this.trajectory.abandon(errorMsg, {
+                    summary: this.trajectory.buildRunSummary(outcomes),
+                    confidence: this.trajectory.computeConfidence(outcomes),
+                    learnings: this.trajectory.extractLearnings(outcomes),
+                    challenges: this.trajectory.extractChallenges(outcomes),
+                });
             }
         }
         finally {
+            this.lastFailedStepOutput.clear();
             for (const stream of this.ptyLogStreams.values())
                 stream.end();
             this.ptyLogStreams.clear();
@@ -1207,9 +1319,11 @@ export class WorkflowRunner {
             if (this.relay) {
                 this.relay.onMessageReceived = null;
                 this.relay.onAgentSpawned = null;
+                this.relay.onAgentReleased = null;
                 this.relay.onAgentExited = null;
                 this.relay.onAgentIdle = null;
                 this.relay.onWorkerOutput = null;
+                this.relay.onDeliveryUpdate = null;
             }
             this.lastIdleLog.clear();
             this.lastActivity.clear();
@@ -1461,7 +1575,7 @@ export class WorkflowRunner {
     async executeStep(step, stepStates, agentMap, errorHandling, runId) {
         // Branch: deterministic steps execute shell commands
         if (this.isDeterministicStep(step)) {
-            return this.executeDeterministicStep(step, stepStates, runId);
+            return this.executeDeterministicStep(step, stepStates, runId, errorHandling);
         }
         // Branch: worktree steps set up git worktrees
         if (this.isWorktreeStep(step)) {
@@ -1474,42 +1588,143 @@ export class WorkflowRunner {
      * Execute a deterministic step (shell command).
      * Fast, reliable, $0 LLM cost.
      */
-    async executeDeterministicStep(step, stepStates, runId) {
+    async executeDeterministicStep(step, stepStates, runId, errorHandling) {
         const state = stepStates.get(step.name);
         if (!state)
             throw new Error(`Step state not found: ${step.name}`);
-        this.checkAborted();
-        // Mark step as running
-        state.row.status = 'running';
-        state.row.startedAt = new Date().toISOString();
-        await this.db.updateStep(state.row.id, {
-            status: 'running',
-            startedAt: state.row.startedAt,
-            updatedAt: new Date().toISOString(),
-        });
-        this.emit({ type: 'step:started', runId, stepName: step.name });
-        this.postToChannel(`**[${step.name}]** Started (deterministic)`);
-        // Resolve variables in the command (e.g., {{steps.plan.output}}, {{branch-name}})
-        const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
-        let resolvedCommand = this.interpolateStepTask(step.command ?? '', stepOutputContext);
-        // Also resolve simple {{variable}} placeholders (already resolved in top-level config but safe to re-run)
-        resolvedCommand = resolvedCommand.replace(/\{\{([\w][\w.\-]*)\}\}/g, (_match, key) => {
-            if (key.startsWith('steps.'))
-                return _match; // Already handled above
-            const value = this.resolveDotPath(key, stepOutputContext);
-            return value !== undefined ? String(value) : _match;
-        });
-        // Resolve step workdir (named path reference) for deterministic steps
-        const stepCwd = this.resolveStepWorkdir(step) ?? this.cwd;
-        try {
-            // Delegate to executor if present
-            if (this.executor?.executeDeterministicStep) {
-                const result = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
-                const failOnError = step.failOnError !== false;
-                if (failOnError && result.exitCode !== 0) {
-                    throw new Error(`Command failed with exit code ${result.exitCode}: ${result.output.slice(0, 500)}`);
+        const maxRetries = step.retries ?? errorHandling?.maxRetries ?? 0;
+        const retryDelay = errorHandling?.retryDelayMs ?? 1000;
+        let lastError;
+        for (let attempt = 0; attempt <= maxRetries; attempt += 1) {
+            this.checkAborted();
+            if (attempt > 0) {
+                this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
+                this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
+                state.row.retryCount = attempt;
+                await this.db.updateStep(state.row.id, {
+                    retryCount: attempt,
+                    updatedAt: new Date().toISOString(),
+                });
+                await this.delay(retryDelay);
+            }
+            // Mark step as running
+            state.row.status = 'running';
+            state.row.startedAt = new Date().toISOString();
+            await this.db.updateStep(state.row.id, {
+                status: 'running',
+                startedAt: state.row.startedAt,
+                updatedAt: new Date().toISOString(),
+            });
+            this.emit({ type: 'step:started', runId, stepName: step.name });
+            this.postToChannel(`**[${step.name}]** Started (deterministic)`);
+            // Resolve variables in the command (e.g., {{steps.plan.output}}, {{branch-name}})
+            const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
+            let resolvedCommand = this.interpolateStepTask(step.command ?? '', stepOutputContext);
+            // Also resolve simple {{variable}} placeholders (already resolved in top-level config but safe to re-run)
+            resolvedCommand = resolvedCommand.replace(/\{\{([\w][\w.\-]*)\}\}/g, (_match, key) => {
+                if (key.startsWith('steps.'))
+                    return _match; // Already handled above
+                const value = this.resolveDotPath(key, stepOutputContext);
+                return value !== undefined ? String(value) : _match;
+            });
+            // Resolve step workdir (named path reference) for deterministic steps
+            const stepCwd = this.resolveStepWorkdir(step) ?? this.cwd;
+            try {
+                // Delegate to executor if present
+                if (this.executor?.executeDeterministicStep) {
+                    const result = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
+                    const failOnError = step.failOnError !== false;
+                    if (failOnError && result.exitCode !== 0) {
+                        throw new Error(`Command failed with exit code ${result.exitCode}: ${result.output.slice(0, 500)}`);
+                    }
+                    const output = step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
+                    if (step.verification) {
+                        this.runVerification(step.verification, output, step.name);
+                    }
+                    // Mark completed
+                    state.row.status = 'completed';
+                    state.row.output = output;
+                    state.row.completedAt = new Date().toISOString();
+                    await this.db.updateStep(state.row.id, {
+                        status: 'completed',
+                        output,
+                        completedAt: state.row.completedAt,
+                        updatedAt: new Date().toISOString(),
+                    });
+                    await this.persistStepOutput(runId, step.name, output);
+                    this.emit({ type: 'step:completed', runId, stepName: step.name, output });
+                    return;
+                }
+                const output = await new Promise((resolve, reject) => {
+                    const child = cpSpawn('sh', ['-c', resolvedCommand], {
+                        stdio: 'pipe',
+                        cwd: stepCwd,
+                        env: { ...process.env },
+                    });
+                    const stdoutChunks = [];
+                    const stderrChunks = [];
+                    // Wire abort signal
+                    const abortSignal = this.abortController?.signal;
+                    let abortHandler;
+                    if (abortSignal && !abortSignal.aborted) {
+                        abortHandler = () => {
+                            child.kill('SIGTERM');
+                            setTimeout(() => child.kill('SIGKILL'), 5000);
+                        };
+                        abortSignal.addEventListener('abort', abortHandler, { once: true });
+                    }
+                    // Handle timeout
+                    let timedOut = false;
+                    let timer;
+                    if (step.timeoutMs) {
+                        timer = setTimeout(() => {
+                            timedOut = true;
+                            child.kill('SIGTERM');
+                            setTimeout(() => child.kill('SIGKILL'), 5000);
+                        }, step.timeoutMs);
+                    }
+                    child.stdout?.on('data', (chunk) => {
+                        stdoutChunks.push(chunk.toString());
+                    });
+                    child.stderr?.on('data', (chunk) => {
+                        stderrChunks.push(chunk.toString());
+                    });
+                    child.on('close', (code) => {
+                        if (timer)
+                            clearTimeout(timer);
+                        if (abortHandler && abortSignal) {
+                            abortSignal.removeEventListener('abort', abortHandler);
+                        }
+                        if (abortSignal?.aborted) {
+                            reject(new Error(`Step "${step.name}" aborted`));
+                            return;
+                        }
+                        if (timedOut) {
+                            reject(new Error(`Step "${step.name}" timed out (no step timeout set, check global swarm.timeoutMs)`));
+                            return;
+                        }
+                        const stdout = stdoutChunks.join('');
+                        const stderr = stderrChunks.join('');
+                        // Check exit code unless failOnError is explicitly false
+                        const failOnError = step.failOnError !== false;
+                        if (failOnError && code !== 0 && code !== null) {
+                            reject(new Error(`Command failed with exit code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`));
+                            return;
+                        }
+                        resolve(step.captureOutput !== false ? stdout : `Command completed (exit code ${code ?? 0})`);
+                    });
+                    child.on('error', (err) => {
+                        if (timer)
+                            clearTimeout(timer);
+                        if (abortHandler && abortSignal) {
+                            abortSignal.removeEventListener('abort', abortHandler);
+                        }
+                        reject(new Error(`Failed to execute command: ${err.message}`));
+                    });
+                });
+                if (step.verification) {
+                    this.runVerification(step.verification, output, step.name);
                 }
-                const output = step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
                 // Mark completed
                 state.row.status = 'completed';
                 state.row.output = output;
@@ -1520,97 +1735,19 @@ export class WorkflowRunner {
                     completedAt: state.row.completedAt,
                     updatedAt: new Date().toISOString(),
                 });
+                // Persist step output
                 await this.persistStepOutput(runId, step.name, output);
                 this.emit({ type: 'step:completed', runId, stepName: step.name, output });
                 return;
             }
-            const output = await new Promise((resolve, reject) => {
-                const child = cpSpawn('sh', ['-c', resolvedCommand], {
-                    stdio: 'pipe',
-                    cwd: stepCwd,
-                    env: { ...process.env },
-                });
-                const stdoutChunks = [];
-                const stderrChunks = [];
-                // Wire abort signal
-                const abortSignal = this.abortController?.signal;
-                let abortHandler;
-                if (abortSignal && !abortSignal.aborted) {
-                    abortHandler = () => {
-                        child.kill('SIGTERM');
-                        setTimeout(() => child.kill('SIGKILL'), 5000);
-                    };
-                    abortSignal.addEventListener('abort', abortHandler, { once: true });
-                }
-                // Handle timeout
-                let timedOut = false;
-                let timer;
-                if (step.timeoutMs) {
-                    timer = setTimeout(() => {
-                        timedOut = true;
-                        child.kill('SIGTERM');
-                        setTimeout(() => child.kill('SIGKILL'), 5000);
-                    }, step.timeoutMs);
-                }
-                child.stdout?.on('data', (chunk) => {
-                    stdoutChunks.push(chunk.toString());
-                });
-                child.stderr?.on('data', (chunk) => {
-                    stderrChunks.push(chunk.toString());
-                });
-                child.on('close', (code) => {
-                    if (timer)
-                        clearTimeout(timer);
-                    if (abortHandler && abortSignal) {
-                        abortSignal.removeEventListener('abort', abortHandler);
-                    }
-                    if (abortSignal?.aborted) {
-                        reject(new Error(`Step "${step.name}" aborted`));
-                        return;
-                    }
-                    if (timedOut) {
-                        reject(new Error(`Step "${step.name}" timed out (no step timeout set, check global swarm.timeoutMs)`));
-                        return;
-                    }
-                    const stdout = stdoutChunks.join('');
-                    const stderr = stderrChunks.join('');
-                    // Check exit code unless failOnError is explicitly false
-                    const failOnError = step.failOnError !== false;
-                    if (failOnError && code !== 0 && code !== null) {
-                        reject(new Error(`Command failed with exit code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`));
-                        return;
-                    }
-                    resolve(step.captureOutput !== false ? stdout : `Command completed (exit code ${code ?? 0})`);
-                });
-                child.on('error', (err) => {
-                    if (timer)
-                        clearTimeout(timer);
-                    if (abortHandler && abortSignal) {
-                        abortSignal.removeEventListener('abort', abortHandler);
-                    }
-                    reject(new Error(`Failed to execute command: ${err.message}`));
-                });
-            });
-            // Mark completed
-            state.row.status = 'completed';
-            state.row.output = output;
-            state.row.completedAt = new Date().toISOString();
-            await this.db.updateStep(state.row.id, {
-                status: 'completed',
-                output,
-                completedAt: state.row.completedAt,
-                updatedAt: new Date().toISOString(),
-            });
-            // Persist step output
-            await this.persistStepOutput(runId, step.name, output);
-            this.emit({ type: 'step:completed', runId, stepName: step.name, output });
-        }
-        catch (err) {
-            const errorMsg = err instanceof Error ? err.message : String(err);
-            this.postToChannel(`**[${step.name}]** Failed: ${errorMsg}`);
-            await this.markStepFailed(state, errorMsg, runId);
-            throw new Error(`Step "${step.name}" failed: ${errorMsg}`);
+            catch (err) {
+                lastError = err instanceof Error ? err.message : String(err);
+            }
         }
+        const errorMsg = lastError ?? 'Unknown error';
+        this.postToChannel(`**[${step.name}]** Failed: ${errorMsg}`);
+        await this.markStepFailed(state, errorMsg, runId);
+        throw new Error(`Step "${step.name}" failed: ${errorMsg}`);
     }
     /**
      * Execute a worktree step (git worktree setup).
@@ -1807,8 +1944,13 @@ export class WorkflowRunner {
             specialistDef.constraints?.timeoutMs ??
             this.currentConfig?.swarm?.timeoutMs;
         let lastError;
+        let lastExitCode;
+        let lastExitSignal;
         for (let attempt = 0; attempt <= maxRetries; attempt++) {
             this.checkAborted();
+            // Reset per-attempt exit info so stale values don't leak across retries
+            lastExitCode = undefined;
+            lastExitSignal = undefined;
             if (attempt > 0) {
                 this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
                 this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
@@ -1850,6 +1992,15 @@ export class WorkflowRunner {
                 // Resolve step-output variables (e.g. {{steps.plan.output}}) at execution time
                 const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
                 let resolvedTask = this.interpolateStepTask(step.task ?? '', stepOutputContext);
+                // On retry attempts, prepend failure context so the agent knows what went wrong
+                if (attempt > 0 && lastError) {
+                    const priorOutput = (this.lastFailedStepOutput.get(step.name) ?? '').slice(-2000);
+                    resolvedTask =
+                        `[RETRY — Attempt ${attempt + 1}/${maxRetries + 1}]\n` +
+                            `Previous attempt failed: ${lastError}\n` +
+                            (priorOutput ? `Previous output (last 2000 chars):\n${priorOutput}\n` : '') +
+                            `---\n${resolvedTask}`;
+                }
                 // If this is an interactive agent, append awareness of non-interactive workers
                 // so the lead knows not to message them and to use step output chaining instead
                 if (specialistDef.interactive !== false || ownerDef.interactive !== false) {
@@ -1884,9 +2035,12 @@ export class WorkflowRunner {
                     this.log(`[${step.name}] Spawning owner "${effectiveOwner.name}" (cli: ${effectiveOwner.cli})${step.workdir ? ` [workdir: ${step.workdir}]` : ''}`);
                     const resolvedStep = { ...step, task: ownerTask };
                     const ownerStartTime = Date.now();
-                    const output = this.executor
+                    const spawnResult = this.executor
                         ? await this.executor.executeAgentStep(resolvedStep, effectiveOwner, ownerTask, timeoutMs)
                         : await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs);
+                    const output = typeof spawnResult === 'string' ? spawnResult : spawnResult.output;
+                    lastExitCode = typeof spawnResult === 'string' ? undefined : spawnResult.exitCode;
+                    lastExitSignal = typeof spawnResult === 'string' ? undefined : spawnResult.exitSignal;
                     ownerElapsed = Date.now() - ownerStartTime;
                     this.log(`[${step.name}] Owner "${effectiveOwner.name}" exited`);
                     if (usesOwnerFlow) {
@@ -1897,7 +2051,7 @@ export class WorkflowRunner {
                 }
                 // Run verification if configured
                 if (step.verification) {
-                    this.runVerification(step.verification, specialistOutput, step.name, resolvedTask);
+                    this.runVerification(step.verification, specialistOutput, step.name, effectiveOwner.interactive === false ? undefined : resolvedTask);
                 }
                 // Every interactive step gets a review pass; pick a dedicated reviewer when available.
                 let combinedOutput = specialistOutput;
@@ -1918,12 +2072,16 @@ export class WorkflowRunner {
                 });
                 // Persist step output to disk so it survives restarts and is inspectable
                 await this.persistStepOutput(runId, step.name, combinedOutput);
-                this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput });
+                this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput, exitCode: lastExitCode, exitSignal: lastExitSignal });
                 await this.trajectory?.stepCompleted(step, combinedOutput, attempt + 1);
                 return;
             }
             catch (err) {
                 lastError = err instanceof Error ? err.message : String(err);
+                if (err instanceof SpawnExitError) {
+                    lastExitCode = err.exitCode;
+                    lastExitSignal = err.exitSignal;
+                }
                 const ownerTimedOut = usesDedicatedOwner
                     ? /\bowner timed out\b/i.test(lastError)
                     : /\btimed out\b/i.test(lastError) && !lastError.includes(`${step.name}-review`);
@@ -1943,7 +2101,10 @@ export class WorkflowRunner {
             verificationValue,
         });
         this.postToChannel(`**[${step.name}]** Failed: ${lastError ?? 'Unknown error'}`);
-        await this.markStepFailed(state, lastError ?? 'Unknown error', runId);
+        await this.markStepFailed(state, lastError ?? 'Unknown error', runId, {
+            exitCode: lastExitCode,
+            exitSignal: lastExitSignal,
+        });
         throw new Error(`Step "${step.name}" failed after ${maxRetries} retries: ${lastError ?? 'Unknown error'}`);
     }
     injectStepOwnerContract(step, resolvedTask, ownerDef, specialistDef) {
@@ -2058,10 +2219,10 @@ export class WorkflowRunner {
         });
         const workerSettled = workerPromise.catch(() => undefined);
         workerPromise
-            .then((output) => {
+            .then((result) => {
             workerReleased = true;
             this.postToChannel(`**[${step.name}]** Worker \`${workerRuntimeName}\` exited`);
-            if (step.verification?.type === 'output_contains' && output.includes(step.verification.value)) {
+            if (step.verification?.type === 'output_contains' && result.output.includes(step.verification.value)) {
                 this.postToChannel(`**[${step.name}]** Verification gate observed: output contains ${JSON.stringify(step.verification.value)}`);
             }
         })
@@ -2080,7 +2241,7 @@ export class WorkflowRunner {
         this.log(`[${step.name}] Spawning owner "${supervised.owner.name}" (cli: ${supervised.owner.cli})`);
         const ownerStartTime = Date.now();
         try {
-            const ownerOutput = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
+            const ownerResultObj = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
                 agentNameSuffix: 'owner',
                 onSpawned: ({ actualName }) => {
                     this.supervisedRuntimeAgents.set(actualName, {
@@ -2094,9 +2255,10 @@ export class WorkflowRunner {
                 },
             });
             const ownerElapsed = Date.now() - ownerStartTime;
+            const ownerOutput = ownerResultObj.output;
             this.log(`[${step.name}] Owner "${supervised.owner.name}" exited`);
             this.assertOwnerCompletionMarker(step, ownerOutput, supervisorTask);
-            const specialistOutput = await workerPromise;
+            const specialistOutput = (await workerPromise).output;
             return { specialistOutput, ownerOutput, ownerElapsed };
         }
         catch (error) {
@@ -2307,7 +2469,7 @@ export class WorkflowRunner {
             })();
         };
         try {
-            reviewOutput = await this.spawnAndWait(reviewerDef, reviewStep, safetyTimeoutMs, {
+            await this.spawnAndWait(reviewerDef, reviewStep, safetyTimeoutMs, {
                 onSpawned: ({ agent }) => {
                     reviewerHandle = agent;
                 },
@@ -2502,7 +2664,7 @@ export class WorkflowRunner {
         const stdoutChunks = [];
         const stderrChunks = [];
         try {
-            const output = await new Promise((resolve, reject) => {
+            const { stdout: output, exitCode, exitSignal } = await new Promise((resolve, reject) => {
                 const child = cpSpawn(cmd, args, {
                     stdio: ['ignore', 'pipe', 'pipe'],
                     cwd: this.resolveAgentCwd(agentDef),
@@ -2560,7 +2722,7 @@ export class WorkflowRunner {
                         setTimeout(() => child.kill('SIGKILL'), 5000);
                     }, timeoutMs);
                 }
-                child.on('close', (code) => {
+                child.on('close', (code, signal) => {
                     clearInterval(heartbeat);
                     if (timer)
                         clearTimeout(timer);
@@ -2578,10 +2740,14 @@ export class WorkflowRunner {
                     }
                     if (code !== 0 && code !== null) {
                         const stderr = stderrChunks.join('');
-                        reject(new Error(`Step "${step.name}" exited with code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`));
+                        reject(new SpawnExitError(`Step "${step.name}" exited with code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`, code, signal));
                         return;
                     }
-                    resolve(stdout);
+                    resolve({
+                        stdout,
+                        exitCode: code ?? undefined,
+                        exitSignal: signal ?? undefined,
+                    });
                 });
                 child.on('error', (err) => {
                     clearInterval(heartbeat);
@@ -2593,9 +2759,11 @@ export class WorkflowRunner {
                     reject(new Error(`Failed to spawn ${cmd}: ${err.message}`));
                 });
             });
-            return output;
+            return { output, exitCode, exitSignal };
         }
         finally {
+            const combinedOutput = stdoutChunks.join('') + stderrChunks.join('');
+            this.lastFailedStepOutput.set(step.name, combinedOutput);
             stopHeartbeat?.();
             logStream.end();
             this.unregisterWorker(agentName);
@@ -2755,10 +2923,14 @@ export class WorkflowRunner {
                     throw new Error(`Step "${step.name}" timed out after ${timeoutMs ?? 'unknown'}ms`);
                 }
             }
+            if (exitResult === 'force-released') {
+                throw new Error(`Step "${step.name}" failed — agent was force-released after exhausting idle nudges without completing`);
+            }
         }
         finally {
             // Snapshot PTY chunks before cleanup — we need them for output reading below
             ptyChunks = this.ptyOutputBuffers.get(agentName) ?? [];
+            this.lastFailedStepOutput.set(step.name, ptyChunks.join(''));
             // Always clean up PTY resources — prevents fd leaks if spawnPty or waitForExit throws
             stopHeartbeat?.();
             this.activeAgentHandles.delete(agentName);
@@ -2784,10 +2956,14 @@ export class WorkflowRunner {
                 : exitResult === 'timeout'
                     ? 'Agent completed (released after idle timeout)'
                     : exitResult === 'released'
-                        ? 'Agent completed (force-released after idle nudging)'
+                        ? 'Agent completed (idle — treated as done)'
                         : `Agent exited (${exitResult})`;
         }
-        return output;
+        return {
+            output,
+            exitCode: agent?.exitCode,
+            exitSignal: agent?.exitSignal,
+        };
     }
     // ── Idle nudging ────────────────────────────────────────────────────────
     /** Patterns where a hub agent coordinates spoke agents. */
@@ -2858,7 +3034,7 @@ export class WorkflowRunner {
                 return exitResult;
             }
             // Agent is still running after the window expired.
-            if (remaining !== undefined && Date.now() - startTime >= remaining) {
+            if (timeoutMs !== undefined && Date.now() - startTime >= timeoutMs) {
                 return 'timeout';
             }
             // Nudge if we haven't exhausted the limit
@@ -2873,7 +3049,7 @@ export class WorkflowRunner {
             this.postToChannel(`**[${step.name}]** Agent \`${agent.name}\` still idle after ${nudgeCount} nudge(s) — force-releasing`);
             this.emit({ type: 'step:force-released', runId: this.currentRunId ?? '', stepName: step.name });
             await agent.release();
-            return 'released';
+            return 'force-released';
         }
     }
     /**
@@ -2988,7 +3164,7 @@ export class WorkflowRunner {
         }
         await this.db.updateRun(runId, patch);
     }
-    async markStepFailed(state, error, runId) {
+    async markStepFailed(state, error, runId, exitInfo) {
         state.row.status = 'failed';
         state.row.error = error;
         state.row.completedAt = new Date().toISOString();
@@ -2998,7 +3174,14 @@ export class WorkflowRunner {
             completedAt: state.row.completedAt,
             updatedAt: new Date().toISOString(),
         });
-        this.emit({ type: 'step:failed', runId, stepName: state.row.stepName, error });
+        this.emit({
+            type: 'step:failed',
+            runId,
+            stepName: state.row.stepName,
+            error,
+            exitCode: exitInfo?.exitCode,
+            exitSignal: exitInfo?.exitSignal,
+        });
     }
     async markDownstreamSkipped(failedStepName, allSteps, stepStates, runId) {
         const queue = [failedStepName];