@agent-relay/sdk 6.0.11 → 6.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/bin/agent-relay-broker-win32-x64.exe +0 -0
  2. package/dist/client.d.ts.map +1 -1
  3. package/dist/client.js +25 -5
  4. package/dist/client.js.map +1 -1
  5. package/dist/github.d.ts +9 -1
  6. package/dist/github.d.ts.map +1 -1
  7. package/dist/github.js +9 -1
  8. package/dist/github.js.map +1 -1
  9. package/dist/index.d.ts +4 -0
  10. package/dist/index.d.ts.map +1 -1
  11. package/dist/index.js +4 -0
  12. package/dist/index.js.map +1 -1
  13. package/dist/slack.d.ts +24 -0
  14. package/dist/slack.d.ts.map +1 -0
  15. package/dist/slack.js +24 -0
  16. package/dist/slack.js.map +1 -0
  17. package/dist/types.d.ts +21 -6
  18. package/dist/types.d.ts.map +1 -1
  19. package/dist/workflows/__tests__/workflow-reliability-contract.test.d.ts +2 -0
  20. package/dist/workflows/__tests__/workflow-reliability-contract.test.d.ts.map +1 -0
  21. package/dist/workflows/__tests__/workflow-reliability-contract.test.js +536 -0
  22. package/dist/workflows/__tests__/workflow-reliability-contract.test.js.map +1 -0
  23. package/dist/workflows/__tests__/workflow-reliability-e2e.test.d.ts +2 -0
  24. package/dist/workflows/__tests__/workflow-reliability-e2e.test.d.ts.map +1 -0
  25. package/dist/workflows/__tests__/workflow-reliability-e2e.test.js +199 -0
  26. package/dist/workflows/__tests__/workflow-reliability-e2e.test.js.map +1 -0
  27. package/dist/workflows/builder.d.ts +10 -0
  28. package/dist/workflows/builder.d.ts.map +1 -1
  29. package/dist/workflows/builder.js +23 -1
  30. package/dist/workflows/builder.js.map +1 -1
  31. package/dist/workflows/runner.d.ts +7 -0
  32. package/dist/workflows/runner.d.ts.map +1 -1
  33. package/dist/workflows/runner.js +355 -98
  34. package/dist/workflows/runner.js.map +1 -1
  35. package/dist/workflows/types.d.ts +4 -0
  36. package/dist/workflows/types.d.ts.map +1 -1
  37. package/package.json +17 -11
@@ -101,6 +101,9 @@ class SpawnExitError extends Error {
101
101
  this.exitSignal = exitSignal ?? undefined;
102
102
  }
103
103
  }
104
+ const DEFAULT_WORKFLOW_MAX_RETRIES = 2;
105
+ const DEFAULT_WORKFLOW_REPAIR_RETRIES = 2;
106
+ const DEFAULT_WORKFLOW_RETRY_DELAY_MS = 1000;
104
107
  // ── CLI resolution ───────────────────────────────────────────────────────────
105
108
  /**
106
109
  * Resolve `cursor` to the concrete cursor agent binary available in PATH.
@@ -1486,6 +1489,30 @@ export class WorkflowRunner {
1486
1489
  }
1487
1490
  return config;
1488
1491
  }
1492
+ applyReliabilityDefaults(config) {
1493
+ const existing = config.errorHandling;
1494
+ if (existing?.strategy === 'fail-fast' || existing?.strategy === 'continue') {
1495
+ return config;
1496
+ }
1497
+ const hasRepairAgentCandidate = (config.agents ?? []).length > 0;
1498
+ const maxRetries = existing?.maxRetries ??
1499
+ existing?.repairRetries ??
1500
+ (existing ? DEFAULT_WORKFLOW_MAX_RETRIES : DEFAULT_WORKFLOW_MAX_RETRIES);
1501
+ const repairRetries = existing?.repairRetries ??
1502
+ (hasRepairAgentCandidate
1503
+ ? existing?.maxRetries ?? DEFAULT_WORKFLOW_REPAIR_RETRIES
1504
+ : existing?.repairRetries);
1505
+ return {
1506
+ ...config,
1507
+ errorHandling: {
1508
+ ...existing,
1509
+ strategy: 'retry',
1510
+ maxRetries,
1511
+ retryDelayMs: existing?.retryDelayMs ?? DEFAULT_WORKFLOW_RETRY_DELAY_MS,
1512
+ ...(repairRetries !== undefined ? { repairRetries } : {}),
1513
+ },
1514
+ };
1515
+ }
1489
1516
  /** Validate a config object against the RelayYamlConfig shape. */
1490
1517
  validateConfig(config, source = '<config>') {
1491
1518
  if (typeof config !== 'object' || config === null) {
@@ -1861,6 +1888,11 @@ export class WorkflowRunner {
1861
1888
  throw new Error(`${source}: deterministic step "${s.name}" must have a "command" field`);
1862
1889
  }
1863
1890
  }
1891
+ else if (s.type === 'worktree') {
1892
+ if (typeof s.branch !== 'string' || s.branch.trim().length === 0) {
1893
+ throw new Error(`${source}: worktree step "${s.name}" must have a "branch" string field`);
1894
+ }
1895
+ }
1864
1896
  else if (s.type === 'integration') {
1865
1897
  // Integration steps require integration and action
1866
1898
  if (typeof s.integration !== 'string') {
@@ -2085,7 +2117,8 @@ export class WorkflowRunner {
2085
2117
  const resolved = this.applyPermissionProfiles(vars ? this.resolveVariables(config, vars) : config);
2086
2118
  // Validate config (catches cycles, missing deps, invalid steps, etc.)
2087
2119
  this.validateConfig(resolved);
2088
- const permissionResult = this.validatePermissions(resolved.agents, resolved.permission_profiles);
2120
+ const runtimeConfig = this.applyReliabilityDefaults(resolved);
2121
+ const permissionResult = this.validatePermissions(runtimeConfig.agents, runtimeConfig.permission_profiles);
2089
2122
  if (permissionResult.errors.length > 0) {
2090
2123
  throw new Error(`Permission validation failed:\n ${permissionResult.errors.join('\n ')}`);
2091
2124
  }
@@ -2093,7 +2126,7 @@ export class WorkflowRunner {
2093
2126
  console.warn(`[WorkflowRunner] Warning: ${warning}`);
2094
2127
  }
2095
2128
  // Resolve and validate named paths from the top-level `paths` config
2096
- const pathResult = this.resolvePathDefinitions(resolved.paths, this.cwd);
2129
+ const pathResult = this.resolvePathDefinitions(runtimeConfig.paths, this.cwd);
2097
2130
  if (pathResult.errors.length > 0) {
2098
2131
  throw new Error(`Path validation failed:\n ${pathResult.errors.join('\n ')}`);
2099
2132
  }
@@ -2103,7 +2136,7 @@ export class WorkflowRunner {
2103
2136
  console.log(`[workflow] path "${name}" → ${abs}`);
2104
2137
  }
2105
2138
  }
2106
- const workflows = resolved.workflows ?? [];
2139
+ const workflows = runtimeConfig.workflows ?? [];
2107
2140
  const workflow = workflowName ? workflows.find((w) => w.name === workflowName) : workflows[0];
2108
2141
  if (!workflow) {
2109
2142
  throw new Error(workflowName ? `Workflow "${workflowName}" not found in config` : 'No workflows defined in config');
@@ -2118,9 +2151,9 @@ export class WorkflowRunner {
2118
2151
  id: runId,
2119
2152
  workspaceId: this.workspaceId,
2120
2153
  workflowName: resolvedWorkflow.name,
2121
- pattern: resolved.swarm.pattern,
2154
+ pattern: runtimeConfig.swarm.pattern,
2122
2155
  status: 'pending',
2123
- config: resolved,
2156
+ config: runtimeConfig,
2124
2157
  startedAt: now,
2125
2158
  createdAt: now,
2126
2159
  updatedAt: now,
@@ -2191,7 +2224,7 @@ export class WorkflowRunner {
2191
2224
  return this.runWorkflowCore({
2192
2225
  run,
2193
2226
  workflow: resolvedWorkflow,
2194
- config: resolved,
2227
+ config: runtimeConfig,
2195
2228
  stepStates,
2196
2229
  isResume: false,
2197
2230
  });
@@ -2220,7 +2253,7 @@ export class WorkflowRunner {
2220
2253
  if (run.status !== 'running' && run.status !== 'failed') {
2221
2254
  throw new Error(`Run "${runId}" is in status "${run.status}" and cannot be resumed`);
2222
2255
  }
2223
- const resolvedConfig = vars ? this.resolveVariables(run.config, vars) : run.config;
2256
+ const resolvedConfig = this.applyReliabilityDefaults(vars ? this.resolveVariables(run.config, vars) : run.config);
2224
2257
  // Resolve path definitions (same as execute()) so workdir lookups work on resume
2225
2258
  const pathResult = this.resolvePathDefinitions(resolvedConfig.paths, this.cwd);
2226
2259
  if (pathResult.errors.length > 0) {
@@ -2799,7 +2832,7 @@ export class WorkflowRunner {
2799
2832
  async executeStep(step, state, stepStates, agentMap, errorHandling, runId, lifecycle) {
2800
2833
  // Branch: deterministic steps execute shell commands
2801
2834
  if (this.isDeterministicStep(step)) {
2802
- return this.executeDeterministicStep(step, state, stepStates, runId, errorHandling, lifecycle);
2835
+ return this.executeDeterministicStep(step, state, stepStates, agentMap, runId, errorHandling, lifecycle);
2803
2836
  }
2804
2837
  // Branch: worktree steps set up git worktrees
2805
2838
  if (this.isWorktreeStep(step)) {
@@ -2816,13 +2849,20 @@ export class WorkflowRunner {
2816
2849
  * Execute a deterministic step (shell command).
2817
2850
  * Fast, reliable, $0 LLM cost.
2818
2851
  */
2819
- async executeDeterministicStep(step, state, stepStates, runId, errorHandling, lifecycle) {
2820
- const maxRetries = step.retries ?? errorHandling?.maxRetries ?? 0;
2852
+ async executeDeterministicStep(step, state, stepStates, agentMap, runId, errorHandling, lifecycle) {
2853
+ const repairRetries = errorHandling?.strategy === 'retry' ? errorHandling.repairRetries ?? 0 : 0;
2854
+ const repairAgent = repairRetries > 0
2855
+ ? this.resolveWorkflowRepairAgent(step, stepStates, agentMap, errorHandling)
2856
+ : undefined;
2857
+ const maxRetries = step.retries ?? errorHandling?.maxRetries ?? (repairAgent ? repairRetries : 0);
2821
2858
  const retryDelay = errorHandling?.retryDelayMs ?? 1000;
2822
2859
  let lastError = 'Unknown error';
2823
2860
  let lastCompletionReason;
2824
2861
  let lastExitCode;
2825
2862
  let lastExitSignal;
2863
+ let lastResolvedCommand = step.command ?? '';
2864
+ let lastStepCwd = this.cwd;
2865
+ let lastCommandOutput = '';
2826
2866
  const result = await lifecycle.monitorStep(step, state, {
2827
2867
  maxRetries,
2828
2868
  retryDelayMs: retryDelay,
@@ -2835,6 +2875,20 @@ export class WorkflowRunner {
2835
2875
  detail: `Retrying attempt ${attempt + 1}/${total + 1}`,
2836
2876
  raw: { attempt, maxRetries: total },
2837
2877
  });
2878
+ if (repairAgent) {
2879
+ await this.runDeterministicRepairAgent({
2880
+ step,
2881
+ agentDef: repairAgent,
2882
+ attempt,
2883
+ maxRetries: total,
2884
+ command: lastResolvedCommand,
2885
+ cwd: lastStepCwd,
2886
+ error: lastError,
2887
+ output: lastCommandOutput,
2888
+ exitCode: lastExitCode,
2889
+ exitSignal: lastExitSignal,
2890
+ });
2891
+ }
2838
2892
  },
2839
2893
  execute: async () => {
2840
2894
  const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
@@ -2846,12 +2900,15 @@ export class WorkflowRunner {
2846
2900
  return value !== undefined ? String(value) : _match;
2847
2901
  });
2848
2902
  const stepCwd = this.resolveEffectiveCwd(step);
2903
+ lastResolvedCommand = resolvedCommand;
2904
+ lastStepCwd = stepCwd;
2849
2905
  this.beginStepEvidence(step.name, [stepCwd], state.row.startedAt);
2850
2906
  this.log(`[${step.name}] Running: ${resolvedCommand.slice(0, 200)}${resolvedCommand.length > 200 ? '...' : ''}`);
2851
2907
  if (this.executor?.executeDeterministicStep) {
2852
2908
  const executorResult = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
2853
2909
  lastExitCode = executorResult.exitCode;
2854
2910
  lastExitSignal = undefined;
2911
+ lastCommandOutput = executorResult.output;
2855
2912
  const failOnError = step.failOnError !== false;
2856
2913
  if (failOnError && executorResult.exitCode !== 0) {
2857
2914
  this.log(`[${step.name}] Command failed (exit code ${executorResult.exitCode})`);
@@ -2926,6 +2983,7 @@ export class WorkflowRunner {
2926
2983
  commandStderr = stderr;
2927
2984
  lastExitCode = code ?? undefined;
2928
2985
  lastExitSignal = signal ?? undefined;
2986
+ lastCommandOutput = [stdout, stderr].filter(Boolean).join('\n');
2929
2987
  const failOnError = step.failOnError !== false;
2930
2988
  if (failOnError && code !== 0 && code !== null) {
2931
2989
  this.log(`[${step.name}] Command failed (exit code ${code})`);
@@ -2957,6 +3015,7 @@ export class WorkflowRunner {
2957
3015
  const verificationResult = step.verification
2958
3016
  ? this.runVerification(step.verification, output, step.name)
2959
3017
  : undefined;
3018
+ lastCommandOutput = [commandStdout || output, commandStderr].filter(Boolean).join('\n');
2960
3019
  return {
2961
3020
  output,
2962
3021
  completionReason: verificationResult?.completionReason,
@@ -2989,6 +3048,212 @@ export class WorkflowRunner {
2989
3048
  throw new Error(`Step "${step.name}" failed: ${result.error ?? 'Unknown error'}`);
2990
3049
  }
2991
3050
  }
3051
+ resolveWorkflowRepairAgent(step, stepStates, agentMap, errorHandling) {
3052
+ const explicitName = errorHandling?.repairAgent?.trim();
3053
+ if (explicitName) {
3054
+ const explicitAgent = agentMap.get(explicitName);
3055
+ if (explicitAgent)
3056
+ return WorkflowRunner.resolveAgentDef(explicitAgent);
3057
+ this.log(`[${step.name}] repairAgent "${explicitName}" not found; falling back to workflow agents`);
3058
+ }
3059
+ if (step.agent) {
3060
+ const stepAgent = agentMap.get(step.agent);
3061
+ if (stepAgent)
3062
+ return WorkflowRunner.resolveAgentDef(stepAgent);
3063
+ }
3064
+ for (const dependency of [...(step.dependsOn ?? [])].reverse()) {
3065
+ const dependencyAgent = stepStates.get(dependency)?.row.agentName;
3066
+ if (!dependencyAgent)
3067
+ continue;
3068
+ const agent = agentMap.get(dependencyAgent);
3069
+ if (agent)
3070
+ return WorkflowRunner.resolveAgentDef(agent);
3071
+ }
3072
+ const candidates = [...agentMap.values()].map((agent) => WorkflowRunner.resolveAgentDef(agent));
3073
+ candidates.sort((a, b) => this.scoreRepairAgent(b) - this.scoreRepairAgent(a));
3074
+ return candidates[0];
3075
+ }
3076
+ scoreRepairAgent(agent) {
3077
+ const text = `${agent.name} ${agent.role ?? ''} ${agent.preset ?? ''}`.toLowerCase();
3078
+ let score = 0;
3079
+ if (/\b(repair|fix|implement|implementation|engineer|developer|coder|worker|owner|lead|coordinator)\b/.test(text)) {
3080
+ score += 10;
3081
+ }
3082
+ if (agent.interactive === false || ['worker', 'analyst'].includes(agent.preset ?? '')) {
3083
+ score += 2;
3084
+ }
3085
+ if (/\b(review|reviewer|audit|security|analyst)\b/.test(text)) {
3086
+ score -= 4;
3087
+ }
3088
+ if (agent.permissions?.access === 'readonly') {
3089
+ score -= 20;
3090
+ }
3091
+ return score;
3092
+ }
3093
+ async runDeterministicRepairAgent(context) {
3094
+ const repairAgent = {
3095
+ ...context.agentDef,
3096
+ interactive: false,
3097
+ };
3098
+ const repairPrompt = this.buildDeterministicRepairPrompt(context);
3099
+ const repairStep = {
3100
+ name: `${context.step.name}-repair-${context.attempt}`,
3101
+ type: 'agent',
3102
+ agent: repairAgent.name,
3103
+ task: repairPrompt,
3104
+ cwd: context.cwd,
3105
+ workdir: undefined,
3106
+ retries: 0,
3107
+ };
3108
+ const timeoutMs = repairAgent.constraints?.timeoutMs ?? context.step.timeoutMs ?? this.currentConfig?.swarm?.timeoutMs;
3109
+ this.log(`[${context.step.name}] Deterministic gate failed; asking "${repairAgent.name}" to repair before retry ${context.attempt + 1}/${context.maxRetries + 1}`);
3110
+ this.postToChannel(`**[${context.step.name}]** Deterministic gate failed; assigning repair to \`${repairAgent.name}\``);
3111
+ this.recordStepToolSideEffect(context.step.name, {
3112
+ type: 'custom',
3113
+ detail: `Assigned deterministic gate repair to ${repairAgent.name}`,
3114
+ raw: {
3115
+ repairAgent: repairAgent.name,
3116
+ attempt: context.attempt,
3117
+ maxRetries: context.maxRetries,
3118
+ exitCode: context.exitCode,
3119
+ exitSignal: context.exitSignal,
3120
+ },
3121
+ });
3122
+ try {
3123
+ this.ensureBudgetAllowsSpawn(context.step.name, repairAgent.name);
3124
+ let repairOutput;
3125
+ if (this.executor) {
3126
+ repairOutput = await this.executor.executeAgentStep(repairStep, repairAgent, repairPrompt, timeoutMs);
3127
+ }
3128
+ else if (repairAgent.cli === 'api') {
3129
+ repairOutput = await executeApiStep(repairAgent.constraints?.model ?? 'claude-sonnet-4-20250514', repairPrompt, {
3130
+ envSecrets: this.envSecrets,
3131
+ skills: repairAgent.skills,
3132
+ defaultMaxTokens: repairAgent.constraints?.maxTokens,
3133
+ });
3134
+ }
3135
+ else {
3136
+ const result = await this.execNonInteractive(repairAgent, repairStep, timeoutMs);
3137
+ repairOutput = result.output;
3138
+ }
3139
+ this.recordStepToolSideEffect(context.step.name, {
3140
+ type: 'custom',
3141
+ detail: `Repair agent ${repairAgent.name} completed before deterministic retry`,
3142
+ raw: { repairAgent: repairAgent.name, output: repairOutput.slice(0, 1000) },
3143
+ });
3144
+ }
3145
+ catch (error) {
3146
+ if (error instanceof BudgetExceededError || this.abortController?.signal.aborted) {
3147
+ throw error;
3148
+ }
3149
+ const message = error instanceof Error ? error.message : String(error);
3150
+ this.log(`[${context.step.name}] Repair agent "${repairAgent.name}" failed: ${message}`);
3151
+ this.postToChannel(`**[${context.step.name}]** Repair agent \`${repairAgent.name}\` failed; retrying gate anyway`);
3152
+ this.recordStepToolSideEffect(context.step.name, {
3153
+ type: 'custom',
3154
+ detail: `Repair agent ${repairAgent.name} failed before deterministic retry: ${message}`,
3155
+ raw: { repairAgent: repairAgent.name, error: message },
3156
+ });
3157
+ }
3158
+ }
3159
+ buildDeterministicRepairPrompt(context) {
3160
+ const output = context.output.trim();
3161
+ const clippedOutput = output.length > 4000 ? output.slice(-4000) : output;
3162
+ return (`A deterministic workflow gate failed after an agent/team step. Fix the repository or workflow state so the same gate passes on the next retry.\n\n` +
3163
+ `Step: ${context.step.name}\n` +
3164
+ `Working directory: ${context.cwd}\n` +
3165
+ `Command:\n${context.command}\n\n` +
3166
+ `Failure:\n${context.error}\n` +
3167
+ `Exit code: ${context.exitCode ?? 'unknown'}\n` +
3168
+ `Exit signal: ${context.exitSignal ?? 'none'}\n\n` +
3169
+ `Command output:\n${clippedOutput || '(no output captured)'}\n\n` +
3170
+ `Repair only what is needed for this gate to pass. Preserve unrelated user changes. ` +
3171
+ `After making the fix, report the files changed and the reason the gate should pass.`);
3172
+ }
3173
+ async runAgentStepRepairAgent(context) {
3174
+ const repairAgent = {
3175
+ ...context.agentDef,
3176
+ interactive: false,
3177
+ };
3178
+ const repairPrompt = this.buildAgentStepRepairPrompt(context);
3179
+ const repairStep = {
3180
+ name: `${context.step.name}-repair-${context.attempt}`,
3181
+ type: 'agent',
3182
+ agent: repairAgent.name,
3183
+ task: repairPrompt,
3184
+ cwd: context.cwd,
3185
+ workdir: undefined,
3186
+ retries: 0,
3187
+ };
3188
+ const timeoutMs = repairAgent.constraints?.timeoutMs ?? context.step.timeoutMs ?? this.currentConfig?.swarm?.timeoutMs;
3189
+ this.log(`[${context.step.name}] Agent step failed; asking "${repairAgent.name}" to repair before retry ${context.attempt + 1}/${context.maxRetries + 1}`);
3190
+ this.postToChannel(`**[${context.step.name}]** Agent step failed; assigning repair to \`${repairAgent.name}\``);
3191
+ this.recordStepToolSideEffect(context.step.name, {
3192
+ type: 'custom',
3193
+ detail: `Assigned agent-step repair to ${repairAgent.name}`,
3194
+ raw: {
3195
+ repairAgent: repairAgent.name,
3196
+ attempt: context.attempt,
3197
+ maxRetries: context.maxRetries,
3198
+ completionReason: context.completionReason,
3199
+ exitCode: context.exitCode,
3200
+ exitSignal: context.exitSignal,
3201
+ },
3202
+ });
3203
+ try {
3204
+ this.ensureBudgetAllowsSpawn(context.step.name, repairAgent.name);
3205
+ let repairOutput;
3206
+ if (this.executor) {
3207
+ repairOutput = await this.executor.executeAgentStep(repairStep, repairAgent, repairPrompt, timeoutMs);
3208
+ }
3209
+ else if (repairAgent.cli === 'api') {
3210
+ repairOutput = await executeApiStep(repairAgent.constraints?.model ?? 'claude-sonnet-4-20250514', repairPrompt, {
3211
+ envSecrets: this.envSecrets,
3212
+ skills: repairAgent.skills,
3213
+ defaultMaxTokens: repairAgent.constraints?.maxTokens,
3214
+ });
3215
+ }
3216
+ else {
3217
+ const result = await this.execNonInteractive(repairAgent, repairStep, timeoutMs);
3218
+ repairOutput = result.output;
3219
+ }
3220
+ this.recordStepToolSideEffect(context.step.name, {
3221
+ type: 'custom',
3222
+ detail: `Repair agent ${repairAgent.name} completed before agent retry`,
3223
+ raw: { repairAgent: repairAgent.name, output: repairOutput.slice(0, 1000) },
3224
+ });
3225
+ }
3226
+ catch (error) {
3227
+ if (error instanceof BudgetExceededError || this.abortController?.signal.aborted) {
3228
+ throw error;
3229
+ }
3230
+ const message = error instanceof Error ? error.message : String(error);
3231
+ this.log(`[${context.step.name}] Repair agent "${repairAgent.name}" failed: ${message}`);
3232
+ this.postToChannel(`**[${context.step.name}]** Repair agent \`${repairAgent.name}\` failed; retrying agent step anyway`);
3233
+ this.recordStepToolSideEffect(context.step.name, {
3234
+ type: 'custom',
3235
+ detail: `Repair agent ${repairAgent.name} failed before agent retry: ${message}`,
3236
+ raw: { repairAgent: repairAgent.name, error: message },
3237
+ });
3238
+ }
3239
+ }
3240
+ buildAgentStepRepairPrompt(context) {
3241
+ const output = context.output.trim();
3242
+ const clippedOutput = output.length > 4000 ? output.slice(-4000) : output;
3243
+ const task = (context.step.task ?? '').trim();
3244
+ const clippedTask = task.length > 3000 ? task.slice(0, 3000) : task;
3245
+ return (`A workflow agent step failed or produced an invalid artifact. Repair the repository, workflow state, or step instructions so the step can succeed on the next retry.\n\n` +
3246
+ `Step: ${context.step.name}\n` +
3247
+ `Working directory: ${context.cwd}\n` +
3248
+ `Completion reason: ${context.completionReason ?? 'unknown'}\n` +
3249
+ `Failure:\n${context.error}\n` +
3250
+ `Exit code: ${context.exitCode ?? 'unknown'}\n` +
3251
+ `Exit signal: ${context.exitSignal ?? 'none'}\n\n` +
3252
+ `Step task:\n${clippedTask || '(no task captured)'}\n\n` +
3253
+ `Previous output:\n${clippedOutput || '(no output captured)'}\n\n` +
3254
+ `Repair only what is needed for this step to produce the required artifact or evidence. ` +
3255
+ `Preserve unrelated user changes. After making the fix, report the files changed and why the retry should pass.`);
3256
+ }
2992
3257
  /**
2993
3258
  * Execute a worktree step (git worktree setup).
2994
3259
  * Fast, reliable, $0 LLM cost.
@@ -3201,56 +3466,7 @@ export class WorkflowRunner {
3201
3466
  throw new Error(`Agent "${agentName}" not found in config`);
3202
3467
  }
3203
3468
  const specialistDef = WorkflowRunner.resolveAgentDef(rawAgentDef);
3204
- // API-mode agents: execute via direct API call instead of spawning a PTY/subprocess.
3205
- if (specialistDef.cli === 'api') {
3206
- this.ensureBudgetAllowsSpawn(step.name, agentName);
3207
- const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
3208
- const resolvedTask = this.interpolateStepTask(step.task ?? '', stepOutputContext);
3209
- state.row.status = 'running';
3210
- state.row.startedAt = new Date().toISOString();
3211
- await this.db.updateStep(state.row.id, {
3212
- status: 'running',
3213
- startedAt: state.row.startedAt,
3214
- updatedAt: new Date().toISOString(),
3215
- });
3216
- this.emit({ type: 'step:started', runId, stepName: step.name });
3217
- this.postToChannel(`**[${step.name}]** Started (api)`);
3218
- try {
3219
- const output = await executeApiStep(specialistDef.constraints?.model ?? 'claude-sonnet-4-20250514', resolvedTask, {
3220
- envSecrets: this.envSecrets,
3221
- skills: specialistDef.skills,
3222
- defaultMaxTokens: specialistDef.constraints?.maxTokens,
3223
- });
3224
- state.row.status = 'completed';
3225
- state.row.output = output;
3226
- state.row.completedAt = new Date().toISOString();
3227
- await this.db.updateStep(state.row.id, {
3228
- status: 'completed',
3229
- output,
3230
- completedAt: state.row.completedAt,
3231
- updatedAt: new Date().toISOString(),
3232
- });
3233
- await this.persistStepOutput(runId, step.name, output);
3234
- this.emit({ type: 'step:completed', runId, stepName: step.name, output });
3235
- }
3236
- catch (apiError) {
3237
- const errorMessage = apiError instanceof Error ? apiError.message : String(apiError);
3238
- state.row.status = 'failed';
3239
- state.row.error = errorMessage;
3240
- state.row.completedAt = new Date().toISOString();
3241
- await this.db.updateStep(state.row.id, {
3242
- status: 'failed',
3243
- error: errorMessage,
3244
- completedAt: state.row.completedAt,
3245
- updatedAt: new Date().toISOString(),
3246
- });
3247
- this.emit({ type: 'step:failed', runId, stepName: step.name, error: errorMessage });
3248
- this.postToChannel(`**[${step.name}]** Failed (api): ${errorMessage}`);
3249
- throw apiError;
3250
- }
3251
- return;
3252
- }
3253
- const usesOwnerFlow = specialistDef.interactive !== false;
3469
+ const usesOwnerFlow = specialistDef.cli !== 'api' && specialistDef.interactive !== false;
3254
3470
  const currentPattern = this.currentConfig?.swarm?.pattern ?? '';
3255
3471
  const isHubPattern = WorkflowRunner.HUB_PATTERNS.has(currentPattern);
3256
3472
  const usesAutoHardening = usesOwnerFlow && isHubPattern && !this.isExplicitInteractiveWorker(specialistDef);
@@ -3274,6 +3490,10 @@ export class WorkflowRunner {
3274
3490
  ownerDef.constraints?.timeoutMs ??
3275
3491
  specialistDef.constraints?.timeoutMs ??
3276
3492
  this.currentConfig?.swarm?.timeoutMs;
3493
+ const repairRetries = errorHandling?.strategy === 'retry' ? (errorHandling.repairRetries ?? 0) : 0;
3494
+ const repairAgent = repairRetries > 0
3495
+ ? this.resolveWorkflowRepairAgent(step, stepStates, agentMap, errorHandling)
3496
+ : undefined;
3277
3497
  let lastError;
3278
3498
  let lastExitCode;
3279
3499
  let lastExitSignal;
@@ -3312,6 +3532,20 @@ export class WorkflowRunner {
3312
3532
  updatedAt: new Date().toISOString(),
3313
3533
  });
3314
3534
  await this.trajectory?.stepRetrying(step, attempt, maxRetries);
3535
+ if (repairAgent && attempt <= repairRetries) {
3536
+ await this.runAgentStepRepairAgent({
3537
+ step,
3538
+ agentDef: repairAgent,
3539
+ attempt,
3540
+ maxRetries,
3541
+ cwd: lastEffectiveCwd ?? this.resolveEffectiveCwd(step, specialistDef),
3542
+ error: lastError ?? 'Unknown error',
3543
+ output: this.lastFailedStepOutput.get(step.name) ?? '',
3544
+ exitCode: lastExitCode,
3545
+ exitSignal: lastExitSignal,
3546
+ completionReason: lastCompletionReason,
3547
+ });
3548
+ }
3315
3549
  await this.delay(retryDelay);
3316
3550
  }
3317
3551
  try {
@@ -3436,28 +3670,38 @@ export class WorkflowRunner {
3436
3670
  // executors still take precedence. See process-backend-executor.ts.
3437
3671
  const spawnResult = this.executor
3438
3672
  ? await this.executor.executeAgentStep(resolvedStep, effectiveOwner, ownerTask, timeoutMs)
3439
- : await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs, {
3440
- retryAttempt: attempt,
3441
- evidenceStepName: step.name,
3442
- evidenceRole: usesOwnerFlow ? 'owner' : 'specialist',
3443
- preserveOnIdle: !isHubPattern || !this.isLeadLikeAgent(effectiveOwner) ? false : undefined,
3444
- logicalName: effectiveOwner.name,
3445
- onSpawned: explicitInteractiveWorker
3446
- ? ({ agent }) => {
3447
- explicitWorkerHandle = agent;
3448
- }
3449
- : undefined,
3450
- onChunk: explicitInteractiveWorker
3451
- ? ({ chunk }) => {
3452
- explicitWorkerOutput += WorkflowRunner.stripAnsi(chunk);
3453
- if (!explicitWorkerCompleted &&
3454
- this.hasExplicitInteractiveWorkerCompletionEvidence(step, explicitWorkerOutput, ownerTask, resolvedTask)) {
3455
- explicitWorkerCompleted = true;
3456
- void explicitWorkerHandle?.release().catch(() => undefined);
3673
+ : effectiveOwner.cli === 'api'
3674
+ ? {
3675
+ output: await executeApiStep(effectiveOwner.constraints?.model ?? 'claude-sonnet-4-20250514', ownerTask, {
3676
+ envSecrets: this.envSecrets,
3677
+ skills: effectiveOwner.skills,
3678
+ defaultMaxTokens: effectiveOwner.constraints?.maxTokens,
3679
+ }),
3680
+ exitCode: 0,
3681
+ promptTaskText: ownerTask,
3682
+ }
3683
+ : await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs, {
3684
+ retryAttempt: attempt,
3685
+ evidenceStepName: step.name,
3686
+ evidenceRole: usesOwnerFlow ? 'owner' : 'specialist',
3687
+ preserveOnIdle: !isHubPattern || !this.isLeadLikeAgent(effectiveOwner) ? false : undefined,
3688
+ logicalName: effectiveOwner.name,
3689
+ onSpawned: explicitInteractiveWorker
3690
+ ? ({ agent }) => {
3691
+ explicitWorkerHandle = agent;
3457
3692
  }
3458
- }
3459
- : undefined,
3460
- });
3693
+ : undefined,
3694
+ onChunk: explicitInteractiveWorker
3695
+ ? ({ chunk }) => {
3696
+ explicitWorkerOutput += WorkflowRunner.stripAnsi(chunk);
3697
+ if (!explicitWorkerCompleted &&
3698
+ this.hasExplicitInteractiveWorkerCompletionEvidence(step, explicitWorkerOutput, ownerTask, resolvedTask)) {
3699
+ explicitWorkerCompleted = true;
3700
+ void explicitWorkerHandle?.release().catch(() => undefined);
3701
+ }
3702
+ }
3703
+ : undefined,
3704
+ });
3461
3705
  const output = typeof spawnResult === 'string' ? spawnResult : spawnResult.output;
3462
3706
  promptTaskText =
3463
3707
  typeof spawnResult === 'string'
@@ -3569,6 +3813,9 @@ export class WorkflowRunner {
3569
3813
  catch (err) {
3570
3814
  lastError = err instanceof Error ? err.message : String(err);
3571
3815
  lastCompletionReason = err instanceof WorkflowCompletionError ? err.completionReason : undefined;
3816
+ if (stepOutputForDiagnostic) {
3817
+ this.lastFailedStepOutput.set(step.name, stepOutputForDiagnostic);
3818
+ }
3572
3819
  const diagnosticVerification = step.verification;
3573
3820
  if (err instanceof WorkflowCompletionError &&
3574
3821
  err.completionReason === 'failed_verification' &&
@@ -3911,23 +4158,33 @@ export class WorkflowRunner {
3911
4158
  this.log(`[${step.name}] Spawning owner "${supervised.owner.name}" (cli: ${supervised.owner.cli})`);
3912
4159
  const ownerStartTime = Date.now();
3913
4160
  try {
3914
- const ownerResultObj = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
3915
- agentNameSuffix: 'owner',
3916
- retryAttempt,
3917
- evidenceStepName: step.name,
3918
- evidenceRole: 'owner',
3919
- logicalName: supervised.owner.name,
3920
- onSpawned: ({ actualName }) => {
3921
- this.supervisedRuntimeAgents.set(actualName, {
3922
- stepName: step.name,
3923
- role: 'owner',
3924
- logicalName: supervised.owner.name,
3925
- });
3926
- },
3927
- onChunk: ({ chunk }) => {
3928
- void this.recordOwnerMonitoringChunk(step, supervised.owner, chunk);
3929
- },
3930
- });
4161
+ const ownerResultObj = supervised.owner.cli === 'api'
4162
+ ? {
4163
+ output: await executeApiStep(supervised.owner.constraints?.model ?? 'claude-sonnet-4-20250514', supervisorTask, {
4164
+ envSecrets: this.envSecrets,
4165
+ skills: supervised.owner.skills,
4166
+ defaultMaxTokens: supervised.owner.constraints?.maxTokens,
4167
+ }),
4168
+ exitCode: 0,
4169
+ promptTaskText: supervisorTask,
4170
+ }
4171
+ : await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
4172
+ agentNameSuffix: 'owner',
4173
+ retryAttempt,
4174
+ evidenceStepName: step.name,
4175
+ evidenceRole: 'owner',
4176
+ logicalName: supervised.owner.name,
4177
+ onSpawned: ({ actualName }) => {
4178
+ this.supervisedRuntimeAgents.set(actualName, {
4179
+ stepName: step.name,
4180
+ role: 'owner',
4181
+ logicalName: supervised.owner.name,
4182
+ });
4183
+ },
4184
+ onChunk: ({ chunk }) => {
4185
+ void this.recordOwnerMonitoringChunk(step, supervised.owner, chunk);
4186
+ },
4187
+ });
3931
4188
  const ownerElapsed = Date.now() - ownerStartTime;
3932
4189
  const ownerOutput = ownerResultObj.output;
3933
4190
  this.log(`[${step.name}] Owner "${supervised.owner.name}" exited`);