@agent-relay/sdk 6.0.12 → 6.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,6 +101,9 @@ class SpawnExitError extends Error {
101
101
  this.exitSignal = exitSignal ?? undefined;
102
102
  }
103
103
  }
104
+ const DEFAULT_WORKFLOW_MAX_RETRIES = 2;
105
+ const DEFAULT_WORKFLOW_REPAIR_RETRIES = 2;
106
+ const DEFAULT_WORKFLOW_RETRY_DELAY_MS = 1000;
104
107
  // ── CLI resolution ───────────────────────────────────────────────────────────
105
108
  /**
106
109
  * Resolve `cursor` to the concrete cursor agent binary available in PATH.
@@ -1486,6 +1489,30 @@ export class WorkflowRunner {
1486
1489
  }
1487
1490
  return config;
1488
1491
  }
1492
+ applyReliabilityDefaults(config) {
1493
+ const existing = config.errorHandling;
1494
+ if (existing?.strategy === 'fail-fast' || existing?.strategy === 'continue') {
1495
+ return config;
1496
+ }
1497
+ const hasRepairAgentCandidate = (config.agents ?? []).length > 0;
1498
+ const maxRetries = existing?.maxRetries ??
1499
+ existing?.repairRetries ??
1500
+ (existing ? DEFAULT_WORKFLOW_MAX_RETRIES : DEFAULT_WORKFLOW_MAX_RETRIES);
1501
+ const repairRetries = existing?.repairRetries ??
1502
+ (hasRepairAgentCandidate
1503
+ ? existing?.maxRetries ?? DEFAULT_WORKFLOW_REPAIR_RETRIES
1504
+ : existing?.repairRetries);
1505
+ return {
1506
+ ...config,
1507
+ errorHandling: {
1508
+ ...existing,
1509
+ strategy: 'retry',
1510
+ maxRetries,
1511
+ retryDelayMs: existing?.retryDelayMs ?? DEFAULT_WORKFLOW_RETRY_DELAY_MS,
1512
+ ...(repairRetries !== undefined ? { repairRetries } : {}),
1513
+ },
1514
+ };
1515
+ }
1489
1516
  /** Validate a config object against the RelayYamlConfig shape. */
1490
1517
  validateConfig(config, source = '<config>') {
1491
1518
  if (typeof config !== 'object' || config === null) {
@@ -1861,6 +1888,11 @@ export class WorkflowRunner {
1861
1888
  throw new Error(`${source}: deterministic step "${s.name}" must have a "command" field`);
1862
1889
  }
1863
1890
  }
1891
+ else if (s.type === 'worktree') {
1892
+ if (typeof s.branch !== 'string' || s.branch.trim().length === 0) {
1893
+ throw new Error(`${source}: worktree step "${s.name}" must have a "branch" string field`);
1894
+ }
1895
+ }
1864
1896
  else if (s.type === 'integration') {
1865
1897
  // Integration steps require integration and action
1866
1898
  if (typeof s.integration !== 'string') {
@@ -2085,7 +2117,8 @@ export class WorkflowRunner {
2085
2117
  const resolved = this.applyPermissionProfiles(vars ? this.resolveVariables(config, vars) : config);
2086
2118
  // Validate config (catches cycles, missing deps, invalid steps, etc.)
2087
2119
  this.validateConfig(resolved);
2088
- const permissionResult = this.validatePermissions(resolved.agents, resolved.permission_profiles);
2120
+ const runtimeConfig = this.applyReliabilityDefaults(resolved);
2121
+ const permissionResult = this.validatePermissions(runtimeConfig.agents, runtimeConfig.permission_profiles);
2089
2122
  if (permissionResult.errors.length > 0) {
2090
2123
  throw new Error(`Permission validation failed:\n ${permissionResult.errors.join('\n ')}`);
2091
2124
  }
@@ -2093,7 +2126,7 @@ export class WorkflowRunner {
2093
2126
  console.warn(`[WorkflowRunner] Warning: ${warning}`);
2094
2127
  }
2095
2128
  // Resolve and validate named paths from the top-level `paths` config
2096
- const pathResult = this.resolvePathDefinitions(resolved.paths, this.cwd);
2129
+ const pathResult = this.resolvePathDefinitions(runtimeConfig.paths, this.cwd);
2097
2130
  if (pathResult.errors.length > 0) {
2098
2131
  throw new Error(`Path validation failed:\n ${pathResult.errors.join('\n ')}`);
2099
2132
  }
@@ -2103,7 +2136,7 @@ export class WorkflowRunner {
2103
2136
  console.log(`[workflow] path "${name}" → ${abs}`);
2104
2137
  }
2105
2138
  }
2106
- const workflows = resolved.workflows ?? [];
2139
+ const workflows = runtimeConfig.workflows ?? [];
2107
2140
  const workflow = workflowName ? workflows.find((w) => w.name === workflowName) : workflows[0];
2108
2141
  if (!workflow) {
2109
2142
  throw new Error(workflowName ? `Workflow "${workflowName}" not found in config` : 'No workflows defined in config');
@@ -2118,9 +2151,9 @@ export class WorkflowRunner {
2118
2151
  id: runId,
2119
2152
  workspaceId: this.workspaceId,
2120
2153
  workflowName: resolvedWorkflow.name,
2121
- pattern: resolved.swarm.pattern,
2154
+ pattern: runtimeConfig.swarm.pattern,
2122
2155
  status: 'pending',
2123
- config: resolved,
2156
+ config: runtimeConfig,
2124
2157
  startedAt: now,
2125
2158
  createdAt: now,
2126
2159
  updatedAt: now,
@@ -2191,7 +2224,7 @@ export class WorkflowRunner {
2191
2224
  return this.runWorkflowCore({
2192
2225
  run,
2193
2226
  workflow: resolvedWorkflow,
2194
- config: resolved,
2227
+ config: runtimeConfig,
2195
2228
  stepStates,
2196
2229
  isResume: false,
2197
2230
  });
@@ -2220,7 +2253,7 @@ export class WorkflowRunner {
2220
2253
  if (run.status !== 'running' && run.status !== 'failed') {
2221
2254
  throw new Error(`Run "${runId}" is in status "${run.status}" and cannot be resumed`);
2222
2255
  }
2223
- const resolvedConfig = vars ? this.resolveVariables(run.config, vars) : run.config;
2256
+ const resolvedConfig = this.applyReliabilityDefaults(vars ? this.resolveVariables(run.config, vars) : run.config);
2224
2257
  // Resolve path definitions (same as execute()) so workdir lookups work on resume
2225
2258
  const pathResult = this.resolvePathDefinitions(resolvedConfig.paths, this.cwd);
2226
2259
  if (pathResult.errors.length > 0) {
@@ -2819,7 +2852,7 @@ export class WorkflowRunner {
2819
2852
  async executeDeterministicStep(step, state, stepStates, agentMap, runId, errorHandling, lifecycle) {
2820
2853
  const repairRetries = errorHandling?.strategy === 'retry' ? errorHandling.repairRetries ?? 0 : 0;
2821
2854
  const repairAgent = repairRetries > 0
2822
- ? this.resolveDeterministicRepairAgent(step, stepStates, agentMap, errorHandling)
2855
+ ? this.resolveWorkflowRepairAgent(step, stepStates, agentMap, errorHandling)
2823
2856
  : undefined;
2824
2857
  const maxRetries = step.retries ?? errorHandling?.maxRetries ?? (repairAgent ? repairRetries : 0);
2825
2858
  const retryDelay = errorHandling?.retryDelayMs ?? 1000;
@@ -3015,7 +3048,7 @@ export class WorkflowRunner {
3015
3048
  throw new Error(`Step "${step.name}" failed: ${result.error ?? 'Unknown error'}`);
3016
3049
  }
3017
3050
  }
3018
- resolveDeterministicRepairAgent(step, stepStates, agentMap, errorHandling) {
3051
+ resolveWorkflowRepairAgent(step, stepStates, agentMap, errorHandling) {
3019
3052
  const explicitName = errorHandling?.repairAgent?.trim();
3020
3053
  if (explicitName) {
3021
3054
  const explicitAgent = agentMap.get(explicitName);
@@ -3137,6 +3170,90 @@ export class WorkflowRunner {
3137
3170
  `Repair only what is needed for this gate to pass. Preserve unrelated user changes. ` +
3138
3171
  `After making the fix, report the files changed and the reason the gate should pass.`);
3139
3172
  }
3173
+ async runAgentStepRepairAgent(context) {
3174
+ const repairAgent = {
3175
+ ...context.agentDef,
3176
+ interactive: false,
3177
+ };
3178
+ const repairPrompt = this.buildAgentStepRepairPrompt(context);
3179
+ const repairStep = {
3180
+ name: `${context.step.name}-repair-${context.attempt}`,
3181
+ type: 'agent',
3182
+ agent: repairAgent.name,
3183
+ task: repairPrompt,
3184
+ cwd: context.cwd,
3185
+ workdir: undefined,
3186
+ retries: 0,
3187
+ };
3188
+ const timeoutMs = repairAgent.constraints?.timeoutMs ?? context.step.timeoutMs ?? this.currentConfig?.swarm?.timeoutMs;
3189
+ this.log(`[${context.step.name}] Agent step failed; asking "${repairAgent.name}" to repair before retry ${context.attempt + 1}/${context.maxRetries + 1}`);
3190
+ this.postToChannel(`**[${context.step.name}]** Agent step failed; assigning repair to \`${repairAgent.name}\``);
3191
+ this.recordStepToolSideEffect(context.step.name, {
3192
+ type: 'custom',
3193
+ detail: `Assigned agent-step repair to ${repairAgent.name}`,
3194
+ raw: {
3195
+ repairAgent: repairAgent.name,
3196
+ attempt: context.attempt,
3197
+ maxRetries: context.maxRetries,
3198
+ completionReason: context.completionReason,
3199
+ exitCode: context.exitCode,
3200
+ exitSignal: context.exitSignal,
3201
+ },
3202
+ });
3203
+ try {
3204
+ this.ensureBudgetAllowsSpawn(context.step.name, repairAgent.name);
3205
+ let repairOutput;
3206
+ if (this.executor) {
3207
+ repairOutput = await this.executor.executeAgentStep(repairStep, repairAgent, repairPrompt, timeoutMs);
3208
+ }
3209
+ else if (repairAgent.cli === 'api') {
3210
+ repairOutput = await executeApiStep(repairAgent.constraints?.model ?? 'claude-sonnet-4-20250514', repairPrompt, {
3211
+ envSecrets: this.envSecrets,
3212
+ skills: repairAgent.skills,
3213
+ defaultMaxTokens: repairAgent.constraints?.maxTokens,
3214
+ });
3215
+ }
3216
+ else {
3217
+ const result = await this.execNonInteractive(repairAgent, repairStep, timeoutMs);
3218
+ repairOutput = result.output;
3219
+ }
3220
+ this.recordStepToolSideEffect(context.step.name, {
3221
+ type: 'custom',
3222
+ detail: `Repair agent ${repairAgent.name} completed before agent retry`,
3223
+ raw: { repairAgent: repairAgent.name, output: repairOutput.slice(0, 1000) },
3224
+ });
3225
+ }
3226
+ catch (error) {
3227
+ if (error instanceof BudgetExceededError || this.abortController?.signal.aborted) {
3228
+ throw error;
3229
+ }
3230
+ const message = error instanceof Error ? error.message : String(error);
3231
+ this.log(`[${context.step.name}] Repair agent "${repairAgent.name}" failed: ${message}`);
3232
+ this.postToChannel(`**[${context.step.name}]** Repair agent \`${repairAgent.name}\` failed; retrying agent step anyway`);
3233
+ this.recordStepToolSideEffect(context.step.name, {
3234
+ type: 'custom',
3235
+ detail: `Repair agent ${repairAgent.name} failed before agent retry: ${message}`,
3236
+ raw: { repairAgent: repairAgent.name, error: message },
3237
+ });
3238
+ }
3239
+ }
3240
+ buildAgentStepRepairPrompt(context) {
3241
+ const output = context.output.trim();
3242
+ const clippedOutput = output.length > 4000 ? output.slice(-4000) : output;
3243
+ const task = (context.step.task ?? '').trim();
3244
+ const clippedTask = task.length > 3000 ? task.slice(0, 3000) : task;
3245
+ return (`A workflow agent step failed or produced an invalid artifact. Repair the repository, workflow state, or step instructions so the step can succeed on the next retry.\n\n` +
3246
+ `Step: ${context.step.name}\n` +
3247
+ `Working directory: ${context.cwd}\n` +
3248
+ `Completion reason: ${context.completionReason ?? 'unknown'}\n` +
3249
+ `Failure:\n${context.error}\n` +
3250
+ `Exit code: ${context.exitCode ?? 'unknown'}\n` +
3251
+ `Exit signal: ${context.exitSignal ?? 'none'}\n\n` +
3252
+ `Step task:\n${clippedTask || '(no task captured)'}\n\n` +
3253
+ `Previous output:\n${clippedOutput || '(no output captured)'}\n\n` +
3254
+ `Repair only what is needed for this step to produce the required artifact or evidence. ` +
3255
+ `Preserve unrelated user changes. After making the fix, report the files changed and why the retry should pass.`);
3256
+ }
3140
3257
  /**
3141
3258
  * Execute a worktree step (git worktree setup).
3142
3259
  * Fast, reliable, $0 LLM cost.
@@ -3349,56 +3466,7 @@ export class WorkflowRunner {
3349
3466
  throw new Error(`Agent "${agentName}" not found in config`);
3350
3467
  }
3351
3468
  const specialistDef = WorkflowRunner.resolveAgentDef(rawAgentDef);
3352
- // API-mode agents: execute via direct API call instead of spawning a PTY/subprocess.
3353
- if (specialistDef.cli === 'api') {
3354
- this.ensureBudgetAllowsSpawn(step.name, agentName);
3355
- const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
3356
- const resolvedTask = this.interpolateStepTask(step.task ?? '', stepOutputContext);
3357
- state.row.status = 'running';
3358
- state.row.startedAt = new Date().toISOString();
3359
- await this.db.updateStep(state.row.id, {
3360
- status: 'running',
3361
- startedAt: state.row.startedAt,
3362
- updatedAt: new Date().toISOString(),
3363
- });
3364
- this.emit({ type: 'step:started', runId, stepName: step.name });
3365
- this.postToChannel(`**[${step.name}]** Started (api)`);
3366
- try {
3367
- const output = await executeApiStep(specialistDef.constraints?.model ?? 'claude-sonnet-4-20250514', resolvedTask, {
3368
- envSecrets: this.envSecrets,
3369
- skills: specialistDef.skills,
3370
- defaultMaxTokens: specialistDef.constraints?.maxTokens,
3371
- });
3372
- state.row.status = 'completed';
3373
- state.row.output = output;
3374
- state.row.completedAt = new Date().toISOString();
3375
- await this.db.updateStep(state.row.id, {
3376
- status: 'completed',
3377
- output,
3378
- completedAt: state.row.completedAt,
3379
- updatedAt: new Date().toISOString(),
3380
- });
3381
- await this.persistStepOutput(runId, step.name, output);
3382
- this.emit({ type: 'step:completed', runId, stepName: step.name, output });
3383
- }
3384
- catch (apiError) {
3385
- const errorMessage = apiError instanceof Error ? apiError.message : String(apiError);
3386
- state.row.status = 'failed';
3387
- state.row.error = errorMessage;
3388
- state.row.completedAt = new Date().toISOString();
3389
- await this.db.updateStep(state.row.id, {
3390
- status: 'failed',
3391
- error: errorMessage,
3392
- completedAt: state.row.completedAt,
3393
- updatedAt: new Date().toISOString(),
3394
- });
3395
- this.emit({ type: 'step:failed', runId, stepName: step.name, error: errorMessage });
3396
- this.postToChannel(`**[${step.name}]** Failed (api): ${errorMessage}`);
3397
- throw apiError;
3398
- }
3399
- return;
3400
- }
3401
- const usesOwnerFlow = specialistDef.interactive !== false;
3469
+ const usesOwnerFlow = specialistDef.cli !== 'api' && specialistDef.interactive !== false;
3402
3470
  const currentPattern = this.currentConfig?.swarm?.pattern ?? '';
3403
3471
  const isHubPattern = WorkflowRunner.HUB_PATTERNS.has(currentPattern);
3404
3472
  const usesAutoHardening = usesOwnerFlow && isHubPattern && !this.isExplicitInteractiveWorker(specialistDef);
@@ -3422,6 +3490,10 @@ export class WorkflowRunner {
3422
3490
  ownerDef.constraints?.timeoutMs ??
3423
3491
  specialistDef.constraints?.timeoutMs ??
3424
3492
  this.currentConfig?.swarm?.timeoutMs;
3493
+ const repairRetries = errorHandling?.strategy === 'retry' ? (errorHandling.repairRetries ?? 0) : 0;
3494
+ const repairAgent = repairRetries > 0
3495
+ ? this.resolveWorkflowRepairAgent(step, stepStates, agentMap, errorHandling)
3496
+ : undefined;
3425
3497
  let lastError;
3426
3498
  let lastExitCode;
3427
3499
  let lastExitSignal;
@@ -3460,6 +3532,20 @@ export class WorkflowRunner {
3460
3532
  updatedAt: new Date().toISOString(),
3461
3533
  });
3462
3534
  await this.trajectory?.stepRetrying(step, attempt, maxRetries);
3535
+ if (repairAgent && attempt <= repairRetries) {
3536
+ await this.runAgentStepRepairAgent({
3537
+ step,
3538
+ agentDef: repairAgent,
3539
+ attempt,
3540
+ maxRetries,
3541
+ cwd: lastEffectiveCwd ?? this.resolveEffectiveCwd(step, specialistDef),
3542
+ error: lastError ?? 'Unknown error',
3543
+ output: this.lastFailedStepOutput.get(step.name) ?? '',
3544
+ exitCode: lastExitCode,
3545
+ exitSignal: lastExitSignal,
3546
+ completionReason: lastCompletionReason,
3547
+ });
3548
+ }
3463
3549
  await this.delay(retryDelay);
3464
3550
  }
3465
3551
  try {
@@ -3584,28 +3670,38 @@ export class WorkflowRunner {
3584
3670
  // executors still take precedence. See process-backend-executor.ts.
3585
3671
  const spawnResult = this.executor
3586
3672
  ? await this.executor.executeAgentStep(resolvedStep, effectiveOwner, ownerTask, timeoutMs)
3587
- : await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs, {
3588
- retryAttempt: attempt,
3589
- evidenceStepName: step.name,
3590
- evidenceRole: usesOwnerFlow ? 'owner' : 'specialist',
3591
- preserveOnIdle: !isHubPattern || !this.isLeadLikeAgent(effectiveOwner) ? false : undefined,
3592
- logicalName: effectiveOwner.name,
3593
- onSpawned: explicitInteractiveWorker
3594
- ? ({ agent }) => {
3595
- explicitWorkerHandle = agent;
3596
- }
3597
- : undefined,
3598
- onChunk: explicitInteractiveWorker
3599
- ? ({ chunk }) => {
3600
- explicitWorkerOutput += WorkflowRunner.stripAnsi(chunk);
3601
- if (!explicitWorkerCompleted &&
3602
- this.hasExplicitInteractiveWorkerCompletionEvidence(step, explicitWorkerOutput, ownerTask, resolvedTask)) {
3603
- explicitWorkerCompleted = true;
3604
- void explicitWorkerHandle?.release().catch(() => undefined);
3673
+ : effectiveOwner.cli === 'api'
3674
+ ? {
3675
+ output: await executeApiStep(effectiveOwner.constraints?.model ?? 'claude-sonnet-4-20250514', ownerTask, {
3676
+ envSecrets: this.envSecrets,
3677
+ skills: effectiveOwner.skills,
3678
+ defaultMaxTokens: effectiveOwner.constraints?.maxTokens,
3679
+ }),
3680
+ exitCode: 0,
3681
+ promptTaskText: ownerTask,
3682
+ }
3683
+ : await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs, {
3684
+ retryAttempt: attempt,
3685
+ evidenceStepName: step.name,
3686
+ evidenceRole: usesOwnerFlow ? 'owner' : 'specialist',
3687
+ preserveOnIdle: !isHubPattern || !this.isLeadLikeAgent(effectiveOwner) ? false : undefined,
3688
+ logicalName: effectiveOwner.name,
3689
+ onSpawned: explicitInteractiveWorker
3690
+ ? ({ agent }) => {
3691
+ explicitWorkerHandle = agent;
3605
3692
  }
3606
- }
3607
- : undefined,
3608
- });
3693
+ : undefined,
3694
+ onChunk: explicitInteractiveWorker
3695
+ ? ({ chunk }) => {
3696
+ explicitWorkerOutput += WorkflowRunner.stripAnsi(chunk);
3697
+ if (!explicitWorkerCompleted &&
3698
+ this.hasExplicitInteractiveWorkerCompletionEvidence(step, explicitWorkerOutput, ownerTask, resolvedTask)) {
3699
+ explicitWorkerCompleted = true;
3700
+ void explicitWorkerHandle?.release().catch(() => undefined);
3701
+ }
3702
+ }
3703
+ : undefined,
3704
+ });
3609
3705
  const output = typeof spawnResult === 'string' ? spawnResult : spawnResult.output;
3610
3706
  promptTaskText =
3611
3707
  typeof spawnResult === 'string'
@@ -3717,6 +3813,9 @@ export class WorkflowRunner {
3717
3813
  catch (err) {
3718
3814
  lastError = err instanceof Error ? err.message : String(err);
3719
3815
  lastCompletionReason = err instanceof WorkflowCompletionError ? err.completionReason : undefined;
3816
+ if (stepOutputForDiagnostic) {
3817
+ this.lastFailedStepOutput.set(step.name, stepOutputForDiagnostic);
3818
+ }
3720
3819
  const diagnosticVerification = step.verification;
3721
3820
  if (err instanceof WorkflowCompletionError &&
3722
3821
  err.completionReason === 'failed_verification' &&
@@ -4059,23 +4158,33 @@ export class WorkflowRunner {
4059
4158
  this.log(`[${step.name}] Spawning owner "${supervised.owner.name}" (cli: ${supervised.owner.cli})`);
4060
4159
  const ownerStartTime = Date.now();
4061
4160
  try {
4062
- const ownerResultObj = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
4063
- agentNameSuffix: 'owner',
4064
- retryAttempt,
4065
- evidenceStepName: step.name,
4066
- evidenceRole: 'owner',
4067
- logicalName: supervised.owner.name,
4068
- onSpawned: ({ actualName }) => {
4069
- this.supervisedRuntimeAgents.set(actualName, {
4070
- stepName: step.name,
4071
- role: 'owner',
4072
- logicalName: supervised.owner.name,
4073
- });
4074
- },
4075
- onChunk: ({ chunk }) => {
4076
- void this.recordOwnerMonitoringChunk(step, supervised.owner, chunk);
4077
- },
4078
- });
4161
+ const ownerResultObj = supervised.owner.cli === 'api'
4162
+ ? {
4163
+ output: await executeApiStep(supervised.owner.constraints?.model ?? 'claude-sonnet-4-20250514', supervisorTask, {
4164
+ envSecrets: this.envSecrets,
4165
+ skills: supervised.owner.skills,
4166
+ defaultMaxTokens: supervised.owner.constraints?.maxTokens,
4167
+ }),
4168
+ exitCode: 0,
4169
+ promptTaskText: supervisorTask,
4170
+ }
4171
+ : await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
4172
+ agentNameSuffix: 'owner',
4173
+ retryAttempt,
4174
+ evidenceStepName: step.name,
4175
+ evidenceRole: 'owner',
4176
+ logicalName: supervised.owner.name,
4177
+ onSpawned: ({ actualName }) => {
4178
+ this.supervisedRuntimeAgents.set(actualName, {
4179
+ stepName: step.name,
4180
+ role: 'owner',
4181
+ logicalName: supervised.owner.name,
4182
+ });
4183
+ },
4184
+ onChunk: ({ chunk }) => {
4185
+ void this.recordOwnerMonitoringChunk(step, supervised.owner, chunk);
4186
+ },
4187
+ });
4079
4188
  const ownerElapsed = Date.now() - ownerStartTime;
4080
4189
  const ownerOutput = ownerResultObj.output;
4081
4190
  this.log(`[${step.name}] Owner "${supervised.owner.name}" exited`);