@agent-relay/sdk 6.0.11 → 6.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/agent-relay-broker-win32-x64.exe +0 -0
- package/dist/client.d.ts.map +1 -1
- package/dist/client.js +25 -5
- package/dist/client.js.map +1 -1
- package/dist/github.d.ts +9 -1
- package/dist/github.d.ts.map +1 -1
- package/dist/github.js +9 -1
- package/dist/github.js.map +1 -1
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -0
- package/dist/index.js.map +1 -1
- package/dist/slack.d.ts +24 -0
- package/dist/slack.d.ts.map +1 -0
- package/dist/slack.js +24 -0
- package/dist/slack.js.map +1 -0
- package/dist/types.d.ts +21 -6
- package/dist/types.d.ts.map +1 -1
- package/dist/workflows/__tests__/workflow-reliability-contract.test.d.ts +2 -0
- package/dist/workflows/__tests__/workflow-reliability-contract.test.d.ts.map +1 -0
- package/dist/workflows/__tests__/workflow-reliability-contract.test.js +536 -0
- package/dist/workflows/__tests__/workflow-reliability-contract.test.js.map +1 -0
- package/dist/workflows/__tests__/workflow-reliability-e2e.test.d.ts +2 -0
- package/dist/workflows/__tests__/workflow-reliability-e2e.test.d.ts.map +1 -0
- package/dist/workflows/__tests__/workflow-reliability-e2e.test.js +199 -0
- package/dist/workflows/__tests__/workflow-reliability-e2e.test.js.map +1 -0
- package/dist/workflows/builder.d.ts +10 -0
- package/dist/workflows/builder.d.ts.map +1 -1
- package/dist/workflows/builder.js +23 -1
- package/dist/workflows/builder.js.map +1 -1
- package/dist/workflows/runner.d.ts +7 -0
- package/dist/workflows/runner.d.ts.map +1 -1
- package/dist/workflows/runner.js +355 -98
- package/dist/workflows/runner.js.map +1 -1
- package/dist/workflows/types.d.ts +4 -0
- package/dist/workflows/types.d.ts.map +1 -1
- package/package.json +17 -11
package/dist/workflows/runner.js
CHANGED
|
@@ -101,6 +101,9 @@ class SpawnExitError extends Error {
|
|
|
101
101
|
this.exitSignal = exitSignal ?? undefined;
|
|
102
102
|
}
|
|
103
103
|
}
|
|
104
|
+
const DEFAULT_WORKFLOW_MAX_RETRIES = 2;
|
|
105
|
+
const DEFAULT_WORKFLOW_REPAIR_RETRIES = 2;
|
|
106
|
+
const DEFAULT_WORKFLOW_RETRY_DELAY_MS = 1000;
|
|
104
107
|
// ── CLI resolution ───────────────────────────────────────────────────────────
|
|
105
108
|
/**
|
|
106
109
|
* Resolve `cursor` to the concrete cursor agent binary available in PATH.
|
|
@@ -1486,6 +1489,30 @@ export class WorkflowRunner {
|
|
|
1486
1489
|
}
|
|
1487
1490
|
return config;
|
|
1488
1491
|
}
|
|
1492
|
+
applyReliabilityDefaults(config) {
|
|
1493
|
+
const existing = config.errorHandling;
|
|
1494
|
+
if (existing?.strategy === 'fail-fast' || existing?.strategy === 'continue') {
|
|
1495
|
+
return config;
|
|
1496
|
+
}
|
|
1497
|
+
const hasRepairAgentCandidate = (config.agents ?? []).length > 0;
|
|
1498
|
+
const maxRetries = existing?.maxRetries ??
|
|
1499
|
+
existing?.repairRetries ??
|
|
1500
|
+
(existing ? DEFAULT_WORKFLOW_MAX_RETRIES : DEFAULT_WORKFLOW_MAX_RETRIES);
|
|
1501
|
+
const repairRetries = existing?.repairRetries ??
|
|
1502
|
+
(hasRepairAgentCandidate
|
|
1503
|
+
? existing?.maxRetries ?? DEFAULT_WORKFLOW_REPAIR_RETRIES
|
|
1504
|
+
: existing?.repairRetries);
|
|
1505
|
+
return {
|
|
1506
|
+
...config,
|
|
1507
|
+
errorHandling: {
|
|
1508
|
+
...existing,
|
|
1509
|
+
strategy: 'retry',
|
|
1510
|
+
maxRetries,
|
|
1511
|
+
retryDelayMs: existing?.retryDelayMs ?? DEFAULT_WORKFLOW_RETRY_DELAY_MS,
|
|
1512
|
+
...(repairRetries !== undefined ? { repairRetries } : {}),
|
|
1513
|
+
},
|
|
1514
|
+
};
|
|
1515
|
+
}
|
|
1489
1516
|
/** Validate a config object against the RelayYamlConfig shape. */
|
|
1490
1517
|
validateConfig(config, source = '<config>') {
|
|
1491
1518
|
if (typeof config !== 'object' || config === null) {
|
|
@@ -1861,6 +1888,11 @@ export class WorkflowRunner {
|
|
|
1861
1888
|
throw new Error(`${source}: deterministic step "${s.name}" must have a "command" field`);
|
|
1862
1889
|
}
|
|
1863
1890
|
}
|
|
1891
|
+
else if (s.type === 'worktree') {
|
|
1892
|
+
if (typeof s.branch !== 'string' || s.branch.trim().length === 0) {
|
|
1893
|
+
throw new Error(`${source}: worktree step "${s.name}" must have a "branch" string field`);
|
|
1894
|
+
}
|
|
1895
|
+
}
|
|
1864
1896
|
else if (s.type === 'integration') {
|
|
1865
1897
|
// Integration steps require integration and action
|
|
1866
1898
|
if (typeof s.integration !== 'string') {
|
|
@@ -2085,7 +2117,8 @@ export class WorkflowRunner {
|
|
|
2085
2117
|
const resolved = this.applyPermissionProfiles(vars ? this.resolveVariables(config, vars) : config);
|
|
2086
2118
|
// Validate config (catches cycles, missing deps, invalid steps, etc.)
|
|
2087
2119
|
this.validateConfig(resolved);
|
|
2088
|
-
const
|
|
2120
|
+
const runtimeConfig = this.applyReliabilityDefaults(resolved);
|
|
2121
|
+
const permissionResult = this.validatePermissions(runtimeConfig.agents, runtimeConfig.permission_profiles);
|
|
2089
2122
|
if (permissionResult.errors.length > 0) {
|
|
2090
2123
|
throw new Error(`Permission validation failed:\n ${permissionResult.errors.join('\n ')}`);
|
|
2091
2124
|
}
|
|
@@ -2093,7 +2126,7 @@ export class WorkflowRunner {
|
|
|
2093
2126
|
console.warn(`[WorkflowRunner] Warning: ${warning}`);
|
|
2094
2127
|
}
|
|
2095
2128
|
// Resolve and validate named paths from the top-level `paths` config
|
|
2096
|
-
const pathResult = this.resolvePathDefinitions(
|
|
2129
|
+
const pathResult = this.resolvePathDefinitions(runtimeConfig.paths, this.cwd);
|
|
2097
2130
|
if (pathResult.errors.length > 0) {
|
|
2098
2131
|
throw new Error(`Path validation failed:\n ${pathResult.errors.join('\n ')}`);
|
|
2099
2132
|
}
|
|
@@ -2103,7 +2136,7 @@ export class WorkflowRunner {
|
|
|
2103
2136
|
console.log(`[workflow] path "${name}" → ${abs}`);
|
|
2104
2137
|
}
|
|
2105
2138
|
}
|
|
2106
|
-
const workflows =
|
|
2139
|
+
const workflows = runtimeConfig.workflows ?? [];
|
|
2107
2140
|
const workflow = workflowName ? workflows.find((w) => w.name === workflowName) : workflows[0];
|
|
2108
2141
|
if (!workflow) {
|
|
2109
2142
|
throw new Error(workflowName ? `Workflow "${workflowName}" not found in config` : 'No workflows defined in config');
|
|
@@ -2118,9 +2151,9 @@ export class WorkflowRunner {
|
|
|
2118
2151
|
id: runId,
|
|
2119
2152
|
workspaceId: this.workspaceId,
|
|
2120
2153
|
workflowName: resolvedWorkflow.name,
|
|
2121
|
-
pattern:
|
|
2154
|
+
pattern: runtimeConfig.swarm.pattern,
|
|
2122
2155
|
status: 'pending',
|
|
2123
|
-
config:
|
|
2156
|
+
config: runtimeConfig,
|
|
2124
2157
|
startedAt: now,
|
|
2125
2158
|
createdAt: now,
|
|
2126
2159
|
updatedAt: now,
|
|
@@ -2191,7 +2224,7 @@ export class WorkflowRunner {
|
|
|
2191
2224
|
return this.runWorkflowCore({
|
|
2192
2225
|
run,
|
|
2193
2226
|
workflow: resolvedWorkflow,
|
|
2194
|
-
config:
|
|
2227
|
+
config: runtimeConfig,
|
|
2195
2228
|
stepStates,
|
|
2196
2229
|
isResume: false,
|
|
2197
2230
|
});
|
|
@@ -2220,7 +2253,7 @@ export class WorkflowRunner {
|
|
|
2220
2253
|
if (run.status !== 'running' && run.status !== 'failed') {
|
|
2221
2254
|
throw new Error(`Run "${runId}" is in status "${run.status}" and cannot be resumed`);
|
|
2222
2255
|
}
|
|
2223
|
-
const resolvedConfig = vars ? this.resolveVariables(run.config, vars) : run.config;
|
|
2256
|
+
const resolvedConfig = this.applyReliabilityDefaults(vars ? this.resolveVariables(run.config, vars) : run.config);
|
|
2224
2257
|
// Resolve path definitions (same as execute()) so workdir lookups work on resume
|
|
2225
2258
|
const pathResult = this.resolvePathDefinitions(resolvedConfig.paths, this.cwd);
|
|
2226
2259
|
if (pathResult.errors.length > 0) {
|
|
@@ -2799,7 +2832,7 @@ export class WorkflowRunner {
|
|
|
2799
2832
|
async executeStep(step, state, stepStates, agentMap, errorHandling, runId, lifecycle) {
|
|
2800
2833
|
// Branch: deterministic steps execute shell commands
|
|
2801
2834
|
if (this.isDeterministicStep(step)) {
|
|
2802
|
-
return this.executeDeterministicStep(step, state, stepStates, runId, errorHandling, lifecycle);
|
|
2835
|
+
return this.executeDeterministicStep(step, state, stepStates, agentMap, runId, errorHandling, lifecycle);
|
|
2803
2836
|
}
|
|
2804
2837
|
// Branch: worktree steps set up git worktrees
|
|
2805
2838
|
if (this.isWorktreeStep(step)) {
|
|
@@ -2816,13 +2849,20 @@ export class WorkflowRunner {
|
|
|
2816
2849
|
* Execute a deterministic step (shell command).
|
|
2817
2850
|
* Fast, reliable, $0 LLM cost.
|
|
2818
2851
|
*/
|
|
2819
|
-
async executeDeterministicStep(step, state, stepStates, runId, errorHandling, lifecycle) {
|
|
2820
|
-
const
|
|
2852
|
+
async executeDeterministicStep(step, state, stepStates, agentMap, runId, errorHandling, lifecycle) {
|
|
2853
|
+
const repairRetries = errorHandling?.strategy === 'retry' ? errorHandling.repairRetries ?? 0 : 0;
|
|
2854
|
+
const repairAgent = repairRetries > 0
|
|
2855
|
+
? this.resolveWorkflowRepairAgent(step, stepStates, agentMap, errorHandling)
|
|
2856
|
+
: undefined;
|
|
2857
|
+
const maxRetries = step.retries ?? errorHandling?.maxRetries ?? (repairAgent ? repairRetries : 0);
|
|
2821
2858
|
const retryDelay = errorHandling?.retryDelayMs ?? 1000;
|
|
2822
2859
|
let lastError = 'Unknown error';
|
|
2823
2860
|
let lastCompletionReason;
|
|
2824
2861
|
let lastExitCode;
|
|
2825
2862
|
let lastExitSignal;
|
|
2863
|
+
let lastResolvedCommand = step.command ?? '';
|
|
2864
|
+
let lastStepCwd = this.cwd;
|
|
2865
|
+
let lastCommandOutput = '';
|
|
2826
2866
|
const result = await lifecycle.monitorStep(step, state, {
|
|
2827
2867
|
maxRetries,
|
|
2828
2868
|
retryDelayMs: retryDelay,
|
|
@@ -2835,6 +2875,20 @@ export class WorkflowRunner {
|
|
|
2835
2875
|
detail: `Retrying attempt ${attempt + 1}/${total + 1}`,
|
|
2836
2876
|
raw: { attempt, maxRetries: total },
|
|
2837
2877
|
});
|
|
2878
|
+
if (repairAgent) {
|
|
2879
|
+
await this.runDeterministicRepairAgent({
|
|
2880
|
+
step,
|
|
2881
|
+
agentDef: repairAgent,
|
|
2882
|
+
attempt,
|
|
2883
|
+
maxRetries: total,
|
|
2884
|
+
command: lastResolvedCommand,
|
|
2885
|
+
cwd: lastStepCwd,
|
|
2886
|
+
error: lastError,
|
|
2887
|
+
output: lastCommandOutput,
|
|
2888
|
+
exitCode: lastExitCode,
|
|
2889
|
+
exitSignal: lastExitSignal,
|
|
2890
|
+
});
|
|
2891
|
+
}
|
|
2838
2892
|
},
|
|
2839
2893
|
execute: async () => {
|
|
2840
2894
|
const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
|
|
@@ -2846,12 +2900,15 @@ export class WorkflowRunner {
|
|
|
2846
2900
|
return value !== undefined ? String(value) : _match;
|
|
2847
2901
|
});
|
|
2848
2902
|
const stepCwd = this.resolveEffectiveCwd(step);
|
|
2903
|
+
lastResolvedCommand = resolvedCommand;
|
|
2904
|
+
lastStepCwd = stepCwd;
|
|
2849
2905
|
this.beginStepEvidence(step.name, [stepCwd], state.row.startedAt);
|
|
2850
2906
|
this.log(`[${step.name}] Running: ${resolvedCommand.slice(0, 200)}${resolvedCommand.length > 200 ? '...' : ''}`);
|
|
2851
2907
|
if (this.executor?.executeDeterministicStep) {
|
|
2852
2908
|
const executorResult = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
|
|
2853
2909
|
lastExitCode = executorResult.exitCode;
|
|
2854
2910
|
lastExitSignal = undefined;
|
|
2911
|
+
lastCommandOutput = executorResult.output;
|
|
2855
2912
|
const failOnError = step.failOnError !== false;
|
|
2856
2913
|
if (failOnError && executorResult.exitCode !== 0) {
|
|
2857
2914
|
this.log(`[${step.name}] Command failed (exit code ${executorResult.exitCode})`);
|
|
@@ -2926,6 +2983,7 @@ export class WorkflowRunner {
|
|
|
2926
2983
|
commandStderr = stderr;
|
|
2927
2984
|
lastExitCode = code ?? undefined;
|
|
2928
2985
|
lastExitSignal = signal ?? undefined;
|
|
2986
|
+
lastCommandOutput = [stdout, stderr].filter(Boolean).join('\n');
|
|
2929
2987
|
const failOnError = step.failOnError !== false;
|
|
2930
2988
|
if (failOnError && code !== 0 && code !== null) {
|
|
2931
2989
|
this.log(`[${step.name}] Command failed (exit code ${code})`);
|
|
@@ -2957,6 +3015,7 @@ export class WorkflowRunner {
|
|
|
2957
3015
|
const verificationResult = step.verification
|
|
2958
3016
|
? this.runVerification(step.verification, output, step.name)
|
|
2959
3017
|
: undefined;
|
|
3018
|
+
lastCommandOutput = [commandStdout || output, commandStderr].filter(Boolean).join('\n');
|
|
2960
3019
|
return {
|
|
2961
3020
|
output,
|
|
2962
3021
|
completionReason: verificationResult?.completionReason,
|
|
@@ -2989,6 +3048,212 @@ export class WorkflowRunner {
|
|
|
2989
3048
|
throw new Error(`Step "${step.name}" failed: ${result.error ?? 'Unknown error'}`);
|
|
2990
3049
|
}
|
|
2991
3050
|
}
|
|
3051
|
+
resolveWorkflowRepairAgent(step, stepStates, agentMap, errorHandling) {
|
|
3052
|
+
const explicitName = errorHandling?.repairAgent?.trim();
|
|
3053
|
+
if (explicitName) {
|
|
3054
|
+
const explicitAgent = agentMap.get(explicitName);
|
|
3055
|
+
if (explicitAgent)
|
|
3056
|
+
return WorkflowRunner.resolveAgentDef(explicitAgent);
|
|
3057
|
+
this.log(`[${step.name}] repairAgent "${explicitName}" not found; falling back to workflow agents`);
|
|
3058
|
+
}
|
|
3059
|
+
if (step.agent) {
|
|
3060
|
+
const stepAgent = agentMap.get(step.agent);
|
|
3061
|
+
if (stepAgent)
|
|
3062
|
+
return WorkflowRunner.resolveAgentDef(stepAgent);
|
|
3063
|
+
}
|
|
3064
|
+
for (const dependency of [...(step.dependsOn ?? [])].reverse()) {
|
|
3065
|
+
const dependencyAgent = stepStates.get(dependency)?.row.agentName;
|
|
3066
|
+
if (!dependencyAgent)
|
|
3067
|
+
continue;
|
|
3068
|
+
const agent = agentMap.get(dependencyAgent);
|
|
3069
|
+
if (agent)
|
|
3070
|
+
return WorkflowRunner.resolveAgentDef(agent);
|
|
3071
|
+
}
|
|
3072
|
+
const candidates = [...agentMap.values()].map((agent) => WorkflowRunner.resolveAgentDef(agent));
|
|
3073
|
+
candidates.sort((a, b) => this.scoreRepairAgent(b) - this.scoreRepairAgent(a));
|
|
3074
|
+
return candidates[0];
|
|
3075
|
+
}
|
|
3076
|
+
scoreRepairAgent(agent) {
|
|
3077
|
+
const text = `${agent.name} ${agent.role ?? ''} ${agent.preset ?? ''}`.toLowerCase();
|
|
3078
|
+
let score = 0;
|
|
3079
|
+
if (/\b(repair|fix|implement|implementation|engineer|developer|coder|worker|owner|lead|coordinator)\b/.test(text)) {
|
|
3080
|
+
score += 10;
|
|
3081
|
+
}
|
|
3082
|
+
if (agent.interactive === false || ['worker', 'analyst'].includes(agent.preset ?? '')) {
|
|
3083
|
+
score += 2;
|
|
3084
|
+
}
|
|
3085
|
+
if (/\b(review|reviewer|audit|security|analyst)\b/.test(text)) {
|
|
3086
|
+
score -= 4;
|
|
3087
|
+
}
|
|
3088
|
+
if (agent.permissions?.access === 'readonly') {
|
|
3089
|
+
score -= 20;
|
|
3090
|
+
}
|
|
3091
|
+
return score;
|
|
3092
|
+
}
|
|
3093
|
+
async runDeterministicRepairAgent(context) {
|
|
3094
|
+
const repairAgent = {
|
|
3095
|
+
...context.agentDef,
|
|
3096
|
+
interactive: false,
|
|
3097
|
+
};
|
|
3098
|
+
const repairPrompt = this.buildDeterministicRepairPrompt(context);
|
|
3099
|
+
const repairStep = {
|
|
3100
|
+
name: `${context.step.name}-repair-${context.attempt}`,
|
|
3101
|
+
type: 'agent',
|
|
3102
|
+
agent: repairAgent.name,
|
|
3103
|
+
task: repairPrompt,
|
|
3104
|
+
cwd: context.cwd,
|
|
3105
|
+
workdir: undefined,
|
|
3106
|
+
retries: 0,
|
|
3107
|
+
};
|
|
3108
|
+
const timeoutMs = repairAgent.constraints?.timeoutMs ?? context.step.timeoutMs ?? this.currentConfig?.swarm?.timeoutMs;
|
|
3109
|
+
this.log(`[${context.step.name}] Deterministic gate failed; asking "${repairAgent.name}" to repair before retry ${context.attempt + 1}/${context.maxRetries + 1}`);
|
|
3110
|
+
this.postToChannel(`**[${context.step.name}]** Deterministic gate failed; assigning repair to \`${repairAgent.name}\``);
|
|
3111
|
+
this.recordStepToolSideEffect(context.step.name, {
|
|
3112
|
+
type: 'custom',
|
|
3113
|
+
detail: `Assigned deterministic gate repair to ${repairAgent.name}`,
|
|
3114
|
+
raw: {
|
|
3115
|
+
repairAgent: repairAgent.name,
|
|
3116
|
+
attempt: context.attempt,
|
|
3117
|
+
maxRetries: context.maxRetries,
|
|
3118
|
+
exitCode: context.exitCode,
|
|
3119
|
+
exitSignal: context.exitSignal,
|
|
3120
|
+
},
|
|
3121
|
+
});
|
|
3122
|
+
try {
|
|
3123
|
+
this.ensureBudgetAllowsSpawn(context.step.name, repairAgent.name);
|
|
3124
|
+
let repairOutput;
|
|
3125
|
+
if (this.executor) {
|
|
3126
|
+
repairOutput = await this.executor.executeAgentStep(repairStep, repairAgent, repairPrompt, timeoutMs);
|
|
3127
|
+
}
|
|
3128
|
+
else if (repairAgent.cli === 'api') {
|
|
3129
|
+
repairOutput = await executeApiStep(repairAgent.constraints?.model ?? 'claude-sonnet-4-20250514', repairPrompt, {
|
|
3130
|
+
envSecrets: this.envSecrets,
|
|
3131
|
+
skills: repairAgent.skills,
|
|
3132
|
+
defaultMaxTokens: repairAgent.constraints?.maxTokens,
|
|
3133
|
+
});
|
|
3134
|
+
}
|
|
3135
|
+
else {
|
|
3136
|
+
const result = await this.execNonInteractive(repairAgent, repairStep, timeoutMs);
|
|
3137
|
+
repairOutput = result.output;
|
|
3138
|
+
}
|
|
3139
|
+
this.recordStepToolSideEffect(context.step.name, {
|
|
3140
|
+
type: 'custom',
|
|
3141
|
+
detail: `Repair agent ${repairAgent.name} completed before deterministic retry`,
|
|
3142
|
+
raw: { repairAgent: repairAgent.name, output: repairOutput.slice(0, 1000) },
|
|
3143
|
+
});
|
|
3144
|
+
}
|
|
3145
|
+
catch (error) {
|
|
3146
|
+
if (error instanceof BudgetExceededError || this.abortController?.signal.aborted) {
|
|
3147
|
+
throw error;
|
|
3148
|
+
}
|
|
3149
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3150
|
+
this.log(`[${context.step.name}] Repair agent "${repairAgent.name}" failed: ${message}`);
|
|
3151
|
+
this.postToChannel(`**[${context.step.name}]** Repair agent \`${repairAgent.name}\` failed; retrying gate anyway`);
|
|
3152
|
+
this.recordStepToolSideEffect(context.step.name, {
|
|
3153
|
+
type: 'custom',
|
|
3154
|
+
detail: `Repair agent ${repairAgent.name} failed before deterministic retry: ${message}`,
|
|
3155
|
+
raw: { repairAgent: repairAgent.name, error: message },
|
|
3156
|
+
});
|
|
3157
|
+
}
|
|
3158
|
+
}
|
|
3159
|
+
buildDeterministicRepairPrompt(context) {
|
|
3160
|
+
const output = context.output.trim();
|
|
3161
|
+
const clippedOutput = output.length > 4000 ? output.slice(-4000) : output;
|
|
3162
|
+
return (`A deterministic workflow gate failed after an agent/team step. Fix the repository or workflow state so the same gate passes on the next retry.\n\n` +
|
|
3163
|
+
`Step: ${context.step.name}\n` +
|
|
3164
|
+
`Working directory: ${context.cwd}\n` +
|
|
3165
|
+
`Command:\n${context.command}\n\n` +
|
|
3166
|
+
`Failure:\n${context.error}\n` +
|
|
3167
|
+
`Exit code: ${context.exitCode ?? 'unknown'}\n` +
|
|
3168
|
+
`Exit signal: ${context.exitSignal ?? 'none'}\n\n` +
|
|
3169
|
+
`Command output:\n${clippedOutput || '(no output captured)'}\n\n` +
|
|
3170
|
+
`Repair only what is needed for this gate to pass. Preserve unrelated user changes. ` +
|
|
3171
|
+
`After making the fix, report the files changed and the reason the gate should pass.`);
|
|
3172
|
+
}
|
|
3173
|
+
async runAgentStepRepairAgent(context) {
|
|
3174
|
+
const repairAgent = {
|
|
3175
|
+
...context.agentDef,
|
|
3176
|
+
interactive: false,
|
|
3177
|
+
};
|
|
3178
|
+
const repairPrompt = this.buildAgentStepRepairPrompt(context);
|
|
3179
|
+
const repairStep = {
|
|
3180
|
+
name: `${context.step.name}-repair-${context.attempt}`,
|
|
3181
|
+
type: 'agent',
|
|
3182
|
+
agent: repairAgent.name,
|
|
3183
|
+
task: repairPrompt,
|
|
3184
|
+
cwd: context.cwd,
|
|
3185
|
+
workdir: undefined,
|
|
3186
|
+
retries: 0,
|
|
3187
|
+
};
|
|
3188
|
+
const timeoutMs = repairAgent.constraints?.timeoutMs ?? context.step.timeoutMs ?? this.currentConfig?.swarm?.timeoutMs;
|
|
3189
|
+
this.log(`[${context.step.name}] Agent step failed; asking "${repairAgent.name}" to repair before retry ${context.attempt + 1}/${context.maxRetries + 1}`);
|
|
3190
|
+
this.postToChannel(`**[${context.step.name}]** Agent step failed; assigning repair to \`${repairAgent.name}\``);
|
|
3191
|
+
this.recordStepToolSideEffect(context.step.name, {
|
|
3192
|
+
type: 'custom',
|
|
3193
|
+
detail: `Assigned agent-step repair to ${repairAgent.name}`,
|
|
3194
|
+
raw: {
|
|
3195
|
+
repairAgent: repairAgent.name,
|
|
3196
|
+
attempt: context.attempt,
|
|
3197
|
+
maxRetries: context.maxRetries,
|
|
3198
|
+
completionReason: context.completionReason,
|
|
3199
|
+
exitCode: context.exitCode,
|
|
3200
|
+
exitSignal: context.exitSignal,
|
|
3201
|
+
},
|
|
3202
|
+
});
|
|
3203
|
+
try {
|
|
3204
|
+
this.ensureBudgetAllowsSpawn(context.step.name, repairAgent.name);
|
|
3205
|
+
let repairOutput;
|
|
3206
|
+
if (this.executor) {
|
|
3207
|
+
repairOutput = await this.executor.executeAgentStep(repairStep, repairAgent, repairPrompt, timeoutMs);
|
|
3208
|
+
}
|
|
3209
|
+
else if (repairAgent.cli === 'api') {
|
|
3210
|
+
repairOutput = await executeApiStep(repairAgent.constraints?.model ?? 'claude-sonnet-4-20250514', repairPrompt, {
|
|
3211
|
+
envSecrets: this.envSecrets,
|
|
3212
|
+
skills: repairAgent.skills,
|
|
3213
|
+
defaultMaxTokens: repairAgent.constraints?.maxTokens,
|
|
3214
|
+
});
|
|
3215
|
+
}
|
|
3216
|
+
else {
|
|
3217
|
+
const result = await this.execNonInteractive(repairAgent, repairStep, timeoutMs);
|
|
3218
|
+
repairOutput = result.output;
|
|
3219
|
+
}
|
|
3220
|
+
this.recordStepToolSideEffect(context.step.name, {
|
|
3221
|
+
type: 'custom',
|
|
3222
|
+
detail: `Repair agent ${repairAgent.name} completed before agent retry`,
|
|
3223
|
+
raw: { repairAgent: repairAgent.name, output: repairOutput.slice(0, 1000) },
|
|
3224
|
+
});
|
|
3225
|
+
}
|
|
3226
|
+
catch (error) {
|
|
3227
|
+
if (error instanceof BudgetExceededError || this.abortController?.signal.aborted) {
|
|
3228
|
+
throw error;
|
|
3229
|
+
}
|
|
3230
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3231
|
+
this.log(`[${context.step.name}] Repair agent "${repairAgent.name}" failed: ${message}`);
|
|
3232
|
+
this.postToChannel(`**[${context.step.name}]** Repair agent \`${repairAgent.name}\` failed; retrying agent step anyway`);
|
|
3233
|
+
this.recordStepToolSideEffect(context.step.name, {
|
|
3234
|
+
type: 'custom',
|
|
3235
|
+
detail: `Repair agent ${repairAgent.name} failed before agent retry: ${message}`,
|
|
3236
|
+
raw: { repairAgent: repairAgent.name, error: message },
|
|
3237
|
+
});
|
|
3238
|
+
}
|
|
3239
|
+
}
|
|
3240
|
+
buildAgentStepRepairPrompt(context) {
|
|
3241
|
+
const output = context.output.trim();
|
|
3242
|
+
const clippedOutput = output.length > 4000 ? output.slice(-4000) : output;
|
|
3243
|
+
const task = (context.step.task ?? '').trim();
|
|
3244
|
+
const clippedTask = task.length > 3000 ? task.slice(0, 3000) : task;
|
|
3245
|
+
return (`A workflow agent step failed or produced an invalid artifact. Repair the repository, workflow state, or step instructions so the step can succeed on the next retry.\n\n` +
|
|
3246
|
+
`Step: ${context.step.name}\n` +
|
|
3247
|
+
`Working directory: ${context.cwd}\n` +
|
|
3248
|
+
`Completion reason: ${context.completionReason ?? 'unknown'}\n` +
|
|
3249
|
+
`Failure:\n${context.error}\n` +
|
|
3250
|
+
`Exit code: ${context.exitCode ?? 'unknown'}\n` +
|
|
3251
|
+
`Exit signal: ${context.exitSignal ?? 'none'}\n\n` +
|
|
3252
|
+
`Step task:\n${clippedTask || '(no task captured)'}\n\n` +
|
|
3253
|
+
`Previous output:\n${clippedOutput || '(no output captured)'}\n\n` +
|
|
3254
|
+
`Repair only what is needed for this step to produce the required artifact or evidence. ` +
|
|
3255
|
+
`Preserve unrelated user changes. After making the fix, report the files changed and why the retry should pass.`);
|
|
3256
|
+
}
|
|
2992
3257
|
/**
|
|
2993
3258
|
* Execute a worktree step (git worktree setup).
|
|
2994
3259
|
* Fast, reliable, $0 LLM cost.
|
|
@@ -3201,56 +3466,7 @@ export class WorkflowRunner {
|
|
|
3201
3466
|
throw new Error(`Agent "${agentName}" not found in config`);
|
|
3202
3467
|
}
|
|
3203
3468
|
const specialistDef = WorkflowRunner.resolveAgentDef(rawAgentDef);
|
|
3204
|
-
|
|
3205
|
-
if (specialistDef.cli === 'api') {
|
|
3206
|
-
this.ensureBudgetAllowsSpawn(step.name, agentName);
|
|
3207
|
-
const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
|
|
3208
|
-
const resolvedTask = this.interpolateStepTask(step.task ?? '', stepOutputContext);
|
|
3209
|
-
state.row.status = 'running';
|
|
3210
|
-
state.row.startedAt = new Date().toISOString();
|
|
3211
|
-
await this.db.updateStep(state.row.id, {
|
|
3212
|
-
status: 'running',
|
|
3213
|
-
startedAt: state.row.startedAt,
|
|
3214
|
-
updatedAt: new Date().toISOString(),
|
|
3215
|
-
});
|
|
3216
|
-
this.emit({ type: 'step:started', runId, stepName: step.name });
|
|
3217
|
-
this.postToChannel(`**[${step.name}]** Started (api)`);
|
|
3218
|
-
try {
|
|
3219
|
-
const output = await executeApiStep(specialistDef.constraints?.model ?? 'claude-sonnet-4-20250514', resolvedTask, {
|
|
3220
|
-
envSecrets: this.envSecrets,
|
|
3221
|
-
skills: specialistDef.skills,
|
|
3222
|
-
defaultMaxTokens: specialistDef.constraints?.maxTokens,
|
|
3223
|
-
});
|
|
3224
|
-
state.row.status = 'completed';
|
|
3225
|
-
state.row.output = output;
|
|
3226
|
-
state.row.completedAt = new Date().toISOString();
|
|
3227
|
-
await this.db.updateStep(state.row.id, {
|
|
3228
|
-
status: 'completed',
|
|
3229
|
-
output,
|
|
3230
|
-
completedAt: state.row.completedAt,
|
|
3231
|
-
updatedAt: new Date().toISOString(),
|
|
3232
|
-
});
|
|
3233
|
-
await this.persistStepOutput(runId, step.name, output);
|
|
3234
|
-
this.emit({ type: 'step:completed', runId, stepName: step.name, output });
|
|
3235
|
-
}
|
|
3236
|
-
catch (apiError) {
|
|
3237
|
-
const errorMessage = apiError instanceof Error ? apiError.message : String(apiError);
|
|
3238
|
-
state.row.status = 'failed';
|
|
3239
|
-
state.row.error = errorMessage;
|
|
3240
|
-
state.row.completedAt = new Date().toISOString();
|
|
3241
|
-
await this.db.updateStep(state.row.id, {
|
|
3242
|
-
status: 'failed',
|
|
3243
|
-
error: errorMessage,
|
|
3244
|
-
completedAt: state.row.completedAt,
|
|
3245
|
-
updatedAt: new Date().toISOString(),
|
|
3246
|
-
});
|
|
3247
|
-
this.emit({ type: 'step:failed', runId, stepName: step.name, error: errorMessage });
|
|
3248
|
-
this.postToChannel(`**[${step.name}]** Failed (api): ${errorMessage}`);
|
|
3249
|
-
throw apiError;
|
|
3250
|
-
}
|
|
3251
|
-
return;
|
|
3252
|
-
}
|
|
3253
|
-
const usesOwnerFlow = specialistDef.interactive !== false;
|
|
3469
|
+
const usesOwnerFlow = specialistDef.cli !== 'api' && specialistDef.interactive !== false;
|
|
3254
3470
|
const currentPattern = this.currentConfig?.swarm?.pattern ?? '';
|
|
3255
3471
|
const isHubPattern = WorkflowRunner.HUB_PATTERNS.has(currentPattern);
|
|
3256
3472
|
const usesAutoHardening = usesOwnerFlow && isHubPattern && !this.isExplicitInteractiveWorker(specialistDef);
|
|
@@ -3274,6 +3490,10 @@ export class WorkflowRunner {
|
|
|
3274
3490
|
ownerDef.constraints?.timeoutMs ??
|
|
3275
3491
|
specialistDef.constraints?.timeoutMs ??
|
|
3276
3492
|
this.currentConfig?.swarm?.timeoutMs;
|
|
3493
|
+
const repairRetries = errorHandling?.strategy === 'retry' ? (errorHandling.repairRetries ?? 0) : 0;
|
|
3494
|
+
const repairAgent = repairRetries > 0
|
|
3495
|
+
? this.resolveWorkflowRepairAgent(step, stepStates, agentMap, errorHandling)
|
|
3496
|
+
: undefined;
|
|
3277
3497
|
let lastError;
|
|
3278
3498
|
let lastExitCode;
|
|
3279
3499
|
let lastExitSignal;
|
|
@@ -3312,6 +3532,20 @@ export class WorkflowRunner {
|
|
|
3312
3532
|
updatedAt: new Date().toISOString(),
|
|
3313
3533
|
});
|
|
3314
3534
|
await this.trajectory?.stepRetrying(step, attempt, maxRetries);
|
|
3535
|
+
if (repairAgent && attempt <= repairRetries) {
|
|
3536
|
+
await this.runAgentStepRepairAgent({
|
|
3537
|
+
step,
|
|
3538
|
+
agentDef: repairAgent,
|
|
3539
|
+
attempt,
|
|
3540
|
+
maxRetries,
|
|
3541
|
+
cwd: lastEffectiveCwd ?? this.resolveEffectiveCwd(step, specialistDef),
|
|
3542
|
+
error: lastError ?? 'Unknown error',
|
|
3543
|
+
output: this.lastFailedStepOutput.get(step.name) ?? '',
|
|
3544
|
+
exitCode: lastExitCode,
|
|
3545
|
+
exitSignal: lastExitSignal,
|
|
3546
|
+
completionReason: lastCompletionReason,
|
|
3547
|
+
});
|
|
3548
|
+
}
|
|
3315
3549
|
await this.delay(retryDelay);
|
|
3316
3550
|
}
|
|
3317
3551
|
try {
|
|
@@ -3436,28 +3670,38 @@ export class WorkflowRunner {
|
|
|
3436
3670
|
// executors still take precedence. See process-backend-executor.ts.
|
|
3437
3671
|
const spawnResult = this.executor
|
|
3438
3672
|
? await this.executor.executeAgentStep(resolvedStep, effectiveOwner, ownerTask, timeoutMs)
|
|
3439
|
-
:
|
|
3440
|
-
|
|
3441
|
-
|
|
3442
|
-
|
|
3443
|
-
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
|
|
3449
|
-
|
|
3450
|
-
|
|
3451
|
-
|
|
3452
|
-
|
|
3453
|
-
|
|
3454
|
-
|
|
3455
|
-
|
|
3456
|
-
|
|
3673
|
+
: effectiveOwner.cli === 'api'
|
|
3674
|
+
? {
|
|
3675
|
+
output: await executeApiStep(effectiveOwner.constraints?.model ?? 'claude-sonnet-4-20250514', ownerTask, {
|
|
3676
|
+
envSecrets: this.envSecrets,
|
|
3677
|
+
skills: effectiveOwner.skills,
|
|
3678
|
+
defaultMaxTokens: effectiveOwner.constraints?.maxTokens,
|
|
3679
|
+
}),
|
|
3680
|
+
exitCode: 0,
|
|
3681
|
+
promptTaskText: ownerTask,
|
|
3682
|
+
}
|
|
3683
|
+
: await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs, {
|
|
3684
|
+
retryAttempt: attempt,
|
|
3685
|
+
evidenceStepName: step.name,
|
|
3686
|
+
evidenceRole: usesOwnerFlow ? 'owner' : 'specialist',
|
|
3687
|
+
preserveOnIdle: !isHubPattern || !this.isLeadLikeAgent(effectiveOwner) ? false : undefined,
|
|
3688
|
+
logicalName: effectiveOwner.name,
|
|
3689
|
+
onSpawned: explicitInteractiveWorker
|
|
3690
|
+
? ({ agent }) => {
|
|
3691
|
+
explicitWorkerHandle = agent;
|
|
3457
3692
|
}
|
|
3458
|
-
|
|
3459
|
-
:
|
|
3460
|
-
|
|
3693
|
+
: undefined,
|
|
3694
|
+
onChunk: explicitInteractiveWorker
|
|
3695
|
+
? ({ chunk }) => {
|
|
3696
|
+
explicitWorkerOutput += WorkflowRunner.stripAnsi(chunk);
|
|
3697
|
+
if (!explicitWorkerCompleted &&
|
|
3698
|
+
this.hasExplicitInteractiveWorkerCompletionEvidence(step, explicitWorkerOutput, ownerTask, resolvedTask)) {
|
|
3699
|
+
explicitWorkerCompleted = true;
|
|
3700
|
+
void explicitWorkerHandle?.release().catch(() => undefined);
|
|
3701
|
+
}
|
|
3702
|
+
}
|
|
3703
|
+
: undefined,
|
|
3704
|
+
});
|
|
3461
3705
|
const output = typeof spawnResult === 'string' ? spawnResult : spawnResult.output;
|
|
3462
3706
|
promptTaskText =
|
|
3463
3707
|
typeof spawnResult === 'string'
|
|
@@ -3569,6 +3813,9 @@ export class WorkflowRunner {
|
|
|
3569
3813
|
catch (err) {
|
|
3570
3814
|
lastError = err instanceof Error ? err.message : String(err);
|
|
3571
3815
|
lastCompletionReason = err instanceof WorkflowCompletionError ? err.completionReason : undefined;
|
|
3816
|
+
if (stepOutputForDiagnostic) {
|
|
3817
|
+
this.lastFailedStepOutput.set(step.name, stepOutputForDiagnostic);
|
|
3818
|
+
}
|
|
3572
3819
|
const diagnosticVerification = step.verification;
|
|
3573
3820
|
if (err instanceof WorkflowCompletionError &&
|
|
3574
3821
|
err.completionReason === 'failed_verification' &&
|
|
@@ -3911,23 +4158,33 @@ export class WorkflowRunner {
|
|
|
3911
4158
|
this.log(`[${step.name}] Spawning owner "${supervised.owner.name}" (cli: ${supervised.owner.cli})`);
|
|
3912
4159
|
const ownerStartTime = Date.now();
|
|
3913
4160
|
try {
|
|
3914
|
-
const ownerResultObj =
|
|
3915
|
-
|
|
3916
|
-
|
|
3917
|
-
|
|
3918
|
-
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
|
|
3922
|
-
|
|
3923
|
-
|
|
3924
|
-
|
|
3925
|
-
|
|
3926
|
-
|
|
3927
|
-
|
|
3928
|
-
|
|
3929
|
-
|
|
3930
|
-
|
|
4161
|
+
const ownerResultObj = supervised.owner.cli === 'api'
|
|
4162
|
+
? {
|
|
4163
|
+
output: await executeApiStep(supervised.owner.constraints?.model ?? 'claude-sonnet-4-20250514', supervisorTask, {
|
|
4164
|
+
envSecrets: this.envSecrets,
|
|
4165
|
+
skills: supervised.owner.skills,
|
|
4166
|
+
defaultMaxTokens: supervised.owner.constraints?.maxTokens,
|
|
4167
|
+
}),
|
|
4168
|
+
exitCode: 0,
|
|
4169
|
+
promptTaskText: supervisorTask,
|
|
4170
|
+
}
|
|
4171
|
+
: await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
|
|
4172
|
+
agentNameSuffix: 'owner',
|
|
4173
|
+
retryAttempt,
|
|
4174
|
+
evidenceStepName: step.name,
|
|
4175
|
+
evidenceRole: 'owner',
|
|
4176
|
+
logicalName: supervised.owner.name,
|
|
4177
|
+
onSpawned: ({ actualName }) => {
|
|
4178
|
+
this.supervisedRuntimeAgents.set(actualName, {
|
|
4179
|
+
stepName: step.name,
|
|
4180
|
+
role: 'owner',
|
|
4181
|
+
logicalName: supervised.owner.name,
|
|
4182
|
+
});
|
|
4183
|
+
},
|
|
4184
|
+
onChunk: ({ chunk }) => {
|
|
4185
|
+
void this.recordOwnerMonitoringChunk(step, supervised.owner, chunk);
|
|
4186
|
+
},
|
|
4187
|
+
});
|
|
3931
4188
|
const ownerElapsed = Date.now() - ownerStartTime;
|
|
3932
4189
|
const ownerOutput = ownerResultObj.output;
|
|
3933
4190
|
this.log(`[${step.name}] Owner "${supervised.owner.name}" exited`);
|