agent-relay 3.1.19 → 3.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -1
- package/bin/agent-relay-broker-darwin-arm64 +0 -0
- package/bin/agent-relay-broker-darwin-x64 +0 -0
- package/bin/agent-relay-broker-linux-arm64 +0 -0
- package/bin/agent-relay-broker-linux-x64 +0 -0
- package/dist/index.cjs +435 -190
- package/dist/src/cli/bootstrap.js +0 -15
- package/dist/src/cli/bootstrap.js.map +1 -1
- package/dist/src/cli/commands/agent-management.d.ts +1 -0
- package/dist/src/cli/commands/agent-management.d.ts.map +1 -1
- package/dist/src/cli/commands/agent-management.js +235 -16
- package/dist/src/cli/commands/agent-management.js.map +1 -1
- package/dist/src/cli/commands/core.js +1 -1
- package/dist/src/cli/commands/core.js.map +1 -1
- package/dist/src/cli/index.d.ts.map +1 -1
- package/dist/src/cli/index.js +13 -1
- package/dist/src/cli/index.js.map +1 -1
- package/dist/src/cli/lib/broker-lifecycle.d.ts.map +1 -1
- package/dist/src/cli/lib/broker-lifecycle.js +3 -5
- package/dist/src/cli/lib/broker-lifecycle.js.map +1 -1
- package/dist/src/cli/lib/connect-daytona.js +2 -2
- package/dist/src/cli/lib/connect-daytona.js.map +1 -1
- package/install.sh +9 -3
- package/package.json +13 -13
- package/packages/acp-bridge/package.json +2 -2
- package/packages/config/package.json +1 -1
- package/packages/hooks/package.json +4 -4
- package/packages/memory/package.json +2 -2
- package/packages/openclaw/dist/cli.js +79 -2
- package/packages/openclaw/dist/cli.js.map +1 -1
- package/packages/openclaw/dist/config.d.ts +28 -1
- package/packages/openclaw/dist/config.d.ts.map +1 -1
- package/packages/openclaw/dist/config.js +145 -0
- package/packages/openclaw/dist/config.js.map +1 -1
- package/packages/openclaw/dist/index.d.ts +2 -2
- package/packages/openclaw/dist/index.d.ts.map +1 -1
- package/packages/openclaw/dist/index.js +1 -1
- package/packages/openclaw/dist/index.js.map +1 -1
- package/packages/openclaw/dist/setup.d.ts.map +1 -1
- package/packages/openclaw/dist/setup.js +24 -1
- package/packages/openclaw/dist/setup.js.map +1 -1
- package/packages/openclaw/dist/types.d.ts +23 -0
- package/packages/openclaw/dist/types.d.ts.map +1 -1
- package/packages/openclaw/package.json +2 -2
- package/packages/openclaw/skill/SKILL.md +46 -0
- package/packages/openclaw/src/cli.ts +90 -2
- package/packages/openclaw/src/config.ts +165 -1
- package/packages/openclaw/src/index.ts +7 -1
- package/packages/openclaw/src/setup.ts +26 -1
- package/packages/openclaw/src/types.ts +25 -0
- package/packages/policy/package.json +2 -2
- package/packages/sdk/dist/__tests__/integration.test.js +35 -0
- package/packages/sdk/dist/__tests__/integration.test.js.map +1 -1
- package/packages/sdk/dist/client.d.ts +9 -0
- package/packages/sdk/dist/client.d.ts.map +1 -1
- package/packages/sdk/dist/client.js +33 -22
- package/packages/sdk/dist/client.js.map +1 -1
- package/packages/sdk/dist/protocol.d.ts +1 -0
- package/packages/sdk/dist/protocol.d.ts.map +1 -1
- package/packages/sdk/dist/relay.d.ts +8 -0
- package/packages/sdk/dist/relay.d.ts.map +1 -1
- package/packages/sdk/dist/relay.js +50 -5
- package/packages/sdk/dist/relay.js.map +1 -1
- package/packages/sdk/dist/workflows/cli.js +2 -0
- package/packages/sdk/dist/workflows/cli.js.map +1 -1
- package/packages/sdk/dist/workflows/runner.d.ts +11 -0
- package/packages/sdk/dist/workflows/runner.d.ts.map +1 -1
- package/packages/sdk/dist/workflows/runner.js +350 -167
- package/packages/sdk/dist/workflows/runner.js.map +1 -1
- package/packages/sdk/dist/workflows/trajectory.d.ts +6 -1
- package/packages/sdk/dist/workflows/trajectory.d.ts.map +1 -1
- package/packages/sdk/dist/workflows/trajectory.js +16 -2
- package/packages/sdk/dist/workflows/trajectory.js.map +1 -1
- package/packages/sdk/package.json +2 -2
- package/packages/sdk/src/__tests__/integration.test.ts +49 -0
- package/packages/sdk/src/__tests__/orchestration-upgrades.test.ts +50 -1
- package/packages/sdk/src/client.ts +44 -21
- package/packages/sdk/src/protocol.ts +1 -1
- package/packages/sdk/src/relay.ts +70 -5
- package/packages/sdk/src/workflows/cli.ts +2 -0
- package/packages/sdk/src/workflows/runner.ts +414 -185
- package/packages/sdk/src/workflows/trajectory.ts +22 -2
- package/packages/sdk-py/pyproject.toml +1 -1
- package/packages/sdk-py/src/agent_relay/client.py +18 -1
- package/packages/sdk-py/src/agent_relay/relay.py +4 -0
- package/packages/sdk-py/src/agent_relay/types.py +4 -0
- package/packages/telemetry/package.json +1 -1
- package/packages/trajectory/package.json +2 -2
- package/packages/user-directory/package.json +2 -2
- package/packages/utils/package.json +2 -2
|
@@ -13,6 +13,7 @@ import path from 'node:path';
|
|
|
13
13
|
|
|
14
14
|
import { parse as parseYaml } from 'yaml';
|
|
15
15
|
import { stripAnsi as stripAnsiFn } from '../pty.js';
|
|
16
|
+
import type { BrokerEvent } from '../protocol.js';
|
|
16
17
|
|
|
17
18
|
import {
|
|
18
19
|
loadCustomSteps,
|
|
@@ -64,6 +65,25 @@ export interface WorkflowDb {
|
|
|
64
65
|
getStepsByRunId(runId: string): Promise<WorkflowStepRow[]>;
|
|
65
66
|
}
|
|
66
67
|
|
|
68
|
+
/** Result returned by spawnAndWait / execNonInteractive with optional process exit info. */
|
|
69
|
+
interface SpawnResult {
|
|
70
|
+
output: string;
|
|
71
|
+
exitCode?: number;
|
|
72
|
+
exitSignal?: string;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** Error carrying exit code/signal from a failed subprocess spawn. */
|
|
76
|
+
class SpawnExitError extends Error {
|
|
77
|
+
exitCode?: number;
|
|
78
|
+
exitSignal?: string;
|
|
79
|
+
constructor(message: string, exitCode?: number, exitSignal?: string | null) {
|
|
80
|
+
super(message);
|
|
81
|
+
this.name = 'SpawnExitError';
|
|
82
|
+
this.exitCode = exitCode;
|
|
83
|
+
this.exitSignal = exitSignal ?? undefined;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
67
87
|
// ── Events ──────────────────────────────────────────────────────────────────
|
|
68
88
|
|
|
69
89
|
export type WorkflowEvent =
|
|
@@ -71,6 +91,7 @@ export type WorkflowEvent =
|
|
|
71
91
|
| { type: 'run:completed'; runId: string }
|
|
72
92
|
| { type: 'run:failed'; runId: string; error: string }
|
|
73
93
|
| { type: 'run:cancelled'; runId: string }
|
|
94
|
+
| { type: 'broker:event'; runId: string; event: BrokerEvent }
|
|
74
95
|
| { type: 'step:started'; runId: string; stepName: string }
|
|
75
96
|
| {
|
|
76
97
|
type: 'step:owner-assigned';
|
|
@@ -79,7 +100,7 @@ export type WorkflowEvent =
|
|
|
79
100
|
ownerName: string;
|
|
80
101
|
specialistName: string;
|
|
81
102
|
}
|
|
82
|
-
| { type: 'step:completed'; runId: string; stepName: string; output?: string }
|
|
103
|
+
| { type: 'step:completed'; runId: string; stepName: string; output?: string; exitCode?: number; exitSignal?: string }
|
|
83
104
|
| {
|
|
84
105
|
type: 'step:review-completed';
|
|
85
106
|
runId: string;
|
|
@@ -88,7 +109,7 @@ export type WorkflowEvent =
|
|
|
88
109
|
decision: 'approved' | 'rejected';
|
|
89
110
|
}
|
|
90
111
|
| { type: 'step:owner-timeout'; runId: string; stepName: string; ownerName: string }
|
|
91
|
-
| { type: 'step:failed'; runId: string; stepName: string; error: string }
|
|
112
|
+
| { type: 'step:failed'; runId: string; stepName: string; error: string; exitCode?: number; exitSignal?: string }
|
|
92
113
|
| { type: 'step:skipped'; runId: string; stepName: string }
|
|
93
114
|
| { type: 'step:retrying'; runId: string; stepName: string; attempt: number }
|
|
94
115
|
| { type: 'step:nudged'; runId: string; stepName: string; nudgeCount: number }
|
|
@@ -223,6 +244,8 @@ export class WorkflowRunner {
|
|
|
223
244
|
|
|
224
245
|
// PTY-based output capture: accumulate terminal output per-agent
|
|
225
246
|
private readonly ptyOutputBuffers = new Map<string, string[]>();
|
|
247
|
+
/** Snapshot of PTY output from the most recent failed attempt, keyed by step name. */
|
|
248
|
+
private readonly lastFailedStepOutput = new Map<string, string>();
|
|
226
249
|
private readonly ptyListeners = new Map<string, (chunk: string) => void>();
|
|
227
250
|
private readonly ptyLogStreams = new Map<string, WriteStream>();
|
|
228
251
|
/** Path to workers.json so `agents:kill` can find workflow-spawned agents */
|
|
@@ -1142,8 +1165,15 @@ export class WorkflowRunner {
|
|
|
1142
1165
|
workflowName?: string,
|
|
1143
1166
|
vars?: VariableContext
|
|
1144
1167
|
): Promise<WorkflowRunRow> {
|
|
1168
|
+
// Set up abort controller early so callers can abort() even during setup
|
|
1169
|
+
this.abortController = new AbortController();
|
|
1170
|
+
this.paused = false;
|
|
1171
|
+
|
|
1145
1172
|
const resolved = vars ? this.resolveVariables(config, vars) : config;
|
|
1146
1173
|
|
|
1174
|
+
// Validate config (catches cycles, missing deps, invalid steps, etc.)
|
|
1175
|
+
this.validateConfig(resolved);
|
|
1176
|
+
|
|
1147
1177
|
// Resolve and validate named paths from the top-level `paths` config
|
|
1148
1178
|
const pathResult = this.resolvePathDefinitions(resolved.paths, this.cwd);
|
|
1149
1179
|
if (pathResult.errors.length > 0) {
|
|
@@ -1227,6 +1257,10 @@ export class WorkflowRunner {
|
|
|
1227
1257
|
|
|
1228
1258
|
/** Resume a previously paused or partially completed run. */
|
|
1229
1259
|
async resume(runId: string, vars?: VariableContext): Promise<WorkflowRunRow> {
|
|
1260
|
+
// Set up abort controller early so callers can abort() even during setup
|
|
1261
|
+
this.abortController = new AbortController();
|
|
1262
|
+
this.paused = false;
|
|
1263
|
+
|
|
1230
1264
|
const run = await this.db.getRun(runId);
|
|
1231
1265
|
if (!run) {
|
|
1232
1266
|
throw new Error(`Run "${runId}" not found`);
|
|
@@ -1289,9 +1323,7 @@ export class WorkflowRunner {
|
|
|
1289
1323
|
const { run, workflow, config, stepStates, isResume } = input;
|
|
1290
1324
|
const runId = run.id;
|
|
1291
1325
|
|
|
1292
|
-
// Start execution
|
|
1293
|
-
this.abortController = new AbortController();
|
|
1294
|
-
this.paused = false;
|
|
1326
|
+
// Start execution (abortController already set by execute()/resume())
|
|
1295
1327
|
this.currentConfig = config;
|
|
1296
1328
|
this.currentRunId = runId;
|
|
1297
1329
|
this.runStartTime = Date.now();
|
|
@@ -1336,15 +1368,22 @@ export class WorkflowRunner {
|
|
|
1336
1368
|
config.swarm.channel = channel;
|
|
1337
1369
|
await this.db.updateRun(runId, { config });
|
|
1338
1370
|
}
|
|
1371
|
+
const relaycastDisabled =
|
|
1372
|
+
this.relayOptions.env?.AGENT_RELAY_WORKFLOW_DISABLE_RELAYCAST === '1';
|
|
1373
|
+
const requiresBroker =
|
|
1374
|
+
!this.executor &&
|
|
1375
|
+
workflow.steps.some((step) => step.type !== 'deterministic' && step.type !== 'worktree');
|
|
1339
1376
|
// Skip broker/relay init when an external executor handles agent spawning
|
|
1340
|
-
if (
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
this.
|
|
1346
|
-
|
|
1347
|
-
|
|
1377
|
+
if (requiresBroker) {
|
|
1378
|
+
if (!relaycastDisabled) {
|
|
1379
|
+
this.log('Resolving Relaycast API key...');
|
|
1380
|
+
await this.ensureRelaycastApiKey(channel);
|
|
1381
|
+
this.log('API key resolved');
|
|
1382
|
+
if (this.relayApiKeyAutoCreated && this.relayApiKey) {
|
|
1383
|
+
this.log(`Workspace created — follow this run in Relaycast:`);
|
|
1384
|
+
this.log(` Observer: https://agentrelay.dev/observer?key=${this.relayApiKey}`);
|
|
1385
|
+
this.log(` Channel: ${channel}`);
|
|
1386
|
+
}
|
|
1348
1387
|
}
|
|
1349
1388
|
|
|
1350
1389
|
this.log('Starting broker...');
|
|
@@ -1356,7 +1395,7 @@ export class WorkflowRunner {
|
|
|
1356
1395
|
this.relay = new AgentRelay({
|
|
1357
1396
|
...this.relayOptions,
|
|
1358
1397
|
brokerName,
|
|
1359
|
-
channels: [channel],
|
|
1398
|
+
channels: relaycastDisabled ? [] : [channel],
|
|
1360
1399
|
env: this.getRelayEnv(),
|
|
1361
1400
|
// Workflows spawn agents across multiple waves; each spawn requires a PTY +
|
|
1362
1401
|
// Relaycast registration. 60s is too tight when the broker is saturated with
|
|
@@ -1412,6 +1451,18 @@ export class WorkflowRunner {
|
|
|
1412
1451
|
|
|
1413
1452
|
// Wire relay event hooks for rich console logging
|
|
1414
1453
|
this.relay.onMessageReceived = (msg) => {
|
|
1454
|
+
this.emit({
|
|
1455
|
+
type: 'broker:event',
|
|
1456
|
+
runId,
|
|
1457
|
+
event: {
|
|
1458
|
+
kind: 'relay_inbound',
|
|
1459
|
+
event_id: msg.eventId,
|
|
1460
|
+
from: msg.from,
|
|
1461
|
+
target: msg.to,
|
|
1462
|
+
body: msg.text,
|
|
1463
|
+
thread_id: msg.threadId,
|
|
1464
|
+
} as BrokerEvent,
|
|
1465
|
+
});
|
|
1415
1466
|
const body = msg.text.length > 120 ? msg.text.slice(0, 117) + '...' : msg.text;
|
|
1416
1467
|
const fromShort = msg.from.replace(/-[a-f0-9]{6,}$/, '');
|
|
1417
1468
|
const toShort = msg.to.replace(/-[a-f0-9]{6,}$/, '');
|
|
@@ -1429,13 +1480,43 @@ export class WorkflowRunner {
|
|
|
1429
1480
|
};
|
|
1430
1481
|
|
|
1431
1482
|
this.relay.onAgentSpawned = (agent) => {
|
|
1483
|
+
this.emit({
|
|
1484
|
+
type: 'broker:event',
|
|
1485
|
+
runId,
|
|
1486
|
+
event: {
|
|
1487
|
+
kind: 'agent_spawned',
|
|
1488
|
+
name: agent.name,
|
|
1489
|
+
runtime: agent.runtime,
|
|
1490
|
+
} as BrokerEvent,
|
|
1491
|
+
});
|
|
1432
1492
|
// Skip agents already managed by step execution
|
|
1433
1493
|
if (!this.activeAgentHandles.has(agent.name)) {
|
|
1434
1494
|
this.log(`[spawned] ${agent.name} (${agent.runtime})`);
|
|
1435
1495
|
}
|
|
1436
1496
|
};
|
|
1437
1497
|
|
|
1498
|
+
this.relay.onAgentReleased = (agent) => {
|
|
1499
|
+
this.emit({
|
|
1500
|
+
type: 'broker:event',
|
|
1501
|
+
runId,
|
|
1502
|
+
event: {
|
|
1503
|
+
kind: 'agent_released',
|
|
1504
|
+
name: agent.name,
|
|
1505
|
+
} as BrokerEvent,
|
|
1506
|
+
});
|
|
1507
|
+
};
|
|
1508
|
+
|
|
1438
1509
|
this.relay.onAgentExited = (agent) => {
|
|
1510
|
+
this.emit({
|
|
1511
|
+
type: 'broker:event',
|
|
1512
|
+
runId,
|
|
1513
|
+
event: {
|
|
1514
|
+
kind: 'agent_exited',
|
|
1515
|
+
name: agent.name,
|
|
1516
|
+
code: agent.exitCode,
|
|
1517
|
+
signal: agent.exitSignal,
|
|
1518
|
+
} as BrokerEvent,
|
|
1519
|
+
});
|
|
1439
1520
|
this.lastActivity.delete(agent.name);
|
|
1440
1521
|
this.lastIdleLog.delete(agent.name);
|
|
1441
1522
|
if (!this.activeAgentHandles.has(agent.name)) {
|
|
@@ -1443,7 +1524,20 @@ export class WorkflowRunner {
|
|
|
1443
1524
|
}
|
|
1444
1525
|
};
|
|
1445
1526
|
|
|
1527
|
+
this.relay.onDeliveryUpdate = (event) => {
|
|
1528
|
+
this.emit({ type: 'broker:event', runId, event });
|
|
1529
|
+
};
|
|
1530
|
+
|
|
1446
1531
|
this.relay.onAgentIdle = ({ name, idleSecs }) => {
|
|
1532
|
+
this.emit({
|
|
1533
|
+
type: 'broker:event',
|
|
1534
|
+
runId,
|
|
1535
|
+
event: {
|
|
1536
|
+
kind: 'agent_idle',
|
|
1537
|
+
name,
|
|
1538
|
+
idle_secs: idleSecs,
|
|
1539
|
+
} as BrokerEvent,
|
|
1540
|
+
});
|
|
1447
1541
|
// Only log at 30s multiples to avoid watchdog spam
|
|
1448
1542
|
const bucket = Math.floor(idleSecs / 30) * 30;
|
|
1449
1543
|
if (bucket >= 30 && this.lastIdleLog.get(name) !== bucket) {
|
|
@@ -1461,20 +1555,22 @@ export class WorkflowRunner {
|
|
|
1461
1555
|
console.log(`[broker] ${line}`);
|
|
1462
1556
|
});
|
|
1463
1557
|
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1558
|
+
if (!relaycastDisabled) {
|
|
1559
|
+
this.log(`Creating channel: ${channel}...`);
|
|
1560
|
+
if (isResume) {
|
|
1561
|
+
await this.createAndJoinRelaycastChannel(channel);
|
|
1562
|
+
} else {
|
|
1563
|
+
await this.createAndJoinRelaycastChannel(channel, workflow.description);
|
|
1564
|
+
}
|
|
1565
|
+
this.log('Channel ready');
|
|
1471
1566
|
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1567
|
+
if (isResume) {
|
|
1568
|
+
this.postToChannel(`Workflow **${workflow.name}** resumed — ${pendingCount} pending steps`);
|
|
1569
|
+
} else {
|
|
1570
|
+
this.postToChannel(
|
|
1571
|
+
`Workflow **${workflow.name}** started — ${workflow.steps.length} steps, pattern: ${config.swarm.pattern}`
|
|
1572
|
+
);
|
|
1573
|
+
}
|
|
1478
1574
|
}
|
|
1479
1575
|
}
|
|
1480
1576
|
|
|
@@ -1491,8 +1587,15 @@ export class WorkflowRunner {
|
|
|
1491
1587
|
this.log(`Executing ${workflow.steps.length} steps (pattern: ${config.swarm.pattern})`);
|
|
1492
1588
|
await this.executeSteps(workflow, stepStates, agentMap, config.errorHandling, runId);
|
|
1493
1589
|
|
|
1590
|
+
const errorStrategy =
|
|
1591
|
+
config.errorHandling?.strategy ?? workflow.onError ?? 'fail-fast';
|
|
1592
|
+
const continueOnError =
|
|
1593
|
+
errorStrategy === 'continue' || errorStrategy === 'skip';
|
|
1494
1594
|
const allCompleted = [...stepStates.values()].every(
|
|
1495
|
-
(s) =>
|
|
1595
|
+
(s) =>
|
|
1596
|
+
s.row.status === 'completed' ||
|
|
1597
|
+
s.row.status === 'skipped' ||
|
|
1598
|
+
(continueOnError && s.row.status === 'failed')
|
|
1496
1599
|
);
|
|
1497
1600
|
|
|
1498
1601
|
if (allCompleted) {
|
|
@@ -1517,9 +1620,18 @@ export class WorkflowRunner {
|
|
|
1517
1620
|
this.emit({ type: 'run:failed', runId, error: errorMsg });
|
|
1518
1621
|
|
|
1519
1622
|
const outcomes = this.collectOutcomes(stepStates, workflow.steps);
|
|
1623
|
+
const summary = this.trajectory.buildRunSummary(outcomes);
|
|
1624
|
+
const confidence = this.trajectory.computeConfidence(outcomes);
|
|
1625
|
+
const learnings = this.trajectory.extractLearnings(outcomes);
|
|
1626
|
+
const challenges = this.trajectory.extractChallenges(outcomes);
|
|
1520
1627
|
this.postFailureReport(workflow.name, outcomes, errorMsg);
|
|
1521
1628
|
this.logRunSummary(workflow.name, outcomes, runId);
|
|
1522
|
-
await this.trajectory.abandon(errorMsg
|
|
1629
|
+
await this.trajectory.abandon(errorMsg, {
|
|
1630
|
+
summary,
|
|
1631
|
+
confidence,
|
|
1632
|
+
learnings,
|
|
1633
|
+
challenges,
|
|
1634
|
+
});
|
|
1523
1635
|
}
|
|
1524
1636
|
} catch (err) {
|
|
1525
1637
|
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
@@ -1528,15 +1640,35 @@ export class WorkflowRunner {
|
|
|
1528
1640
|
await this.updateRunStatus(runId, status, errorMsg);
|
|
1529
1641
|
|
|
1530
1642
|
if (status === 'cancelled') {
|
|
1643
|
+
// Mark any pending or in-progress steps as failed due to cancellation
|
|
1644
|
+
for (const [stepName, state] of stepStates) {
|
|
1645
|
+
if (state.row.status === 'pending' || state.row.status === 'running') {
|
|
1646
|
+
state.row.status = 'failed';
|
|
1647
|
+
state.row.error = 'Cancelled';
|
|
1648
|
+
await this.db.updateStep(state.row.id, {
|
|
1649
|
+
status: 'failed',
|
|
1650
|
+
error: 'Cancelled',
|
|
1651
|
+
updatedAt: new Date().toISOString(),
|
|
1652
|
+
});
|
|
1653
|
+
this.emit({ type: 'step:failed', runId, stepName, error: 'Cancelled' });
|
|
1654
|
+
}
|
|
1655
|
+
}
|
|
1531
1656
|
this.emit({ type: 'run:cancelled', runId });
|
|
1532
1657
|
this.postToChannel(`Workflow **${workflow.name}** cancelled`);
|
|
1533
1658
|
await this.trajectory.abandon('Cancelled by user');
|
|
1534
1659
|
} else {
|
|
1535
1660
|
this.emit({ type: 'run:failed', runId, error: errorMsg });
|
|
1536
1661
|
this.postToChannel(`Workflow failed: ${errorMsg}`);
|
|
1537
|
-
|
|
1662
|
+
const outcomes = this.collectOutcomes(stepStates, workflow.steps);
|
|
1663
|
+
await this.trajectory.abandon(errorMsg, {
|
|
1664
|
+
summary: this.trajectory.buildRunSummary(outcomes),
|
|
1665
|
+
confidence: this.trajectory.computeConfidence(outcomes),
|
|
1666
|
+
learnings: this.trajectory.extractLearnings(outcomes),
|
|
1667
|
+
challenges: this.trajectory.extractChallenges(outcomes),
|
|
1668
|
+
});
|
|
1538
1669
|
}
|
|
1539
1670
|
} finally {
|
|
1671
|
+
this.lastFailedStepOutput.clear();
|
|
1540
1672
|
for (const stream of this.ptyLogStreams.values()) stream.end();
|
|
1541
1673
|
this.ptyLogStreams.clear();
|
|
1542
1674
|
this.ptyOutputBuffers.clear();
|
|
@@ -1549,9 +1681,11 @@ export class WorkflowRunner {
|
|
|
1549
1681
|
if (this.relay) {
|
|
1550
1682
|
this.relay.onMessageReceived = null;
|
|
1551
1683
|
this.relay.onAgentSpawned = null;
|
|
1684
|
+
this.relay.onAgentReleased = null;
|
|
1552
1685
|
this.relay.onAgentExited = null;
|
|
1553
1686
|
this.relay.onAgentIdle = null;
|
|
1554
1687
|
this.relay.onWorkerOutput = null;
|
|
1688
|
+
this.relay.onDeliveryUpdate = null;
|
|
1555
1689
|
}
|
|
1556
1690
|
this.lastIdleLog.clear();
|
|
1557
1691
|
this.lastActivity.clear();
|
|
@@ -1867,7 +2001,7 @@ export class WorkflowRunner {
|
|
|
1867
2001
|
): Promise<void> {
|
|
1868
2002
|
// Branch: deterministic steps execute shell commands
|
|
1869
2003
|
if (this.isDeterministicStep(step)) {
|
|
1870
|
-
return this.executeDeterministicStep(step, stepStates, runId);
|
|
2004
|
+
return this.executeDeterministicStep(step, stepStates, runId, errorHandling);
|
|
1871
2005
|
}
|
|
1872
2006
|
|
|
1873
2007
|
// Branch: worktree steps set up git worktrees
|
|
@@ -1886,167 +2020,199 @@ export class WorkflowRunner {
|
|
|
1886
2020
|
private async executeDeterministicStep(
|
|
1887
2021
|
step: WorkflowStep,
|
|
1888
2022
|
stepStates: Map<string, StepState>,
|
|
1889
|
-
runId: string
|
|
2023
|
+
runId: string,
|
|
2024
|
+
errorHandling: ErrorHandlingConfig | undefined
|
|
1890
2025
|
): Promise<void> {
|
|
1891
2026
|
const state = stepStates.get(step.name);
|
|
1892
2027
|
if (!state) throw new Error(`Step state not found: ${step.name}`);
|
|
1893
2028
|
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
state.row.status = 'running';
|
|
1898
|
-
state.row.startedAt = new Date().toISOString();
|
|
1899
|
-
await this.db.updateStep(state.row.id, {
|
|
1900
|
-
status: 'running',
|
|
1901
|
-
startedAt: state.row.startedAt,
|
|
1902
|
-
updatedAt: new Date().toISOString(),
|
|
1903
|
-
});
|
|
1904
|
-
this.emit({ type: 'step:started', runId, stepName: step.name });
|
|
1905
|
-
this.postToChannel(`**[${step.name}]** Started (deterministic)`);
|
|
1906
|
-
|
|
1907
|
-
// Resolve variables in the command (e.g., {{steps.plan.output}}, {{branch-name}})
|
|
1908
|
-
const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
|
|
1909
|
-
let resolvedCommand = this.interpolateStepTask(step.command ?? '', stepOutputContext);
|
|
1910
|
-
|
|
1911
|
-
// Also resolve simple {{variable}} placeholders (already resolved in top-level config but safe to re-run)
|
|
1912
|
-
resolvedCommand = resolvedCommand.replace(/\{\{([\w][\w.\-]*)\}\}/g, (_match, key: string) => {
|
|
1913
|
-
if (key.startsWith('steps.')) return _match; // Already handled above
|
|
1914
|
-
const value = this.resolveDotPath(key, stepOutputContext);
|
|
1915
|
-
return value !== undefined ? String(value) : _match;
|
|
1916
|
-
});
|
|
1917
|
-
|
|
1918
|
-
// Resolve step workdir (named path reference) for deterministic steps
|
|
1919
|
-
const stepCwd = this.resolveStepWorkdir(step) ?? this.cwd;
|
|
2029
|
+
const maxRetries = step.retries ?? errorHandling?.maxRetries ?? 0;
|
|
2030
|
+
const retryDelay = errorHandling?.retryDelayMs ?? 1000;
|
|
2031
|
+
let lastError: string | undefined;
|
|
1920
2032
|
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
if (this.executor?.executeDeterministicStep) {
|
|
1924
|
-
const result = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
|
|
1925
|
-
const failOnError = step.failOnError !== false;
|
|
1926
|
-
if (failOnError && result.exitCode !== 0) {
|
|
1927
|
-
throw new Error(`Command failed with exit code ${result.exitCode}: ${result.output.slice(0, 500)}`);
|
|
1928
|
-
}
|
|
1929
|
-
const output =
|
|
1930
|
-
step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
|
|
2033
|
+
for (let attempt = 0; attempt <= maxRetries; attempt += 1) {
|
|
2034
|
+
this.checkAborted();
|
|
1931
2035
|
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
state.row.
|
|
2036
|
+
if (attempt > 0) {
|
|
2037
|
+
this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
|
|
2038
|
+
this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
|
|
2039
|
+
state.row.retryCount = attempt;
|
|
1936
2040
|
await this.db.updateStep(state.row.id, {
|
|
1937
|
-
|
|
1938
|
-
output,
|
|
1939
|
-
completedAt: state.row.completedAt,
|
|
2041
|
+
retryCount: attempt,
|
|
1940
2042
|
updatedAt: new Date().toISOString(),
|
|
1941
2043
|
});
|
|
1942
|
-
await this.
|
|
1943
|
-
this.emit({ type: 'step:completed', runId, stepName: step.name, output });
|
|
1944
|
-
return;
|
|
2044
|
+
await this.delay(retryDelay);
|
|
1945
2045
|
}
|
|
1946
2046
|
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
2047
|
+
// Mark step as running
|
|
2048
|
+
state.row.status = 'running';
|
|
2049
|
+
state.row.startedAt = new Date().toISOString();
|
|
2050
|
+
await this.db.updateStep(state.row.id, {
|
|
2051
|
+
status: 'running',
|
|
2052
|
+
startedAt: state.row.startedAt,
|
|
2053
|
+
updatedAt: new Date().toISOString(),
|
|
2054
|
+
});
|
|
2055
|
+
this.emit({ type: 'step:started', runId, stepName: step.name });
|
|
2056
|
+
this.postToChannel(`**[${step.name}]** Started (deterministic)`);
|
|
2057
|
+
|
|
2058
|
+
// Resolve variables in the command (e.g., {{steps.plan.output}}, {{branch-name}})
|
|
2059
|
+
const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
|
|
2060
|
+
let resolvedCommand = this.interpolateStepTask(step.command ?? '', stepOutputContext);
|
|
2061
|
+
|
|
2062
|
+
// Also resolve simple {{variable}} placeholders (already resolved in top-level config but safe to re-run)
|
|
2063
|
+
resolvedCommand = resolvedCommand.replace(/\{\{([\w][\w.\-]*)\}\}/g, (_match, key: string) => {
|
|
2064
|
+
if (key.startsWith('steps.')) return _match; // Already handled above
|
|
2065
|
+
const value = this.resolveDotPath(key, stepOutputContext);
|
|
2066
|
+
return value !== undefined ? String(value) : _match;
|
|
2067
|
+
});
|
|
1953
2068
|
|
|
1954
|
-
|
|
1955
|
-
|
|
2069
|
+
// Resolve step workdir (named path reference) for deterministic steps
|
|
2070
|
+
const stepCwd = this.resolveStepWorkdir(step) ?? this.cwd;
|
|
1956
2071
|
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
2072
|
+
try {
|
|
2073
|
+
// Delegate to executor if present
|
|
2074
|
+
if (this.executor?.executeDeterministicStep) {
|
|
2075
|
+
const result = await this.executor.executeDeterministicStep(step, resolvedCommand, stepCwd);
|
|
2076
|
+
const failOnError = step.failOnError !== false;
|
|
2077
|
+
if (failOnError && result.exitCode !== 0) {
|
|
2078
|
+
throw new Error(
|
|
2079
|
+
`Command failed with exit code ${result.exitCode}: ${result.output.slice(0, 500)}`
|
|
2080
|
+
);
|
|
2081
|
+
}
|
|
2082
|
+
const output =
|
|
2083
|
+
step.captureOutput !== false ? result.output : `Command completed (exit code ${result.exitCode})`;
|
|
2084
|
+
if (step.verification) {
|
|
2085
|
+
this.runVerification(step.verification, output, step.name);
|
|
2086
|
+
}
|
|
1967
2087
|
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
2088
|
+
// Mark completed
|
|
2089
|
+
state.row.status = 'completed';
|
|
2090
|
+
state.row.output = output;
|
|
2091
|
+
state.row.completedAt = new Date().toISOString();
|
|
2092
|
+
await this.db.updateStep(state.row.id, {
|
|
2093
|
+
status: 'completed',
|
|
2094
|
+
output,
|
|
2095
|
+
completedAt: state.row.completedAt,
|
|
2096
|
+
updatedAt: new Date().toISOString(),
|
|
2097
|
+
});
|
|
2098
|
+
await this.persistStepOutput(runId, step.name, output);
|
|
2099
|
+
this.emit({ type: 'step:completed', runId, stepName: step.name, output });
|
|
2100
|
+
return;
|
|
1977
2101
|
}
|
|
1978
2102
|
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
2103
|
+
const output = await new Promise<string>((resolve, reject) => {
|
|
2104
|
+
const child = cpSpawn('sh', ['-c', resolvedCommand], {
|
|
2105
|
+
stdio: 'pipe',
|
|
2106
|
+
cwd: stepCwd,
|
|
2107
|
+
env: { ...process.env },
|
|
2108
|
+
});
|
|
1982
2109
|
|
|
1983
|
-
|
|
1984
|
-
stderrChunks
|
|
1985
|
-
});
|
|
2110
|
+
const stdoutChunks: string[] = [];
|
|
2111
|
+
const stderrChunks: string[] = [];
|
|
1986
2112
|
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
|
|
1990
|
-
|
|
2113
|
+
// Wire abort signal
|
|
2114
|
+
const abortSignal = this.abortController?.signal;
|
|
2115
|
+
let abortHandler: (() => void) | undefined;
|
|
2116
|
+
if (abortSignal && !abortSignal.aborted) {
|
|
2117
|
+
abortHandler = () => {
|
|
2118
|
+
child.kill('SIGTERM');
|
|
2119
|
+
setTimeout(() => child.kill('SIGKILL'), 5000);
|
|
2120
|
+
};
|
|
2121
|
+
abortSignal.addEventListener('abort', abortHandler, { once: true });
|
|
1991
2122
|
}
|
|
1992
2123
|
|
|
1993
|
-
|
|
1994
|
-
|
|
1995
|
-
|
|
2124
|
+
// Handle timeout
|
|
2125
|
+
let timedOut = false;
|
|
2126
|
+
let timer: ReturnType<typeof setTimeout> | undefined;
|
|
2127
|
+
if (step.timeoutMs) {
|
|
2128
|
+
timer = setTimeout(() => {
|
|
2129
|
+
timedOut = true;
|
|
2130
|
+
child.kill('SIGTERM');
|
|
2131
|
+
setTimeout(() => child.kill('SIGKILL'), 5000);
|
|
2132
|
+
}, step.timeoutMs);
|
|
1996
2133
|
}
|
|
1997
2134
|
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
);
|
|
2002
|
-
return;
|
|
2003
|
-
}
|
|
2135
|
+
child.stdout?.on('data', (chunk: Buffer) => {
|
|
2136
|
+
stdoutChunks.push(chunk.toString());
|
|
2137
|
+
});
|
|
2004
2138
|
|
|
2005
|
-
|
|
2006
|
-
|
|
2139
|
+
child.stderr?.on('data', (chunk: Buffer) => {
|
|
2140
|
+
stderrChunks.push(chunk.toString());
|
|
2141
|
+
});
|
|
2007
2142
|
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
);
|
|
2014
|
-
return;
|
|
2015
|
-
}
|
|
2143
|
+
child.on('close', (code) => {
|
|
2144
|
+
if (timer) clearTimeout(timer);
|
|
2145
|
+
if (abortHandler && abortSignal) {
|
|
2146
|
+
abortSignal.removeEventListener('abort', abortHandler);
|
|
2147
|
+
}
|
|
2016
2148
|
|
|
2017
|
-
|
|
2018
|
-
|
|
2149
|
+
if (abortSignal?.aborted) {
|
|
2150
|
+
reject(new Error(`Step "${step.name}" aborted`));
|
|
2151
|
+
return;
|
|
2152
|
+
}
|
|
2019
2153
|
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2154
|
+
if (timedOut) {
|
|
2155
|
+
reject(
|
|
2156
|
+
new Error(`Step "${step.name}" timed out (no step timeout set, check global swarm.timeoutMs)`)
|
|
2157
|
+
);
|
|
2158
|
+
return;
|
|
2159
|
+
}
|
|
2160
|
+
|
|
2161
|
+
const stdout = stdoutChunks.join('');
|
|
2162
|
+
const stderr = stderrChunks.join('');
|
|
2163
|
+
|
|
2164
|
+
// Check exit code unless failOnError is explicitly false
|
|
2165
|
+
const failOnError = step.failOnError !== false;
|
|
2166
|
+
if (failOnError && code !== 0 && code !== null) {
|
|
2167
|
+
reject(
|
|
2168
|
+
new Error(
|
|
2169
|
+
`Command failed with exit code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`
|
|
2170
|
+
)
|
|
2171
|
+
);
|
|
2172
|
+
return;
|
|
2173
|
+
}
|
|
2174
|
+
|
|
2175
|
+
resolve(step.captureOutput !== false ? stdout : `Command completed (exit code ${code ?? 0})`);
|
|
2176
|
+
});
|
|
2177
|
+
|
|
2178
|
+
child.on('error', (err) => {
|
|
2179
|
+
if (timer) clearTimeout(timer);
|
|
2180
|
+
if (abortHandler && abortSignal) {
|
|
2181
|
+
abortSignal.removeEventListener('abort', abortHandler);
|
|
2182
|
+
}
|
|
2183
|
+
reject(new Error(`Failed to execute command: ${err.message}`));
|
|
2184
|
+
});
|
|
2026
2185
|
});
|
|
2027
|
-
});
|
|
2028
2186
|
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
state.row.completedAt = new Date().toISOString();
|
|
2033
|
-
await this.db.updateStep(state.row.id, {
|
|
2034
|
-
status: 'completed',
|
|
2035
|
-
output,
|
|
2036
|
-
completedAt: state.row.completedAt,
|
|
2037
|
-
updatedAt: new Date().toISOString(),
|
|
2038
|
-
});
|
|
2187
|
+
if (step.verification) {
|
|
2188
|
+
this.runVerification(step.verification, output, step.name);
|
|
2189
|
+
}
|
|
2039
2190
|
|
|
2040
|
-
|
|
2041
|
-
|
|
2191
|
+
// Mark completed
|
|
2192
|
+
state.row.status = 'completed';
|
|
2193
|
+
state.row.output = output;
|
|
2194
|
+
state.row.completedAt = new Date().toISOString();
|
|
2195
|
+
await this.db.updateStep(state.row.id, {
|
|
2196
|
+
status: 'completed',
|
|
2197
|
+
output,
|
|
2198
|
+
completedAt: state.row.completedAt,
|
|
2199
|
+
updatedAt: new Date().toISOString(),
|
|
2200
|
+
});
|
|
2042
2201
|
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
|
|
2048
|
-
|
|
2202
|
+
// Persist step output
|
|
2203
|
+
await this.persistStepOutput(runId, step.name, output);
|
|
2204
|
+
|
|
2205
|
+
this.emit({ type: 'step:completed', runId, stepName: step.name, output });
|
|
2206
|
+
return;
|
|
2207
|
+
} catch (err) {
|
|
2208
|
+
lastError = err instanceof Error ? err.message : String(err);
|
|
2209
|
+
}
|
|
2049
2210
|
}
|
|
2211
|
+
|
|
2212
|
+
const errorMsg = lastError ?? 'Unknown error';
|
|
2213
|
+
this.postToChannel(`**[${step.name}]** Failed: ${errorMsg}`);
|
|
2214
|
+
await this.markStepFailed(state, errorMsg, runId);
|
|
2215
|
+
throw new Error(`Step "${step.name}" failed: ${errorMsg}`);
|
|
2050
2216
|
}
|
|
2051
2217
|
|
|
2052
2218
|
/**
|
|
@@ -2286,10 +2452,16 @@ export class WorkflowRunner {
|
|
|
2286
2452
|
this.currentConfig?.swarm?.timeoutMs;
|
|
2287
2453
|
|
|
2288
2454
|
let lastError: string | undefined;
|
|
2455
|
+
let lastExitCode: number | undefined;
|
|
2456
|
+
let lastExitSignal: string | undefined;
|
|
2289
2457
|
|
|
2290
2458
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
2291
2459
|
this.checkAborted();
|
|
2292
2460
|
|
|
2461
|
+
// Reset per-attempt exit info so stale values don't leak across retries
|
|
2462
|
+
lastExitCode = undefined;
|
|
2463
|
+
lastExitSignal = undefined;
|
|
2464
|
+
|
|
2293
2465
|
if (attempt > 0) {
|
|
2294
2466
|
this.emit({ type: 'step:retrying', runId, stepName: step.name, attempt });
|
|
2295
2467
|
this.postToChannel(`**[${step.name}]** Retrying (attempt ${attempt + 1}/${maxRetries + 1})`);
|
|
@@ -2336,6 +2508,16 @@ export class WorkflowRunner {
|
|
|
2336
2508
|
const stepOutputContext = this.buildStepOutputContext(stepStates, runId);
|
|
2337
2509
|
let resolvedTask = this.interpolateStepTask(step.task ?? '', stepOutputContext);
|
|
2338
2510
|
|
|
2511
|
+
// On retry attempts, prepend failure context so the agent knows what went wrong
|
|
2512
|
+
if (attempt > 0 && lastError) {
|
|
2513
|
+
const priorOutput = (this.lastFailedStepOutput.get(step.name) ?? '').slice(-2000);
|
|
2514
|
+
resolvedTask =
|
|
2515
|
+
`[RETRY — Attempt ${attempt + 1}/${maxRetries + 1}]\n` +
|
|
2516
|
+
`Previous attempt failed: ${lastError}\n` +
|
|
2517
|
+
(priorOutput ? `Previous output (last 2000 chars):\n${priorOutput}\n` : '') +
|
|
2518
|
+
`---\n${resolvedTask}`;
|
|
2519
|
+
}
|
|
2520
|
+
|
|
2339
2521
|
// If this is an interactive agent, append awareness of non-interactive workers
|
|
2340
2522
|
// so the lead knows not to message them and to use step output chaining instead
|
|
2341
2523
|
if (specialistDef.interactive !== false || ownerDef.interactive !== false) {
|
|
@@ -2378,9 +2560,12 @@ export class WorkflowRunner {
|
|
|
2378
2560
|
this.log(`[${step.name}] Spawning owner "${effectiveOwner.name}" (cli: ${effectiveOwner.cli})${step.workdir ? ` [workdir: ${step.workdir}]` : ''}`);
|
|
2379
2561
|
const resolvedStep = { ...step, task: ownerTask };
|
|
2380
2562
|
const ownerStartTime = Date.now();
|
|
2381
|
-
const
|
|
2563
|
+
const spawnResult = this.executor
|
|
2382
2564
|
? await this.executor.executeAgentStep(resolvedStep, effectiveOwner, ownerTask, timeoutMs)
|
|
2383
2565
|
: await this.spawnAndWait(effectiveOwner, resolvedStep, timeoutMs);
|
|
2566
|
+
const output = typeof spawnResult === 'string' ? spawnResult : spawnResult.output;
|
|
2567
|
+
lastExitCode = typeof spawnResult === 'string' ? undefined : spawnResult.exitCode;
|
|
2568
|
+
lastExitSignal = typeof spawnResult === 'string' ? undefined : spawnResult.exitSignal;
|
|
2384
2569
|
ownerElapsed = Date.now() - ownerStartTime;
|
|
2385
2570
|
this.log(`[${step.name}] Owner "${effectiveOwner.name}" exited`);
|
|
2386
2571
|
if (usesOwnerFlow) {
|
|
@@ -2392,7 +2577,12 @@ export class WorkflowRunner {
|
|
|
2392
2577
|
|
|
2393
2578
|
// Run verification if configured
|
|
2394
2579
|
if (step.verification) {
|
|
2395
|
-
this.runVerification(
|
|
2580
|
+
this.runVerification(
|
|
2581
|
+
step.verification,
|
|
2582
|
+
specialistOutput,
|
|
2583
|
+
step.name,
|
|
2584
|
+
effectiveOwner.interactive === false ? undefined : resolvedTask
|
|
2585
|
+
);
|
|
2396
2586
|
}
|
|
2397
2587
|
|
|
2398
2588
|
// Every interactive step gets a review pass; pick a dedicated reviewer when available.
|
|
@@ -2425,11 +2615,15 @@ export class WorkflowRunner {
|
|
|
2425
2615
|
// Persist step output to disk so it survives restarts and is inspectable
|
|
2426
2616
|
await this.persistStepOutput(runId, step.name, combinedOutput);
|
|
2427
2617
|
|
|
2428
|
-
this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput });
|
|
2618
|
+
this.emit({ type: 'step:completed', runId, stepName: step.name, output: combinedOutput, exitCode: lastExitCode, exitSignal: lastExitSignal });
|
|
2429
2619
|
await this.trajectory?.stepCompleted(step, combinedOutput, attempt + 1);
|
|
2430
2620
|
return;
|
|
2431
2621
|
} catch (err) {
|
|
2432
2622
|
lastError = err instanceof Error ? err.message : String(err);
|
|
2623
|
+
if (err instanceof SpawnExitError) {
|
|
2624
|
+
lastExitCode = err.exitCode;
|
|
2625
|
+
lastExitSignal = err.exitSignal;
|
|
2626
|
+
}
|
|
2433
2627
|
const ownerTimedOut = usesDedicatedOwner
|
|
2434
2628
|
? /\bowner timed out\b/i.test(lastError)
|
|
2435
2629
|
: /\btimed out\b/i.test(lastError) && !lastError.includes(`${step.name}-review`);
|
|
@@ -2452,7 +2646,10 @@ export class WorkflowRunner {
|
|
|
2452
2646
|
verificationValue,
|
|
2453
2647
|
});
|
|
2454
2648
|
this.postToChannel(`**[${step.name}]** Failed: ${lastError ?? 'Unknown error'}`);
|
|
2455
|
-
await this.markStepFailed(state, lastError ?? 'Unknown error', runId
|
|
2649
|
+
await this.markStepFailed(state, lastError ?? 'Unknown error', runId, {
|
|
2650
|
+
exitCode: lastExitCode,
|
|
2651
|
+
exitSignal: lastExitSignal,
|
|
2652
|
+
});
|
|
2456
2653
|
throw new Error(
|
|
2457
2654
|
`Step "${step.name}" failed after ${maxRetries} retries: ${lastError ?? 'Unknown error'}`
|
|
2458
2655
|
);
|
|
@@ -2615,10 +2812,10 @@ export class WorkflowRunner {
|
|
|
2615
2812
|
|
|
2616
2813
|
const workerSettled = workerPromise.catch(() => undefined);
|
|
2617
2814
|
workerPromise
|
|
2618
|
-
.then((
|
|
2815
|
+
.then((result) => {
|
|
2619
2816
|
workerReleased = true;
|
|
2620
2817
|
this.postToChannel(`**[${step.name}]** Worker \`${workerRuntimeName}\` exited`);
|
|
2621
|
-
if (step.verification?.type === 'output_contains' && output.includes(step.verification.value)) {
|
|
2818
|
+
if (step.verification?.type === 'output_contains' && result.output.includes(step.verification.value)) {
|
|
2622
2819
|
this.postToChannel(
|
|
2623
2820
|
`**[${step.name}]** Verification gate observed: output contains ${JSON.stringify(step.verification.value)}`
|
|
2624
2821
|
);
|
|
@@ -2645,7 +2842,7 @@ export class WorkflowRunner {
|
|
|
2645
2842
|
const ownerStartTime = Date.now();
|
|
2646
2843
|
|
|
2647
2844
|
try {
|
|
2648
|
-
const
|
|
2845
|
+
const ownerResultObj = await this.spawnAndWait(supervised.owner, ownerStep, timeoutMs, {
|
|
2649
2846
|
agentNameSuffix: 'owner',
|
|
2650
2847
|
onSpawned: ({ actualName }) => {
|
|
2651
2848
|
this.supervisedRuntimeAgents.set(actualName, {
|
|
@@ -2659,10 +2856,11 @@ export class WorkflowRunner {
|
|
|
2659
2856
|
},
|
|
2660
2857
|
});
|
|
2661
2858
|
const ownerElapsed = Date.now() - ownerStartTime;
|
|
2859
|
+
const ownerOutput = ownerResultObj.output;
|
|
2662
2860
|
this.log(`[${step.name}] Owner "${supervised.owner.name}" exited`);
|
|
2663
2861
|
this.assertOwnerCompletionMarker(step, ownerOutput, supervisorTask);
|
|
2664
2862
|
|
|
2665
|
-
const specialistOutput = await workerPromise;
|
|
2863
|
+
const specialistOutput = (await workerPromise).output;
|
|
2666
2864
|
return { specialistOutput, ownerOutput, ownerElapsed };
|
|
2667
2865
|
} catch (error) {
|
|
2668
2866
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -2909,7 +3107,7 @@ export class WorkflowRunner {
|
|
|
2909
3107
|
};
|
|
2910
3108
|
|
|
2911
3109
|
try {
|
|
2912
|
-
|
|
3110
|
+
await this.spawnAndWait(reviewerDef, reviewStep, safetyTimeoutMs, {
|
|
2913
3111
|
onSpawned: ({ agent }) => {
|
|
2914
3112
|
reviewerHandle = agent;
|
|
2915
3113
|
},
|
|
@@ -3089,7 +3287,7 @@ export class WorkflowRunner {
|
|
|
3089
3287
|
agentDef: AgentDefinition,
|
|
3090
3288
|
step: WorkflowStep,
|
|
3091
3289
|
timeoutMs?: number
|
|
3092
|
-
): Promise<
|
|
3290
|
+
): Promise<SpawnResult> {
|
|
3093
3291
|
const agentName = `${step.name}-${this.generateShortId()}`;
|
|
3094
3292
|
const modelArgs = agentDef.constraints?.model ? ['--model', agentDef.constraints.model] : [];
|
|
3095
3293
|
|
|
@@ -3152,7 +3350,7 @@ export class WorkflowRunner {
|
|
|
3152
3350
|
const stderrChunks: string[] = [];
|
|
3153
3351
|
|
|
3154
3352
|
try {
|
|
3155
|
-
const output = await new Promise<string>((resolve, reject) => {
|
|
3353
|
+
const { stdout: output, exitCode, exitSignal } = await new Promise<{ stdout: string; exitCode?: number; exitSignal?: string }>((resolve, reject) => {
|
|
3156
3354
|
const child = cpSpawn(cmd, args, {
|
|
3157
3355
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
3158
3356
|
cwd: this.resolveAgentCwd(agentDef),
|
|
@@ -3217,7 +3415,7 @@ export class WorkflowRunner {
|
|
|
3217
3415
|
}, timeoutMs);
|
|
3218
3416
|
}
|
|
3219
3417
|
|
|
3220
|
-
child.on('close', (code) => {
|
|
3418
|
+
child.on('close', (code, signal) => {
|
|
3221
3419
|
clearInterval(heartbeat);
|
|
3222
3420
|
if (timer) clearTimeout(timer);
|
|
3223
3421
|
if (abortHandler && abortSignal) {
|
|
@@ -3238,14 +3436,20 @@ export class WorkflowRunner {
|
|
|
3238
3436
|
if (code !== 0 && code !== null) {
|
|
3239
3437
|
const stderr = stderrChunks.join('');
|
|
3240
3438
|
reject(
|
|
3241
|
-
new
|
|
3242
|
-
`Step "${step.name}" exited with code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}
|
|
3439
|
+
new SpawnExitError(
|
|
3440
|
+
`Step "${step.name}" exited with code ${code}${stderr ? `: ${stderr.slice(0, 500)}` : ''}`,
|
|
3441
|
+
code,
|
|
3442
|
+
signal
|
|
3243
3443
|
)
|
|
3244
3444
|
);
|
|
3245
3445
|
return;
|
|
3246
3446
|
}
|
|
3247
3447
|
|
|
3248
|
-
resolve(
|
|
3448
|
+
resolve({
|
|
3449
|
+
stdout,
|
|
3450
|
+
exitCode: code ?? undefined,
|
|
3451
|
+
exitSignal: signal ?? undefined,
|
|
3452
|
+
});
|
|
3249
3453
|
});
|
|
3250
3454
|
|
|
3251
3455
|
child.on('error', (err) => {
|
|
@@ -3258,8 +3462,10 @@ export class WorkflowRunner {
|
|
|
3258
3462
|
});
|
|
3259
3463
|
});
|
|
3260
3464
|
|
|
3261
|
-
return output;
|
|
3465
|
+
return { output, exitCode, exitSignal };
|
|
3262
3466
|
} finally {
|
|
3467
|
+
const combinedOutput = stdoutChunks.join('') + stderrChunks.join('');
|
|
3468
|
+
this.lastFailedStepOutput.set(step.name, combinedOutput);
|
|
3263
3469
|
stopHeartbeat?.();
|
|
3264
3470
|
logStream.end();
|
|
3265
3471
|
this.unregisterWorker(agentName);
|
|
@@ -3271,7 +3477,7 @@ export class WorkflowRunner {
|
|
|
3271
3477
|
step: WorkflowStep,
|
|
3272
3478
|
timeoutMs?: number,
|
|
3273
3479
|
options: SpawnAndWaitOptions = {}
|
|
3274
|
-
): Promise<
|
|
3480
|
+
): Promise<SpawnResult> {
|
|
3275
3481
|
// Branch: non-interactive agents run as simple subprocesses
|
|
3276
3482
|
if (agentDef.interactive === false) {
|
|
3277
3483
|
return this.execNonInteractive(agentDef, step, timeoutMs);
|
|
@@ -3456,9 +3662,16 @@ export class WorkflowRunner {
|
|
|
3456
3662
|
throw new Error(`Step "${step.name}" timed out after ${timeoutMs ?? 'unknown'}ms`);
|
|
3457
3663
|
}
|
|
3458
3664
|
}
|
|
3665
|
+
|
|
3666
|
+
if (exitResult === 'force-released') {
|
|
3667
|
+
throw new Error(
|
|
3668
|
+
`Step "${step.name}" failed — agent was force-released after exhausting idle nudges without completing`
|
|
3669
|
+
);
|
|
3670
|
+
}
|
|
3459
3671
|
} finally {
|
|
3460
3672
|
// Snapshot PTY chunks before cleanup — we need them for output reading below
|
|
3461
3673
|
ptyChunks = this.ptyOutputBuffers.get(agentName) ?? [];
|
|
3674
|
+
this.lastFailedStepOutput.set(step.name, ptyChunks.join(''));
|
|
3462
3675
|
|
|
3463
3676
|
// Always clean up PTY resources — prevents fd leaks if spawnPty or waitForExit throws
|
|
3464
3677
|
stopHeartbeat?.();
|
|
@@ -3485,11 +3698,15 @@ export class WorkflowRunner {
|
|
|
3485
3698
|
: exitResult === 'timeout'
|
|
3486
3699
|
? 'Agent completed (released after idle timeout)'
|
|
3487
3700
|
: exitResult === 'released'
|
|
3488
|
-
? 'Agent completed (
|
|
3701
|
+
? 'Agent completed (idle — treated as done)'
|
|
3489
3702
|
: `Agent exited (${exitResult})`;
|
|
3490
3703
|
}
|
|
3491
3704
|
|
|
3492
|
-
return
|
|
3705
|
+
return {
|
|
3706
|
+
output,
|
|
3707
|
+
exitCode: agent?.exitCode,
|
|
3708
|
+
exitSignal: agent?.exitSignal,
|
|
3709
|
+
};
|
|
3493
3710
|
}
|
|
3494
3711
|
|
|
3495
3712
|
// ── Idle nudging ────────────────────────────────────────────────────────
|
|
@@ -3525,7 +3742,7 @@ export class WorkflowRunner {
|
|
|
3525
3742
|
agentDef: AgentDefinition,
|
|
3526
3743
|
step: WorkflowStep,
|
|
3527
3744
|
timeoutMs?: number
|
|
3528
|
-
): Promise<'exited' | 'timeout' | 'released'> {
|
|
3745
|
+
): Promise<'exited' | 'timeout' | 'released' | 'force-released'> {
|
|
3529
3746
|
const nudgeConfig = this.currentConfig?.swarm.idleNudge;
|
|
3530
3747
|
if (!nudgeConfig) {
|
|
3531
3748
|
// Idle = done: race exit against idle. Whichever fires first completes the step.
|
|
@@ -3576,7 +3793,7 @@ export class WorkflowRunner {
|
|
|
3576
3793
|
}
|
|
3577
3794
|
|
|
3578
3795
|
// Agent is still running after the window expired.
|
|
3579
|
-
if (
|
|
3796
|
+
if (timeoutMs !== undefined && Date.now() - startTime >= timeoutMs) {
|
|
3580
3797
|
return 'timeout';
|
|
3581
3798
|
}
|
|
3582
3799
|
|
|
@@ -3595,7 +3812,7 @@ export class WorkflowRunner {
|
|
|
3595
3812
|
);
|
|
3596
3813
|
this.emit({ type: 'step:force-released', runId: this.currentRunId ?? '', stepName: step.name });
|
|
3597
3814
|
await agent.release();
|
|
3598
|
-
return 'released';
|
|
3815
|
+
return 'force-released';
|
|
3599
3816
|
}
|
|
3600
3817
|
}
|
|
3601
3818
|
|
|
@@ -3731,7 +3948,12 @@ export class WorkflowRunner {
|
|
|
3731
3948
|
await this.db.updateRun(runId, patch);
|
|
3732
3949
|
}
|
|
3733
3950
|
|
|
3734
|
-
private async markStepFailed(
|
|
3951
|
+
private async markStepFailed(
|
|
3952
|
+
state: StepState,
|
|
3953
|
+
error: string,
|
|
3954
|
+
runId: string,
|
|
3955
|
+
exitInfo?: { exitCode?: number; exitSignal?: string }
|
|
3956
|
+
): Promise<void> {
|
|
3735
3957
|
state.row.status = 'failed';
|
|
3736
3958
|
state.row.error = error;
|
|
3737
3959
|
state.row.completedAt = new Date().toISOString();
|
|
@@ -3741,7 +3963,14 @@ export class WorkflowRunner {
|
|
|
3741
3963
|
completedAt: state.row.completedAt,
|
|
3742
3964
|
updatedAt: new Date().toISOString(),
|
|
3743
3965
|
});
|
|
3744
|
-
this.emit({
|
|
3966
|
+
this.emit({
|
|
3967
|
+
type: 'step:failed',
|
|
3968
|
+
runId,
|
|
3969
|
+
stepName: state.row.stepName,
|
|
3970
|
+
error,
|
|
3971
|
+
exitCode: exitInfo?.exitCode,
|
|
3972
|
+
exitSignal: exitInfo?.exitSignal,
|
|
3973
|
+
});
|
|
3745
3974
|
}
|
|
3746
3975
|
|
|
3747
3976
|
private async markDownstreamSkipped(
|