@forwardimpact/libeval 0.1.28 → 0.1.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +17 -3
- package/bin/fit-trace.js +14 -5
- package/package.json +2 -2
- package/src/commands/run.js +17 -0
- package/src/commands/supervise.js +16 -0
- package/src/commands/trace.js +24 -51
- package/src/facilitator.js +12 -6
- package/src/orchestration-toolkit.js +8 -6
- package/src/render/tool-hints.js +21 -53
- package/src/render/turn-renderer.js +12 -7
- package/src/supervisor.js +27 -11
- package/src/tee-writer.js +6 -1
- package/src/trace-collector.js +18 -2
- package/src/trace-github.js +15 -8
package/bin/fit-eval.js
CHANGED
|
@@ -10,9 +10,13 @@ import { runRunCommand } from "../src/commands/run.js";
|
|
|
10
10
|
import { runSuperviseCommand } from "../src/commands/supervise.js";
|
|
11
11
|
import { runFacilitateCommand } from "../src/commands/facilitate.js";
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
)
|
|
13
|
+
// `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
|
|
14
|
+
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
15
|
+
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
16
|
+
const VERSION =
|
|
17
|
+
process.env.FIT_EVAL_VERSION ||
|
|
18
|
+
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
19
|
+
.version;
|
|
16
20
|
|
|
17
21
|
const definition = {
|
|
18
22
|
name: "fit-eval",
|
|
@@ -55,6 +59,11 @@ const definition = {
|
|
|
55
59
|
type: "string",
|
|
56
60
|
description: "Comma-separated tool allowlist",
|
|
57
61
|
},
|
|
62
|
+
"mcp-server": {
|
|
63
|
+
type: "string",
|
|
64
|
+
description:
|
|
65
|
+
"Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
|
|
66
|
+
},
|
|
58
67
|
},
|
|
59
68
|
},
|
|
60
69
|
{
|
|
@@ -102,6 +111,11 @@ const definition = {
|
|
|
102
111
|
type: "string",
|
|
103
112
|
description: "Supervisor tool allowlist",
|
|
104
113
|
},
|
|
114
|
+
"mcp-server": {
|
|
115
|
+
type: "string",
|
|
116
|
+
description:
|
|
117
|
+
"Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
|
|
118
|
+
},
|
|
105
119
|
},
|
|
106
120
|
},
|
|
107
121
|
{
|
package/bin/fit-trace.js
CHANGED
|
@@ -26,9 +26,13 @@ import {
|
|
|
26
26
|
runSplitCommand,
|
|
27
27
|
} from "../src/commands/trace.js";
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
)
|
|
29
|
+
// `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
|
|
30
|
+
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
31
|
+
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
32
|
+
const VERSION =
|
|
33
|
+
process.env.FIT_TRACE_VERSION ||
|
|
34
|
+
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
35
|
+
.version;
|
|
32
36
|
|
|
33
37
|
const definition = {
|
|
34
38
|
name: "fit-trace",
|
|
@@ -178,11 +182,16 @@ const definition = {
|
|
|
178
182
|
name: "split",
|
|
179
183
|
args: "<file>",
|
|
180
184
|
description:
|
|
181
|
-
"Split a combined trace into per-source files
|
|
185
|
+
"Split a combined trace into per-source files following the `trace--<case>--<participant>.<role>.ndjson` convention",
|
|
182
186
|
options: {
|
|
183
187
|
mode: {
|
|
184
188
|
type: "string",
|
|
185
|
-
description: "Execution mode: run
|
|
189
|
+
description: "Execution mode: run, supervise, or facilitate",
|
|
190
|
+
},
|
|
191
|
+
case: {
|
|
192
|
+
type: "string",
|
|
193
|
+
description:
|
|
194
|
+
"Case identifier embedded in output filenames (default: default)",
|
|
186
195
|
},
|
|
187
196
|
"output-dir": {
|
|
188
197
|
type: "string",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.31",
|
|
4
4
|
"description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"eval",
|
|
@@ -51,7 +51,7 @@
|
|
|
51
51
|
"@forwardimpact/libcli": "^0.1.0",
|
|
52
52
|
"@forwardimpact/libconfig": "^0.1.0",
|
|
53
53
|
"@forwardimpact/libtelemetry": "^0.1.22",
|
|
54
|
-
"zod": "^4.4.
|
|
54
|
+
"zod": "^4.4.3"
|
|
55
55
|
},
|
|
56
56
|
"devDependencies": {
|
|
57
57
|
"@forwardimpact/libharness": "^0.1.14"
|
package/src/commands/run.js
CHANGED
|
@@ -5,6 +5,7 @@ import { createAgentRunner } from "../agent-runner.js";
|
|
|
5
5
|
import { composeProfilePrompt } from "../profile-prompt.js";
|
|
6
6
|
import { createTeeWriter } from "../tee-writer.js";
|
|
7
7
|
import { SequenceCounter } from "../sequence-counter.js";
|
|
8
|
+
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
8
9
|
|
|
9
10
|
/**
|
|
10
11
|
* Parse and validate run command options from parsed values.
|
|
@@ -35,6 +36,7 @@ function parseRunOptions(values) {
|
|
|
35
36
|
values["allowed-tools"] ??
|
|
36
37
|
"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
|
|
37
38
|
).split(","),
|
|
39
|
+
mcpServer: values["mcp-server"] ?? undefined,
|
|
38
40
|
};
|
|
39
41
|
}
|
|
40
42
|
|
|
@@ -56,6 +58,7 @@ export async function runRunCommand(values, _args) {
|
|
|
56
58
|
outputPath,
|
|
57
59
|
agentProfile,
|
|
58
60
|
allowedTools,
|
|
61
|
+
mcpServer,
|
|
59
62
|
} = parseRunOptions(values);
|
|
60
63
|
|
|
61
64
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
@@ -78,6 +81,19 @@ export async function runRunCommand(values, _args) {
|
|
|
78
81
|
);
|
|
79
82
|
};
|
|
80
83
|
|
|
84
|
+
let mcpServers = null;
|
|
85
|
+
if (mcpServer) {
|
|
86
|
+
const mcpConfig = await createServiceConfig("mcp");
|
|
87
|
+
mcpServers = {
|
|
88
|
+
[mcpServer]: {
|
|
89
|
+
type: "http",
|
|
90
|
+
url: mcpConfig.url,
|
|
91
|
+
headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
allowedTools.push(`mcp__${mcpServer}__*`);
|
|
95
|
+
}
|
|
96
|
+
|
|
81
97
|
if (agentProfile) {
|
|
82
98
|
process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
|
|
83
99
|
}
|
|
@@ -100,6 +116,7 @@ export async function runRunCommand(values, _args) {
|
|
|
100
116
|
settingSources: ["project"],
|
|
101
117
|
systemPrompt,
|
|
102
118
|
taskAmend,
|
|
119
|
+
mcpServers,
|
|
103
120
|
});
|
|
104
121
|
|
|
105
122
|
const result = await runner.run(taskContent);
|
|
@@ -3,6 +3,7 @@ import { resolve, join } from "node:path";
|
|
|
3
3
|
import { tmpdir } from "node:os";
|
|
4
4
|
import { createSupervisor } from "../supervisor.js";
|
|
5
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
6
|
+
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
6
7
|
|
|
7
8
|
/**
|
|
8
9
|
* Parse all supervise flags from parsed values into an options object.
|
|
@@ -44,6 +45,7 @@ function parseSuperviseOptions(values) {
|
|
|
44
45
|
supervisorAllowedTools: supervisorAllowedToolsRaw
|
|
45
46
|
? supervisorAllowedToolsRaw.split(",")
|
|
46
47
|
: undefined,
|
|
48
|
+
mcpServer: values["mcp-server"] ?? undefined,
|
|
47
49
|
};
|
|
48
50
|
}
|
|
49
51
|
|
|
@@ -71,6 +73,19 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
71
73
|
})
|
|
72
74
|
: process.stdout;
|
|
73
75
|
|
|
76
|
+
let agentMcpServers = null;
|
|
77
|
+
if (opts.mcpServer) {
|
|
78
|
+
const mcpConfig = await createServiceConfig("mcp");
|
|
79
|
+
agentMcpServers = {
|
|
80
|
+
[opts.mcpServer]: {
|
|
81
|
+
type: "http",
|
|
82
|
+
url: mcpConfig.url,
|
|
83
|
+
headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
|
|
84
|
+
},
|
|
85
|
+
};
|
|
86
|
+
opts.allowedTools.push(`mcp__${opts.mcpServer}__*`);
|
|
87
|
+
}
|
|
88
|
+
|
|
74
89
|
if (opts.agentProfile) {
|
|
75
90
|
process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
|
|
76
91
|
}
|
|
@@ -88,6 +103,7 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
88
103
|
supervisorProfile: opts.supervisorProfile,
|
|
89
104
|
agentProfile: opts.agentProfile,
|
|
90
105
|
taskAmend: opts.taskAmend,
|
|
106
|
+
agentMcpServers,
|
|
91
107
|
});
|
|
92
108
|
|
|
93
109
|
const result = await supervisor.run(opts.taskContent);
|
package/src/commands/trace.js
CHANGED
|
@@ -152,11 +152,22 @@ export async function runFilterCommand(values, args) {
|
|
|
152
152
|
|
|
153
153
|
// --- Split command ---
|
|
154
154
|
|
|
155
|
-
/** Valid
|
|
155
|
+
/** Valid source name pattern: lowercase letter, then lowercase alphanumeric or hyphen. */
|
|
156
156
|
const VALID_SOURCE_NAME = /^[a-z][a-z0-9-]*$/;
|
|
157
157
|
|
|
158
|
+
/** Sources whose name is itself a structural role; classified into the role they represent. */
|
|
159
|
+
const STRUCTURAL_ROLES = new Set(["agent", "supervisor", "facilitator"]);
|
|
160
|
+
|
|
158
161
|
/**
|
|
159
|
-
* Split a combined NDJSON trace into per-source files
|
|
162
|
+
* Split a combined NDJSON trace into per-source files using the
|
|
163
|
+
* `trace--<case>--<participant>.<role>.ndjson` convention.
|
|
164
|
+
*
|
|
165
|
+
* Each valid envelope source becomes one output file. Structural sources
|
|
166
|
+
* (`agent`, `supervisor`, `facilitator`) classify into the matching role and
|
|
167
|
+
* use their own name as participant; profile-named sources (e.g.
|
|
168
|
+
* `staff-engineer`) classify as agents with the profile in the participant
|
|
169
|
+
* slot. Orchestrator events and invalid source names are dropped.
|
|
170
|
+
*
|
|
160
171
|
* @param {object} values - Parsed option values
|
|
161
172
|
* @param {string[]} args - [file]
|
|
162
173
|
*/
|
|
@@ -166,24 +177,24 @@ export async function runSplitCommand(values, args) {
|
|
|
166
177
|
|
|
167
178
|
const mode = values.mode;
|
|
168
179
|
if (!mode) throw new Error("split: --mode is required");
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
process.stdout.write(
|
|
172
|
-
"run mode: trace is already in final form, no split needed\n",
|
|
173
|
-
);
|
|
174
|
-
return;
|
|
180
|
+
if (!["run", "supervise", "facilitate"].includes(mode)) {
|
|
181
|
+
throw new Error(`split: invalid --mode "${mode}"`);
|
|
175
182
|
}
|
|
176
183
|
|
|
184
|
+
const caseId = values.case ?? "default";
|
|
177
185
|
const outputDir = values["output-dir"] || dirname(file);
|
|
178
186
|
mkdirSync(outputDir, { recursive: true });
|
|
179
187
|
|
|
180
188
|
const buckets = parseBuckets(readFileSync(file, "utf8"));
|
|
181
189
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
190
|
+
for (const [source, lines] of buckets.entries()) {
|
|
191
|
+
if (!VALID_SOURCE_NAME.test(source)) continue;
|
|
192
|
+
const role = STRUCTURAL_ROLES.has(source) ? source : "agent";
|
|
193
|
+
const outPath = join(
|
|
194
|
+
outputDir,
|
|
195
|
+
`trace--${caseId}--${source}.${role}.ndjson`,
|
|
196
|
+
);
|
|
197
|
+
writeFileSync(outPath, lines.join("\n") + "\n");
|
|
187
198
|
}
|
|
188
199
|
}
|
|
189
200
|
|
|
@@ -219,44 +230,6 @@ function parseBuckets(content) {
|
|
|
219
230
|
return buckets;
|
|
220
231
|
}
|
|
221
232
|
|
|
222
|
-
/**
|
|
223
|
-
* Write facilitated mode split: facilitator, per-agent, and combined agent files.
|
|
224
|
-
* @param {Map<string, string[]>} buckets
|
|
225
|
-
* @param {string} outputDir
|
|
226
|
-
*/
|
|
227
|
-
function splitFacilitated(buckets, outputDir) {
|
|
228
|
-
writeBucket(buckets, "facilitator", outputDir);
|
|
229
|
-
|
|
230
|
-
const agentSources = [...buckets.keys()].filter(
|
|
231
|
-
(s) => s !== "facilitator" && VALID_SOURCE_NAME.test(s),
|
|
232
|
-
);
|
|
233
|
-
|
|
234
|
-
for (const name of agentSources) {
|
|
235
|
-
writeBucket(buckets, name, outputDir);
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
const combinedLines = agentSources.flatMap((n) => buckets.get(n) ?? []);
|
|
239
|
-
if (combinedLines.length > 0) {
|
|
240
|
-
writeFileSync(
|
|
241
|
-
join(outputDir, "trace-agent.ndjson"),
|
|
242
|
-
combinedLines.join("\n") + "\n",
|
|
243
|
-
);
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
/**
|
|
248
|
-
* Write a single source bucket to a trace-{name}.ndjson file.
|
|
249
|
-
* @param {Map<string, string[]>} buckets
|
|
250
|
-
* @param {string} name
|
|
251
|
-
* @param {string} outputDir
|
|
252
|
-
*/
|
|
253
|
-
function writeBucket(buckets, name, outputDir) {
|
|
254
|
-
const lines = buckets.get(name);
|
|
255
|
-
if (!lines || lines.length === 0) return;
|
|
256
|
-
const outPath = join(outputDir, `trace-${name}.ndjson`);
|
|
257
|
-
writeFileSync(outPath, lines.join("\n") + "\n");
|
|
258
|
-
}
|
|
259
|
-
|
|
260
233
|
// --- Shared helpers ---
|
|
261
234
|
|
|
262
235
|
/**
|
package/src/facilitator.js
CHANGED
|
@@ -26,7 +26,8 @@ export const FACILITATOR_SYSTEM_PROMPT =
|
|
|
26
26
|
"Announce sends a message with no reply obligation. " +
|
|
27
27
|
"Redirect interrupts a participant with replacement instructions. " +
|
|
28
28
|
"RollCall lists participants. " +
|
|
29
|
-
"Conclude ends the session with a summary
|
|
29
|
+
"Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
|
|
30
|
+
"the verdict reflects whether the session met the criteria stated in the task.";
|
|
30
31
|
|
|
31
32
|
/** System prompt appended for facilitated agent runners. */
|
|
32
33
|
export const FACILITATED_AGENT_SYSTEM_PROMPT =
|
|
@@ -106,12 +107,14 @@ export class Facilitator {
|
|
|
106
107
|
// messages and started processing concurrently.
|
|
107
108
|
this.concludeResolve();
|
|
108
109
|
await Promise.allSettled(agentPromises);
|
|
110
|
+
const success = this.ctx.verdict === "success";
|
|
109
111
|
this.emitSummary({
|
|
110
|
-
success
|
|
112
|
+
success,
|
|
113
|
+
verdict: this.ctx.verdict,
|
|
111
114
|
turns: this.facilitatorTurns,
|
|
112
115
|
summary: this.ctx.summary,
|
|
113
116
|
});
|
|
114
|
-
return { success
|
|
117
|
+
return { success, turns: this.facilitatorTurns };
|
|
115
118
|
}
|
|
116
119
|
|
|
117
120
|
// Abort agents promptly when Conclude is called during the event loop
|
|
@@ -134,12 +137,14 @@ export class Facilitator {
|
|
|
134
137
|
throw err;
|
|
135
138
|
}
|
|
136
139
|
|
|
140
|
+
const success = this.ctx.concluded && this.ctx.verdict === "success";
|
|
137
141
|
const result = {
|
|
138
|
-
success
|
|
142
|
+
success,
|
|
139
143
|
turns: this.facilitatorTurns,
|
|
140
144
|
};
|
|
141
145
|
this.emitSummary({
|
|
142
|
-
success
|
|
146
|
+
success,
|
|
147
|
+
verdict: this.ctx.verdict,
|
|
143
148
|
turns: result.turns,
|
|
144
149
|
summary: this.ctx.summary,
|
|
145
150
|
});
|
|
@@ -344,7 +349,7 @@ export class Facilitator {
|
|
|
344
349
|
}
|
|
345
350
|
|
|
346
351
|
/**
|
|
347
|
-
* @param {{success: boolean, turns: number, summary?: string}} result
|
|
352
|
+
* @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
|
|
348
353
|
*/
|
|
349
354
|
emitSummary(result) {
|
|
350
355
|
this.output.write(
|
|
@@ -354,6 +359,7 @@ export class Facilitator {
|
|
|
354
359
|
event: {
|
|
355
360
|
type: "summary",
|
|
356
361
|
success: result.success,
|
|
362
|
+
...(result.verdict && { verdict: result.verdict }),
|
|
357
363
|
turns: result.turns,
|
|
358
364
|
...(result.summary && { summary: result.summary }),
|
|
359
365
|
},
|
|
@@ -22,6 +22,7 @@ import { z } from "zod";
|
|
|
22
22
|
export function createOrchestrationContext() {
|
|
23
23
|
return {
|
|
24
24
|
concluded: false,
|
|
25
|
+
verdict: null,
|
|
25
26
|
summary: null,
|
|
26
27
|
redirect: null,
|
|
27
28
|
participants: [],
|
|
@@ -37,10 +38,11 @@ export function createOrchestrationContext() {
|
|
|
37
38
|
|
|
38
39
|
// --- Handler factories ---
|
|
39
40
|
|
|
40
|
-
/** Create a handler that marks the session as concluded and records the summary. */
|
|
41
|
+
/** Create a handler that marks the session as concluded and records the verdict and summary. */
|
|
41
42
|
export function createConcludeHandler(ctx) {
|
|
42
|
-
return async ({ summary }) => {
|
|
43
|
+
return async ({ verdict, summary }) => {
|
|
43
44
|
ctx.concluded = true;
|
|
45
|
+
ctx.verdict = verdict;
|
|
44
46
|
ctx.summary = summary;
|
|
45
47
|
return { content: [{ type: "text", text: "Session concluded." }] };
|
|
46
48
|
};
|
|
@@ -220,8 +222,8 @@ export function createSupervisorToolServer(ctx) {
|
|
|
220
222
|
),
|
|
221
223
|
tool(
|
|
222
224
|
"Conclude",
|
|
223
|
-
"End the session with a summary.",
|
|
224
|
-
{ summary: z.string() },
|
|
225
|
+
"End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
|
|
226
|
+
{ verdict: z.enum(["success", "failure"]), summary: z.string() },
|
|
225
227
|
createConcludeHandler(ctx),
|
|
226
228
|
),
|
|
227
229
|
tool(
|
|
@@ -307,8 +309,8 @@ export function createFacilitatorToolServer(ctx) {
|
|
|
307
309
|
),
|
|
308
310
|
tool(
|
|
309
311
|
"Conclude",
|
|
310
|
-
"End the session with a summary.",
|
|
311
|
-
{ summary: z.string() },
|
|
312
|
+
"End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
|
|
313
|
+
{ verdict: z.enum(["success", "failure"]), summary: z.string() },
|
|
312
314
|
createConcludeHandler(ctx),
|
|
313
315
|
),
|
|
314
316
|
tool(
|
package/src/render/tool-hints.js
CHANGED
|
@@ -6,6 +6,11 @@
|
|
|
6
6
|
* tool (file path, command, pattern, …) sanitized to strip JSON punctuation
|
|
7
7
|
* (`{`, `}`, `"`) and collapsed to a single line ≤ 80 chars.
|
|
8
8
|
*
|
|
9
|
+
* MCP-prefixed tools (`mcp__*`) are an intentional carve-out: their hint is
|
|
10
|
+
* the full input rendered as compact single-line JSON, so `{` and `"` do
|
|
11
|
+
* appear on those lines. Readers of GitHub workflow logs need the full MCP
|
|
12
|
+
* payload to know what was actually sent across the protocol.
|
|
13
|
+
*
|
|
9
14
|
* `previewForResult(content, isError)` collapses a tool result to a single
|
|
10
15
|
* line ≤ 80 chars and flags errors so the renderer can apply the reserved
|
|
11
16
|
* error color and the `Error:` label.
|
|
@@ -91,35 +96,16 @@ export function simplifyToolName(name) {
|
|
|
91
96
|
return parts.slice(2).join("__");
|
|
92
97
|
}
|
|
93
98
|
|
|
94
|
-
/**
|
|
95
|
-
* MCP-prefixed tool names (e.g. `mcp__orchestration__Ask`) take a different
|
|
96
|
-
* handler path. The method name itself is surfaced via `simplifyToolName`,
|
|
97
|
-
* so this only adds the `to/from` decorators for orchestration calls.
|
|
98
|
-
* Returns null if the name does not match any MCP prefix.
|
|
99
|
-
* @param {string} name
|
|
100
|
-
* @param {object} input
|
|
101
|
-
* @returns {string|null}
|
|
102
|
-
*/
|
|
103
|
-
function hintForMcp(name, input) {
|
|
104
|
-
if (name.startsWith("mcp__orchestration__")) {
|
|
105
|
-
const parts = [];
|
|
106
|
-
if (input.to) parts.push(`to ${sanitize(input.to)}`);
|
|
107
|
-
if (input.from) parts.push(`from ${sanitize(input.from)}`);
|
|
108
|
-
return truncate(parts.join(" "));
|
|
109
|
-
}
|
|
110
|
-
if (name.startsWith("mcp__")) {
|
|
111
|
-
return "";
|
|
112
|
-
}
|
|
113
|
-
return null;
|
|
114
|
-
}
|
|
115
|
-
|
|
116
99
|
/**
|
|
117
100
|
* Map a tool name and input to a one-line human hint.
|
|
118
101
|
*
|
|
119
|
-
*
|
|
120
|
-
*
|
|
121
|
-
*
|
|
122
|
-
*
|
|
102
|
+
* Three branches, in priority order:
|
|
103
|
+
* - A built-in tool with an entry in `HINT_HANDLERS` → sanitized hint, no
|
|
104
|
+
* `{` / `"` from the input (spec 540 criterion #2 for non-MCP tools).
|
|
105
|
+
* - An MCP-prefixed tool (`mcp__*`) → full input rendered as compact
|
|
106
|
+
* single-line JSON; `{` and `"` intentionally appear so readers see
|
|
107
|
+
* the actual MCP payload.
|
|
108
|
+
* - Anything else → "" (the caller still shows the bare tool name).
|
|
123
109
|
*
|
|
124
110
|
* @param {string} name - Tool name (e.g. "Bash", "Read", "mcp__orchestration__Ask")
|
|
125
111
|
* @param {object|null|undefined} input - Raw tool input object from the trace
|
|
@@ -132,8 +118,7 @@ export function hintForCall(name, input) {
|
|
|
132
118
|
const handler = HINT_HANDLERS[name];
|
|
133
119
|
if (handler) return handler(safeInput);
|
|
134
120
|
|
|
135
|
-
|
|
136
|
-
if (mcp !== null) return mcp;
|
|
121
|
+
if (name.startsWith("mcp__")) return JSON.stringify(safeInput);
|
|
137
122
|
|
|
138
123
|
return "";
|
|
139
124
|
}
|
|
@@ -154,32 +139,15 @@ export function previewForResult(content, isError) {
|
|
|
154
139
|
: typeof content === "string"
|
|
155
140
|
? content
|
|
156
141
|
: JSON.stringify(content);
|
|
157
|
-
const
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
break;
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
if (isError) {
|
|
167
|
-
const body = firstNonBlank || "(no output)";
|
|
168
|
-
return {
|
|
169
|
-
text:
|
|
170
|
-
body.length <= MAX_HINT_CHARS
|
|
171
|
-
? body
|
|
172
|
-
: body.slice(0, MAX_HINT_CHARS - 3) + "...",
|
|
173
|
-
isError: true,
|
|
174
|
-
};
|
|
175
|
-
}
|
|
142
|
+
const firstNonBlank =
|
|
143
|
+
normalized
|
|
144
|
+
.split(/\r?\n/)
|
|
145
|
+
.map((l) => l.trim())
|
|
146
|
+
.find((l) => l.length > 0) ?? "";
|
|
176
147
|
|
|
177
|
-
|
|
148
|
+
const fallback = isError ? "(no output)" : "(ok)";
|
|
178
149
|
return {
|
|
179
|
-
text:
|
|
180
|
-
|
|
181
|
-
? firstNonBlank
|
|
182
|
-
: firstNonBlank.slice(0, MAX_HINT_CHARS - 3) + "...",
|
|
183
|
-
isError: false,
|
|
150
|
+
text: truncate(firstNonBlank || fallback),
|
|
151
|
+
isError,
|
|
184
152
|
};
|
|
185
153
|
}
|
|
@@ -25,12 +25,7 @@ import {
|
|
|
25
25
|
* @returns {string[]} Array of rendered line strings
|
|
26
26
|
*/
|
|
27
27
|
export function renderTurnLines(turn, withPrefix) {
|
|
28
|
-
|
|
29
|
-
if (turn.role === "tool_result")
|
|
30
|
-
return renderToolResultTurn(turn, withPrefix);
|
|
31
|
-
if (turn.role === "system") return renderSystemTurn(turn, withPrefix);
|
|
32
|
-
if (turn.role === "user") return renderUserTurn(turn, withPrefix);
|
|
33
|
-
return [];
|
|
28
|
+
return TURN_RENDERERS[turn.role]?.(turn, withPrefix) ?? [];
|
|
34
29
|
}
|
|
35
30
|
|
|
36
31
|
/** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
|
|
@@ -57,10 +52,13 @@ function renderAssistantTurn(turn, withPrefix) {
|
|
|
57
52
|
|
|
58
53
|
/** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
|
|
59
54
|
function renderToolResultTurn(turn, withPrefix) {
|
|
55
|
+
// Successful tool results emit no preview line — the trace document keeps
|
|
56
|
+
// the structured turn, but readers of the streamed log only see errors.
|
|
57
|
+
if (!turn.isError) return [];
|
|
60
58
|
return [
|
|
61
59
|
renderToolResultLine({
|
|
62
60
|
source: turn.source,
|
|
63
|
-
preview: previewForResult(turn.content,
|
|
61
|
+
preview: previewForResult(turn.content, true),
|
|
64
62
|
withPrefix,
|
|
65
63
|
}),
|
|
66
64
|
];
|
|
@@ -90,3 +88,10 @@ function renderUserTurn(turn, withPrefix) {
|
|
|
90
88
|
}
|
|
91
89
|
return lines;
|
|
92
90
|
}
|
|
91
|
+
|
|
92
|
+
const TURN_RENDERERS = {
|
|
93
|
+
assistant: renderAssistantTurn,
|
|
94
|
+
tool_result: renderToolResultTurn,
|
|
95
|
+
system: renderSystemTurn,
|
|
96
|
+
user: renderUserTurn,
|
|
97
|
+
};
|
package/src/supervisor.js
CHANGED
|
@@ -36,7 +36,8 @@ export const SUPERVISOR_SYSTEM_PROMPT =
|
|
|
36
36
|
"Answer replies to an ask the agent addressed to you. " +
|
|
37
37
|
"Announce sends a message with no reply obligation. " +
|
|
38
38
|
"Redirect interrupts the agent with replacement instructions. " +
|
|
39
|
-
"Conclude ends the session with a summary
|
|
39
|
+
"Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
|
|
40
|
+
"the verdict reflects whether the agent's work meets the criteria stated in the task.";
|
|
40
41
|
|
|
41
42
|
/** System prompt appended for the agent runner in supervise mode. */
|
|
42
43
|
export const AGENT_SYSTEM_PROMPT =
|
|
@@ -110,8 +111,14 @@ export class Supervisor {
|
|
|
110
111
|
}
|
|
111
112
|
|
|
112
113
|
if (this.ctx.concluded) {
|
|
113
|
-
|
|
114
|
-
|
|
114
|
+
const success = this.ctx.verdict === "success";
|
|
115
|
+
this.emitSummary({
|
|
116
|
+
success,
|
|
117
|
+
verdict: this.ctx.verdict,
|
|
118
|
+
turns: 0,
|
|
119
|
+
summary: this.ctx.summary,
|
|
120
|
+
});
|
|
121
|
+
return { success, turns: 0 };
|
|
115
122
|
}
|
|
116
123
|
|
|
117
124
|
let pendingRelay = null;
|
|
@@ -214,12 +221,14 @@ export class Supervisor {
|
|
|
214
221
|
}
|
|
215
222
|
|
|
216
223
|
if (this.ctx.concluded) {
|
|
224
|
+
const success = this.ctx.verdict === "success";
|
|
217
225
|
this.emitSummary({
|
|
218
|
-
success
|
|
226
|
+
success,
|
|
227
|
+
verdict: this.ctx.verdict,
|
|
219
228
|
turns: turn,
|
|
220
229
|
summary: this.ctx.summary,
|
|
221
230
|
});
|
|
222
|
-
return { type: "exit", exit: { success
|
|
231
|
+
return { type: "exit", exit: { success, turns: turn } };
|
|
223
232
|
}
|
|
224
233
|
|
|
225
234
|
if (agentResult.aborted && this.ctx.redirect) {
|
|
@@ -308,12 +317,14 @@ export class Supervisor {
|
|
|
308
317
|
}
|
|
309
318
|
|
|
310
319
|
if (this.ctx.concluded) {
|
|
320
|
+
const success = this.ctx.verdict === "success";
|
|
311
321
|
this.emitSummary({
|
|
312
|
-
success
|
|
322
|
+
success,
|
|
323
|
+
verdict: this.ctx.verdict,
|
|
313
324
|
turns: turn,
|
|
314
325
|
summary: this.ctx.summary,
|
|
315
326
|
});
|
|
316
|
-
return { exit: { success
|
|
327
|
+
return { exit: { success, turns: turn } };
|
|
317
328
|
}
|
|
318
329
|
|
|
319
330
|
if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
|
|
@@ -323,12 +334,14 @@ export class Supervisor {
|
|
|
323
334
|
formatMessages(reminders),
|
|
324
335
|
);
|
|
325
336
|
if (this.ctx.concluded) {
|
|
337
|
+
const success = this.ctx.verdict === "success";
|
|
326
338
|
this.emitSummary({
|
|
327
|
-
success
|
|
339
|
+
success,
|
|
340
|
+
verdict: this.ctx.verdict,
|
|
328
341
|
turns: turn,
|
|
329
342
|
summary: this.ctx.summary,
|
|
330
343
|
});
|
|
331
|
-
return { exit: { success
|
|
344
|
+
return { exit: { success, turns: turn } };
|
|
332
345
|
}
|
|
333
346
|
this.#checkAsk("supervisor");
|
|
334
347
|
}
|
|
@@ -426,7 +439,7 @@ export class Supervisor {
|
|
|
426
439
|
|
|
427
440
|
/**
|
|
428
441
|
* Emit a final orchestrator summary line, wrapped in the universal envelope.
|
|
429
|
-
* @param {{success: boolean, turns: number, summary?: string}} result
|
|
442
|
+
* @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
|
|
430
443
|
*/
|
|
431
444
|
emitSummary(result) {
|
|
432
445
|
this.output.write(
|
|
@@ -436,6 +449,7 @@ export class Supervisor {
|
|
|
436
449
|
event: {
|
|
437
450
|
type: "summary",
|
|
438
451
|
success: result.success,
|
|
452
|
+
...(result.verdict && { verdict: result.verdict }),
|
|
439
453
|
turns: result.turns,
|
|
440
454
|
...(result.summary && { summary: result.summary }),
|
|
441
455
|
},
|
|
@@ -466,6 +480,7 @@ const devNull = new Writable({
|
|
|
466
480
|
* @param {string} [deps.agentProfile] - Agent profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
|
|
467
481
|
* @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<supervisorCwd>/.claude/agents`. Resolved once from the orchestrator's cwd so profiles travel with the project, not with a per-agent sandbox.
|
|
468
482
|
* @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
|
|
483
|
+
* @param {Record<string, object>} [deps.agentMcpServers] - Additional MCP servers exposed to the agent (merged alongside the orchestration server).
|
|
469
484
|
* @returns {Supervisor}
|
|
470
485
|
*/
|
|
471
486
|
export function createSupervisor({
|
|
@@ -482,6 +497,7 @@ export function createSupervisor({
|
|
|
482
497
|
agentProfile,
|
|
483
498
|
profilesDir,
|
|
484
499
|
taskAmend,
|
|
500
|
+
agentMcpServers,
|
|
485
501
|
}) {
|
|
486
502
|
const resolvedProfilesDir =
|
|
487
503
|
profilesDir ?? resolve(supervisorCwd, ".claude/agents");
|
|
@@ -521,7 +537,7 @@ export function createSupervisor({
|
|
|
521
537
|
onLine,
|
|
522
538
|
settingSources: ["project"],
|
|
523
539
|
systemPrompt: systemPromptFor(agentProfile, AGENT_SYSTEM_PROMPT),
|
|
524
|
-
mcpServers: { orchestration: agentServer },
|
|
540
|
+
mcpServers: { orchestration: agentServer, ...agentMcpServers },
|
|
525
541
|
});
|
|
526
542
|
|
|
527
543
|
const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
|
package/src/tee-writer.js
CHANGED
|
@@ -100,6 +100,12 @@ export class TeeWriter extends Writable {
|
|
|
100
100
|
|
|
101
101
|
// Universal envelope: { source, seq, event }
|
|
102
102
|
if (parsed.event) {
|
|
103
|
+
// Always forward to the collector so it can capture orchestrator
|
|
104
|
+
// metadata (e.g. the summary verdict for the result footer); the
|
|
105
|
+
// collector adds no turn for suppressed events, so flushTurns stays
|
|
106
|
+
// a no-op when we skip it below.
|
|
107
|
+
this.collector.addLine(line);
|
|
108
|
+
|
|
103
109
|
// Orchestrator lifecycle events are suppressed from the text stream
|
|
104
110
|
// entirely (spec 540). They still reached fileStream above.
|
|
105
111
|
if (
|
|
@@ -108,7 +114,6 @@ export class TeeWriter extends Writable {
|
|
|
108
114
|
) {
|
|
109
115
|
return;
|
|
110
116
|
}
|
|
111
|
-
this.collector.addLine(line);
|
|
112
117
|
this.flushTurns();
|
|
113
118
|
return;
|
|
114
119
|
}
|
package/src/trace-collector.js
CHANGED
|
@@ -27,6 +27,8 @@ export class TraceCollector {
|
|
|
27
27
|
this.turns = [];
|
|
28
28
|
/** @type {object|null} */
|
|
29
29
|
this.result = null;
|
|
30
|
+
/** @type {{verdict?: string, summary?: string, turns?: number}|null} */
|
|
31
|
+
this.orchestratorSummary = null;
|
|
30
32
|
/** @type {number} */
|
|
31
33
|
this.turnIndex = 0;
|
|
32
34
|
/** @type {object|null} */
|
|
@@ -62,6 +64,16 @@ export class TraceCollector {
|
|
|
62
64
|
// Orchestrator lifecycle events carry no content and are suppressed
|
|
63
65
|
// from turns entirely — the NDJSON artifact keeps them separately.
|
|
64
66
|
if (source === "orchestrator" && isSuppressedOrchestratorEvent(event)) {
|
|
67
|
+
// The summary event carries the supervisor/facilitator verdict —
|
|
68
|
+
// capture it before dropping the event, so the result footer can
|
|
69
|
+
// surface verdict="failure" instead of the SDK's per-runner status.
|
|
70
|
+
if (event.type === "summary") {
|
|
71
|
+
this.orchestratorSummary = {
|
|
72
|
+
...(event.verdict && { verdict: event.verdict }),
|
|
73
|
+
...(typeof event.summary === "string" && { summary: event.summary }),
|
|
74
|
+
...(typeof event.turns === "number" && { turns: event.turns }),
|
|
75
|
+
};
|
|
76
|
+
}
|
|
65
77
|
return;
|
|
66
78
|
}
|
|
67
79
|
|
|
@@ -277,16 +289,20 @@ export class TraceCollector {
|
|
|
277
289
|
}
|
|
278
290
|
|
|
279
291
|
/**
|
|
280
|
-
* Format the trailing result summary line (spec 540).
|
|
292
|
+
* Format the trailing result summary line (spec 540). When an orchestrator
|
|
293
|
+
* summary is present (supervised / facilitated mode), the headline word is
|
|
294
|
+
* the supervisor's verdict ("success" / "failure") rather than the SDK's
|
|
295
|
+
* per-runner subtype, so the footer aligns with the CI exit code.
|
|
281
296
|
* @returns {string}
|
|
282
297
|
*/
|
|
283
298
|
#formatResultTail() {
|
|
284
299
|
if (!this.result) return "";
|
|
285
300
|
const duration = formatDuration(this.result.durationMs);
|
|
286
301
|
const cost = Number(this.result.totalCostUsd).toFixed(4);
|
|
302
|
+
const headline = this.orchestratorSummary?.verdict ?? this.result.result;
|
|
287
303
|
return (
|
|
288
304
|
"\n" +
|
|
289
|
-
`--- Result: ${
|
|
305
|
+
`--- Result: ${headline} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
|
|
290
306
|
);
|
|
291
307
|
}
|
|
292
308
|
}
|
package/src/trace-github.js
CHANGED
|
@@ -65,8 +65,10 @@ export class TraceGitHub {
|
|
|
65
65
|
/**
|
|
66
66
|
* Download a trace artifact from a workflow run and extract it.
|
|
67
67
|
*
|
|
68
|
-
*
|
|
69
|
-
*
|
|
68
|
+
* When `opts.name` is set, looks up that exact artifact. Otherwise picks the
|
|
69
|
+
* best match from the unified `trace--<case>--<participant>.<role>` naming
|
|
70
|
+
* convention: prefer a `*.raw` artifact (combined log), then any `*.agent`,
|
|
71
|
+
* then the first `trace--*` artifact found.
|
|
70
72
|
*
|
|
71
73
|
* @param {number|string} runId
|
|
72
74
|
* @param {object} [opts]
|
|
@@ -84,13 +86,18 @@ export class TraceGitHub {
|
|
|
84
86
|
const artifacts = data.artifacts ?? [];
|
|
85
87
|
|
|
86
88
|
// Find the trace artifact.
|
|
87
|
-
const preferredNames = opts.name
|
|
88
|
-
? [opts.name]
|
|
89
|
-
: ["combined-trace", "agent-trace"];
|
|
90
89
|
let artifact = null;
|
|
91
|
-
|
|
92
|
-
artifact = artifacts.find((a) => a.name === name);
|
|
93
|
-
|
|
90
|
+
if (opts.name) {
|
|
91
|
+
artifact = artifacts.find((a) => a.name === opts.name);
|
|
92
|
+
} else {
|
|
93
|
+
const traceArtifacts = artifacts.filter((a) =>
|
|
94
|
+
a.name.startsWith("trace--"),
|
|
95
|
+
);
|
|
96
|
+
artifact =
|
|
97
|
+
traceArtifacts.find((a) => a.name.endsWith(".raw")) ??
|
|
98
|
+
traceArtifacts.find((a) => a.name.endsWith(".agent")) ??
|
|
99
|
+
traceArtifacts[0] ??
|
|
100
|
+
null;
|
|
94
101
|
}
|
|
95
102
|
|
|
96
103
|
if (!artifact) {
|