@forwardimpact/libeval 0.1.28 → 0.1.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/fit-eval.js CHANGED
@@ -10,9 +10,13 @@ import { runRunCommand } from "../src/commands/run.js";
10
10
  import { runSuperviseCommand } from "../src/commands/supervise.js";
11
11
  import { runFacilitateCommand } from "../src/commands/facilitate.js";
12
12
 
13
- const { version: VERSION } = JSON.parse(
14
- readFileSync(new URL("../package.json", import.meta.url), "utf8"),
15
- );
13
+ // `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
14
+ // the readFileSync branch in the compiled binary (which would ENOENT against
15
+ // the bunfs virtual mount). Source execution falls through to package.json.
16
+ const VERSION =
17
+ process.env.FIT_EVAL_VERSION ||
18
+ JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
19
+ .version;
16
20
 
17
21
  const definition = {
18
22
  name: "fit-eval",
@@ -55,6 +59,11 @@ const definition = {
55
59
  type: "string",
56
60
  description: "Comma-separated tool allowlist",
57
61
  },
62
+ "mcp-server": {
63
+ type: "string",
64
+ description:
65
+ "Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
66
+ },
58
67
  },
59
68
  },
60
69
  {
@@ -102,6 +111,11 @@ const definition = {
102
111
  type: "string",
103
112
  description: "Supervisor tool allowlist",
104
113
  },
114
+ "mcp-server": {
115
+ type: "string",
116
+ description:
117
+ "Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
118
+ },
105
119
  },
106
120
  },
107
121
  {
package/bin/fit-trace.js CHANGED
@@ -26,9 +26,13 @@ import {
26
26
  runSplitCommand,
27
27
  } from "../src/commands/trace.js";
28
28
 
29
- const { version: VERSION } = JSON.parse(
30
- readFileSync(new URL("../package.json", import.meta.url), "utf8"),
31
- );
29
+ // `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
30
+ // the readFileSync branch in the compiled binary (which would ENOENT against
31
+ // the bunfs virtual mount). Source execution falls through to package.json.
32
+ const VERSION =
33
+ process.env.FIT_TRACE_VERSION ||
34
+ JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
35
+ .version;
32
36
 
33
37
  const definition = {
34
38
  name: "fit-trace",
@@ -178,11 +182,16 @@ const definition = {
178
182
  name: "split",
179
183
  args: "<file>",
180
184
  description:
181
- "Split a combined trace into per-source files (one per agent or supervisor)",
185
+ "Split a combined trace into per-source files following the `trace--<case>--<participant>.<role>.ndjson` convention",
182
186
  options: {
183
187
  mode: {
184
188
  type: "string",
185
- description: "Execution mode: run (no-op), supervise, or facilitate",
189
+ description: "Execution mode: run, supervise, or facilitate",
190
+ },
191
+ case: {
192
+ type: "string",
193
+ description:
194
+ "Case identifier embedded in output filenames (default: default)",
186
195
  },
187
196
  "output-dir": {
188
197
  type: "string",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.28",
3
+ "version": "0.1.31",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -51,7 +51,7 @@
51
51
  "@forwardimpact/libcli": "^0.1.0",
52
52
  "@forwardimpact/libconfig": "^0.1.0",
53
53
  "@forwardimpact/libtelemetry": "^0.1.22",
54
- "zod": "^4.4.1"
54
+ "zod": "^4.4.3"
55
55
  },
56
56
  "devDependencies": {
57
57
  "@forwardimpact/libharness": "^0.1.14"
@@ -5,6 +5,7 @@ import { createAgentRunner } from "../agent-runner.js";
5
5
  import { composeProfilePrompt } from "../profile-prompt.js";
6
6
  import { createTeeWriter } from "../tee-writer.js";
7
7
  import { SequenceCounter } from "../sequence-counter.js";
8
+ import { createServiceConfig } from "@forwardimpact/libconfig";
8
9
 
9
10
  /**
10
11
  * Parse and validate run command options from parsed values.
@@ -35,6 +36,7 @@ function parseRunOptions(values) {
35
36
  values["allowed-tools"] ??
36
37
  "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
37
38
  ).split(","),
39
+ mcpServer: values["mcp-server"] ?? undefined,
38
40
  };
39
41
  }
40
42
 
@@ -56,6 +58,7 @@ export async function runRunCommand(values, _args) {
56
58
  outputPath,
57
59
  agentProfile,
58
60
  allowedTools,
61
+ mcpServer,
59
62
  } = parseRunOptions(values);
60
63
 
61
64
  // When --output is specified, stream text to stdout while writing NDJSON to file.
@@ -78,6 +81,19 @@ export async function runRunCommand(values, _args) {
78
81
  );
79
82
  };
80
83
 
84
+ let mcpServers = null;
85
+ if (mcpServer) {
86
+ const mcpConfig = await createServiceConfig("mcp");
87
+ mcpServers = {
88
+ [mcpServer]: {
89
+ type: "http",
90
+ url: mcpConfig.url,
91
+ headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
92
+ },
93
+ };
94
+ allowedTools.push(`mcp__${mcpServer}__*`);
95
+ }
96
+
81
97
  if (agentProfile) {
82
98
  process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
83
99
  }
@@ -100,6 +116,7 @@ export async function runRunCommand(values, _args) {
100
116
  settingSources: ["project"],
101
117
  systemPrompt,
102
118
  taskAmend,
119
+ mcpServers,
103
120
  });
104
121
 
105
122
  const result = await runner.run(taskContent);
@@ -3,6 +3,7 @@ import { resolve, join } from "node:path";
3
3
  import { tmpdir } from "node:os";
4
4
  import { createSupervisor } from "../supervisor.js";
5
5
  import { createTeeWriter } from "../tee-writer.js";
6
+ import { createServiceConfig } from "@forwardimpact/libconfig";
6
7
 
7
8
  /**
8
9
  * Parse all supervise flags from parsed values into an options object.
@@ -44,6 +45,7 @@ function parseSuperviseOptions(values) {
44
45
  supervisorAllowedTools: supervisorAllowedToolsRaw
45
46
  ? supervisorAllowedToolsRaw.split(",")
46
47
  : undefined,
48
+ mcpServer: values["mcp-server"] ?? undefined,
47
49
  };
48
50
  }
49
51
 
@@ -71,6 +73,19 @@ export async function runSuperviseCommand(values, _args) {
71
73
  })
72
74
  : process.stdout;
73
75
 
76
+ let agentMcpServers = null;
77
+ if (opts.mcpServer) {
78
+ const mcpConfig = await createServiceConfig("mcp");
79
+ agentMcpServers = {
80
+ [opts.mcpServer]: {
81
+ type: "http",
82
+ url: mcpConfig.url,
83
+ headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
84
+ },
85
+ };
86
+ opts.allowedTools.push(`mcp__${opts.mcpServer}__*`);
87
+ }
88
+
74
89
  if (opts.agentProfile) {
75
90
  process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
76
91
  }
@@ -88,6 +103,7 @@ export async function runSuperviseCommand(values, _args) {
88
103
  supervisorProfile: opts.supervisorProfile,
89
104
  agentProfile: opts.agentProfile,
90
105
  taskAmend: opts.taskAmend,
106
+ agentMcpServers,
91
107
  });
92
108
 
93
109
  const result = await supervisor.run(opts.taskContent);
@@ -152,11 +152,22 @@ export async function runFilterCommand(values, args) {
152
152
 
153
153
  // --- Split command ---
154
154
 
155
- /** Valid agent source name pattern: lowercase letter, then lowercase alphanumeric or hyphen */
155
+ /** Valid source name pattern: lowercase letter, then lowercase alphanumeric or hyphen. */
156
156
  const VALID_SOURCE_NAME = /^[a-z][a-z0-9-]*$/;
157
157
 
158
+ /** Sources whose name is itself a structural role; classified into the role they represent. */
159
+ const STRUCTURAL_ROLES = new Set(["agent", "supervisor", "facilitator"]);
160
+
158
161
  /**
159
- * Split a combined NDJSON trace into per-source files.
162
+ * Split a combined NDJSON trace into per-source files using the
163
+ * `trace--<case>--<participant>.<role>.ndjson` convention.
164
+ *
165
+ * Each valid envelope source becomes one output file. Structural sources
166
+ * (`agent`, `supervisor`, `facilitator`) classify into the matching role and
167
+ * use their own name as participant; profile-named sources (e.g.
168
+ * `staff-engineer`) classify as agents with the profile in the participant
169
+ * slot. Orchestrator events and invalid source names are dropped.
170
+ *
160
171
  * @param {object} values - Parsed option values
161
172
  * @param {string[]} args - [file]
162
173
  */
@@ -166,24 +177,24 @@ export async function runSplitCommand(values, args) {
166
177
 
167
178
  const mode = values.mode;
168
179
  if (!mode) throw new Error("split: --mode is required");
169
-
170
- if (mode === "run") {
171
- process.stdout.write(
172
- "run mode: trace is already in final form, no split needed\n",
173
- );
174
- return;
180
+ if (!["run", "supervise", "facilitate"].includes(mode)) {
181
+ throw new Error(`split: invalid --mode "${mode}"`);
175
182
  }
176
183
 
184
+ const caseId = values.case ?? "default";
177
185
  const outputDir = values["output-dir"] || dirname(file);
178
186
  mkdirSync(outputDir, { recursive: true });
179
187
 
180
188
  const buckets = parseBuckets(readFileSync(file, "utf8"));
181
189
 
182
- if (mode === "supervise") {
183
- writeBucket(buckets, "agent", outputDir);
184
- writeBucket(buckets, "supervisor", outputDir);
185
- } else if (mode === "facilitate") {
186
- splitFacilitated(buckets, outputDir);
190
+ for (const [source, lines] of buckets.entries()) {
191
+ if (!VALID_SOURCE_NAME.test(source)) continue;
192
+ const role = STRUCTURAL_ROLES.has(source) ? source : "agent";
193
+ const outPath = join(
194
+ outputDir,
195
+ `trace--${caseId}--${source}.${role}.ndjson`,
196
+ );
197
+ writeFileSync(outPath, lines.join("\n") + "\n");
187
198
  }
188
199
  }
189
200
 
@@ -219,44 +230,6 @@ function parseBuckets(content) {
219
230
  return buckets;
220
231
  }
221
232
 
222
- /**
223
- * Write facilitated mode split: facilitator, per-agent, and combined agent files.
224
- * @param {Map<string, string[]>} buckets
225
- * @param {string} outputDir
226
- */
227
- function splitFacilitated(buckets, outputDir) {
228
- writeBucket(buckets, "facilitator", outputDir);
229
-
230
- const agentSources = [...buckets.keys()].filter(
231
- (s) => s !== "facilitator" && VALID_SOURCE_NAME.test(s),
232
- );
233
-
234
- for (const name of agentSources) {
235
- writeBucket(buckets, name, outputDir);
236
- }
237
-
238
- const combinedLines = agentSources.flatMap((n) => buckets.get(n) ?? []);
239
- if (combinedLines.length > 0) {
240
- writeFileSync(
241
- join(outputDir, "trace-agent.ndjson"),
242
- combinedLines.join("\n") + "\n",
243
- );
244
- }
245
- }
246
-
247
- /**
248
- * Write a single source bucket to a trace-{name}.ndjson file.
249
- * @param {Map<string, string[]>} buckets
250
- * @param {string} name
251
- * @param {string} outputDir
252
- */
253
- function writeBucket(buckets, name, outputDir) {
254
- const lines = buckets.get(name);
255
- if (!lines || lines.length === 0) return;
256
- const outPath = join(outputDir, `trace-${name}.ndjson`);
257
- writeFileSync(outPath, lines.join("\n") + "\n");
258
- }
259
-
260
233
  // --- Shared helpers ---
261
234
 
262
235
  /**
@@ -26,7 +26,8 @@ export const FACILITATOR_SYSTEM_PROMPT =
26
26
  "Announce sends a message with no reply obligation. " +
27
27
  "Redirect interrupts a participant with replacement instructions. " +
28
28
  "RollCall lists participants. " +
29
- "Conclude ends the session with a summary.";
29
+ "Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
30
+ "the verdict reflects whether the session met the criteria stated in the task.";
30
31
 
31
32
  /** System prompt appended for facilitated agent runners. */
32
33
  export const FACILITATED_AGENT_SYSTEM_PROMPT =
@@ -106,12 +107,14 @@ export class Facilitator {
106
107
  // messages and started processing concurrently.
107
108
  this.concludeResolve();
108
109
  await Promise.allSettled(agentPromises);
110
+ const success = this.ctx.verdict === "success";
109
111
  this.emitSummary({
110
- success: true,
112
+ success,
113
+ verdict: this.ctx.verdict,
111
114
  turns: this.facilitatorTurns,
112
115
  summary: this.ctx.summary,
113
116
  });
114
- return { success: true, turns: this.facilitatorTurns };
117
+ return { success, turns: this.facilitatorTurns };
115
118
  }
116
119
 
117
120
  // Abort agents promptly when Conclude is called during the event loop
@@ -134,12 +137,14 @@ export class Facilitator {
134
137
  throw err;
135
138
  }
136
139
 
140
+ const success = this.ctx.concluded && this.ctx.verdict === "success";
137
141
  const result = {
138
- success: this.ctx.concluded,
142
+ success,
139
143
  turns: this.facilitatorTurns,
140
144
  };
141
145
  this.emitSummary({
142
- success: result.success,
146
+ success,
147
+ verdict: this.ctx.verdict,
143
148
  turns: result.turns,
144
149
  summary: this.ctx.summary,
145
150
  });
@@ -344,7 +349,7 @@ export class Facilitator {
344
349
  }
345
350
 
346
351
  /**
347
- * @param {{success: boolean, turns: number, summary?: string}} result
352
+ * @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
348
353
  */
349
354
  emitSummary(result) {
350
355
  this.output.write(
@@ -354,6 +359,7 @@ export class Facilitator {
354
359
  event: {
355
360
  type: "summary",
356
361
  success: result.success,
362
+ ...(result.verdict && { verdict: result.verdict }),
357
363
  turns: result.turns,
358
364
  ...(result.summary && { summary: result.summary }),
359
365
  },
@@ -22,6 +22,7 @@ import { z } from "zod";
22
22
  export function createOrchestrationContext() {
23
23
  return {
24
24
  concluded: false,
25
+ verdict: null,
25
26
  summary: null,
26
27
  redirect: null,
27
28
  participants: [],
@@ -37,10 +38,11 @@ export function createOrchestrationContext() {
37
38
 
38
39
  // --- Handler factories ---
39
40
 
40
- /** Create a handler that marks the session as concluded and records the summary. */
41
+ /** Create a handler that marks the session as concluded and records the verdict and summary. */
41
42
  export function createConcludeHandler(ctx) {
42
- return async ({ summary }) => {
43
+ return async ({ verdict, summary }) => {
43
44
  ctx.concluded = true;
45
+ ctx.verdict = verdict;
44
46
  ctx.summary = summary;
45
47
  return { content: [{ type: "text", text: "Session concluded." }] };
46
48
  };
@@ -220,8 +222,8 @@ export function createSupervisorToolServer(ctx) {
220
222
  ),
221
223
  tool(
222
224
  "Conclude",
223
- "End the session with a summary.",
224
- { summary: z.string() },
225
+ "End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
226
+ { verdict: z.enum(["success", "failure"]), summary: z.string() },
225
227
  createConcludeHandler(ctx),
226
228
  ),
227
229
  tool(
@@ -307,8 +309,8 @@ export function createFacilitatorToolServer(ctx) {
307
309
  ),
308
310
  tool(
309
311
  "Conclude",
310
- "End the session with a summary.",
311
- { summary: z.string() },
312
+ "End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
313
+ { verdict: z.enum(["success", "failure"]), summary: z.string() },
312
314
  createConcludeHandler(ctx),
313
315
  ),
314
316
  tool(
@@ -6,6 +6,11 @@
6
6
  * tool (file path, command, pattern, …) sanitized to strip JSON punctuation
7
7
  * (`{`, `}`, `"`) and collapsed to a single line ≤ 80 chars.
8
8
  *
9
+ * MCP-prefixed tools (`mcp__*`) are an intentional carve-out: their hint is
10
+ * the full input rendered as compact single-line JSON, so `{` and `"` do
11
+ * appear on those lines. Readers of GitHub workflow logs need the full MCP
12
+ * payload to know what was actually sent across the protocol.
13
+ *
9
14
  * `previewForResult(content, isError)` collapses a tool result to a single
10
15
  * line ≤ 80 chars and flags errors so the renderer can apply the reserved
11
16
  * error color and the `Error:` label.
@@ -91,35 +96,16 @@ export function simplifyToolName(name) {
91
96
  return parts.slice(2).join("__");
92
97
  }
93
98
 
94
- /**
95
- * MCP-prefixed tool names (e.g. `mcp__orchestration__Ask`) take a different
96
- * handler path. The method name itself is surfaced via `simplifyToolName`,
97
- * so this only adds the `to/from` decorators for orchestration calls.
98
- * Returns null if the name does not match any MCP prefix.
99
- * @param {string} name
100
- * @param {object} input
101
- * @returns {string|null}
102
- */
103
- function hintForMcp(name, input) {
104
- if (name.startsWith("mcp__orchestration__")) {
105
- const parts = [];
106
- if (input.to) parts.push(`to ${sanitize(input.to)}`);
107
- if (input.from) parts.push(`from ${sanitize(input.from)}`);
108
- return truncate(parts.join(" "));
109
- }
110
- if (name.startsWith("mcp__")) {
111
- return "";
112
- }
113
- return null;
114
- }
115
-
116
99
  /**
117
100
  * Map a tool name and input to a one-line human hint.
118
101
  *
119
- * Unknown tools return an empty hint — the caller still shows the tool
120
- * name, just without extra detail. Sanitization is uniform: every branch
121
- * ends with `sanitize`, so the output is guaranteed free of `{`, `}`, `"`
122
- * from the input object (success criterion #2).
102
+ * Three branches, in priority order:
103
+ * - A built-in tool with an entry in `HINT_HANDLERS` sanitized hint, no
104
+ * `{` / `"` from the input (spec 540 criterion #2 for non-MCP tools).
105
+ * - An MCP-prefixed tool (`mcp__*`) → full input rendered as compact
106
+ * single-line JSON; `{` and `"` intentionally appear so readers see
107
+ * the actual MCP payload.
108
+ * - Anything else → "" (the caller still shows the bare tool name).
123
109
  *
124
110
  * @param {string} name - Tool name (e.g. "Bash", "Read", "mcp__orchestration__Ask")
125
111
  * @param {object|null|undefined} input - Raw tool input object from the trace
@@ -132,8 +118,7 @@ export function hintForCall(name, input) {
132
118
  const handler = HINT_HANDLERS[name];
133
119
  if (handler) return handler(safeInput);
134
120
 
135
- const mcp = hintForMcp(name, safeInput);
136
- if (mcp !== null) return mcp;
121
+ if (name.startsWith("mcp__")) return JSON.stringify(safeInput);
137
122
 
138
123
  return "";
139
124
  }
@@ -154,32 +139,15 @@ export function previewForResult(content, isError) {
154
139
  : typeof content === "string"
155
140
  ? content
156
141
  : JSON.stringify(content);
157
- const lines = normalized.split(/\r?\n/);
158
- let firstNonBlank = "";
159
- for (const line of lines) {
160
- if (line.trim().length > 0) {
161
- firstNonBlank = line.trim();
162
- break;
163
- }
164
- }
165
-
166
- if (isError) {
167
- const body = firstNonBlank || "(no output)";
168
- return {
169
- text:
170
- body.length <= MAX_HINT_CHARS
171
- ? body
172
- : body.slice(0, MAX_HINT_CHARS - 3) + "...",
173
- isError: true,
174
- };
175
- }
142
+ const firstNonBlank =
143
+ normalized
144
+ .split(/\r?\n/)
145
+ .map((l) => l.trim())
146
+ .find((l) => l.length > 0) ?? "";
176
147
 
177
- if (!firstNonBlank) return { text: "(ok)", isError: false };
148
+ const fallback = isError ? "(no output)" : "(ok)";
178
149
  return {
179
- text:
180
- firstNonBlank.length <= MAX_HINT_CHARS
181
- ? firstNonBlank
182
- : firstNonBlank.slice(0, MAX_HINT_CHARS - 3) + "...",
183
- isError: false,
150
+ text: truncate(firstNonBlank || fallback),
151
+ isError,
184
152
  };
185
153
  }
@@ -25,12 +25,7 @@ import {
25
25
  * @returns {string[]} Array of rendered line strings
26
26
  */
27
27
  export function renderTurnLines(turn, withPrefix) {
28
- if (turn.role === "assistant") return renderAssistantTurn(turn, withPrefix);
29
- if (turn.role === "tool_result")
30
- return renderToolResultTurn(turn, withPrefix);
31
- if (turn.role === "system") return renderSystemTurn(turn, withPrefix);
32
- if (turn.role === "user") return renderUserTurn(turn, withPrefix);
33
- return [];
28
+ return TURN_RENDERERS[turn.role]?.(turn, withPrefix) ?? [];
34
29
  }
35
30
 
36
31
  /** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
@@ -57,10 +52,13 @@ function renderAssistantTurn(turn, withPrefix) {
57
52
 
58
53
  /** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
59
54
  function renderToolResultTurn(turn, withPrefix) {
55
+ // Successful tool results emit no preview line — the trace document keeps
56
+ // the structured turn, but readers of the streamed log only see errors.
57
+ if (!turn.isError) return [];
60
58
  return [
61
59
  renderToolResultLine({
62
60
  source: turn.source,
63
- preview: previewForResult(turn.content, turn.isError),
61
+ preview: previewForResult(turn.content, true),
64
62
  withPrefix,
65
63
  }),
66
64
  ];
@@ -90,3 +88,10 @@ function renderUserTurn(turn, withPrefix) {
90
88
  }
91
89
  return lines;
92
90
  }
91
+
92
+ const TURN_RENDERERS = {
93
+ assistant: renderAssistantTurn,
94
+ tool_result: renderToolResultTurn,
95
+ system: renderSystemTurn,
96
+ user: renderUserTurn,
97
+ };
package/src/supervisor.js CHANGED
@@ -36,7 +36,8 @@ export const SUPERVISOR_SYSTEM_PROMPT =
36
36
  "Answer replies to an ask the agent addressed to you. " +
37
37
  "Announce sends a message with no reply obligation. " +
38
38
  "Redirect interrupts the agent with replacement instructions. " +
39
- "Conclude ends the session with a summary.";
39
+ "Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
40
+ "the verdict reflects whether the agent's work meets the criteria stated in the task.";
40
41
 
41
42
  /** System prompt appended for the agent runner in supervise mode. */
42
43
  export const AGENT_SYSTEM_PROMPT =
@@ -110,8 +111,14 @@ export class Supervisor {
110
111
  }
111
112
 
112
113
  if (this.ctx.concluded) {
113
- this.emitSummary({ success: true, turns: 0, summary: this.ctx.summary });
114
- return { success: true, turns: 0 };
114
+ const success = this.ctx.verdict === "success";
115
+ this.emitSummary({
116
+ success,
117
+ verdict: this.ctx.verdict,
118
+ turns: 0,
119
+ summary: this.ctx.summary,
120
+ });
121
+ return { success, turns: 0 };
115
122
  }
116
123
 
117
124
  let pendingRelay = null;
@@ -214,12 +221,14 @@ export class Supervisor {
214
221
  }
215
222
 
216
223
  if (this.ctx.concluded) {
224
+ const success = this.ctx.verdict === "success";
217
225
  this.emitSummary({
218
- success: true,
226
+ success,
227
+ verdict: this.ctx.verdict,
219
228
  turns: turn,
220
229
  summary: this.ctx.summary,
221
230
  });
222
- return { type: "exit", exit: { success: true, turns: turn } };
231
+ return { type: "exit", exit: { success, turns: turn } };
223
232
  }
224
233
 
225
234
  if (agentResult.aborted && this.ctx.redirect) {
@@ -308,12 +317,14 @@ export class Supervisor {
308
317
  }
309
318
 
310
319
  if (this.ctx.concluded) {
320
+ const success = this.ctx.verdict === "success";
311
321
  this.emitSummary({
312
- success: true,
322
+ success,
323
+ verdict: this.ctx.verdict,
313
324
  turns: turn,
314
325
  summary: this.ctx.summary,
315
326
  });
316
- return { exit: { success: true, turns: turn } };
327
+ return { exit: { success, turns: turn } };
317
328
  }
318
329
 
319
330
  if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
@@ -323,12 +334,14 @@ export class Supervisor {
323
334
  formatMessages(reminders),
324
335
  );
325
336
  if (this.ctx.concluded) {
337
+ const success = this.ctx.verdict === "success";
326
338
  this.emitSummary({
327
- success: true,
339
+ success,
340
+ verdict: this.ctx.verdict,
328
341
  turns: turn,
329
342
  summary: this.ctx.summary,
330
343
  });
331
- return { exit: { success: true, turns: turn } };
344
+ return { exit: { success, turns: turn } };
332
345
  }
333
346
  this.#checkAsk("supervisor");
334
347
  }
@@ -426,7 +439,7 @@ export class Supervisor {
426
439
 
427
440
  /**
428
441
  * Emit a final orchestrator summary line, wrapped in the universal envelope.
429
- * @param {{success: boolean, turns: number, summary?: string}} result
442
+ * @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
430
443
  */
431
444
  emitSummary(result) {
432
445
  this.output.write(
@@ -436,6 +449,7 @@ export class Supervisor {
436
449
  event: {
437
450
  type: "summary",
438
451
  success: result.success,
452
+ ...(result.verdict && { verdict: result.verdict }),
439
453
  turns: result.turns,
440
454
  ...(result.summary && { summary: result.summary }),
441
455
  },
@@ -466,6 +480,7 @@ const devNull = new Writable({
466
480
  * @param {string} [deps.agentProfile] - Agent profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
467
481
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<supervisorCwd>/.claude/agents`. Resolved once from the orchestrator's cwd so profiles travel with the project, not with a per-agent sandbox.
468
482
  * @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
483
+ * @param {Record<string, object>} [deps.agentMcpServers] - Additional MCP servers exposed to the agent (merged alongside the orchestration server).
469
484
  * @returns {Supervisor}
470
485
  */
471
486
  export function createSupervisor({
@@ -482,6 +497,7 @@ export function createSupervisor({
482
497
  agentProfile,
483
498
  profilesDir,
484
499
  taskAmend,
500
+ agentMcpServers,
485
501
  }) {
486
502
  const resolvedProfilesDir =
487
503
  profilesDir ?? resolve(supervisorCwd, ".claude/agents");
@@ -521,7 +537,7 @@ export function createSupervisor({
521
537
  onLine,
522
538
  settingSources: ["project"],
523
539
  systemPrompt: systemPromptFor(agentProfile, AGENT_SYSTEM_PROMPT),
524
- mcpServers: { orchestration: agentServer },
540
+ mcpServers: { orchestration: agentServer, ...agentMcpServers },
525
541
  });
526
542
 
527
543
  const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
package/src/tee-writer.js CHANGED
@@ -100,6 +100,12 @@ export class TeeWriter extends Writable {
100
100
 
101
101
  // Universal envelope: { source, seq, event }
102
102
  if (parsed.event) {
103
+ // Always forward to the collector so it can capture orchestrator
104
+ // metadata (e.g. the summary verdict for the result footer); the
105
+ // collector adds no turn for suppressed events, so flushTurns stays
106
+ // a no-op when we skip it below.
107
+ this.collector.addLine(line);
108
+
103
109
  // Orchestrator lifecycle events are suppressed from the text stream
104
110
  // entirely (spec 540). They still reached fileStream above.
105
111
  if (
@@ -108,7 +114,6 @@ export class TeeWriter extends Writable {
108
114
  ) {
109
115
  return;
110
116
  }
111
- this.collector.addLine(line);
112
117
  this.flushTurns();
113
118
  return;
114
119
  }
@@ -27,6 +27,8 @@ export class TraceCollector {
27
27
  this.turns = [];
28
28
  /** @type {object|null} */
29
29
  this.result = null;
30
+ /** @type {{verdict?: string, summary?: string, turns?: number}|null} */
31
+ this.orchestratorSummary = null;
30
32
  /** @type {number} */
31
33
  this.turnIndex = 0;
32
34
  /** @type {object|null} */
@@ -62,6 +64,16 @@ export class TraceCollector {
62
64
  // Orchestrator lifecycle events carry no content and are suppressed
63
65
  // from turns entirely — the NDJSON artifact keeps them separately.
64
66
  if (source === "orchestrator" && isSuppressedOrchestratorEvent(event)) {
67
+ // The summary event carries the supervisor/facilitator verdict —
68
+ // capture it before dropping the event, so the result footer can
69
+ // surface verdict="failure" instead of the SDK's per-runner status.
70
+ if (event.type === "summary") {
71
+ this.orchestratorSummary = {
72
+ ...(event.verdict && { verdict: event.verdict }),
73
+ ...(typeof event.summary === "string" && { summary: event.summary }),
74
+ ...(typeof event.turns === "number" && { turns: event.turns }),
75
+ };
76
+ }
65
77
  return;
66
78
  }
67
79
 
@@ -277,16 +289,20 @@ export class TraceCollector {
277
289
  }
278
290
 
279
291
  /**
280
- * Format the trailing result summary line (spec 540).
292
+ * Format the trailing result summary line (spec 540). When an orchestrator
293
+ * summary is present (supervised / facilitated mode), the headline word is
294
+ * the supervisor's verdict ("success" / "failure") rather than the SDK's
295
+ * per-runner subtype, so the footer aligns with the CI exit code.
281
296
  * @returns {string}
282
297
  */
283
298
  #formatResultTail() {
284
299
  if (!this.result) return "";
285
300
  const duration = formatDuration(this.result.durationMs);
286
301
  const cost = Number(this.result.totalCostUsd).toFixed(4);
302
+ const headline = this.orchestratorSummary?.verdict ?? this.result.result;
287
303
  return (
288
304
  "\n" +
289
- `--- Result: ${this.result.result} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
305
+ `--- Result: ${headline} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
290
306
  );
291
307
  }
292
308
  }
@@ -65,8 +65,10 @@ export class TraceGitHub {
65
65
  /**
66
66
  * Download a trace artifact from a workflow run and extract it.
67
67
  *
68
- * Tries artifact names in order: combined-trace, agent-trace.
69
- * The artifact zip is downloaded and extracted to the output directory.
68
+ * When `opts.name` is set, looks up that exact artifact. Otherwise picks the
69
+ * best match from the unified `trace--<case>--<participant>.<role>` naming
70
+ * convention: prefer a `*.raw` artifact (combined log), then any `*.agent`,
71
+ * then the first `trace--*` artifact found.
70
72
  *
71
73
  * @param {number|string} runId
72
74
  * @param {object} [opts]
@@ -84,13 +86,18 @@ export class TraceGitHub {
84
86
  const artifacts = data.artifacts ?? [];
85
87
 
86
88
  // Find the trace artifact.
87
- const preferredNames = opts.name
88
- ? [opts.name]
89
- : ["combined-trace", "agent-trace"];
90
89
  let artifact = null;
91
- for (const name of preferredNames) {
92
- artifact = artifacts.find((a) => a.name === name);
93
- if (artifact) break;
90
+ if (opts.name) {
91
+ artifact = artifacts.find((a) => a.name === opts.name);
92
+ } else {
93
+ const traceArtifacts = artifacts.filter((a) =>
94
+ a.name.startsWith("trace--"),
95
+ );
96
+ artifact =
97
+ traceArtifacts.find((a) => a.name.endsWith(".raw")) ??
98
+ traceArtifacts.find((a) => a.name.endsWith(".agent")) ??
99
+ traceArtifacts[0] ??
100
+ null;
94
101
  }
95
102
 
96
103
  if (!artifact) {