@forwardimpact/libeval 0.1.28 → 0.1.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/fit-eval.js CHANGED
@@ -10,9 +10,13 @@ import { runRunCommand } from "../src/commands/run.js";
10
10
  import { runSuperviseCommand } from "../src/commands/supervise.js";
11
11
  import { runFacilitateCommand } from "../src/commands/facilitate.js";
12
12
 
13
- const { version: VERSION } = JSON.parse(
14
- readFileSync(new URL("../package.json", import.meta.url), "utf8"),
15
- );
13
+ // `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
14
+ // the readFileSync branch in the compiled binary (which would ENOENT against
15
+ // the bunfs virtual mount). Source execution falls through to package.json.
16
+ const VERSION =
17
+ process.env.FIT_EVAL_VERSION ||
18
+ JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
19
+ .version;
16
20
 
17
21
  const definition = {
18
22
  name: "fit-eval",
@@ -55,6 +59,11 @@ const definition = {
55
59
  type: "string",
56
60
  description: "Comma-separated tool allowlist",
57
61
  },
62
+ "mcp-server": {
63
+ type: "string",
64
+ description:
65
+ "Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
66
+ },
58
67
  },
59
68
  },
60
69
  {
@@ -102,6 +111,11 @@ const definition = {
102
111
  type: "string",
103
112
  description: "Supervisor tool allowlist",
104
113
  },
114
+ "mcp-server": {
115
+ type: "string",
116
+ description:
117
+ "Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
118
+ },
105
119
  },
106
120
  },
107
121
  {
package/bin/fit-trace.js CHANGED
@@ -26,9 +26,13 @@ import {
26
26
  runSplitCommand,
27
27
  } from "../src/commands/trace.js";
28
28
 
29
- const { version: VERSION } = JSON.parse(
30
- readFileSync(new URL("../package.json", import.meta.url), "utf8"),
31
- );
29
+ // `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
30
+ // the readFileSync branch in the compiled binary (which would ENOENT against
31
+ // the bunfs virtual mount). Source execution falls through to package.json.
32
+ const VERSION =
33
+ process.env.FIT_TRACE_VERSION ||
34
+ JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
35
+ .version;
32
36
 
33
37
  const definition = {
34
38
  name: "fit-trace",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.28",
3
+ "version": "0.1.30",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -51,7 +51,7 @@
51
51
  "@forwardimpact/libcli": "^0.1.0",
52
52
  "@forwardimpact/libconfig": "^0.1.0",
53
53
  "@forwardimpact/libtelemetry": "^0.1.22",
54
- "zod": "^4.4.1"
54
+ "zod": "^4.4.3"
55
55
  },
56
56
  "devDependencies": {
57
57
  "@forwardimpact/libharness": "^0.1.14"
@@ -5,6 +5,7 @@ import { createAgentRunner } from "../agent-runner.js";
5
5
  import { composeProfilePrompt } from "../profile-prompt.js";
6
6
  import { createTeeWriter } from "../tee-writer.js";
7
7
  import { SequenceCounter } from "../sequence-counter.js";
8
+ import { createServiceConfig } from "@forwardimpact/libconfig";
8
9
 
9
10
  /**
10
11
  * Parse and validate run command options from parsed values.
@@ -35,6 +36,7 @@ function parseRunOptions(values) {
35
36
  values["allowed-tools"] ??
36
37
  "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
37
38
  ).split(","),
39
+ mcpServer: values["mcp-server"] ?? undefined,
38
40
  };
39
41
  }
40
42
 
@@ -56,6 +58,7 @@ export async function runRunCommand(values, _args) {
56
58
  outputPath,
57
59
  agentProfile,
58
60
  allowedTools,
61
+ mcpServer,
59
62
  } = parseRunOptions(values);
60
63
 
61
64
  // When --output is specified, stream text to stdout while writing NDJSON to file.
@@ -78,6 +81,19 @@ export async function runRunCommand(values, _args) {
78
81
  );
79
82
  };
80
83
 
84
+ let mcpServers = null;
85
+ if (mcpServer) {
86
+ const mcpConfig = await createServiceConfig("mcp");
87
+ mcpServers = {
88
+ [mcpServer]: {
89
+ type: "http",
90
+ url: mcpConfig.url,
91
+ headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
92
+ },
93
+ };
94
+ allowedTools.push(`mcp__${mcpServer}__*`);
95
+ }
96
+
81
97
  if (agentProfile) {
82
98
  process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
83
99
  }
@@ -100,6 +116,7 @@ export async function runRunCommand(values, _args) {
100
116
  settingSources: ["project"],
101
117
  systemPrompt,
102
118
  taskAmend,
119
+ mcpServers,
103
120
  });
104
121
 
105
122
  const result = await runner.run(taskContent);
@@ -3,6 +3,7 @@ import { resolve, join } from "node:path";
3
3
  import { tmpdir } from "node:os";
4
4
  import { createSupervisor } from "../supervisor.js";
5
5
  import { createTeeWriter } from "../tee-writer.js";
6
+ import { createServiceConfig } from "@forwardimpact/libconfig";
6
7
 
7
8
  /**
8
9
  * Parse all supervise flags from parsed values into an options object.
@@ -44,6 +45,7 @@ function parseSuperviseOptions(values) {
44
45
  supervisorAllowedTools: supervisorAllowedToolsRaw
45
46
  ? supervisorAllowedToolsRaw.split(",")
46
47
  : undefined,
48
+ mcpServer: values["mcp-server"] ?? undefined,
47
49
  };
48
50
  }
49
51
 
@@ -71,6 +73,19 @@ export async function runSuperviseCommand(values, _args) {
71
73
  })
72
74
  : process.stdout;
73
75
 
76
+ let agentMcpServers = null;
77
+ if (opts.mcpServer) {
78
+ const mcpConfig = await createServiceConfig("mcp");
79
+ agentMcpServers = {
80
+ [opts.mcpServer]: {
81
+ type: "http",
82
+ url: mcpConfig.url,
83
+ headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
84
+ },
85
+ };
86
+ opts.allowedTools.push(`mcp__${opts.mcpServer}__*`);
87
+ }
88
+
74
89
  if (opts.agentProfile) {
75
90
  process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
76
91
  }
@@ -88,6 +103,7 @@ export async function runSuperviseCommand(values, _args) {
88
103
  supervisorProfile: opts.supervisorProfile,
89
104
  agentProfile: opts.agentProfile,
90
105
  taskAmend: opts.taskAmend,
106
+ agentMcpServers,
91
107
  });
92
108
 
93
109
  const result = await supervisor.run(opts.taskContent);
@@ -26,7 +26,8 @@ export const FACILITATOR_SYSTEM_PROMPT =
26
26
  "Announce sends a message with no reply obligation. " +
27
27
  "Redirect interrupts a participant with replacement instructions. " +
28
28
  "RollCall lists participants. " +
29
- "Conclude ends the session with a summary.";
29
+ "Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
30
+ "the verdict reflects whether the session met the criteria stated in the task.";
30
31
 
31
32
  /** System prompt appended for facilitated agent runners. */
32
33
  export const FACILITATED_AGENT_SYSTEM_PROMPT =
@@ -106,12 +107,14 @@ export class Facilitator {
106
107
  // messages and started processing concurrently.
107
108
  this.concludeResolve();
108
109
  await Promise.allSettled(agentPromises);
110
+ const success = this.ctx.verdict === "success";
109
111
  this.emitSummary({
110
- success: true,
112
+ success,
113
+ verdict: this.ctx.verdict,
111
114
  turns: this.facilitatorTurns,
112
115
  summary: this.ctx.summary,
113
116
  });
114
- return { success: true, turns: this.facilitatorTurns };
117
+ return { success, turns: this.facilitatorTurns };
115
118
  }
116
119
 
117
120
  // Abort agents promptly when Conclude is called during the event loop
@@ -134,12 +137,14 @@ export class Facilitator {
134
137
  throw err;
135
138
  }
136
139
 
140
+ const success = this.ctx.concluded && this.ctx.verdict === "success";
137
141
  const result = {
138
- success: this.ctx.concluded,
142
+ success,
139
143
  turns: this.facilitatorTurns,
140
144
  };
141
145
  this.emitSummary({
142
- success: result.success,
146
+ success,
147
+ verdict: this.ctx.verdict,
143
148
  turns: result.turns,
144
149
  summary: this.ctx.summary,
145
150
  });
@@ -344,7 +349,7 @@ export class Facilitator {
344
349
  }
345
350
 
346
351
  /**
347
- * @param {{success: boolean, turns: number, summary?: string}} result
352
+ * @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
348
353
  */
349
354
  emitSummary(result) {
350
355
  this.output.write(
@@ -354,6 +359,7 @@ export class Facilitator {
354
359
  event: {
355
360
  type: "summary",
356
361
  success: result.success,
362
+ ...(result.verdict && { verdict: result.verdict }),
357
363
  turns: result.turns,
358
364
  ...(result.summary && { summary: result.summary }),
359
365
  },
@@ -22,6 +22,7 @@ import { z } from "zod";
22
22
  export function createOrchestrationContext() {
23
23
  return {
24
24
  concluded: false,
25
+ verdict: null,
25
26
  summary: null,
26
27
  redirect: null,
27
28
  participants: [],
@@ -37,10 +38,11 @@ export function createOrchestrationContext() {
37
38
 
38
39
  // --- Handler factories ---
39
40
 
40
- /** Create a handler that marks the session as concluded and records the summary. */
41
+ /** Create a handler that marks the session as concluded and records the verdict and summary. */
41
42
  export function createConcludeHandler(ctx) {
42
- return async ({ summary }) => {
43
+ return async ({ verdict, summary }) => {
43
44
  ctx.concluded = true;
45
+ ctx.verdict = verdict;
44
46
  ctx.summary = summary;
45
47
  return { content: [{ type: "text", text: "Session concluded." }] };
46
48
  };
@@ -220,8 +222,8 @@ export function createSupervisorToolServer(ctx) {
220
222
  ),
221
223
  tool(
222
224
  "Conclude",
223
- "End the session with a summary.",
224
- { summary: z.string() },
225
+ "End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
226
+ { verdict: z.enum(["success", "failure"]), summary: z.string() },
225
227
  createConcludeHandler(ctx),
226
228
  ),
227
229
  tool(
@@ -307,8 +309,8 @@ export function createFacilitatorToolServer(ctx) {
307
309
  ),
308
310
  tool(
309
311
  "Conclude",
310
- "End the session with a summary.",
311
- { summary: z.string() },
312
+ "End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
313
+ { verdict: z.enum(["success", "failure"]), summary: z.string() },
312
314
  createConcludeHandler(ctx),
313
315
  ),
314
316
  tool(
@@ -6,6 +6,11 @@
6
6
  * tool (file path, command, pattern, …) sanitized to strip JSON punctuation
7
7
  * (`{`, `}`, `"`) and collapsed to a single line ≤ 80 chars.
8
8
  *
9
+ * MCP-prefixed tools (`mcp__*`) are an intentional carve-out: their hint is
10
+ * the full input rendered as compact single-line JSON, so `{` and `"` do
11
+ * appear on those lines. Readers of GitHub workflow logs need the full MCP
12
+ * payload to know what was actually sent across the protocol.
13
+ *
9
14
  * `previewForResult(content, isError)` collapses a tool result to a single
10
15
  * line ≤ 80 chars and flags errors so the renderer can apply the reserved
11
16
  * error color and the `Error:` label.
@@ -91,35 +96,16 @@ export function simplifyToolName(name) {
91
96
  return parts.slice(2).join("__");
92
97
  }
93
98
 
94
- /**
95
- * MCP-prefixed tool names (e.g. `mcp__orchestration__Ask`) take a different
96
- * handler path. The method name itself is surfaced via `simplifyToolName`,
97
- * so this only adds the `to/from` decorators for orchestration calls.
98
- * Returns null if the name does not match any MCP prefix.
99
- * @param {string} name
100
- * @param {object} input
101
- * @returns {string|null}
102
- */
103
- function hintForMcp(name, input) {
104
- if (name.startsWith("mcp__orchestration__")) {
105
- const parts = [];
106
- if (input.to) parts.push(`to ${sanitize(input.to)}`);
107
- if (input.from) parts.push(`from ${sanitize(input.from)}`);
108
- return truncate(parts.join(" "));
109
- }
110
- if (name.startsWith("mcp__")) {
111
- return "";
112
- }
113
- return null;
114
- }
115
-
116
99
  /**
117
100
  * Map a tool name and input to a one-line human hint.
118
101
  *
119
- * Unknown tools return an empty hint — the caller still shows the tool
120
- * name, just without extra detail. Sanitization is uniform: every branch
121
- * ends with `sanitize`, so the output is guaranteed free of `{`, `}`, `"`
122
- * from the input object (success criterion #2).
102
+ * Three branches, in priority order:
103
+ * - A built-in tool with an entry in `HINT_HANDLERS` sanitized hint, no
104
+ * `{` / `"` from the input (spec 540 criterion #2 for non-MCP tools).
105
+ * - An MCP-prefixed tool (`mcp__*`) → full input rendered as compact
106
+ * single-line JSON; `{` and `"` intentionally appear so readers see
107
+ * the actual MCP payload.
108
+ * - Anything else → "" (the caller still shows the bare tool name).
123
109
  *
124
110
  * @param {string} name - Tool name (e.g. "Bash", "Read", "mcp__orchestration__Ask")
125
111
  * @param {object|null|undefined} input - Raw tool input object from the trace
@@ -132,8 +118,7 @@ export function hintForCall(name, input) {
132
118
  const handler = HINT_HANDLERS[name];
133
119
  if (handler) return handler(safeInput);
134
120
 
135
- const mcp = hintForMcp(name, safeInput);
136
- if (mcp !== null) return mcp;
121
+ if (name.startsWith("mcp__")) return JSON.stringify(safeInput);
137
122
 
138
123
  return "";
139
124
  }
@@ -154,32 +139,15 @@ export function previewForResult(content, isError) {
154
139
  : typeof content === "string"
155
140
  ? content
156
141
  : JSON.stringify(content);
157
- const lines = normalized.split(/\r?\n/);
158
- let firstNonBlank = "";
159
- for (const line of lines) {
160
- if (line.trim().length > 0) {
161
- firstNonBlank = line.trim();
162
- break;
163
- }
164
- }
165
-
166
- if (isError) {
167
- const body = firstNonBlank || "(no output)";
168
- return {
169
- text:
170
- body.length <= MAX_HINT_CHARS
171
- ? body
172
- : body.slice(0, MAX_HINT_CHARS - 3) + "...",
173
- isError: true,
174
- };
175
- }
142
+ const firstNonBlank =
143
+ normalized
144
+ .split(/\r?\n/)
145
+ .map((l) => l.trim())
146
+ .find((l) => l.length > 0) ?? "";
176
147
 
177
- if (!firstNonBlank) return { text: "(ok)", isError: false };
148
+ const fallback = isError ? "(no output)" : "(ok)";
178
149
  return {
179
- text:
180
- firstNonBlank.length <= MAX_HINT_CHARS
181
- ? firstNonBlank
182
- : firstNonBlank.slice(0, MAX_HINT_CHARS - 3) + "...",
183
- isError: false,
150
+ text: truncate(firstNonBlank || fallback),
151
+ isError,
184
152
  };
185
153
  }
@@ -25,12 +25,7 @@ import {
25
25
  * @returns {string[]} Array of rendered line strings
26
26
  */
27
27
  export function renderTurnLines(turn, withPrefix) {
28
- if (turn.role === "assistant") return renderAssistantTurn(turn, withPrefix);
29
- if (turn.role === "tool_result")
30
- return renderToolResultTurn(turn, withPrefix);
31
- if (turn.role === "system") return renderSystemTurn(turn, withPrefix);
32
- if (turn.role === "user") return renderUserTurn(turn, withPrefix);
33
- return [];
28
+ return TURN_RENDERERS[turn.role]?.(turn, withPrefix) ?? [];
34
29
  }
35
30
 
36
31
  /** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
@@ -57,10 +52,13 @@ function renderAssistantTurn(turn, withPrefix) {
57
52
 
58
53
  /** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
59
54
  function renderToolResultTurn(turn, withPrefix) {
55
+ // Successful tool results emit no preview line — the trace document keeps
56
+ // the structured turn, but readers of the streamed log only see errors.
57
+ if (!turn.isError) return [];
60
58
  return [
61
59
  renderToolResultLine({
62
60
  source: turn.source,
63
- preview: previewForResult(turn.content, turn.isError),
61
+ preview: previewForResult(turn.content, true),
64
62
  withPrefix,
65
63
  }),
66
64
  ];
@@ -90,3 +88,10 @@ function renderUserTurn(turn, withPrefix) {
90
88
  }
91
89
  return lines;
92
90
  }
91
+
92
+ const TURN_RENDERERS = {
93
+ assistant: renderAssistantTurn,
94
+ tool_result: renderToolResultTurn,
95
+ system: renderSystemTurn,
96
+ user: renderUserTurn,
97
+ };
package/src/supervisor.js CHANGED
@@ -36,7 +36,8 @@ export const SUPERVISOR_SYSTEM_PROMPT =
36
36
  "Answer replies to an ask the agent addressed to you. " +
37
37
  "Announce sends a message with no reply obligation. " +
38
38
  "Redirect interrupts the agent with replacement instructions. " +
39
- "Conclude ends the session with a summary.";
39
+ "Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
40
+ "the verdict reflects whether the agent's work meets the criteria stated in the task.";
40
41
 
41
42
  /** System prompt appended for the agent runner in supervise mode. */
42
43
  export const AGENT_SYSTEM_PROMPT =
@@ -110,8 +111,14 @@ export class Supervisor {
110
111
  }
111
112
 
112
113
  if (this.ctx.concluded) {
113
- this.emitSummary({ success: true, turns: 0, summary: this.ctx.summary });
114
- return { success: true, turns: 0 };
114
+ const success = this.ctx.verdict === "success";
115
+ this.emitSummary({
116
+ success,
117
+ verdict: this.ctx.verdict,
118
+ turns: 0,
119
+ summary: this.ctx.summary,
120
+ });
121
+ return { success, turns: 0 };
115
122
  }
116
123
 
117
124
  let pendingRelay = null;
@@ -214,12 +221,14 @@ export class Supervisor {
214
221
  }
215
222
 
216
223
  if (this.ctx.concluded) {
224
+ const success = this.ctx.verdict === "success";
217
225
  this.emitSummary({
218
- success: true,
226
+ success,
227
+ verdict: this.ctx.verdict,
219
228
  turns: turn,
220
229
  summary: this.ctx.summary,
221
230
  });
222
- return { type: "exit", exit: { success: true, turns: turn } };
231
+ return { type: "exit", exit: { success, turns: turn } };
223
232
  }
224
233
 
225
234
  if (agentResult.aborted && this.ctx.redirect) {
@@ -308,12 +317,14 @@ export class Supervisor {
308
317
  }
309
318
 
310
319
  if (this.ctx.concluded) {
320
+ const success = this.ctx.verdict === "success";
311
321
  this.emitSummary({
312
- success: true,
322
+ success,
323
+ verdict: this.ctx.verdict,
313
324
  turns: turn,
314
325
  summary: this.ctx.summary,
315
326
  });
316
- return { exit: { success: true, turns: turn } };
327
+ return { exit: { success, turns: turn } };
317
328
  }
318
329
 
319
330
  if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
@@ -323,12 +334,14 @@ export class Supervisor {
323
334
  formatMessages(reminders),
324
335
  );
325
336
  if (this.ctx.concluded) {
337
+ const success = this.ctx.verdict === "success";
326
338
  this.emitSummary({
327
- success: true,
339
+ success,
340
+ verdict: this.ctx.verdict,
328
341
  turns: turn,
329
342
  summary: this.ctx.summary,
330
343
  });
331
- return { exit: { success: true, turns: turn } };
344
+ return { exit: { success, turns: turn } };
332
345
  }
333
346
  this.#checkAsk("supervisor");
334
347
  }
@@ -426,7 +439,7 @@ export class Supervisor {
426
439
 
427
440
  /**
428
441
  * Emit a final orchestrator summary line, wrapped in the universal envelope.
429
- * @param {{success: boolean, turns: number, summary?: string}} result
442
+ * @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
430
443
  */
431
444
  emitSummary(result) {
432
445
  this.output.write(
@@ -436,6 +449,7 @@ export class Supervisor {
436
449
  event: {
437
450
  type: "summary",
438
451
  success: result.success,
452
+ ...(result.verdict && { verdict: result.verdict }),
439
453
  turns: result.turns,
440
454
  ...(result.summary && { summary: result.summary }),
441
455
  },
@@ -466,6 +480,7 @@ const devNull = new Writable({
466
480
  * @param {string} [deps.agentProfile] - Agent profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
467
481
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<supervisorCwd>/.claude/agents`. Resolved once from the orchestrator's cwd so profiles travel with the project, not with a per-agent sandbox.
468
482
  * @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
483
+ * @param {Record<string, object>} [deps.agentMcpServers] - Additional MCP servers exposed to the agent (merged alongside the orchestration server).
469
484
  * @returns {Supervisor}
470
485
  */
471
486
  export function createSupervisor({
@@ -482,6 +497,7 @@ export function createSupervisor({
482
497
  agentProfile,
483
498
  profilesDir,
484
499
  taskAmend,
500
+ agentMcpServers,
485
501
  }) {
486
502
  const resolvedProfilesDir =
487
503
  profilesDir ?? resolve(supervisorCwd, ".claude/agents");
@@ -521,7 +537,7 @@ export function createSupervisor({
521
537
  onLine,
522
538
  settingSources: ["project"],
523
539
  systemPrompt: systemPromptFor(agentProfile, AGENT_SYSTEM_PROMPT),
524
- mcpServers: { orchestration: agentServer },
540
+ mcpServers: { orchestration: agentServer, ...agentMcpServers },
525
541
  });
526
542
 
527
543
  const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
package/src/tee-writer.js CHANGED
@@ -100,6 +100,12 @@ export class TeeWriter extends Writable {
100
100
 
101
101
  // Universal envelope: { source, seq, event }
102
102
  if (parsed.event) {
103
+ // Always forward to the collector so it can capture orchestrator
104
+ // metadata (e.g. the summary verdict for the result footer); the
105
+ // collector adds no turn for suppressed events, so flushTurns stays
106
+ // a no-op when we skip it below.
107
+ this.collector.addLine(line);
108
+
103
109
  // Orchestrator lifecycle events are suppressed from the text stream
104
110
  // entirely (spec 540). They still reached fileStream above.
105
111
  if (
@@ -108,7 +114,6 @@ export class TeeWriter extends Writable {
108
114
  ) {
109
115
  return;
110
116
  }
111
- this.collector.addLine(line);
112
117
  this.flushTurns();
113
118
  return;
114
119
  }
@@ -27,6 +27,8 @@ export class TraceCollector {
27
27
  this.turns = [];
28
28
  /** @type {object|null} */
29
29
  this.result = null;
30
+ /** @type {{verdict?: string, summary?: string, turns?: number}|null} */
31
+ this.orchestratorSummary = null;
30
32
  /** @type {number} */
31
33
  this.turnIndex = 0;
32
34
  /** @type {object|null} */
@@ -62,6 +64,16 @@ export class TraceCollector {
62
64
  // Orchestrator lifecycle events carry no content and are suppressed
63
65
  // from turns entirely — the NDJSON artifact keeps them separately.
64
66
  if (source === "orchestrator" && isSuppressedOrchestratorEvent(event)) {
67
+ // The summary event carries the supervisor/facilitator verdict —
68
+ // capture it before dropping the event, so the result footer can
69
+ // surface verdict="failure" instead of the SDK's per-runner status.
70
+ if (event.type === "summary") {
71
+ this.orchestratorSummary = {
72
+ ...(event.verdict && { verdict: event.verdict }),
73
+ ...(typeof event.summary === "string" && { summary: event.summary }),
74
+ ...(typeof event.turns === "number" && { turns: event.turns }),
75
+ };
76
+ }
65
77
  return;
66
78
  }
67
79
 
@@ -277,16 +289,20 @@ export class TraceCollector {
277
289
  }
278
290
 
279
291
  /**
280
- * Format the trailing result summary line (spec 540).
292
+ * Format the trailing result summary line (spec 540). When an orchestrator
293
+ * summary is present (supervised / facilitated mode), the headline word is
294
+ * the supervisor's verdict ("success" / "failure") rather than the SDK's
295
+ * per-runner subtype, so the footer aligns with the CI exit code.
281
296
  * @returns {string}
282
297
  */
283
298
  #formatResultTail() {
284
299
  if (!this.result) return "";
285
300
  const duration = formatDuration(this.result.durationMs);
286
301
  const cost = Number(this.result.totalCostUsd).toFixed(4);
302
+ const headline = this.orchestratorSummary?.verdict ?? this.result.result;
287
303
  return (
288
304
  "\n" +
289
- `--- Result: ${this.result.result} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
305
+ `--- Result: ${headline} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
290
306
  );
291
307
  }
292
308
  }