npm - @forwardimpact/libeval - Versions diffs - 0.1.27 → 0.1.30 - Mend

@forwardimpact/libeval 0.1.27 → 0.1.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +6 -1
package/bin/fit-eval.js +24 -10
package/bin/fit-trace.js +14 -10
package/package.json +13 -11
package/src/agent-runner.js +1 -0
package/src/commands/run.js +17 -0
package/src/commands/supervise.js +16 -0
package/src/facilitator.js +14 -6
package/src/message-bus.js +1 -0
package/src/orchestration-toolkit.js +10 -5
package/src/orchestrator-helpers.js +1 -0
package/src/render/tool-hints.js +21 -53
package/src/render/turn-renderer.js +12 -7
package/src/sequence-counter.js +4 -0
package/src/supervisor.js +31 -13
package/src/tee-writer.js +7 -1
package/src/trace-collector.js +19 -2

package/README.md CHANGED Viewed

@@ -1,6 +1,11 @@
 # libeval
-Process Claude Code stream-json output into structured traces.
+<!-- BEGIN:description — Do not edit. Generated from package.json. -->
+Agent evaluation framework — prove whether agent changes improved outcomes with
+reproducible evidence.
+<!-- END:description -->
 ## Getting Started

package/bin/fit-eval.js CHANGED Viewed

@@ -10,9 +10,13 @@ import { runRunCommand } from "../src/commands/run.js";
 import { runSuperviseCommand } from "../src/commands/supervise.js";
 import { runFacilitateCommand } from "../src/commands/facilitate.js";
-const { version: VERSION } = JSON.parse(
-  readFileSync(new URL("../package.json", import.meta.url), "utf8"),
-);
+// `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
+// the readFileSync branch in the compiled binary (which would ENOENT against
+// the bunfs virtual mount). Source execution falls through to package.json.
+const VERSION =
+  process.env.FIT_EVAL_VERSION ||
+  JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
+    .version;
 const definition = {
   name: "fit-eval",
@@ -55,6 +59,11 @@ const definition = {
           type: "string",
           description: "Comma-separated tool allowlist",
         },
+        "mcp-server": {
+          type: "string",
+          description:
+            "Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
+        },
       },
     },
     {
@@ -102,6 +111,11 @@ const definition = {
           type: "string",
           description: "Supervisor tool allowlist",
         },
+        "mcp-server": {
+          type: "string",
+          description:
+            "Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
+        },
       },
     },
     {
@@ -177,20 +191,20 @@ const definition = {
   ],
   documentation: [
     {
-      title: "Agent Evaluations",
-      url: "https://www.forwardimpact.team/docs/libraries/agent-evaluations/index.md",
+      title: "Run an Eval",
+      url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
       description:
         "Author a judge profile, run an eval locally, wire it into CI, and inspect the resulting trace.",
     },
     {
-      title: "Agent Collaboration",
-      url: "https://www.forwardimpact.team/docs/libraries/agent-collaboration/index.md",
+      title: "Prove Agent Changes",
+      url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
       description:
-        "Author a facilitator and participant profiles, run a multi-agent session, and read the message flow.",
+        "End-to-end workflow from dataset generation through evaluation to trace analysis, including multi-agent collaboration sessions.",
     },
     {
-      title: "Trace Analysis",
-      url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
+      title: "Analyze Traces",
+      url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
       description:
         "Read the NDJSON traces produced by `fit-eval` with `fit-trace` — grounded-theory method and worked examples.",
     },

package/bin/fit-trace.js CHANGED Viewed

@@ -26,9 +26,13 @@ import {
   runSplitCommand,
 } from "../src/commands/trace.js";
-const { version: VERSION } = JSON.parse(
-  readFileSync(new URL("../package.json", import.meta.url), "utf8"),
-);
+// `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
+// the readFileSync branch in the compiled binary (which would ENOENT against
+// the bunfs virtual mount). Source execution falls through to package.json.
+const VERSION =
+  process.env.FIT_TRACE_VERSION ||
+  JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
+    .version;
 const definition = {
   name: "fit-trace",
@@ -214,22 +218,22 @@ const definition = {
   ],
   documentation: [
     {
-      title: "Trace Analysis",
-      url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
+      title: "Analyze Traces",
+      url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
       description:
         "The full method walkthrough with worked examples (an eval that failed, a multi-agent session that stalled).",
     },
     {
-      title: "Agent Evaluations",
-      url: "https://www.forwardimpact.team/docs/libraries/agent-evaluations/index.md",
+      title: "Run an Eval",
+      url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
       description:
         "How `fit-eval supervise` produces the traces this skill analyzes.",
     },
     {
-      title: "Agent Collaboration",
-      url: "https://www.forwardimpact.team/docs/libraries/agent-collaboration/index.md",
+      title: "Prove Agent Changes",
+      url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
       description:
-        "How `fit-eval facilitate` produces multi-agent traces; `split` is the bridge into per-source files.",
+        "End-to-end workflow including multi-agent collaboration; `split` is the bridge into per-source trace files.",
     },
   ],
 };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.27",
-  "description": "Agent evaluation: collect Claude Code traces, run agent loops, supervise multi-step workflows.",
+  "version": "0.1.30",
+  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",
     "agent",
@@ -17,14 +17,16 @@
   },
   "license": "Apache-2.0",
   "author": "D. Olsson <hi@senzilla.io>",
-  "forwardimpact": {
-    "capability": "agent-self-improvement",
-    "needs": [
-      "Parse and query Claude Code trace NDJSON files",
-      "Drive an LLM agent through a scripted run and capture its trace",
-      "Supervise a multi-step or multi-agent workflow"
-    ]
-  },
+  "jobs": [
+    {
+      "user": "Platform Builders",
+      "goal": "Prove Agent Changes",
+      "trigger": "An eval passes locally but fails in CI and the only output is 'assertion failed.'",
+      "bigHire": "prove whether agent changes improved outcomes with reproducible evidence.",
+      "littleHire": "run an eval and get a trace that shows exactly what the agent did.",
+      "competesWith": "manual before/after comparison; trusting gut feeling over evidence; skipping evaluation entirely"
+    }
+  ],
   "type": "module",
   "main": "./src/index.js",
   "exports": {
@@ -49,7 +51,7 @@
     "@forwardimpact/libcli": "^0.1.0",
     "@forwardimpact/libconfig": "^0.1.0",
     "@forwardimpact/libtelemetry": "^0.1.22",
-    "zod": "^4.4.1"
+    "zod": "^4.4.3"
   },
   "devDependencies": {
     "@forwardimpact/libharness": "^0.1.14"

package/src/agent-runner.js CHANGED Viewed

@@ -32,6 +32,7 @@ function applyDefaults(deps) {
   };
 }
+/** Run a single Claude Agent SDK session and emit raw NDJSON events to an output stream. */
 export class AgentRunner {
   /**
    * @param {object} deps

package/src/commands/run.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { createAgentRunner } from "../agent-runner.js";
 import { composeProfilePrompt } from "../profile-prompt.js";
 import { createTeeWriter } from "../tee-writer.js";
 import { SequenceCounter } from "../sequence-counter.js";
+import { createServiceConfig } from "@forwardimpact/libconfig";
 /**
  * Parse and validate run command options from parsed values.
@@ -35,6 +36,7 @@ function parseRunOptions(values) {
       values["allowed-tools"] ??
       "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
     ).split(","),
+    mcpServer: values["mcp-server"] ?? undefined,
   };
 }
@@ -56,6 +58,7 @@ export async function runRunCommand(values, _args) {
     outputPath,
     agentProfile,
     allowedTools,
+    mcpServer,
   } = parseRunOptions(values);
   // When --output is specified, stream text to stdout while writing NDJSON to file.
@@ -78,6 +81,19 @@ export async function runRunCommand(values, _args) {
     );
   };
+  let mcpServers = null;
+  if (mcpServer) {
+    const mcpConfig = await createServiceConfig("mcp");
+    mcpServers = {
+      [mcpServer]: {
+        type: "http",
+        url: mcpConfig.url,
+        headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
+      },
+    };
+    allowedTools.push(`mcp__${mcpServer}__*`);
+  }
   if (agentProfile) {
     process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
   }
@@ -100,6 +116,7 @@ export async function runRunCommand(values, _args) {
     settingSources: ["project"],
     systemPrompt,
     taskAmend,
+    mcpServers,
   });
   const result = await runner.run(taskContent);

package/src/commands/supervise.js CHANGED Viewed

@@ -3,6 +3,7 @@ import { resolve, join } from "node:path";
 import { tmpdir } from "node:os";
 import { createSupervisor } from "../supervisor.js";
 import { createTeeWriter } from "../tee-writer.js";
+import { createServiceConfig } from "@forwardimpact/libconfig";
 /**
  * Parse all supervise flags from parsed values into an options object.
@@ -44,6 +45,7 @@ function parseSuperviseOptions(values) {
     supervisorAllowedTools: supervisorAllowedToolsRaw
       ? supervisorAllowedToolsRaw.split(",")
       : undefined,
+    mcpServer: values["mcp-server"] ?? undefined,
   };
 }
@@ -71,6 +73,19 @@ export async function runSuperviseCommand(values, _args) {
       })
     : process.stdout;
+  let agentMcpServers = null;
+  if (opts.mcpServer) {
+    const mcpConfig = await createServiceConfig("mcp");
+    agentMcpServers = {
+      [opts.mcpServer]: {
+        type: "http",
+        url: mcpConfig.url,
+        headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
+      },
+    };
+    opts.allowedTools.push(`mcp__${opts.mcpServer}__*`);
+  }
   if (opts.agentProfile) {
     process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
   }
@@ -88,6 +103,7 @@ export async function runSuperviseCommand(values, _args) {
     supervisorProfile: opts.supervisorProfile,
     agentProfile: opts.agentProfile,
     taskAmend: opts.taskAmend,
+    agentMcpServers,
   });
   const result = await supervisor.run(opts.taskContent);

package/src/facilitator.js CHANGED Viewed

@@ -26,7 +26,8 @@ export const FACILITATOR_SYSTEM_PROMPT =
   "Announce sends a message with no reply obligation. " +
   "Redirect interrupts a participant with replacement instructions. " +
   "RollCall lists participants. " +
-  "Conclude ends the session with a summary.";
+  "Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
+  "the verdict reflects whether the session met the criteria stated in the task.";
 /** System prompt appended for facilitated agent runners. */
 export const FACILITATED_AGENT_SYSTEM_PROMPT =
@@ -36,6 +37,7 @@ export const FACILITATED_AGENT_SYSTEM_PROMPT =
   "Announce broadcasts a message. " +
   "RollCall lists participants.";
+/** Orchestrate N agent sessions coordinated by a single facilitator LLM session. */
 export class Facilitator {
   /**
    * @param {object} deps
@@ -105,12 +107,14 @@ export class Facilitator {
       // messages and started processing concurrently.
       this.concludeResolve();
       await Promise.allSettled(agentPromises);
+      const success = this.ctx.verdict === "success";
       this.emitSummary({
-        success: true,
+        success,
+        verdict: this.ctx.verdict,
         turns: this.facilitatorTurns,
         summary: this.ctx.summary,
       });
-      return { success: true, turns: this.facilitatorTurns };
+      return { success, turns: this.facilitatorTurns };
     }
     // Abort agents promptly when Conclude is called during the event loop
@@ -133,12 +137,14 @@ export class Facilitator {
       throw err;
     }
+    const success = this.ctx.concluded && this.ctx.verdict === "success";
     const result = {
-      success: this.ctx.concluded,
+      success,
       turns: this.facilitatorTurns,
     };
     this.emitSummary({
-      success: result.success,
+      success,
+      verdict: this.ctx.verdict,
       turns: result.turns,
       summary: this.ctx.summary,
     });
@@ -296,6 +302,7 @@ export class Facilitator {
     }
   }
+  /** Return the last assistant text block from a runner's buffer, or the fallback if none exists. */
   extractLastText(runner, fallback) {
     const lines = runner.buffer;
     for (let i = lines.length - 1; i >= 0; i--) {
@@ -342,7 +349,7 @@ export class Facilitator {
   }
   /**
-   * @param {{success: boolean, turns: number, summary?: string}} result
+   * @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
    */
   emitSummary(result) {
     this.output.write(
@@ -352,6 +359,7 @@ export class Facilitator {
         event: {
           type: "summary",
           success: result.success,
+          ...(result.verdict && { verdict: result.verdict }),
           turns: result.turns,
           ...(result.summary && { summary: result.summary }),
         },

package/src/message-bus.js CHANGED Viewed

@@ -12,6 +12,7 @@
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
  */
+/** In-memory per-participant message queues for facilitated and supervised orchestration modes. */
 export class MessageBus {
   /**
    * @param {object} deps

package/src/orchestration-toolkit.js CHANGED Viewed

@@ -22,6 +22,7 @@ import { z } from "zod";
 export function createOrchestrationContext() {
   return {
     concluded: false,
+    verdict: null,
     summary: null,
     redirect: null,
     participants: [],
@@ -37,14 +38,17 @@ export function createOrchestrationContext() {
 // --- Handler factories ---
+/** Create a handler that marks the session as concluded and records the verdict and summary. */
 export function createConcludeHandler(ctx) {
-  return async ({ summary }) => {
+  return async ({ verdict, summary }) => {
     ctx.concluded = true;
+    ctx.verdict = verdict;
     ctx.summary = summary;
     return { content: [{ type: "text", text: "Session concluded." }] };
   };
 }
+/** Create a handler that queues a redirect to interrupt a participant with replacement instructions. */
 export function createRedirectHandler(ctx) {
   return async ({ message, to }) => {
     ctx.redirect = { message, to: to ?? null };
@@ -52,6 +56,7 @@ export function createRedirectHandler(ctx) {
   };
 }
+/** Create a handler that returns the list of all session participants and their roles. */
 export function createRollCallHandler(ctx) {
   return async () => {
     return {
@@ -217,8 +222,8 @@ export function createSupervisorToolServer(ctx) {
       ),
       tool(
         "Conclude",
-        "End the session with a summary.",
-        { summary: z.string() },
+        "End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
+        { verdict: z.enum(["success", "failure"]), summary: z.string() },
         createConcludeHandler(ctx),
       ),
       tool(
@@ -304,8 +309,8 @@ export function createFacilitatorToolServer(ctx) {
       ),
       tool(
         "Conclude",
-        "End the session with a summary.",
-        { summary: z.string() },
+        "End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
+        { verdict: z.enum(["success", "failure"]), summary: z.string() },
         createConcludeHandler(ctx),
       ),
       tool(

package/src/orchestrator-helpers.js CHANGED Viewed

@@ -5,6 +5,7 @@
  * - `formatMessages`    — render a drained message batch as tagged lines.
  */
+/** Create a promise-based async queue for serializing event delivery to the facilitator loop. */
 export function createAsyncQueue() {
   const items = [];
   let waiter = null;

package/src/render/tool-hints.js CHANGED Viewed

@@ -6,6 +6,11 @@
  * tool (file path, command, pattern, …) sanitized to strip JSON punctuation
  * (`{`, `}`, `"`) and collapsed to a single line ≤ 80 chars.
  *
+ * MCP-prefixed tools (`mcp__*`) are an intentional carve-out: their hint is
+ * the full input rendered as compact single-line JSON, so `{` and `"` do
+ * appear on those lines. Readers of GitHub workflow logs need the full MCP
+ * payload to know what was actually sent across the protocol.
+ *
  * `previewForResult(content, isError)` collapses a tool result to a single
  * line ≤ 80 chars and flags errors so the renderer can apply the reserved
  * error color and the `Error:` label.
@@ -91,35 +96,16 @@ export function simplifyToolName(name) {
   return parts.slice(2).join("__");
 }
-/**
- * MCP-prefixed tool names (e.g. `mcp__orchestration__Ask`) take a different
- * handler path. The method name itself is surfaced via `simplifyToolName`,
- * so this only adds the `to/from` decorators for orchestration calls.
- * Returns null if the name does not match any MCP prefix.
- * @param {string} name
- * @param {object} input
- * @returns {string|null}
- */
-function hintForMcp(name, input) {
-  if (name.startsWith("mcp__orchestration__")) {
-    const parts = [];
-    if (input.to) parts.push(`to ${sanitize(input.to)}`);
-    if (input.from) parts.push(`from ${sanitize(input.from)}`);
-    return truncate(parts.join(" "));
-  }
-  if (name.startsWith("mcp__")) {
-    return "";
-  }
-  return null;
-}
 /**
  * Map a tool name and input to a one-line human hint.
  *
- * Unknown tools return an empty hint — the caller still shows the tool
- * name, just without extra detail. Sanitization is uniform: every branch
- * ends with `sanitize`, so the output is guaranteed free of `{`, `}`, `"`
- * from the input object (success criterion #2).
+ * Three branches, in priority order:
+ *  - A built-in tool with an entry in `HINT_HANDLERS` → sanitized hint, no
+ *    `{` / `"` from the input (spec 540 criterion #2 for non-MCP tools).
+ *  - An MCP-prefixed tool (`mcp__*`) → full input rendered as compact
+ *    single-line JSON; `{` and `"` intentionally appear so readers see
+ *    the actual MCP payload.
+ *  - Anything else → "" (the caller still shows the bare tool name).
  *
  * @param {string} name - Tool name (e.g. "Bash", "Read", "mcp__orchestration__Ask")
  * @param {object|null|undefined} input - Raw tool input object from the trace
@@ -132,8 +118,7 @@ export function hintForCall(name, input) {
   const handler = HINT_HANDLERS[name];
   if (handler) return handler(safeInput);
-  const mcp = hintForMcp(name, safeInput);
-  if (mcp !== null) return mcp;
+  if (name.startsWith("mcp__")) return JSON.stringify(safeInput);
   return "";
 }
@@ -154,32 +139,15 @@ export function previewForResult(content, isError) {
       : typeof content === "string"
         ? content
         : JSON.stringify(content);
-  const lines = normalized.split(/\r?\n/);
-  let firstNonBlank = "";
-  for (const line of lines) {
-    if (line.trim().length > 0) {
-      firstNonBlank = line.trim();
-      break;
-    }
-  }
-  if (isError) {
-    const body = firstNonBlank || "(no output)";
-    return {
-      text:
-        body.length <= MAX_HINT_CHARS
-          ? body
-          : body.slice(0, MAX_HINT_CHARS - 3) + "...",
-      isError: true,
-    };
-  }
+  const firstNonBlank =
+    normalized
+      .split(/\r?\n/)
+      .map((l) => l.trim())
+      .find((l) => l.length > 0) ?? "";
-  if (!firstNonBlank) return { text: "(ok)", isError: false };
+  const fallback = isError ? "(no output)" : "(ok)";
   return {
-    text:
-      firstNonBlank.length <= MAX_HINT_CHARS
-        ? firstNonBlank
-        : firstNonBlank.slice(0, MAX_HINT_CHARS - 3) + "...",
-    isError: false,
+    text: truncate(firstNonBlank || fallback),
+    isError,
   };
 }

package/src/render/turn-renderer.js CHANGED Viewed

@@ -25,12 +25,7 @@ import {
  * @returns {string[]} Array of rendered line strings
  */
 export function renderTurnLines(turn, withPrefix) {
-  if (turn.role === "assistant") return renderAssistantTurn(turn, withPrefix);
-  if (turn.role === "tool_result")
-    return renderToolResultTurn(turn, withPrefix);
-  if (turn.role === "system") return renderSystemTurn(turn, withPrefix);
-  if (turn.role === "user") return renderUserTurn(turn, withPrefix);
-  return [];
+  return TURN_RENDERERS[turn.role]?.(turn, withPrefix) ?? [];
 }
 /** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
@@ -57,10 +52,13 @@ function renderAssistantTurn(turn, withPrefix) {
 /** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
 function renderToolResultTurn(turn, withPrefix) {
+  // Successful tool results emit no preview line — the trace document keeps
+  // the structured turn, but readers of the streamed log only see errors.
+  if (!turn.isError) return [];
   return [
     renderToolResultLine({
       source: turn.source,
-      preview: previewForResult(turn.content, turn.isError),
+      preview: previewForResult(turn.content, true),
       withPrefix,
     }),
   ];
@@ -90,3 +88,10 @@ function renderUserTurn(turn, withPrefix) {
   }
   return lines;
 }
+const TURN_RENDERERS = {
+  assistant: renderAssistantTurn,
+  tool_result: renderToolResultTurn,
+  system: renderSystemTurn,
+  user: renderUserTurn,
+};

package/src/sequence-counter.js CHANGED Viewed

@@ -2,16 +2,20 @@
  * SequenceCounter — global monotonic counter shared across all participants
  * in a session. Single-threaded JS means no synchronization needed.
  */
+/** Monotonic counter that assigns globally ordered sequence numbers within a session. */
 export class SequenceCounter {
+  /** Initialize the counter at zero. */
   constructor() {
     this.value = 0;
   }
+  /** Return the current value and advance the counter by one. */
   next() {
     return this.value++;
   }
 }
+/** Create a new SequenceCounter starting at zero. */
 export function createSequenceCounter() {
   return new SequenceCounter();
 }

package/src/supervisor.js CHANGED Viewed

@@ -4,8 +4,9 @@
  * introduces itself, and delegates work to the agent. The loop then alternates:
  * agent → supervisor → agent.
  *
- * Signaling uses orchestration tools (Ask / Answer / Announce / Redirect /
- * Conclude) via in-process MCP servers. The Ask/Answer contract is enforced
+ * Signaling uses orchestration tools (Ask / Announce / Redirect / Conclude)
+ * via in-process MCP servers; the supervisor has no Answer tool — agent replies
+ * are routed back through the relay loop. The Ask/Answer contract is enforced
  * at turn boundaries: an unanswered Ask triggers one synthetic reminder and
  * then a `protocol_violation` trace event plus a null-answer injection so the
  * session advances without silent deadlock.
@@ -35,7 +36,8 @@ export const SUPERVISOR_SYSTEM_PROMPT =
   "Answer replies to an ask the agent addressed to you. " +
   "Announce sends a message with no reply obligation. " +
   "Redirect interrupts the agent with replacement instructions. " +
-  "Conclude ends the session with a summary.";
+  "Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
+  "the verdict reflects whether the agent's work meets the criteria stated in the task.";
 /** System prompt appended for the agent runner in supervise mode. */
 export const AGENT_SYSTEM_PROMPT =
@@ -52,6 +54,7 @@ export const AGENT_SYSTEM_PROMPT =
  */
 const MAX_INTERVENTIONS_PER_TURN = 5;
+/** Orchestrate a relay loop between a supervisor LLM and an agent LLM with mid-turn review. */
 export class Supervisor {
   /**
    * @param {object} deps
@@ -108,8 +111,14 @@ export class Supervisor {
     }
     if (this.ctx.concluded) {
-      this.emitSummary({ success: true, turns: 0, summary: this.ctx.summary });
-      return { success: true, turns: 0 };
+      const success = this.ctx.verdict === "success";
+      this.emitSummary({
+        success,
+        verdict: this.ctx.verdict,
+        turns: 0,
+        summary: this.ctx.summary,
+      });
+      return { success, turns: 0 };
     }
     let pendingRelay = null;
@@ -212,12 +221,14 @@ export class Supervisor {
     }
     if (this.ctx.concluded) {
+      const success = this.ctx.verdict === "success";
       this.emitSummary({
-        success: true,
+        success,
+        verdict: this.ctx.verdict,
         turns: turn,
         summary: this.ctx.summary,
       });
-      return { type: "exit", exit: { success: true, turns: turn } };
+      return { type: "exit", exit: { success, turns: turn } };
     }
     if (agentResult.aborted && this.ctx.redirect) {
@@ -306,12 +317,14 @@ export class Supervisor {
     }
     if (this.ctx.concluded) {
+      const success = this.ctx.verdict === "success";
       this.emitSummary({
-        success: true,
+        success,
+        verdict: this.ctx.verdict,
         turns: turn,
         summary: this.ctx.summary,
       });
-      return { exit: { success: true, turns: turn } };
+      return { exit: { success, turns: turn } };
     }
     if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
@@ -321,12 +334,14 @@ export class Supervisor {
           formatMessages(reminders),
         );
         if (this.ctx.concluded) {
+          const success = this.ctx.verdict === "success";
           this.emitSummary({
-            success: true,
+            success,
+            verdict: this.ctx.verdict,
             turns: turn,
             summary: this.ctx.summary,
           });
-          return { exit: { success: true, turns: turn } };
+          return { exit: { success, turns: turn } };
         }
         this.#checkAsk("supervisor");
       }
@@ -424,7 +439,7 @@ export class Supervisor {
   /**
    * Emit a final orchestrator summary line, wrapped in the universal envelope.
-   * @param {{success: boolean, turns: number, summary?: string}} result
+   * @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
    */
   emitSummary(result) {
     this.output.write(
@@ -434,6 +449,7 @@ export class Supervisor {
         event: {
           type: "summary",
           success: result.success,
+          ...(result.verdict && { verdict: result.verdict }),
           turns: result.turns,
           ...(result.summary && { summary: result.summary }),
         },
@@ -464,6 +480,7 @@ const devNull = new Writable({
  * @param {string} [deps.agentProfile] - Agent profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<supervisorCwd>/.claude/agents`. Resolved once from the orchestrator's cwd so profiles travel with the project, not with a per-agent sandbox.
  * @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
+ * @param {Record<string, object>} [deps.agentMcpServers] - Additional MCP servers exposed to the agent (merged alongside the orchestration server).
  * @returns {Supervisor}
  */
 export function createSupervisor({
@@ -480,6 +497,7 @@ export function createSupervisor({
   agentProfile,
   profilesDir,
   taskAmend,
+  agentMcpServers,
 }) {
   const resolvedProfilesDir =
     profilesDir ?? resolve(supervisorCwd, ".claude/agents");
@@ -519,7 +537,7 @@ export function createSupervisor({
     onLine,
     settingSources: ["project"],
     systemPrompt: systemPromptFor(agentProfile, AGENT_SYSTEM_PROMPT),
-    mcpServers: { orchestration: agentServer },
+    mcpServers: { orchestration: agentServer, ...agentMcpServers },
   });
   const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];

package/src/tee-writer.js CHANGED Viewed

@@ -20,6 +20,7 @@ import { TraceCollector } from "./trace-collector.js";
 import { renderTurnLines } from "./render/turn-renderer.js";
 import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
+/** Writable stream that saves raw NDJSON to a file while streaming human-readable text to a display stream. */
 export class TeeWriter extends Writable {
   /**
    * @param {object} deps
@@ -99,6 +100,12 @@ export class TeeWriter extends Writable {
     // Universal envelope: { source, seq, event }
     if (parsed.event) {
+      // Always forward to the collector so it can capture orchestrator
+      // metadata (e.g. the summary verdict for the result footer); the
+      // collector adds no turn for suppressed events, so flushTurns stays
+      // a no-op when we skip it below.
+      this.collector.addLine(line);
       // Orchestrator lifecycle events are suppressed from the text stream
       // entirely (spec 540). They still reached fileStream above.
       if (
@@ -107,7 +114,6 @@ export class TeeWriter extends Writable {
       ) {
         return;
       }
-      this.collector.addLine(line);
       this.flushTurns();
       return;
     }

package/src/trace-collector.js CHANGED Viewed

@@ -12,6 +12,7 @@
 import { renderTurnLines } from "./render/turn-renderer.js";
 import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
+/** Accumulate Claude Code NDJSON stream events into structured traces for analysis or text replay. */
 export class TraceCollector {
   /**
    * @param {object} [deps]
@@ -26,6 +27,8 @@ export class TraceCollector {
     this.turns = [];
     /** @type {object|null} */
     this.result = null;
+    /** @type {{verdict?: string, summary?: string, turns?: number}|null} */
+    this.orchestratorSummary = null;
     /** @type {number} */
     this.turnIndex = 0;
     /** @type {object|null} */
@@ -61,6 +64,16 @@ export class TraceCollector {
     // Orchestrator lifecycle events carry no content and are suppressed
     // from turns entirely — the NDJSON artifact keeps them separately.
     if (source === "orchestrator" && isSuppressedOrchestratorEvent(event)) {
+      // The summary event carries the supervisor/facilitator verdict —
+      // capture it before dropping the event, so the result footer can
+      // surface verdict="failure" instead of the SDK's per-runner status.
+      if (event.type === "summary") {
+        this.orchestratorSummary = {
+          ...(event.verdict && { verdict: event.verdict }),
+          ...(typeof event.summary === "string" && { summary: event.summary }),
+          ...(typeof event.turns === "number" && { turns: event.turns }),
+        };
+      }
       return;
     }
@@ -276,16 +289,20 @@ export class TraceCollector {
   }
   /**
-   * Format the trailing result summary line (spec 540).
+   * Format the trailing result summary line (spec 540). When an orchestrator
+   * summary is present (supervised / facilitated mode), the headline word is
+   * the supervisor's verdict ("success" / "failure") rather than the SDK's
+   * per-runner subtype, so the footer aligns with the CI exit code.
    * @returns {string}
    */
   #formatResultTail() {
     if (!this.result) return "";
     const duration = formatDuration(this.result.durationMs);
     const cost = Number(this.result.totalCostUsd).toFixed(4);
+    const headline = this.orchestratorSummary?.verdict ?? this.result.result;
     return (
       "\n" +
-      `--- Result: ${this.result.result} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
+      `--- Result: ${headline} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
     );
   }
 }