@forwardimpact/libeval 0.1.27 → 0.1.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,11 @@
1
1
  # libeval
2
2
 
3
- Process Claude Code stream-json output into structured traces.
3
+ <!-- BEGIN:description Do not edit. Generated from package.json. -->
4
+
5
+ Agent evaluation framework — prove whether agent changes improved outcomes with
6
+ reproducible evidence.
7
+
8
+ <!-- END:description -->
4
9
 
5
10
  ## Getting Started
6
11
 
package/bin/fit-eval.js CHANGED
@@ -10,9 +10,13 @@ import { runRunCommand } from "../src/commands/run.js";
10
10
  import { runSuperviseCommand } from "../src/commands/supervise.js";
11
11
  import { runFacilitateCommand } from "../src/commands/facilitate.js";
12
12
 
13
- const { version: VERSION } = JSON.parse(
14
- readFileSync(new URL("../package.json", import.meta.url), "utf8"),
15
- );
13
+ // `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
14
+ // the readFileSync branch in the compiled binary (which would ENOENT against
15
+ // the bunfs virtual mount). Source execution falls through to package.json.
16
+ const VERSION =
17
+ process.env.FIT_EVAL_VERSION ||
18
+ JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
19
+ .version;
16
20
 
17
21
  const definition = {
18
22
  name: "fit-eval",
@@ -55,6 +59,11 @@ const definition = {
55
59
  type: "string",
56
60
  description: "Comma-separated tool allowlist",
57
61
  },
62
+ "mcp-server": {
63
+ type: "string",
64
+ description:
65
+ "Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
66
+ },
58
67
  },
59
68
  },
60
69
  {
@@ -102,6 +111,11 @@ const definition = {
102
111
  type: "string",
103
112
  description: "Supervisor tool allowlist",
104
113
  },
114
+ "mcp-server": {
115
+ type: "string",
116
+ description:
117
+ "Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
118
+ },
105
119
  },
106
120
  },
107
121
  {
@@ -177,20 +191,20 @@ const definition = {
177
191
  ],
178
192
  documentation: [
179
193
  {
180
- title: "Agent Evaluations",
181
- url: "https://www.forwardimpact.team/docs/libraries/agent-evaluations/index.md",
194
+ title: "Run an Eval",
195
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
182
196
  description:
183
197
  "Author a judge profile, run an eval locally, wire it into CI, and inspect the resulting trace.",
184
198
  },
185
199
  {
186
- title: "Agent Collaboration",
187
- url: "https://www.forwardimpact.team/docs/libraries/agent-collaboration/index.md",
200
+ title: "Prove Agent Changes",
201
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
188
202
  description:
189
- "Author a facilitator and participant profiles, run a multi-agent session, and read the message flow.",
203
+ "End-to-end workflow from dataset generation through evaluation to trace analysis, including multi-agent collaboration sessions.",
190
204
  },
191
205
  {
192
- title: "Trace Analysis",
193
- url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
206
+ title: "Analyze Traces",
207
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
194
208
  description:
195
209
  "Read the NDJSON traces produced by `fit-eval` with `fit-trace` — grounded-theory method and worked examples.",
196
210
  },
package/bin/fit-trace.js CHANGED
@@ -26,9 +26,13 @@ import {
26
26
  runSplitCommand,
27
27
  } from "../src/commands/trace.js";
28
28
 
29
- const { version: VERSION } = JSON.parse(
30
- readFileSync(new URL("../package.json", import.meta.url), "utf8"),
31
- );
29
+ // `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
30
+ // the readFileSync branch in the compiled binary (which would ENOENT against
31
+ // the bunfs virtual mount). Source execution falls through to package.json.
32
+ const VERSION =
33
+ process.env.FIT_TRACE_VERSION ||
34
+ JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
35
+ .version;
32
36
 
33
37
  const definition = {
34
38
  name: "fit-trace",
@@ -214,22 +218,22 @@ const definition = {
214
218
  ],
215
219
  documentation: [
216
220
  {
217
- title: "Trace Analysis",
218
- url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
221
+ title: "Analyze Traces",
222
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
219
223
  description:
220
224
  "The full method walkthrough with worked examples (an eval that failed, a multi-agent session that stalled).",
221
225
  },
222
226
  {
223
- title: "Agent Evaluations",
224
- url: "https://www.forwardimpact.team/docs/libraries/agent-evaluations/index.md",
227
+ title: "Run an Eval",
228
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
225
229
  description:
226
230
  "How `fit-eval supervise` produces the traces this skill analyzes.",
227
231
  },
228
232
  {
229
- title: "Agent Collaboration",
230
- url: "https://www.forwardimpact.team/docs/libraries/agent-collaboration/index.md",
233
+ title: "Prove Agent Changes",
234
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
231
235
  description:
232
- "How `fit-eval facilitate` produces multi-agent traces; `split` is the bridge into per-source files.",
236
+ "End-to-end workflow including multi-agent collaboration; `split` is the bridge into per-source trace files.",
233
237
  },
234
238
  ],
235
239
  };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.27",
4
- "description": "Agent evaluation: collect Claude Code traces, run agent loops, supervise multi-step workflows.",
3
+ "version": "0.1.30",
4
+ "description": "Agent evaluation framework prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
7
7
  "agent",
@@ -17,14 +17,16 @@
17
17
  },
18
18
  "license": "Apache-2.0",
19
19
  "author": "D. Olsson <hi@senzilla.io>",
20
- "forwardimpact": {
21
- "capability": "agent-self-improvement",
22
- "needs": [
23
- "Parse and query Claude Code trace NDJSON files",
24
- "Drive an LLM agent through a scripted run and capture its trace",
25
- "Supervise a multi-step or multi-agent workflow"
26
- ]
27
- },
20
+ "jobs": [
21
+ {
22
+ "user": "Platform Builders",
23
+ "goal": "Prove Agent Changes",
24
+ "trigger": "An eval passes locally but fails in CI and the only output is 'assertion failed.'",
25
+ "bigHire": "prove whether agent changes improved outcomes with reproducible evidence.",
26
+ "littleHire": "run an eval and get a trace that shows exactly what the agent did.",
27
+ "competesWith": "manual before/after comparison; trusting gut feeling over evidence; skipping evaluation entirely"
28
+ }
29
+ ],
28
30
  "type": "module",
29
31
  "main": "./src/index.js",
30
32
  "exports": {
@@ -49,7 +51,7 @@
49
51
  "@forwardimpact/libcli": "^0.1.0",
50
52
  "@forwardimpact/libconfig": "^0.1.0",
51
53
  "@forwardimpact/libtelemetry": "^0.1.22",
52
- "zod": "^4.4.1"
54
+ "zod": "^4.4.3"
53
55
  },
54
56
  "devDependencies": {
55
57
  "@forwardimpact/libharness": "^0.1.14"
@@ -32,6 +32,7 @@ function applyDefaults(deps) {
32
32
  };
33
33
  }
34
34
 
35
+ /** Run a single Claude Agent SDK session and emit raw NDJSON events to an output stream. */
35
36
  export class AgentRunner {
36
37
  /**
37
38
  * @param {object} deps
@@ -5,6 +5,7 @@ import { createAgentRunner } from "../agent-runner.js";
5
5
  import { composeProfilePrompt } from "../profile-prompt.js";
6
6
  import { createTeeWriter } from "../tee-writer.js";
7
7
  import { SequenceCounter } from "../sequence-counter.js";
8
+ import { createServiceConfig } from "@forwardimpact/libconfig";
8
9
 
9
10
  /**
10
11
  * Parse and validate run command options from parsed values.
@@ -35,6 +36,7 @@ function parseRunOptions(values) {
35
36
  values["allowed-tools"] ??
36
37
  "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
37
38
  ).split(","),
39
+ mcpServer: values["mcp-server"] ?? undefined,
38
40
  };
39
41
  }
40
42
 
@@ -56,6 +58,7 @@ export async function runRunCommand(values, _args) {
56
58
  outputPath,
57
59
  agentProfile,
58
60
  allowedTools,
61
+ mcpServer,
59
62
  } = parseRunOptions(values);
60
63
 
61
64
  // When --output is specified, stream text to stdout while writing NDJSON to file.
@@ -78,6 +81,19 @@ export async function runRunCommand(values, _args) {
78
81
  );
79
82
  };
80
83
 
84
+ let mcpServers = null;
85
+ if (mcpServer) {
86
+ const mcpConfig = await createServiceConfig("mcp");
87
+ mcpServers = {
88
+ [mcpServer]: {
89
+ type: "http",
90
+ url: mcpConfig.url,
91
+ headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
92
+ },
93
+ };
94
+ allowedTools.push(`mcp__${mcpServer}__*`);
95
+ }
96
+
81
97
  if (agentProfile) {
82
98
  process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
83
99
  }
@@ -100,6 +116,7 @@ export async function runRunCommand(values, _args) {
100
116
  settingSources: ["project"],
101
117
  systemPrompt,
102
118
  taskAmend,
119
+ mcpServers,
103
120
  });
104
121
 
105
122
  const result = await runner.run(taskContent);
@@ -3,6 +3,7 @@ import { resolve, join } from "node:path";
3
3
  import { tmpdir } from "node:os";
4
4
  import { createSupervisor } from "../supervisor.js";
5
5
  import { createTeeWriter } from "../tee-writer.js";
6
+ import { createServiceConfig } from "@forwardimpact/libconfig";
6
7
 
7
8
  /**
8
9
  * Parse all supervise flags from parsed values into an options object.
@@ -44,6 +45,7 @@ function parseSuperviseOptions(values) {
44
45
  supervisorAllowedTools: supervisorAllowedToolsRaw
45
46
  ? supervisorAllowedToolsRaw.split(",")
46
47
  : undefined,
48
+ mcpServer: values["mcp-server"] ?? undefined,
47
49
  };
48
50
  }
49
51
 
@@ -71,6 +73,19 @@ export async function runSuperviseCommand(values, _args) {
71
73
  })
72
74
  : process.stdout;
73
75
 
76
+ let agentMcpServers = null;
77
+ if (opts.mcpServer) {
78
+ const mcpConfig = await createServiceConfig("mcp");
79
+ agentMcpServers = {
80
+ [opts.mcpServer]: {
81
+ type: "http",
82
+ url: mcpConfig.url,
83
+ headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
84
+ },
85
+ };
86
+ opts.allowedTools.push(`mcp__${opts.mcpServer}__*`);
87
+ }
88
+
74
89
  if (opts.agentProfile) {
75
90
  process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
76
91
  }
@@ -88,6 +103,7 @@ export async function runSuperviseCommand(values, _args) {
88
103
  supervisorProfile: opts.supervisorProfile,
89
104
  agentProfile: opts.agentProfile,
90
105
  taskAmend: opts.taskAmend,
106
+ agentMcpServers,
91
107
  });
92
108
 
93
109
  const result = await supervisor.run(opts.taskContent);
@@ -26,7 +26,8 @@ export const FACILITATOR_SYSTEM_PROMPT =
26
26
  "Announce sends a message with no reply obligation. " +
27
27
  "Redirect interrupts a participant with replacement instructions. " +
28
28
  "RollCall lists participants. " +
29
- "Conclude ends the session with a summary.";
29
+ "Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
30
+ "the verdict reflects whether the session met the criteria stated in the task.";
30
31
 
31
32
  /** System prompt appended for facilitated agent runners. */
32
33
  export const FACILITATED_AGENT_SYSTEM_PROMPT =
@@ -36,6 +37,7 @@ export const FACILITATED_AGENT_SYSTEM_PROMPT =
36
37
  "Announce broadcasts a message. " +
37
38
  "RollCall lists participants.";
38
39
 
40
+ /** Orchestrate N agent sessions coordinated by a single facilitator LLM session. */
39
41
  export class Facilitator {
40
42
  /**
41
43
  * @param {object} deps
@@ -105,12 +107,14 @@ export class Facilitator {
105
107
  // messages and started processing concurrently.
106
108
  this.concludeResolve();
107
109
  await Promise.allSettled(agentPromises);
110
+ const success = this.ctx.verdict === "success";
108
111
  this.emitSummary({
109
- success: true,
112
+ success,
113
+ verdict: this.ctx.verdict,
110
114
  turns: this.facilitatorTurns,
111
115
  summary: this.ctx.summary,
112
116
  });
113
- return { success: true, turns: this.facilitatorTurns };
117
+ return { success, turns: this.facilitatorTurns };
114
118
  }
115
119
 
116
120
  // Abort agents promptly when Conclude is called during the event loop
@@ -133,12 +137,14 @@ export class Facilitator {
133
137
  throw err;
134
138
  }
135
139
 
140
+ const success = this.ctx.concluded && this.ctx.verdict === "success";
136
141
  const result = {
137
- success: this.ctx.concluded,
142
+ success,
138
143
  turns: this.facilitatorTurns,
139
144
  };
140
145
  this.emitSummary({
141
- success: result.success,
146
+ success,
147
+ verdict: this.ctx.verdict,
142
148
  turns: result.turns,
143
149
  summary: this.ctx.summary,
144
150
  });
@@ -296,6 +302,7 @@ export class Facilitator {
296
302
  }
297
303
  }
298
304
 
305
+ /** Return the last assistant text block from a runner's buffer, or the fallback if none exists. */
299
306
  extractLastText(runner, fallback) {
300
307
  const lines = runner.buffer;
301
308
  for (let i = lines.length - 1; i >= 0; i--) {
@@ -342,7 +349,7 @@ export class Facilitator {
342
349
  }
343
350
 
344
351
  /**
345
- * @param {{success: boolean, turns: number, summary?: string}} result
352
+ * @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
346
353
  */
347
354
  emitSummary(result) {
348
355
  this.output.write(
@@ -352,6 +359,7 @@ export class Facilitator {
352
359
  event: {
353
360
  type: "summary",
354
361
  success: result.success,
362
+ ...(result.verdict && { verdict: result.verdict }),
355
363
  turns: result.turns,
356
364
  ...(result.summary && { summary: result.summary }),
357
365
  },
@@ -12,6 +12,7 @@
12
12
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
13
13
  */
14
14
 
15
+ /** In-memory per-participant message queues for facilitated and supervised orchestration modes. */
15
16
  export class MessageBus {
16
17
  /**
17
18
  * @param {object} deps
@@ -22,6 +22,7 @@ import { z } from "zod";
22
22
  export function createOrchestrationContext() {
23
23
  return {
24
24
  concluded: false,
25
+ verdict: null,
25
26
  summary: null,
26
27
  redirect: null,
27
28
  participants: [],
@@ -37,14 +38,17 @@ export function createOrchestrationContext() {
37
38
 
38
39
  // --- Handler factories ---
39
40
 
41
+ /** Create a handler that marks the session as concluded and records the verdict and summary. */
40
42
  export function createConcludeHandler(ctx) {
41
- return async ({ summary }) => {
43
+ return async ({ verdict, summary }) => {
42
44
  ctx.concluded = true;
45
+ ctx.verdict = verdict;
43
46
  ctx.summary = summary;
44
47
  return { content: [{ type: "text", text: "Session concluded." }] };
45
48
  };
46
49
  }
47
50
 
51
+ /** Create a handler that queues a redirect to interrupt a participant with replacement instructions. */
48
52
  export function createRedirectHandler(ctx) {
49
53
  return async ({ message, to }) => {
50
54
  ctx.redirect = { message, to: to ?? null };
@@ -52,6 +56,7 @@ export function createRedirectHandler(ctx) {
52
56
  };
53
57
  }
54
58
 
59
+ /** Create a handler that returns the list of all session participants and their roles. */
55
60
  export function createRollCallHandler(ctx) {
56
61
  return async () => {
57
62
  return {
@@ -217,8 +222,8 @@ export function createSupervisorToolServer(ctx) {
217
222
  ),
218
223
  tool(
219
224
  "Conclude",
220
- "End the session with a summary.",
221
- { summary: z.string() },
225
+ "End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
226
+ { verdict: z.enum(["success", "failure"]), summary: z.string() },
222
227
  createConcludeHandler(ctx),
223
228
  ),
224
229
  tool(
@@ -304,8 +309,8 @@ export function createFacilitatorToolServer(ctx) {
304
309
  ),
305
310
  tool(
306
311
  "Conclude",
307
- "End the session with a summary.",
308
- { summary: z.string() },
312
+ "End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
313
+ { verdict: z.enum(["success", "failure"]), summary: z.string() },
309
314
  createConcludeHandler(ctx),
310
315
  ),
311
316
  tool(
@@ -5,6 +5,7 @@
5
5
  * - `formatMessages` — render a drained message batch as tagged lines.
6
6
  */
7
7
 
8
+ /** Create a promise-based async queue for serializing event delivery to the facilitator loop. */
8
9
  export function createAsyncQueue() {
9
10
  const items = [];
10
11
  let waiter = null;
@@ -6,6 +6,11 @@
6
6
  * tool (file path, command, pattern, …) sanitized to strip JSON punctuation
7
7
  * (`{`, `}`, `"`) and collapsed to a single line ≤ 80 chars.
8
8
  *
9
+ * MCP-prefixed tools (`mcp__*`) are an intentional carve-out: their hint is
10
+ * the full input rendered as compact single-line JSON, so `{` and `"` do
11
+ * appear on those lines. Readers of GitHub workflow logs need the full MCP
12
+ * payload to know what was actually sent across the protocol.
13
+ *
9
14
  * `previewForResult(content, isError)` collapses a tool result to a single
10
15
  * line ≤ 80 chars and flags errors so the renderer can apply the reserved
11
16
  * error color and the `Error:` label.
@@ -91,35 +96,16 @@ export function simplifyToolName(name) {
91
96
  return parts.slice(2).join("__");
92
97
  }
93
98
 
94
- /**
95
- * MCP-prefixed tool names (e.g. `mcp__orchestration__Ask`) take a different
96
- * handler path. The method name itself is surfaced via `simplifyToolName`,
97
- * so this only adds the `to/from` decorators for orchestration calls.
98
- * Returns null if the name does not match any MCP prefix.
99
- * @param {string} name
100
- * @param {object} input
101
- * @returns {string|null}
102
- */
103
- function hintForMcp(name, input) {
104
- if (name.startsWith("mcp__orchestration__")) {
105
- const parts = [];
106
- if (input.to) parts.push(`to ${sanitize(input.to)}`);
107
- if (input.from) parts.push(`from ${sanitize(input.from)}`);
108
- return truncate(parts.join(" "));
109
- }
110
- if (name.startsWith("mcp__")) {
111
- return "";
112
- }
113
- return null;
114
- }
115
-
116
99
  /**
117
100
  * Map a tool name and input to a one-line human hint.
118
101
  *
119
- * Unknown tools return an empty hint — the caller still shows the tool
120
- * name, just without extra detail. Sanitization is uniform: every branch
121
- * ends with `sanitize`, so the output is guaranteed free of `{`, `}`, `"`
122
- * from the input object (success criterion #2).
102
+ * Three branches, in priority order:
103
+ * - A built-in tool with an entry in `HINT_HANDLERS` sanitized hint, no
104
+ * `{` / `"` from the input (spec 540 criterion #2 for non-MCP tools).
105
+ * - An MCP-prefixed tool (`mcp__*`) → full input rendered as compact
106
+ * single-line JSON; `{` and `"` intentionally appear so readers see
107
+ * the actual MCP payload.
108
+ * - Anything else → "" (the caller still shows the bare tool name).
123
109
  *
124
110
  * @param {string} name - Tool name (e.g. "Bash", "Read", "mcp__orchestration__Ask")
125
111
  * @param {object|null|undefined} input - Raw tool input object from the trace
@@ -132,8 +118,7 @@ export function hintForCall(name, input) {
132
118
  const handler = HINT_HANDLERS[name];
133
119
  if (handler) return handler(safeInput);
134
120
 
135
- const mcp = hintForMcp(name, safeInput);
136
- if (mcp !== null) return mcp;
121
+ if (name.startsWith("mcp__")) return JSON.stringify(safeInput);
137
122
 
138
123
  return "";
139
124
  }
@@ -154,32 +139,15 @@ export function previewForResult(content, isError) {
154
139
  : typeof content === "string"
155
140
  ? content
156
141
  : JSON.stringify(content);
157
- const lines = normalized.split(/\r?\n/);
158
- let firstNonBlank = "";
159
- for (const line of lines) {
160
- if (line.trim().length > 0) {
161
- firstNonBlank = line.trim();
162
- break;
163
- }
164
- }
165
-
166
- if (isError) {
167
- const body = firstNonBlank || "(no output)";
168
- return {
169
- text:
170
- body.length <= MAX_HINT_CHARS
171
- ? body
172
- : body.slice(0, MAX_HINT_CHARS - 3) + "...",
173
- isError: true,
174
- };
175
- }
142
+ const firstNonBlank =
143
+ normalized
144
+ .split(/\r?\n/)
145
+ .map((l) => l.trim())
146
+ .find((l) => l.length > 0) ?? "";
176
147
 
177
- if (!firstNonBlank) return { text: "(ok)", isError: false };
148
+ const fallback = isError ? "(no output)" : "(ok)";
178
149
  return {
179
- text:
180
- firstNonBlank.length <= MAX_HINT_CHARS
181
- ? firstNonBlank
182
- : firstNonBlank.slice(0, MAX_HINT_CHARS - 3) + "...",
183
- isError: false,
150
+ text: truncate(firstNonBlank || fallback),
151
+ isError,
184
152
  };
185
153
  }
@@ -25,12 +25,7 @@ import {
25
25
  * @returns {string[]} Array of rendered line strings
26
26
  */
27
27
  export function renderTurnLines(turn, withPrefix) {
28
- if (turn.role === "assistant") return renderAssistantTurn(turn, withPrefix);
29
- if (turn.role === "tool_result")
30
- return renderToolResultTurn(turn, withPrefix);
31
- if (turn.role === "system") return renderSystemTurn(turn, withPrefix);
32
- if (turn.role === "user") return renderUserTurn(turn, withPrefix);
33
- return [];
28
+ return TURN_RENDERERS[turn.role]?.(turn, withPrefix) ?? [];
34
29
  }
35
30
 
36
31
  /** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
@@ -57,10 +52,13 @@ function renderAssistantTurn(turn, withPrefix) {
57
52
 
58
53
  /** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
59
54
  function renderToolResultTurn(turn, withPrefix) {
55
+ // Successful tool results emit no preview line — the trace document keeps
56
+ // the structured turn, but readers of the streamed log only see errors.
57
+ if (!turn.isError) return [];
60
58
  return [
61
59
  renderToolResultLine({
62
60
  source: turn.source,
63
- preview: previewForResult(turn.content, turn.isError),
61
+ preview: previewForResult(turn.content, true),
64
62
  withPrefix,
65
63
  }),
66
64
  ];
@@ -90,3 +88,10 @@ function renderUserTurn(turn, withPrefix) {
90
88
  }
91
89
  return lines;
92
90
  }
91
+
92
+ const TURN_RENDERERS = {
93
+ assistant: renderAssistantTurn,
94
+ tool_result: renderToolResultTurn,
95
+ system: renderSystemTurn,
96
+ user: renderUserTurn,
97
+ };
@@ -2,16 +2,20 @@
2
2
  * SequenceCounter — global monotonic counter shared across all participants
3
3
  * in a session. Single-threaded JS means no synchronization needed.
4
4
  */
5
+ /** Monotonic counter that assigns globally ordered sequence numbers within a session. */
5
6
  export class SequenceCounter {
7
+ /** Initialize the counter at zero. */
6
8
  constructor() {
7
9
  this.value = 0;
8
10
  }
9
11
 
12
+ /** Return the current value and advance the counter by one. */
10
13
  next() {
11
14
  return this.value++;
12
15
  }
13
16
  }
14
17
 
18
+ /** Create a new SequenceCounter starting at zero. */
15
19
  export function createSequenceCounter() {
16
20
  return new SequenceCounter();
17
21
  }
package/src/supervisor.js CHANGED
@@ -4,8 +4,9 @@
4
4
  * introduces itself, and delegates work to the agent. The loop then alternates:
5
5
  * agent → supervisor → agent.
6
6
  *
7
- * Signaling uses orchestration tools (Ask / Answer / Announce / Redirect /
8
- * Conclude) via in-process MCP servers. The Ask/Answer contract is enforced
7
+ * Signaling uses orchestration tools (Ask / Announce / Redirect / Conclude)
8
+ * via in-process MCP servers; the supervisor has no Answer tool agent replies
9
+ * are routed back through the relay loop. The Ask/Answer contract is enforced
9
10
  * at turn boundaries: an unanswered Ask triggers one synthetic reminder and
10
11
  * then a `protocol_violation` trace event plus a null-answer injection so the
11
12
  * session advances without silent deadlock.
@@ -35,7 +36,8 @@ export const SUPERVISOR_SYSTEM_PROMPT =
35
36
  "Answer replies to an ask the agent addressed to you. " +
36
37
  "Announce sends a message with no reply obligation. " +
37
38
  "Redirect interrupts the agent with replacement instructions. " +
38
- "Conclude ends the session with a summary.";
39
+ "Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
40
+ "the verdict reflects whether the agent's work meets the criteria stated in the task.";
39
41
 
40
42
  /** System prompt appended for the agent runner in supervise mode. */
41
43
  export const AGENT_SYSTEM_PROMPT =
@@ -52,6 +54,7 @@ export const AGENT_SYSTEM_PROMPT =
52
54
  */
53
55
  const MAX_INTERVENTIONS_PER_TURN = 5;
54
56
 
57
+ /** Orchestrate a relay loop between a supervisor LLM and an agent LLM with mid-turn review. */
55
58
  export class Supervisor {
56
59
  /**
57
60
  * @param {object} deps
@@ -108,8 +111,14 @@ export class Supervisor {
108
111
  }
109
112
 
110
113
  if (this.ctx.concluded) {
111
- this.emitSummary({ success: true, turns: 0, summary: this.ctx.summary });
112
- return { success: true, turns: 0 };
114
+ const success = this.ctx.verdict === "success";
115
+ this.emitSummary({
116
+ success,
117
+ verdict: this.ctx.verdict,
118
+ turns: 0,
119
+ summary: this.ctx.summary,
120
+ });
121
+ return { success, turns: 0 };
113
122
  }
114
123
 
115
124
  let pendingRelay = null;
@@ -212,12 +221,14 @@ export class Supervisor {
212
221
  }
213
222
 
214
223
  if (this.ctx.concluded) {
224
+ const success = this.ctx.verdict === "success";
215
225
  this.emitSummary({
216
- success: true,
226
+ success,
227
+ verdict: this.ctx.verdict,
217
228
  turns: turn,
218
229
  summary: this.ctx.summary,
219
230
  });
220
- return { type: "exit", exit: { success: true, turns: turn } };
231
+ return { type: "exit", exit: { success, turns: turn } };
221
232
  }
222
233
 
223
234
  if (agentResult.aborted && this.ctx.redirect) {
@@ -306,12 +317,14 @@ export class Supervisor {
306
317
  }
307
318
 
308
319
  if (this.ctx.concluded) {
320
+ const success = this.ctx.verdict === "success";
309
321
  this.emitSummary({
310
- success: true,
322
+ success,
323
+ verdict: this.ctx.verdict,
311
324
  turns: turn,
312
325
  summary: this.ctx.summary,
313
326
  });
314
- return { exit: { success: true, turns: turn } };
327
+ return { exit: { success, turns: turn } };
315
328
  }
316
329
 
317
330
  if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
@@ -321,12 +334,14 @@ export class Supervisor {
321
334
  formatMessages(reminders),
322
335
  );
323
336
  if (this.ctx.concluded) {
337
+ const success = this.ctx.verdict === "success";
324
338
  this.emitSummary({
325
- success: true,
339
+ success,
340
+ verdict: this.ctx.verdict,
326
341
  turns: turn,
327
342
  summary: this.ctx.summary,
328
343
  });
329
- return { exit: { success: true, turns: turn } };
344
+ return { exit: { success, turns: turn } };
330
345
  }
331
346
  this.#checkAsk("supervisor");
332
347
  }
@@ -424,7 +439,7 @@ export class Supervisor {
424
439
 
425
440
  /**
426
441
  * Emit a final orchestrator summary line, wrapped in the universal envelope.
427
- * @param {{success: boolean, turns: number, summary?: string}} result
442
+ * @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
428
443
  */
429
444
  emitSummary(result) {
430
445
  this.output.write(
@@ -434,6 +449,7 @@ export class Supervisor {
434
449
  event: {
435
450
  type: "summary",
436
451
  success: result.success,
452
+ ...(result.verdict && { verdict: result.verdict }),
437
453
  turns: result.turns,
438
454
  ...(result.summary && { summary: result.summary }),
439
455
  },
@@ -464,6 +480,7 @@ const devNull = new Writable({
464
480
  * @param {string} [deps.agentProfile] - Agent profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
465
481
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<supervisorCwd>/.claude/agents`. Resolved once from the orchestrator's cwd so profiles travel with the project, not with a per-agent sandbox.
466
482
  * @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
483
+ * @param {Record<string, object>} [deps.agentMcpServers] - Additional MCP servers exposed to the agent (merged alongside the orchestration server).
467
484
  * @returns {Supervisor}
468
485
  */
469
486
  export function createSupervisor({
@@ -480,6 +497,7 @@ export function createSupervisor({
480
497
  agentProfile,
481
498
  profilesDir,
482
499
  taskAmend,
500
+ agentMcpServers,
483
501
  }) {
484
502
  const resolvedProfilesDir =
485
503
  profilesDir ?? resolve(supervisorCwd, ".claude/agents");
@@ -519,7 +537,7 @@ export function createSupervisor({
519
537
  onLine,
520
538
  settingSources: ["project"],
521
539
  systemPrompt: systemPromptFor(agentProfile, AGENT_SYSTEM_PROMPT),
522
- mcpServers: { orchestration: agentServer },
540
+ mcpServers: { orchestration: agentServer, ...agentMcpServers },
523
541
  });
524
542
 
525
543
  const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
package/src/tee-writer.js CHANGED
@@ -20,6 +20,7 @@ import { TraceCollector } from "./trace-collector.js";
20
20
  import { renderTurnLines } from "./render/turn-renderer.js";
21
21
  import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
22
22
 
23
+ /** Writable stream that saves raw NDJSON to a file while streaming human-readable text to a display stream. */
23
24
  export class TeeWriter extends Writable {
24
25
  /**
25
26
  * @param {object} deps
@@ -99,6 +100,12 @@ export class TeeWriter extends Writable {
99
100
 
100
101
  // Universal envelope: { source, seq, event }
101
102
  if (parsed.event) {
103
+ // Always forward to the collector so it can capture orchestrator
104
+ // metadata (e.g. the summary verdict for the result footer); the
105
+ // collector adds no turn for suppressed events, so flushTurns stays
106
+ // a no-op when we skip it below.
107
+ this.collector.addLine(line);
108
+
102
109
  // Orchestrator lifecycle events are suppressed from the text stream
103
110
  // entirely (spec 540). They still reached fileStream above.
104
111
  if (
@@ -107,7 +114,6 @@ export class TeeWriter extends Writable {
107
114
  ) {
108
115
  return;
109
116
  }
110
- this.collector.addLine(line);
111
117
  this.flushTurns();
112
118
  return;
113
119
  }
@@ -12,6 +12,7 @@
12
12
  import { renderTurnLines } from "./render/turn-renderer.js";
13
13
  import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
14
14
 
15
+ /** Accumulate Claude Code NDJSON stream events into structured traces for analysis or text replay. */
15
16
  export class TraceCollector {
16
17
  /**
17
18
  * @param {object} [deps]
@@ -26,6 +27,8 @@ export class TraceCollector {
26
27
  this.turns = [];
27
28
  /** @type {object|null} */
28
29
  this.result = null;
30
+ /** @type {{verdict?: string, summary?: string, turns?: number}|null} */
31
+ this.orchestratorSummary = null;
29
32
  /** @type {number} */
30
33
  this.turnIndex = 0;
31
34
  /** @type {object|null} */
@@ -61,6 +64,16 @@ export class TraceCollector {
61
64
  // Orchestrator lifecycle events carry no content and are suppressed
62
65
  // from turns entirely — the NDJSON artifact keeps them separately.
63
66
  if (source === "orchestrator" && isSuppressedOrchestratorEvent(event)) {
67
+ // The summary event carries the supervisor/facilitator verdict —
68
+ // capture it before dropping the event, so the result footer can
69
+ // surface verdict="failure" instead of the SDK's per-runner status.
70
+ if (event.type === "summary") {
71
+ this.orchestratorSummary = {
72
+ ...(event.verdict && { verdict: event.verdict }),
73
+ ...(typeof event.summary === "string" && { summary: event.summary }),
74
+ ...(typeof event.turns === "number" && { turns: event.turns }),
75
+ };
76
+ }
64
77
  return;
65
78
  }
66
79
 
@@ -276,16 +289,20 @@ export class TraceCollector {
276
289
  }
277
290
 
278
291
  /**
279
- * Format the trailing result summary line (spec 540).
292
+ * Format the trailing result summary line (spec 540). When an orchestrator
293
+ * summary is present (supervised / facilitated mode), the headline word is
294
+ * the supervisor's verdict ("success" / "failure") rather than the SDK's
295
+ * per-runner subtype, so the footer aligns with the CI exit code.
280
296
  * @returns {string}
281
297
  */
282
298
  #formatResultTail() {
283
299
  if (!this.result) return "";
284
300
  const duration = formatDuration(this.result.durationMs);
285
301
  const cost = Number(this.result.totalCostUsd).toFixed(4);
302
+ const headline = this.orchestratorSummary?.verdict ?? this.result.result;
286
303
  return (
287
304
  "\n" +
288
- `--- Result: ${this.result.result} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
305
+ `--- Result: ${headline} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
289
306
  );
290
307
  }
291
308
  }