@forwardimpact/libeval 0.1.36 → 0.1.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,15 +34,26 @@ export const definition = {
34
34
  },
35
35
  output: {
36
36
  type: "string",
37
- description: "Run-output directory (created if missing)",
37
+ description:
38
+ "Run-output directory (created if missing, default: benchmark-runs)",
38
39
  },
39
40
  runs: {
40
41
  type: "string",
41
- description: "Runs per task (integer ≥ 1, default 1)",
42
+ description: "Runs per task (integer ≥ 1, default: 5)",
43
+ },
44
+ "agent-model": {
45
+ type: "string",
46
+ description:
47
+ "Claude model for the agent-under-test (default: claude-sonnet-4-6)",
48
+ },
49
+ "supervisor-model": {
50
+ type: "string",
51
+ description:
52
+ "Claude model for the supervisor (default: claude-opus-4-7)",
42
53
  },
43
- model: {
54
+ "judge-model": {
44
55
  type: "string",
45
- description: "Claude model id (default: claude-opus-4-7[1m])",
56
+ description: "Claude model for the judge (default: claude-opus-4-7)",
46
57
  },
47
58
  "agent-profile": {
48
59
  type: "string",
@@ -57,6 +68,11 @@ export const definition = {
57
68
  description:
58
69
  "Agent-under-test turn budget (default: 50, 0 = unlimited)",
59
70
  },
71
+ "allowed-tools": {
72
+ type: "string",
73
+ description:
74
+ "Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
75
+ },
60
76
  },
61
77
  },
62
78
  {
@@ -92,7 +108,8 @@ export const definition = {
92
108
  options: {
93
109
  input: {
94
110
  type: "string",
95
- description: "Run-output directory containing results.jsonl",
111
+ description:
112
+ "Run-output directory containing results.jsonl (default: benchmark-runs)",
96
113
  },
97
114
  k: {
98
115
  type: "string",
@@ -111,8 +128,10 @@ export const definition = {
111
128
  json: { type: "boolean", description: "Output help as JSON" },
112
129
  },
113
130
  examples: [
114
- "fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
115
- "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./runs/2026-05-11/runs/todo-api/0",
131
+ "fit-benchmark run --family=./families/coding",
132
+ "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
133
+ "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
134
+ "fit-benchmark report --format=text",
116
135
  "fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
117
136
  ],
118
137
  documentation: [
@@ -122,6 +141,12 @@ export const definition = {
122
141
  description:
123
142
  "Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
124
143
  },
144
+ {
145
+ title: "Automate with GitHub Actions",
146
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
147
+ description:
148
+ "Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
149
+ },
125
150
  ],
126
151
  };
127
152
 
package/bin/fit-eval.js CHANGED
@@ -41,7 +41,11 @@ const definition = {
41
41
  type: "string",
42
42
  description: "Additional text appended to the task",
43
43
  },
44
- model: { type: "string", description: "Claude model (default: opus)" },
44
+ "agent-model": {
45
+ type: "string",
46
+ description:
47
+ "Claude model for the agent (default: claude-opus-4-7[1m])",
48
+ },
45
49
  "max-turns": {
46
50
  type: "string",
47
51
  description: "Max agentic turns (default: 50, 0 = unlimited)",
@@ -84,7 +88,16 @@ const definition = {
84
88
  type: "string",
85
89
  description: "Additional text appended to the task",
86
90
  },
87
- model: { type: "string", description: "Claude model (default: opus)" },
91
+ "agent-model": {
92
+ type: "string",
93
+ description:
94
+ "Claude model for the agent (default: claude-opus-4-7[1m])",
95
+ },
96
+ "supervisor-model": {
97
+ type: "string",
98
+ description:
99
+ "Claude model for the supervisor (default: claude-opus-4-7[1m])",
100
+ },
88
101
  "max-turns": {
89
102
  type: "string",
90
103
  description: "Max agentic turns (default: 20, 0 = unlimited)",
@@ -136,7 +149,15 @@ const definition = {
136
149
  type: "string",
137
150
  description: "Additional text appended to the task",
138
151
  },
139
- model: { type: "string", description: "Claude model (default: opus)" },
152
+ "agent-model": {
153
+ type: "string",
154
+ description: "Claude model for agents (default: claude-opus-4-7[1m])",
155
+ },
156
+ "facilitator-model": {
157
+ type: "string",
158
+ description:
159
+ "Claude model for the facilitator (default: claude-opus-4-7[1m])",
160
+ },
140
161
  "max-turns": {
141
162
  type: "string",
142
163
  description: "Max agentic turns (default: 20, 0 = unlimited)",
package/bin/fit-trace.js CHANGED
@@ -25,6 +25,7 @@ import {
25
25
  runFilterCommand,
26
26
  runSplitCommand,
27
27
  } from "../src/commands/trace.js";
28
+ import { runAssertCommand } from "../src/commands/assert.js";
28
29
 
29
30
  // `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
30
31
  // the readFileSync branch in the compiled binary (which would ENOENT against
@@ -199,6 +200,41 @@ const definition = {
199
200
  },
200
201
  },
201
202
  },
203
+ {
204
+ name: "assert",
205
+ args: "<test-name> <file>",
206
+ description:
207
+ "Shell-friendly assertion — outputs structured JSON for scoring hooks",
208
+ options: {
209
+ grep: {
210
+ type: "string",
211
+ description:
212
+ "Pass if extended regex matches file content (case-insensitive)",
213
+ },
214
+ query: {
215
+ type: "string",
216
+ description:
217
+ "Pass if JMESPath expression against JSON/NDJSON yields a truthy result",
218
+ },
219
+ exists: {
220
+ type: "boolean",
221
+ description: "Pass if file exists",
222
+ },
223
+ "cites-job": {
224
+ type: "string",
225
+ description:
226
+ "Pass if <file> contains the canonical citation from a <job> tag in the given JTBD file",
227
+ },
228
+ not: {
229
+ type: "boolean",
230
+ description: "Invert the assertion",
231
+ },
232
+ message: {
233
+ type: "string",
234
+ description: "Custom failure message",
235
+ },
236
+ },
237
+ },
202
238
  ],
203
239
  globalOptions: {
204
240
  help: { type: "boolean", short: "h", description: "Show this help" },
@@ -220,6 +256,11 @@ const definition = {
220
256
  "fit-trace search structured.json 'error|fail' --context 1",
221
257
  "fit-trace filter structured.json --tool Bash --error",
222
258
  "fit-trace turn structured.json 3",
259
+ "fit-trace assert has-heading --grep '^## Problem' spec.md",
260
+ "fit-trace assert no-leak --not --grep 'password' output.log",
261
+ "fit-trace assert file-present --exists path/to/spec.md",
262
+ "fit-trace assert cites-jtbd --cites-job jtbd-excerpt.md spec.md",
263
+ "fit-trace assert used-edit --query \"[?type=='assistant'].message.content[] | [?name=='Edit']\" trace.ndjson",
223
264
  ],
224
265
  documentation: [
225
266
  {
@@ -265,6 +306,7 @@ const COMMANDS = {
265
306
  turn: runTurnCommand,
266
307
  filter: runFilterCommand,
267
308
  split: runSplitCommand,
309
+ assert: runAssertCommand,
268
310
  };
269
311
 
270
312
  async function main() {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.36",
3
+ "version": "0.1.39",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -53,6 +53,7 @@
53
53
  "@forwardimpact/libcli": "^0.1.0",
54
54
  "@forwardimpact/libconfig": "^0.1.0",
55
55
  "@forwardimpact/libtelemetry": "^0.1.22",
56
+ "jmespath": "^0.16.0",
56
57
  "zod": "^4.4.3"
57
58
  },
58
59
  "devDependencies": {
@@ -1,14 +1,13 @@
1
1
  /**
2
- * ApmInstaller — materialises the family's pre-staged `.claude/` tree into a
3
- * single staging directory, computes the manifest fingerprint, and is invoked
4
- * once per family install. Per-task copy happens later in WorkdirManager.
5
- *
6
- * v1 trusts the family's checked-in `.claude/` (P1); the lockfile is hashed
7
- * verbatim, not interpreted.
2
+ * ApmInstaller — runs `apm install --target claude` in the family root to
3
+ * materialise skills and agents, copies the resulting `.claude/` into a
4
+ * staging directory, and computes the manifest fingerprint from the lockfile.
5
+ * Per-task copy happens later in WorkdirManager.
8
6
  */
9
7
 
8
+ import { spawn } from "node:child_process";
10
9
  import { createHash } from "node:crypto";
11
- import { access, cp, rm } from "node:fs/promises";
10
+ import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
12
11
  import { join } from "node:path";
13
12
 
14
13
  /**
@@ -20,20 +19,83 @@ export async function installApm(family, outputDir) {
20
19
  const stagingDir = join(outputDir, ".apm-staging");
21
20
  const stagedClaude = join(stagingDir, ".claude");
22
21
  const sourceClaude = join(family.rootPath, ".claude");
22
+ const apmYml = join(family.rootPath, "apm.yml");
23
23
 
24
+ const hasApm = await access(apmYml)
25
+ .then(() => true)
26
+ .catch(() => false);
27
+
28
+ if (hasApm) {
29
+ await runApmInstall(family.rootPath);
30
+ try {
31
+ await access(sourceClaude);
32
+ } catch {
33
+ throw new Error(
34
+ `apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
35
+ );
36
+ }
37
+ }
38
+
39
+ await rm(stagingDir, { recursive: true, force: true });
40
+ const hasClaudeDir = await access(sourceClaude)
41
+ .then(() => true)
42
+ .catch(() => false);
43
+ if (hasClaudeDir) {
44
+ await cp(sourceClaude, stagedClaude, { recursive: true });
45
+ } else {
46
+ await mkdir(stagedClaude, { recursive: true });
47
+ }
48
+
49
+ // Stage the family-local judge profile outside .claude/ so it is available
50
+ // to the judge but never copied into the agent-under-test's CWD.
51
+ const judgeSource = join(family.rootPath, "judge.md");
52
+ const judgeProfilesDir = join(stagingDir, "judge-profiles");
24
53
  try {
25
- await access(sourceClaude);
54
+ await access(judgeSource);
55
+ await mkdir(judgeProfilesDir, { recursive: true });
56
+ await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
57
+ } catch {}
58
+
59
+ const lockPath = join(family.rootPath, "apm.lock.yaml");
60
+ let skillSetHash = "";
61
+ try {
62
+ const lockBytes = await readFile(lockPath);
63
+ skillSetHash =
64
+ "sha256:" +
65
+ createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
26
66
  } catch {
27
- throw new Error(
28
- `task family missing .claude/ at ${sourceClaude}; family must check in a pre-staged skills/agents tree (design decision P1)`,
29
- );
67
+ // No lockfile — family doesn't use skill packs.
30
68
  }
31
69
 
32
- await rm(stagingDir, { recursive: true, force: true });
33
- await cp(sourceClaude, stagedClaude, { recursive: true });
70
+ return { stagingDir, skillSetHash, judgeProfilesDir };
71
+ }
34
72
 
35
- const skillSetHash =
36
- "sha256:" + createHash("sha256").update(family.apmLockBytes).digest("hex");
73
+ function normalizeLf(buf) {
74
+ const out = [];
75
+ for (let i = 0; i < buf.length; i++) {
76
+ if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
77
+ out.push(buf[i]);
78
+ }
79
+ return Buffer.from(out);
80
+ }
37
81
 
38
- return { stagingDir, skillSetHash };
82
+ function runApmInstall(cwd) {
83
+ return new Promise((res, rej) => {
84
+ const child = spawn("apm", ["install", "--target", "claude"], {
85
+ cwd,
86
+ stdio: ["ignore", "pipe", "pipe"],
87
+ });
88
+ let stderr = "";
89
+ child.stdout.on("data", () => {});
90
+ child.stderr.on("data", (d) => {
91
+ stderr += d.toString();
92
+ });
93
+ child.on("error", (e) => {
94
+ rej(new Error(`failed to spawn apm: ${e.message}`));
95
+ });
96
+ child.on("close", (code) => {
97
+ if (code === 0) res();
98
+ else rej(new Error(`apm install exited ${code}: ${stderr}`));
99
+ });
100
+ });
39
101
  }
@@ -0,0 +1,146 @@
1
+ /**
2
+ * Env-loader — auto-discover `.env` / `.env.local` files in a task family
3
+ * and its tasks, load them into `process.env`, and render the merged result
4
+ * into each agent CWD.
5
+ *
6
+ * Discovery paths (loaded in this order, first value per key wins):
7
+ * 1. process.env (CI secrets, shell env — never overwritten)
8
+ * 2. <family>/.env.local
9
+ * 3. <family>/.env
10
+ * 4. tasks/<id>/.env.local
11
+ * 5. tasks/<id>/.env
12
+ *
13
+ * Every discovered env file — family or task — is loaded into process.env
14
+ * AND rendered (with resolved values) into the agent working directory.
15
+ */
16
+
17
+ import { readFile, writeFile } from "node:fs/promises";
18
+ import { join } from "node:path";
19
+
20
+ const ENV_FILES = [".env.local", ".env"];
21
+
22
+ /**
23
+ * Parse a `.env` file into an array of {key, value} pairs.
24
+ * Handles KEY=VALUE, # comments, blank lines, and single/double-quoted values.
25
+ * @param {string} content
26
+ * @returns {Array<{key: string, value: string}>}
27
+ */
28
+ export function parseEnvFile(content) {
29
+ const entries = [];
30
+ for (const raw of content.split("\n")) {
31
+ const line = raw.trim();
32
+ if (!line || line.startsWith("#")) continue;
33
+ const eq = line.indexOf("=");
34
+ if (eq === -1) continue;
35
+ const key = line.slice(0, eq).trim();
36
+ if (!key) continue;
37
+ let value = line.slice(eq + 1).trim();
38
+ if (
39
+ (value.startsWith('"') && value.endsWith('"')) ||
40
+ (value.startsWith("'") && value.endsWith("'"))
41
+ ) {
42
+ value = value.slice(1, -1);
43
+ }
44
+ entries.push({ key, value });
45
+ }
46
+ return entries;
47
+ }
48
+
49
+ /**
50
+ * Read and parse an env file, returning [] if the file does not exist.
51
+ * @param {string} filePath
52
+ * @returns {Promise<Array<{key: string, value: string}>>}
53
+ */
54
+ async function readEnvFile(filePath) {
55
+ try {
56
+ const content = await readFile(filePath, "utf8");
57
+ return parseEnvFile(content);
58
+ } catch (e) {
59
+ if (e.code === "ENOENT") return [];
60
+ throw e;
61
+ }
62
+ }
63
+
64
+ /**
65
+ * Load entries into process.env. Existing keys are never overwritten.
66
+ * @param {Array<{key: string, value: string}>} entries
67
+ * @returns {string[]} var names that were loaded
68
+ */
69
+ function applyToProcessEnv(entries) {
70
+ const names = [];
71
+ for (const { key, value } of entries) {
72
+ names.push(key);
73
+ if (process.env[key] === undefined) {
74
+ process.env[key] = value;
75
+ }
76
+ }
77
+ return names;
78
+ }
79
+
80
+ /**
81
+ * Load one env file: apply to process.env, record keys in the merged map.
82
+ * @param {string} dir
83
+ * @param {string} file
84
+ * @param {Set<string>} names
85
+ * @param {Map<string, Map<string, true>>} merged
86
+ */
87
+ async function loadOneEnvFile(dir, file, names, merged) {
88
+ const entries = await readEnvFile(join(dir, file));
89
+ if (entries.length === 0) return;
90
+ for (const name of applyToProcessEnv(entries)) names.add(name);
91
+ if (!merged.has(file)) merged.set(file, new Map());
92
+ const fileMap = merged.get(file);
93
+ for (const { key } of entries) {
94
+ if (!fileMap.has(key)) fileMap.set(key, true);
95
+ }
96
+ }
97
+
98
+ /**
99
+ * Scan directories for env files, load into process.env, and collect
100
+ * a merged key manifest per filename.
101
+ * @param {string[]} dirs
102
+ * @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
103
+ */
104
+ async function collectEnvEntries(dirs) {
105
+ const names = new Set();
106
+ const merged = new Map();
107
+ for (const dir of dirs) {
108
+ for (const file of ENV_FILES) {
109
+ await loadOneEnvFile(dir, file, names, merged);
110
+ }
111
+ }
112
+ return { names, merged };
113
+ }
114
+
115
+ /**
116
+ * Write resolved env files into the agent CWD and warn about empty values.
117
+ * @param {Map<string, Map<string, true>>} merged
118
+ * @param {string} agentCwd
119
+ */
120
+ async function renderEnvFiles(merged, agentCwd) {
121
+ for (const [file, keyMap] of merged) {
122
+ const keys = [...keyMap.keys()];
123
+ const resolved = keys.map((key) => `${key}=${process.env[key] ?? ""}`);
124
+ await writeFile(join(agentCwd, file), resolved.join("\n") + "\n");
125
+ const empty = keys.filter((key) => !process.env[key]);
126
+ if (empty.length > 0) {
127
+ process.stderr.write(
128
+ `libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
129
+ );
130
+ }
131
+ }
132
+ }
133
+
134
+ /**
135
+ * Discover `.env` / `.env.local` in one or more directories, load them
136
+ * into process.env, and render the resolved values into the agent CWD.
137
+ *
138
+ * @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
139
+ * @param {string} agentCwd - Agent working directory to render into.
140
+ * @returns {Promise<string[]>} All var names discovered (for redaction).
141
+ */
142
+ export async function loadEnv(dirs, agentCwd) {
143
+ const { names, merged } = await collectEnvEntries(dirs);
144
+ await renderEnvFiles(merged, agentCwd);
145
+ return [...names];
146
+ }
@@ -6,7 +6,7 @@
6
6
  *
7
7
  * Template variables available in `judge.task.md`:
8
8
  *
9
- * {{AGENT_INSTRUCTIONS}} — contents of instructions.md
9
+ * {{AGENT_INSTRUCTIONS}} — contents of agent.task.md
10
10
  * {{AGENT_PROFILE}} — agent profile body (empty string if none)
11
11
  * {{AGENT_TRACE_PATH}} — path to agent.ndjson
12
12
  * {{SCORING_RESULT}} — JSON scoring object
@@ -37,7 +37,7 @@ import { createRedactor } from "../redaction.js";
37
37
 
38
38
  /**
39
39
  * @typedef {object} JudgeContext
40
- * @property {string} agentInstructions - Contents of instructions.md.
40
+ * @property {string} agentInstructions - Contents of agent.task.md.
41
41
  * @property {string} agentProfile - Agent profile body (empty string if none).
42
42
  * @property {string} skillSetHash - SHA-256 fingerprint from apm.lock.yaml.
43
43
  */
@@ -47,7 +47,7 @@ import { createRedactor } from "../redaction.js";
47
47
  * @param {import("./task-family.js").Task} task
48
48
  * @param {import("./workdir.js").Workdir} workdir
49
49
  * @param {import("./scorer.js").ScoringResult} scoring
50
- * @param {{query: Function, model: string, judgeProfile?: string}} deps
50
+ * @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
51
51
  * @param {JudgeContext} [context]
52
52
  * @returns {Promise<JudgeVerdict>}
53
53
  */
@@ -71,6 +71,7 @@ export async function runJudge(task, workdir, scoring, deps, context) {
71
71
  output,
72
72
  model: deps.model,
73
73
  judgeProfile: deps.judgeProfile,
74
+ profilesDir: deps.profilesDir,
74
75
  maxTurns: 25,
75
76
  redactor: createRedactor(),
76
77
  });
@@ -178,24 +178,46 @@ function renderFullReport(report, kValues) {
178
178
  function renderSummary(report) {
179
179
  const { totals } = report;
180
180
  const passing = report.tasks.filter((t) => t.c > 0 && t.c === t.n).length;
181
+ const icon = statusIcon(passing === totals.tasks);
181
182
  const lines = [
182
183
  "# Benchmark Report",
183
184
  "",
184
- `**Result: ${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
185
+ `${icon} **${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
185
186
  ];
187
+
188
+ const headers = [];
189
+ const values = [];
190
+ if (totals.costUsd != null) {
191
+ headers.push("Cost");
192
+ values.push(formatCost(totals.costUsd));
193
+ }
194
+ if (totals.medianDurationMs != null) {
195
+ headers.push("Median Duration");
196
+ values.push(formatDuration(totals.medianDurationMs));
197
+ }
198
+ if (totals.medianTurns != null) {
199
+ headers.push("Median Turns");
200
+ values.push(String(totals.medianTurns));
201
+ }
202
+ if (headers.length) {
203
+ lines.push("");
204
+ lines.push(`| ${headers.join(" | ")} |`);
205
+ lines.push(`| ${headers.map(() => "---").join(" | ")} |`);
206
+ lines.push(`| ${values.join(" | ")} |`);
207
+ }
208
+
186
209
  const meta = [];
187
- if (totals.model) meta.push(`Model: \`${totals.model}\``);
210
+ if (totals.model) {
211
+ meta.push(`Agent: \`${totals.model.agent}\``);
212
+ meta.push(`Supervisor: \`${totals.model.supervisor}\``);
213
+ meta.push(`Judge: \`${totals.model.judge}\``);
214
+ }
188
215
  if (totals.skillSetHash) meta.push(`Skill set: \`${totals.skillSetHash}\``);
189
216
  if (totals.familyRevision) meta.push(`Family: \`${totals.familyRevision}\``);
190
- if (meta.length) lines.push(meta.join(" | "));
191
-
192
- const stats = [];
193
- if (totals.costUsd != null) stats.push(`Cost: ${formatCost(totals.costUsd)}`);
194
- if (totals.medianDurationMs != null)
195
- stats.push(`Median duration: ${formatDuration(totals.medianDurationMs)}`);
196
- if (totals.medianTurns != null)
197
- stats.push(`Median turns: ${totals.medianTurns}`);
198
- if (stats.length) lines.push(stats.join(" | "));
217
+ if (meta.length) {
218
+ lines.push("");
219
+ lines.push(meta.join(" | "));
220
+ }
199
221
 
200
222
  lines.push("");
201
223
  return lines.join("\n");
@@ -229,13 +251,13 @@ function renderTotalsLine(report) {
229
251
 
230
252
  function renderTaskDetail(task) {
231
253
  const runs = task.runs ?? [];
232
- const status = task.c === task.n ? "PASS" : "FAIL";
254
+ const icon = statusIcon(task.c === task.n);
233
255
  const singleRun = runs.length === 1;
234
256
 
235
257
  const lines = [
236
258
  `### ${task.taskId}`,
237
259
  "",
238
- `**${status} — ${task.c}/${task.n} runs passed**`,
260
+ `${icon} **${task.c}/${task.n} runs passed**`,
239
261
  ];
240
262
 
241
263
  lines.push("", renderRunsTable(runs));
@@ -267,16 +289,16 @@ function renderRunsTable(runs) {
267
289
  const scoringCell = r.preflightError
268
290
  ? "preflight error"
269
291
  : r.scoring
270
- ? r.scoring.verdict
292
+ ? statusIcon(r.scoring.verdict === "pass")
271
293
  : "—";
272
294
  const judgeCell = r.preflightError
273
295
  ? "—"
274
296
  : r.judgeVerdict
275
- ? r.judgeVerdict.verdict
297
+ ? statusIcon(r.judgeVerdict.verdict === "pass")
276
298
  : "—";
277
299
  rows.push([
278
300
  String(r.runIndex),
279
- r.verdict.toUpperCase(),
301
+ statusIcon(r.verdict === "pass"),
280
302
  scoringCell,
281
303
  judgeCell,
282
304
  formatCost(r.costUsd),
@@ -317,7 +339,7 @@ function collectScoringRows(runs) {
317
339
  rows.push({
318
340
  run: r.runIndex,
319
341
  check: escapeCell(String(d.test ?? "(unnamed)")),
320
- result: d.pass ? "PASS" : "FAIL",
342
+ result: statusIcon(d.pass),
321
343
  message: escapeCell(String(d.message ?? "")),
322
344
  });
323
345
  }
@@ -365,6 +387,10 @@ function renderErrors(runs) {
365
387
  // Formatting helpers
366
388
  // ---------------------------------------------------------------------------
367
389
 
390
+ function statusIcon(pass) {
391
+ return pass ? "✅" : "❌";
392
+ }
393
+
368
394
  function formatPassAt(v) {
369
395
  if (v == null) return "—";
370
396
  if (typeof v === "object" && "error" in v) return v.error;
@@ -29,7 +29,7 @@ const JUDGE_VERDICT_SHAPE = z.object({
29
29
 
30
30
  const PROFILES_SHAPE = z.object({
31
31
  agent: z.union([z.string(), z.null()]),
32
- supervisor: z.null(),
32
+ supervisor: z.union([z.string(), z.null()]),
33
33
  judge: z.union([z.string(), z.null()]),
34
34
  });
35
35
 
@@ -46,7 +46,11 @@ const COMMON_FIELDS = {
46
46
  costUsd: z.number(),
47
47
  turns: z.number().int().min(0),
48
48
  profiles: PROFILES_SHAPE,
49
- model: z.string(),
49
+ model: z.object({
50
+ agent: z.string(),
51
+ supervisor: z.string().optional(),
52
+ judge: z.string().optional(),
53
+ }),
50
54
  skillSetHash: z.string(),
51
55
  familyRevision: z.string(),
52
56
  durationMs: z.number().int().min(0),
@@ -61,8 +65,9 @@ const HAPPY_RECORD = z.object({
61
65
  ...COMMON_FIELDS,
62
66
  scoring: SCORING_SHAPE,
63
67
  submission: z.string(),
64
- judgeVerdict: JUDGE_VERDICT_SHAPE,
68
+ judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
65
69
  agentTracePath: z.string(),
70
+ supervisorTracePath: z.string(),
66
71
  judgeTracePath: z.string(),
67
72
  agentError: AGENT_ERROR_SHAPE.optional(),
68
73
  preflightError: z.undefined().optional(),
@@ -76,6 +81,7 @@ const PREFLIGHT_RECORD = z.object({
76
81
  // them in WorkdirManager.start) so the record is uniform across branches
77
82
  // and downstream consumers can reference them without conditional fields.
78
83
  agentTracePath: z.string(),
84
+ supervisorTracePath: z.string(),
79
85
  judgeTracePath: z.string(),
80
86
  scoring: z.undefined().optional(),
81
87
  submission: z.undefined().optional(),