@forwardimpact/libeval 0.1.31 → 0.1.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -12,3 +12,23 @@ reproducible evidence.
12
12
  ```js
13
13
  import { createTraceCollector, createTraceQuery, createAgentRunner } from '@forwardimpact/libeval';
14
14
  ```
15
+
16
+ ## Trace redaction
17
+
18
+ `fit-eval run`, `fit-eval supervise`, and `fit-eval facilitate` redact
19
+ secrets in trace artifacts before they reach disk. Two layers compose:
20
+
21
+ - **Env-var allowlist**, defaulting to `ANTHROPIC_API_KEY`, `GH_TOKEN`,
22
+ `GITHUB_TOKEN`. The runtime values of these vars are replaced with
23
+ `[REDACTED:env:NAME]` wherever they appear in tool inputs, tool
24
+ outputs, assistant text, or orchestrator summaries. Override the list
25
+ with `LIBEVAL_REDACTION_ENV_VARS=NAME1,NAME2,…` (replaces, not extends).
26
+ - **Credential-shape patterns**, covering Anthropic API keys (`sk-ant-`),
27
+ GitHub PATs (`ghp_`), installation tokens (`ghs_`), OAuth tokens
28
+ (`gho_`), and fine-grained PATs (`github_pat_`). Pattern hits become
29
+ `[REDACTED:pattern:KIND]`.
30
+
31
+ Redaction is on by default. To disable, set `LIBEVAL_REDACTION_DISABLED=1`
32
+ — a stderr warning fires once per run. Never set this in CI on a public
33
+ repository: workflow artifacts there are downloadable through the
34
+ retention window.
@@ -0,0 +1,167 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { readFileSync } from "node:fs";
4
+ import { createCli } from "@forwardimpact/libcli";
5
+ import { createLogger } from "@forwardimpact/libtelemetry";
6
+
7
+ import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
8
+ import { runBenchmarkScoreCommand } from "../src/commands/benchmark-score.js";
9
+ import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
10
+
11
+ // `bun build --compile` injects FIT_BENCHMARK_VERSION via --define, eliminating
12
+ // the readFileSync branch in the compiled binary (which would ENOENT against
13
+ // the bunfs virtual mount). Source execution falls through to package.json.
14
+ const VERSION =
15
+ process.env.FIT_BENCHMARK_VERSION ||
16
+ JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
17
+ .version;
18
+
19
+ export const definition = {
20
+ name: "fit-benchmark",
21
+ version: VERSION,
22
+ description:
23
+ "Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
24
+ commands: [
25
+ {
26
+ name: "run",
27
+ args: "",
28
+ description:
29
+ "Run every task in a family for N runs and emit one result record per (task, runIndex).",
30
+ options: {
31
+ family: {
32
+ type: "string",
33
+ description: "Path or git URL to a task family",
34
+ },
35
+ output: {
36
+ type: "string",
37
+ description: "Run-output directory (created if missing)",
38
+ },
39
+ runs: {
40
+ type: "string",
41
+ description: "Runs per task (integer ≥ 1, default 1)",
42
+ },
43
+ model: {
44
+ type: "string",
45
+ description: "Claude model id (default: claude-opus-4-7[1m])",
46
+ },
47
+ "agent-profile": {
48
+ type: "string",
49
+ description: "Agent-under-test profile name",
50
+ },
51
+ "judge-profile": {
52
+ type: "string",
53
+ description: "Judge profile name",
54
+ },
55
+ "max-turns": {
56
+ type: "string",
57
+ description:
58
+ "Agent-under-test turn budget (default: 50, 0 = unlimited)",
59
+ },
60
+ },
61
+ },
62
+ {
63
+ name: "score",
64
+ args: "",
65
+ description:
66
+ "Score a single task against a post-run workdir without invoking an agent.",
67
+ options: {
68
+ family: {
69
+ type: "string",
70
+ description: "Path or git URL to a task family",
71
+ },
72
+ task: {
73
+ type: "string",
74
+ description: "METR-style task id (task_family_name/task_name)",
75
+ },
76
+ workdir: {
77
+ type: "string",
78
+ description:
79
+ "Post-run directory; <workdir>/cwd/ is the agent CWD scoring runs against",
80
+ },
81
+ output: {
82
+ type: "string",
83
+ description: "Output file (defaults to stdout; one JSONL line)",
84
+ },
85
+ },
86
+ },
87
+ {
88
+ name: "report",
89
+ args: "",
90
+ description:
91
+ "Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
92
+ options: {
93
+ input: {
94
+ type: "string",
95
+ description: "Run-output directory containing results.jsonl",
96
+ },
97
+ k: {
98
+ type: "string",
99
+ description: "Comma-separated k values (default: 1,3,5)",
100
+ },
101
+ format: {
102
+ type: "string",
103
+ description: "Output format (json|text, default: json)",
104
+ },
105
+ },
106
+ },
107
+ ],
108
+ globalOptions: {
109
+ help: { type: "boolean", short: "h", description: "Show this help" },
110
+ version: { type: "boolean", description: "Show version" },
111
+ json: { type: "boolean", description: "Output help as JSON" },
112
+ },
113
+ examples: [
114
+ "fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
115
+ "fit-benchmark score --family=./families/coding --task=coding/todo-api --workdir=./runs/2026-05-11/runs/coding__todo-api/0",
116
+ "fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
117
+ ],
118
+ documentation: [
119
+ {
120
+ title: "Run a Benchmark",
121
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/index.md",
122
+ description:
123
+ "Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
124
+ },
125
+ ],
126
+ };
127
+
128
+ const cli = createCli(definition);
129
+ const logger = createLogger("benchmark");
130
+
131
+ const COMMANDS = {
132
+ run: runBenchmarkRunCommand,
133
+ score: runBenchmarkScoreCommand,
134
+ report: runBenchmarkReportCommand,
135
+ };
136
+
137
+ async function main() {
138
+ const parsed = cli.parse(process.argv.slice(2));
139
+ if (!parsed) process.exit(0);
140
+
141
+ const { values, positionals } = parsed;
142
+
143
+ if (positionals.length === 0) {
144
+ cli.usageError("no command specified");
145
+ process.exit(2);
146
+ }
147
+
148
+ const [command, ...args] = positionals;
149
+ const handler = COMMANDS[command];
150
+
151
+ if (!handler) {
152
+ cli.usageError(`unknown command "${command}"`);
153
+ process.exit(2);
154
+ }
155
+
156
+ await handler(values, args);
157
+ }
158
+
159
+ // Run main only when invoked as a CLI. Importing for tests (e.g. parity)
160
+ // should not execute the entry point.
161
+ if (import.meta.url === `file://${process.argv[1]}`) {
162
+ main().catch((error) => {
163
+ logger.exception("main", error);
164
+ cli.error(error.message);
165
+ process.exit(1);
166
+ });
167
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.31",
3
+ "version": "0.1.33",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -32,11 +32,13 @@
32
32
  "exports": {
33
33
  ".": "./src/index.js",
34
34
  "./bin/fit-eval.js": "./bin/fit-eval.js",
35
- "./bin/fit-trace.js": "./bin/fit-trace.js"
35
+ "./bin/fit-trace.js": "./bin/fit-trace.js",
36
+ "./bin/fit-benchmark.js": "./bin/fit-benchmark.js"
36
37
  },
37
38
  "bin": {
38
39
  "fit-eval": "./bin/fit-eval.js",
39
- "fit-trace": "./bin/fit-trace.js"
40
+ "fit-trace": "./bin/fit-trace.js",
41
+ "fit-benchmark": "./bin/fit-benchmark.js"
40
42
  },
41
43
  "files": [
42
44
  "src/**/*.js",
@@ -54,7 +54,9 @@ export class AgentRunner {
54
54
  if (!deps.cwd) throw new Error("cwd is required");
55
55
  if (!deps.query) throw new Error("query is required");
56
56
  if (!deps.output) throw new Error("output is required");
57
+ if (!deps.redactor) throw new Error("redactor is required");
57
58
  Object.assign(this, applyDefaults(deps));
59
+ this.redactor = deps.redactor;
58
60
  this.sessionId = null;
59
61
  this.buffer = [];
60
62
  /** @type {AbortController|null} */
@@ -203,12 +205,16 @@ export class AgentRunner {
203
205
  * @param {{pendingBatch: string[], assistantTextCount: number}} state
204
206
  */
205
207
  #recordLine(message, state) {
206
- const line = JSON.stringify(message);
208
+ const redacted = this.redactor.redactValue(message);
209
+ const line = JSON.stringify(redacted);
207
210
  this.output.write(line + "\n");
208
211
  this.buffer.push(line);
209
212
  if (this.onLine) this.onLine(line);
210
213
  if (this.onBatch) state.pendingBatch.push(line);
211
214
 
215
+ // Session-id / text-block tracking reads the ORIGINAL message —
216
+ // these fields are not secret carriers, and the trackers rely on
217
+ // shape, not string contents.
212
218
  if (message.type === "system" && message.subtype === "init") {
213
219
  this.sessionId = message.session_id;
214
220
  }
@@ -0,0 +1,39 @@
1
+ /**
2
+ * ApmInstaller — materialises the family's pre-staged `.claude/` tree into a
3
+ * single staging directory, computes the manifest fingerprint, and is invoked
4
+ * once per family install. Per-task copy happens later in WorkdirManager.
5
+ *
6
+ * v1 trusts the family's checked-in `.claude/` (P1); the lockfile is hashed
7
+ * verbatim, not interpreted.
8
+ */
9
+
10
+ import { createHash } from "node:crypto";
11
+ import { access, cp, rm } from "node:fs/promises";
12
+ import { join } from "node:path";
13
+
14
+ /**
15
+ * @param {import("./task-family.js").TaskFamily} family
16
+ * @param {string} outputDir - The benchmark run's output directory.
17
+ * @returns {Promise<{stagingDir: string, skillSetHash: string}>}
18
+ */
19
+ export async function installApm(family, outputDir) {
20
+ const stagingDir = join(outputDir, ".apm-staging");
21
+ const stagedClaude = join(stagingDir, ".claude");
22
+ const sourceClaude = join(family.rootPath, ".claude");
23
+
24
+ try {
25
+ await access(sourceClaude);
26
+ } catch {
27
+ throw new Error(
28
+ `task family missing .claude/ at ${sourceClaude}; family must check in a pre-staged skills/agents tree (design decision P1)`,
29
+ );
30
+ }
31
+
32
+ await rm(stagingDir, { recursive: true, force: true });
33
+ await cp(sourceClaude, stagedClaude, { recursive: true });
34
+
35
+ const skillSetHash =
36
+ "sha256:" + createHash("sha256").update(family.apmLockBytes).digest("hex");
37
+
38
+ return { stagingDir, skillSetHash };
39
+ }
@@ -0,0 +1,146 @@
1
+ /**
2
+ * Benchmark adapter for the libeval `Judge`. Templates the family's
3
+ * `judge.task.md` ({{SCORING}} / {{AGENT_TRACE_PATH}} substitution), runs the
4
+ * judge against the post-run agent CWD, and returns the verdict in the
5
+ * benchmark's `pass`/`fail` vocabulary (mapped from libeval's
6
+ * `success`/`failure`).
7
+ *
8
+ * The judge verdict is captured from the orchestration context's
9
+ * `concluded` flag directly — no trace parsing on the happy path.
10
+ * `parseConcludeFromTrace` is preserved for offline analysis and as a
11
+ * fallback when the runtime ctx isn't available (e.g. re-grading a
12
+ * historical run from its judge.ndjson file).
13
+ */
14
+
15
+ import { createReadStream, createWriteStream } from "node:fs";
16
+ import { readFile } from "node:fs/promises";
17
+ import { createInterface } from "node:readline";
18
+ import { createJudge } from "../judge.js";
19
+ import { createRedactor } from "../redaction.js";
20
+
21
+ /**
22
+ * @typedef {object} JudgeVerdict
23
+ * @property {"pass" | "fail"} verdict
24
+ * @property {string} summary
25
+ */
26
+
27
+ /**
28
+ * Run the judge over a completed task run.
29
+ * @param {import("./task-family.js").Task} task
30
+ * @param {import("./workdir.js").Workdir} workdir
31
+ * @param {import("./scorer.js").ScoringResult} scoring
32
+ * @param {{query: Function, model: string, judgeProfile?: string}} deps
33
+ * @returns {Promise<JudgeVerdict>}
34
+ */
35
+ export async function runJudge(task, workdir, scoring, deps) {
36
+ const template = await readFile(task.paths.judge, "utf8");
37
+ const taskText = template
38
+ .replaceAll("{{SCORING}}", JSON.stringify(scoring, null, 2))
39
+ .replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath);
40
+
41
+ const output = createWriteStream(workdir.judgeTracePath);
42
+ const judge = createJudge({
43
+ cwd: workdir.cwd,
44
+ query: deps.query,
45
+ output,
46
+ model: deps.model,
47
+ judgeProfile: deps.judgeProfile,
48
+ maxTurns: 5,
49
+ redactor: createRedactor(),
50
+ });
51
+
52
+ let outcome;
53
+ try {
54
+ outcome = await judge.run(taskText);
55
+ } finally {
56
+ await new Promise((r) => output.end(r));
57
+ }
58
+
59
+ if (outcome.verdict === null) {
60
+ return { verdict: "fail", summary: "judge did not conclude" };
61
+ }
62
+ return {
63
+ verdict: outcome.verdict === "success" ? "pass" : "fail",
64
+ summary: outcome.summary ?? "",
65
+ };
66
+ }
67
+
68
+ /**
69
+ * Parse the last judge-source (or supervisor-source, for backward compat
70
+ * with pre-Judge-class traces) `Conclude` tool call from an NDJSON trace
71
+ * and map the verdict (`success → pass`, `failure → fail`). Preserved for
72
+ * offline analysis; not used on the runtime happy path.
73
+ * @param {string} tracePath
74
+ * @returns {Promise<JudgeVerdict | null>}
75
+ */
76
+ export async function parseConcludeFromTrace(tracePath) {
77
+ const stream = createReadStream(tracePath);
78
+ const rl = createInterface({ input: stream, crlfDelay: Infinity });
79
+ let last = null;
80
+ for await (const line of rl) {
81
+ const candidate = extractConcludeInput(line);
82
+ if (candidate) last = candidate;
83
+ }
84
+ if (!last) return null;
85
+ return {
86
+ verdict: last.verdict === "success" ? "pass" : "fail",
87
+ summary: last.summary ?? "",
88
+ };
89
+ }
90
+
91
+ /**
92
+ * Return the `Conclude` tool input if the line carries a judge-source or
93
+ * supervisor-source assistant message ending in a `Conclude` tool_use
94
+ * block; null otherwise.
95
+ * @param {string} line
96
+ * @returns {{verdict: string, summary?: string} | null}
97
+ */
98
+ function extractConcludeInput(line) {
99
+ const trimmed = line.trim();
100
+ if (!trimmed) return null;
101
+ let event;
102
+ try {
103
+ event = JSON.parse(trimmed);
104
+ } catch {
105
+ return null;
106
+ }
107
+ const wrapped =
108
+ event.event && typeof event.source === "string"
109
+ ? { source: event.source, inner: event.event }
110
+ : { source: null, inner: event };
111
+ if (
112
+ wrapped.source !== null &&
113
+ wrapped.source !== "judge" &&
114
+ wrapped.source !== "supervisor"
115
+ ) {
116
+ return null;
117
+ }
118
+ if (wrapped.inner.type !== "assistant") return null;
119
+ const content = wrapped.inner.message?.content ?? wrapped.inner.content;
120
+ if (!Array.isArray(content)) return null;
121
+ let found = null;
122
+ for (const block of content) {
123
+ if (
124
+ block.type === "tool_use" &&
125
+ isConcludeToolName(block.name) &&
126
+ block.input
127
+ ) {
128
+ found = block.input;
129
+ }
130
+ }
131
+ return found;
132
+ }
133
+
134
+ /**
135
+ * The Claude Agent SDK reports MCP tool names as
136
+ * `mcp__<server>__<tool>` when the model invokes them — the orchestration
137
+ * `Conclude` arrives as `mcp__orchestration__Conclude`. Pre-baked
138
+ * supervisor traces (and the libeval-internal envelopes) sometimes carry
139
+ * the bare `Conclude` name. Accept both forms so the parser is robust to
140
+ * trace source.
141
+ */
142
+ function isConcludeToolName(name) {
143
+ if (typeof name !== "string") return false;
144
+ if (name === "Conclude") return true;
145
+ return name.endsWith("__Conclude");
146
+ }
@@ -0,0 +1,161 @@
1
+ /**
2
+ * ReportAggregator — read a run-output directory's `results.jsonl`, group
3
+ * records by `taskId`, and compute pass@k via the OpenAI HumanEval
4
+ * unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
5
+ *
6
+ * Records that fail schema validation are skipped with a stderr warning
7
+ * (counted under `totals.skipped`) so a corrupt line cannot abort the
8
+ * whole report.
9
+ */
10
+
11
+ import { createReadStream } from "node:fs";
12
+ import { join } from "node:path";
13
+ import { createInterface } from "node:readline";
14
+
15
+ import { validateResultRecord } from "./result.js";
16
+
17
+ /**
18
+ * @typedef {object} TaskReport
19
+ * @property {string} taskId
20
+ * @property {number} n - Total runs.
21
+ * @property {number} c - Passing runs.
22
+ * @property {Record<string|number, number|null>} passAtK
23
+ */
24
+
25
+ /**
26
+ * @param {{inputDir: string, kValues: number[]}} opts
27
+ * @returns {Promise<{tasks: TaskReport[], totals: {tasks: number, runs: number, skipped: number}}>}
28
+ */
29
+ export async function aggregate({ inputDir, kValues }) {
30
+ const records = await loadRecords(inputDir);
31
+ const grouped = groupByTask(records.records);
32
+ const tasks = [];
33
+ let runs = 0;
34
+ for (const [taskId, group] of grouped) {
35
+ const n = group.length;
36
+ const c = group.filter((r) => r.verdict === "pass").length;
37
+ runs += n;
38
+ const passAtK = {};
39
+ for (const k of kValues) passAtK[k] = passAtKValue(n, c, k);
40
+ tasks.push({ taskId, n, c, passAtK });
41
+ }
42
+ tasks.sort((a, b) =>
43
+ a.taskId < b.taskId ? -1 : a.taskId > b.taskId ? 1 : 0,
44
+ );
45
+ return {
46
+ tasks,
47
+ totals: { tasks: tasks.length, runs, skipped: records.skipped },
48
+ };
49
+ }
50
+
51
+ /**
52
+ * Render an aggregate report as a Markdown table. Columns: taskId | n | c |
53
+ * pass@k1 | pass@k2 ... — one column per kValues entry, in the same order.
54
+ * @param {Awaited<ReturnType<typeof aggregate>>} report
55
+ * @param {number[]} kValues
56
+ * @returns {string}
57
+ */
58
+ export function renderTextReport(report, kValues) {
59
+ const header = ["taskId", "n", "c", ...kValues.map((k) => `pass@${k}`)];
60
+ const rows = [header, header.map(() => "---")];
61
+ for (const t of report.tasks) {
62
+ rows.push([
63
+ t.taskId,
64
+ String(t.n),
65
+ String(t.c),
66
+ ...kValues.map((k) => formatPassAt(t.passAtK[k])),
67
+ ]);
68
+ }
69
+ const lines = rows.map((r) => `| ${r.join(" | ")} |`);
70
+ lines.push("");
71
+ lines.push(
72
+ `Totals — tasks: ${report.totals.tasks}, runs: ${report.totals.runs}, skipped: ${report.totals.skipped}`,
73
+ );
74
+ return lines.join("\n");
75
+ }
76
+
77
+ function formatPassAt(v) {
78
+ if (v == null) return "—";
79
+ if (typeof v === "object" && "error" in v) return v.error;
80
+ return Number(v).toFixed(4);
81
+ }
82
+
83
+ async function loadRecords(inputDir) {
84
+ const path = join(inputDir, "results.jsonl");
85
+ const stream = createReadStream(path);
86
+ const rl = createInterface({ input: stream, crlfDelay: Infinity });
87
+ const records = [];
88
+ let skipped = 0;
89
+ for await (const line of rl) {
90
+ const trimmed = line.trim();
91
+ if (!trimmed) continue;
92
+ let record;
93
+ try {
94
+ record = JSON.parse(trimmed);
95
+ } catch (e) {
96
+ process.stderr.write(
97
+ `benchmark report: skipped malformed JSON line — ${e.message}\n`,
98
+ );
99
+ skipped++;
100
+ continue;
101
+ }
102
+ try {
103
+ validateResultRecord(record);
104
+ } catch (e) {
105
+ process.stderr.write(
106
+ `benchmark report: skipped record failing schema — ${describeError(e)}\n`,
107
+ );
108
+ skipped++;
109
+ continue;
110
+ }
111
+ records.push(record);
112
+ }
113
+ return { records, skipped };
114
+ }
115
+
116
+ function describeError(e) {
117
+ if (e && Array.isArray(e.issues)) {
118
+ return e.issues.map((i) => `${i.path.join(".")}: ${i.message}`).join("; ");
119
+ }
120
+ return e.message ?? String(e);
121
+ }
122
+
123
+ function groupByTask(records) {
124
+ const out = new Map();
125
+ for (const r of records) {
126
+ if (!out.has(r.taskId)) out.set(r.taskId, []);
127
+ out.get(r.taskId).push(r);
128
+ }
129
+ return out;
130
+ }
131
+
132
+ /**
133
+ * pass@k = 1 - C(n - c, k) / C(n, k). Compute with BigInt to avoid
134
+ * floating-point loss on large n.
135
+ * @param {number} n
136
+ * @param {number} c
137
+ * @param {number} k
138
+ * @returns {number | {error: string}}
139
+ */
140
+ function passAtKValue(n, c, k) {
141
+ if (k > n) return { error: "k > n" };
142
+ if (n - c < k) return 1;
143
+ const total = binomial(BigInt(n), BigInt(k));
144
+ const fail = binomial(BigInt(n - c), BigInt(k));
145
+ // Compute the ratio as a single division so we avoid `1 - x` which
146
+ // accumulates IEEE-754 error (e.g. 1 - 0.6 = 0.39999...).
147
+ const passing = total - fail;
148
+ return Number(passing) / Number(total);
149
+ }
150
+
151
+ function binomial(n, k) {
152
+ if (k < 0n || k > n) return 0n;
153
+ if (k === 0n || k === n) return 1n;
154
+ let kk = k;
155
+ if (kk > n - kk) kk = n - kk;
156
+ let result = 1n;
157
+ for (let i = 0n; i < kk; i++) {
158
+ result = (result * (n - i)) / (i + 1n);
159
+ }
160
+ return result;
161
+ }
@@ -0,0 +1,108 @@
1
+ /**
2
+ * Result-record schemas and runtime validators.
3
+ *
4
+ * Two schemas live here:
5
+ * - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
6
+ * benchmark run. Has a happy branch (scoring + judge present) and a
7
+ * pre-flight-failure branch (scoring/judgeVerdict/submission absent).
8
+ * - SCORING_RECORD_SCHEMA — narrower output of `benchmark-score` (P7):
9
+ * ad-hoc grading without a full lifecycle.
10
+ *
11
+ * Validation is throw-on-mismatch so the runner can wrap every JSONL append
12
+ * in a guard and reject schema drift at write time.
13
+ */
14
+
15
+ import { z } from "zod";
16
+
17
+ const VERDICT_ENUM = z.enum(["pass", "fail"]);
18
+
19
+ const SCORING_SHAPE = z.object({
20
+ verdict: VERDICT_ENUM,
21
+ details: z.array(z.unknown()),
22
+ exitCode: z.number().int(),
23
+ });
24
+
25
+ const JUDGE_VERDICT_SHAPE = z.object({
26
+ verdict: VERDICT_ENUM,
27
+ summary: z.string(),
28
+ });
29
+
30
+ const PROFILES_SHAPE = z.object({
31
+ agent: z.union([z.string(), z.null()]),
32
+ supervisor: z.null(),
33
+ judge: z.union([z.string(), z.null()]),
34
+ });
35
+
36
+ const PREFLIGHT_ERROR_SHAPE = z.object({
37
+ phase: z.string(),
38
+ message: z.string(),
39
+ exitCode: z.number().int(),
40
+ });
41
+
42
+ const COMMON_FIELDS = {
43
+ taskId: z.string().min(1),
44
+ runIndex: z.number().int().min(0),
45
+ verdict: VERDICT_ENUM,
46
+ costUsd: z.number(),
47
+ turns: z.number().int().min(0),
48
+ profiles: PROFILES_SHAPE,
49
+ model: z.string(),
50
+ skillSetHash: z.string(),
51
+ familyRevision: z.string(),
52
+ durationMs: z.number().int().min(0),
53
+ };
54
+
55
+ const AGENT_ERROR_SHAPE = z.object({
56
+ message: z.string(),
57
+ aborted: z.boolean(),
58
+ });
59
+
60
+ const HAPPY_RECORD = z.object({
61
+ ...COMMON_FIELDS,
62
+ scoring: SCORING_SHAPE,
63
+ submission: z.string(),
64
+ judgeVerdict: JUDGE_VERDICT_SHAPE,
65
+ agentTracePath: z.string(),
66
+ judgeTracePath: z.string(),
67
+ agentError: AGENT_ERROR_SHAPE.optional(),
68
+ preflightError: z.undefined().optional(),
69
+ });
70
+
71
+ const PREFLIGHT_RECORD = z.object({
72
+ ...COMMON_FIELDS,
73
+ costUsd: z.literal(0),
74
+ preflightError: PREFLIGHT_ERROR_SHAPE,
75
+ // Trace paths are populated even on preflight failure (the runner allocates
76
+ // them in WorkdirManager.start) so the record is uniform across branches
77
+ // and downstream consumers can reference them without conditional fields.
78
+ agentTracePath: z.string(),
79
+ judgeTracePath: z.string(),
80
+ scoring: z.undefined().optional(),
81
+ submission: z.undefined().optional(),
82
+ judgeVerdict: z.undefined().optional(),
83
+ agentError: z.undefined().optional(),
84
+ });
85
+
86
+ export const RESULT_RECORD_SCHEMA = z.union([HAPPY_RECORD, PREFLIGHT_RECORD]);
87
+
88
+ export const SCORING_RECORD_SCHEMA = z.object({
89
+ taskId: z.string().min(1),
90
+ scoring: SCORING_SHAPE,
91
+ exitCode: z.number().int(),
92
+ });
93
+
94
+ /**
95
+ * Throw on schema mismatch.
96
+ * @param {object} record
97
+ */
98
+ export function validateResultRecord(record) {
99
+ RESULT_RECORD_SCHEMA.parse(record);
100
+ }
101
+
102
+ /**
103
+ * Throw on schema mismatch.
104
+ * @param {object} record
105
+ */
106
+ export function validateScoringRecord(record) {
107
+ SCORING_RECORD_SCHEMA.parse(record);
108
+ }