@forwardimpact/libeval 0.1.31 → 0.1.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/bin/fit-benchmark.js +167 -0
- package/package.json +5 -3
- package/src/agent-runner.js +7 -1
- package/src/benchmark/apm-installer.js +39 -0
- package/src/benchmark/judge.js +146 -0
- package/src/benchmark/report.js +161 -0
- package/src/benchmark/result.js +108 -0
- package/src/benchmark/runner.js +396 -0
- package/src/benchmark/scorer.js +138 -0
- package/src/benchmark/task-family.js +259 -0
- package/src/benchmark/workdir.js +248 -0
- package/src/commands/benchmark-report.js +39 -0
- package/src/commands/benchmark-run.js +53 -0
- package/src/commands/benchmark-score.js +68 -0
- package/src/commands/facilitate.js +7 -0
- package/src/commands/run.js +9 -3
- package/src/commands/supervise.js +7 -0
- package/src/facilitator.js +35 -21
- package/src/index.js +9 -0
- package/src/judge.js +211 -0
- package/src/orchestration-toolkit.js +25 -0
- package/src/redaction.js +163 -0
- package/src/supervisor.js +29 -17
package/README.md
CHANGED
|
@@ -12,3 +12,23 @@ reproducible evidence.
|
|
|
12
12
|
```js
|
|
13
13
|
import { createTraceCollector, createTraceQuery, createAgentRunner } from '@forwardimpact/libeval';
|
|
14
14
|
```
|
|
15
|
+
|
|
16
|
+
## Trace redaction
|
|
17
|
+
|
|
18
|
+
`fit-eval run`, `fit-eval supervise`, and `fit-eval facilitate` redact
|
|
19
|
+
secrets in trace artifacts before they reach disk. Two layers compose:
|
|
20
|
+
|
|
21
|
+
- **Env-var allowlist**, defaulting to `ANTHROPIC_API_KEY`, `GH_TOKEN`,
|
|
22
|
+
`GITHUB_TOKEN`. The runtime values of these vars are replaced with
|
|
23
|
+
`[REDACTED:env:NAME]` wherever they appear in tool inputs, tool
|
|
24
|
+
outputs, assistant text, or orchestrator summaries. Override the list
|
|
25
|
+
with `LIBEVAL_REDACTION_ENV_VARS=NAME1,NAME2,…` (replaces, not extends).
|
|
26
|
+
- **Credential-shape patterns**, covering Anthropic API keys (`sk-ant-`),
|
|
27
|
+
GitHub PATs (`ghp_`), installation tokens (`ghs_`), OAuth tokens
|
|
28
|
+
(`gho_`), and fine-grained PATs (`github_pat_`). Pattern hits become
|
|
29
|
+
`[REDACTED:pattern:KIND]`.
|
|
30
|
+
|
|
31
|
+
Redaction is on by default. To disable, set `LIBEVAL_REDACTION_DISABLED=1`
|
|
32
|
+
— a stderr warning fires once per run. Never set this in CI on a public
|
|
33
|
+
repository: workflow artifacts there are downloadable through the
|
|
34
|
+
retention window.
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { readFileSync } from "node:fs";
|
|
4
|
+
import { createCli } from "@forwardimpact/libcli";
|
|
5
|
+
import { createLogger } from "@forwardimpact/libtelemetry";
|
|
6
|
+
|
|
7
|
+
import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
|
|
8
|
+
import { runBenchmarkScoreCommand } from "../src/commands/benchmark-score.js";
|
|
9
|
+
import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
|
|
10
|
+
|
|
11
|
+
// `bun build --compile` injects FIT_BENCHMARK_VERSION via --define, eliminating
|
|
12
|
+
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
13
|
+
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
14
|
+
const VERSION =
|
|
15
|
+
process.env.FIT_BENCHMARK_VERSION ||
|
|
16
|
+
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
17
|
+
.version;
|
|
18
|
+
|
|
19
|
+
export const definition = {
|
|
20
|
+
name: "fit-benchmark",
|
|
21
|
+
version: VERSION,
|
|
22
|
+
description:
|
|
23
|
+
"Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
|
|
24
|
+
commands: [
|
|
25
|
+
{
|
|
26
|
+
name: "run",
|
|
27
|
+
args: "",
|
|
28
|
+
description:
|
|
29
|
+
"Run every task in a family for N runs and emit one result record per (task, runIndex).",
|
|
30
|
+
options: {
|
|
31
|
+
family: {
|
|
32
|
+
type: "string",
|
|
33
|
+
description: "Path or git URL to a task family",
|
|
34
|
+
},
|
|
35
|
+
output: {
|
|
36
|
+
type: "string",
|
|
37
|
+
description: "Run-output directory (created if missing)",
|
|
38
|
+
},
|
|
39
|
+
runs: {
|
|
40
|
+
type: "string",
|
|
41
|
+
description: "Runs per task (integer ≥ 1, default 1)",
|
|
42
|
+
},
|
|
43
|
+
model: {
|
|
44
|
+
type: "string",
|
|
45
|
+
description: "Claude model id (default: claude-opus-4-7[1m])",
|
|
46
|
+
},
|
|
47
|
+
"agent-profile": {
|
|
48
|
+
type: "string",
|
|
49
|
+
description: "Agent-under-test profile name",
|
|
50
|
+
},
|
|
51
|
+
"judge-profile": {
|
|
52
|
+
type: "string",
|
|
53
|
+
description: "Judge profile name",
|
|
54
|
+
},
|
|
55
|
+
"max-turns": {
|
|
56
|
+
type: "string",
|
|
57
|
+
description:
|
|
58
|
+
"Agent-under-test turn budget (default: 50, 0 = unlimited)",
|
|
59
|
+
},
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
name: "score",
|
|
64
|
+
args: "",
|
|
65
|
+
description:
|
|
66
|
+
"Score a single task against a post-run workdir without invoking an agent.",
|
|
67
|
+
options: {
|
|
68
|
+
family: {
|
|
69
|
+
type: "string",
|
|
70
|
+
description: "Path or git URL to a task family",
|
|
71
|
+
},
|
|
72
|
+
task: {
|
|
73
|
+
type: "string",
|
|
74
|
+
description: "METR-style task id (task_family_name/task_name)",
|
|
75
|
+
},
|
|
76
|
+
workdir: {
|
|
77
|
+
type: "string",
|
|
78
|
+
description:
|
|
79
|
+
"Post-run directory; <workdir>/cwd/ is the agent CWD scoring runs against",
|
|
80
|
+
},
|
|
81
|
+
output: {
|
|
82
|
+
type: "string",
|
|
83
|
+
description: "Output file (defaults to stdout; one JSONL line)",
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
name: "report",
|
|
89
|
+
args: "",
|
|
90
|
+
description:
|
|
91
|
+
"Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
|
|
92
|
+
options: {
|
|
93
|
+
input: {
|
|
94
|
+
type: "string",
|
|
95
|
+
description: "Run-output directory containing results.jsonl",
|
|
96
|
+
},
|
|
97
|
+
k: {
|
|
98
|
+
type: "string",
|
|
99
|
+
description: "Comma-separated k values (default: 1,3,5)",
|
|
100
|
+
},
|
|
101
|
+
format: {
|
|
102
|
+
type: "string",
|
|
103
|
+
description: "Output format (json|text, default: json)",
|
|
104
|
+
},
|
|
105
|
+
},
|
|
106
|
+
},
|
|
107
|
+
],
|
|
108
|
+
globalOptions: {
|
|
109
|
+
help: { type: "boolean", short: "h", description: "Show this help" },
|
|
110
|
+
version: { type: "boolean", description: "Show version" },
|
|
111
|
+
json: { type: "boolean", description: "Output help as JSON" },
|
|
112
|
+
},
|
|
113
|
+
examples: [
|
|
114
|
+
"fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
|
|
115
|
+
"fit-benchmark score --family=./families/coding --task=coding/todo-api --workdir=./runs/2026-05-11/runs/coding__todo-api/0",
|
|
116
|
+
"fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
|
|
117
|
+
],
|
|
118
|
+
documentation: [
|
|
119
|
+
{
|
|
120
|
+
title: "Run a Benchmark",
|
|
121
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/index.md",
|
|
122
|
+
description:
|
|
123
|
+
"Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
|
|
124
|
+
},
|
|
125
|
+
],
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
const cli = createCli(definition);
|
|
129
|
+
const logger = createLogger("benchmark");
|
|
130
|
+
|
|
131
|
+
const COMMANDS = {
|
|
132
|
+
run: runBenchmarkRunCommand,
|
|
133
|
+
score: runBenchmarkScoreCommand,
|
|
134
|
+
report: runBenchmarkReportCommand,
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
async function main() {
|
|
138
|
+
const parsed = cli.parse(process.argv.slice(2));
|
|
139
|
+
if (!parsed) process.exit(0);
|
|
140
|
+
|
|
141
|
+
const { values, positionals } = parsed;
|
|
142
|
+
|
|
143
|
+
if (positionals.length === 0) {
|
|
144
|
+
cli.usageError("no command specified");
|
|
145
|
+
process.exit(2);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const [command, ...args] = positionals;
|
|
149
|
+
const handler = COMMANDS[command];
|
|
150
|
+
|
|
151
|
+
if (!handler) {
|
|
152
|
+
cli.usageError(`unknown command "${command}"`);
|
|
153
|
+
process.exit(2);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
await handler(values, args);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Run main only when invoked as a CLI. Importing for tests (e.g. parity)
|
|
160
|
+
// should not execute the entry point.
|
|
161
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
162
|
+
main().catch((error) => {
|
|
163
|
+
logger.exception("main", error);
|
|
164
|
+
cli.error(error.message);
|
|
165
|
+
process.exit(1);
|
|
166
|
+
});
|
|
167
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.33",
|
|
4
4
|
"description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"eval",
|
|
@@ -32,11 +32,13 @@
|
|
|
32
32
|
"exports": {
|
|
33
33
|
".": "./src/index.js",
|
|
34
34
|
"./bin/fit-eval.js": "./bin/fit-eval.js",
|
|
35
|
-
"./bin/fit-trace.js": "./bin/fit-trace.js"
|
|
35
|
+
"./bin/fit-trace.js": "./bin/fit-trace.js",
|
|
36
|
+
"./bin/fit-benchmark.js": "./bin/fit-benchmark.js"
|
|
36
37
|
},
|
|
37
38
|
"bin": {
|
|
38
39
|
"fit-eval": "./bin/fit-eval.js",
|
|
39
|
-
"fit-trace": "./bin/fit-trace.js"
|
|
40
|
+
"fit-trace": "./bin/fit-trace.js",
|
|
41
|
+
"fit-benchmark": "./bin/fit-benchmark.js"
|
|
40
42
|
},
|
|
41
43
|
"files": [
|
|
42
44
|
"src/**/*.js",
|
package/src/agent-runner.js
CHANGED
|
@@ -54,7 +54,9 @@ export class AgentRunner {
|
|
|
54
54
|
if (!deps.cwd) throw new Error("cwd is required");
|
|
55
55
|
if (!deps.query) throw new Error("query is required");
|
|
56
56
|
if (!deps.output) throw new Error("output is required");
|
|
57
|
+
if (!deps.redactor) throw new Error("redactor is required");
|
|
57
58
|
Object.assign(this, applyDefaults(deps));
|
|
59
|
+
this.redactor = deps.redactor;
|
|
58
60
|
this.sessionId = null;
|
|
59
61
|
this.buffer = [];
|
|
60
62
|
/** @type {AbortController|null} */
|
|
@@ -203,12 +205,16 @@ export class AgentRunner {
|
|
|
203
205
|
* @param {{pendingBatch: string[], assistantTextCount: number}} state
|
|
204
206
|
*/
|
|
205
207
|
#recordLine(message, state) {
|
|
206
|
-
const
|
|
208
|
+
const redacted = this.redactor.redactValue(message);
|
|
209
|
+
const line = JSON.stringify(redacted);
|
|
207
210
|
this.output.write(line + "\n");
|
|
208
211
|
this.buffer.push(line);
|
|
209
212
|
if (this.onLine) this.onLine(line);
|
|
210
213
|
if (this.onBatch) state.pendingBatch.push(line);
|
|
211
214
|
|
|
215
|
+
// Session-id / text-block tracking reads the ORIGINAL message —
|
|
216
|
+
// these fields are not secret carriers, and the trackers rely on
|
|
217
|
+
// shape, not string contents.
|
|
212
218
|
if (message.type === "system" && message.subtype === "init") {
|
|
213
219
|
this.sessionId = message.session_id;
|
|
214
220
|
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ApmInstaller — materialises the family's pre-staged `.claude/` tree into a
|
|
3
|
+
* single staging directory, computes the manifest fingerprint, and is invoked
|
|
4
|
+
* once per family install. Per-task copy happens later in WorkdirManager.
|
|
5
|
+
*
|
|
6
|
+
* v1 trusts the family's checked-in `.claude/` (P1); the lockfile is hashed
|
|
7
|
+
* verbatim, not interpreted.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { createHash } from "node:crypto";
|
|
11
|
+
import { access, cp, rm } from "node:fs/promises";
|
|
12
|
+
import { join } from "node:path";
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* @param {import("./task-family.js").TaskFamily} family
|
|
16
|
+
* @param {string} outputDir - The benchmark run's output directory.
|
|
17
|
+
* @returns {Promise<{stagingDir: string, skillSetHash: string}>}
|
|
18
|
+
*/
|
|
19
|
+
export async function installApm(family, outputDir) {
|
|
20
|
+
const stagingDir = join(outputDir, ".apm-staging");
|
|
21
|
+
const stagedClaude = join(stagingDir, ".claude");
|
|
22
|
+
const sourceClaude = join(family.rootPath, ".claude");
|
|
23
|
+
|
|
24
|
+
try {
|
|
25
|
+
await access(sourceClaude);
|
|
26
|
+
} catch {
|
|
27
|
+
throw new Error(
|
|
28
|
+
`task family missing .claude/ at ${sourceClaude}; family must check in a pre-staged skills/agents tree (design decision P1)`,
|
|
29
|
+
);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
await rm(stagingDir, { recursive: true, force: true });
|
|
33
|
+
await cp(sourceClaude, stagedClaude, { recursive: true });
|
|
34
|
+
|
|
35
|
+
const skillSetHash =
|
|
36
|
+
"sha256:" + createHash("sha256").update(family.apmLockBytes).digest("hex");
|
|
37
|
+
|
|
38
|
+
return { stagingDir, skillSetHash };
|
|
39
|
+
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark adapter for the libeval `Judge`. Templates the family's
|
|
3
|
+
* `judge.task.md` ({{SCORING}} / {{AGENT_TRACE_PATH}} substitution), runs the
|
|
4
|
+
* judge against the post-run agent CWD, and returns the verdict in the
|
|
5
|
+
* benchmark's `pass`/`fail` vocabulary (mapped from libeval's
|
|
6
|
+
* `success`/`failure`).
|
|
7
|
+
*
|
|
8
|
+
* The judge verdict is captured from the orchestration context's
|
|
9
|
+
* `concluded` flag directly — no trace parsing on the happy path.
|
|
10
|
+
* `parseConcludeFromTrace` is preserved for offline analysis and as a
|
|
11
|
+
* fallback when the runtime ctx isn't available (e.g. re-grading a
|
|
12
|
+
* historical run from its judge.ndjson file).
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { createReadStream, createWriteStream } from "node:fs";
|
|
16
|
+
import { readFile } from "node:fs/promises";
|
|
17
|
+
import { createInterface } from "node:readline";
|
|
18
|
+
import { createJudge } from "../judge.js";
|
|
19
|
+
import { createRedactor } from "../redaction.js";
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* @typedef {object} JudgeVerdict
|
|
23
|
+
* @property {"pass" | "fail"} verdict
|
|
24
|
+
* @property {string} summary
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Run the judge over a completed task run.
|
|
29
|
+
* @param {import("./task-family.js").Task} task
|
|
30
|
+
* @param {import("./workdir.js").Workdir} workdir
|
|
31
|
+
* @param {import("./scorer.js").ScoringResult} scoring
|
|
32
|
+
* @param {{query: Function, model: string, judgeProfile?: string}} deps
|
|
33
|
+
* @returns {Promise<JudgeVerdict>}
|
|
34
|
+
*/
|
|
35
|
+
export async function runJudge(task, workdir, scoring, deps) {
|
|
36
|
+
const template = await readFile(task.paths.judge, "utf8");
|
|
37
|
+
const taskText = template
|
|
38
|
+
.replaceAll("{{SCORING}}", JSON.stringify(scoring, null, 2))
|
|
39
|
+
.replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath);
|
|
40
|
+
|
|
41
|
+
const output = createWriteStream(workdir.judgeTracePath);
|
|
42
|
+
const judge = createJudge({
|
|
43
|
+
cwd: workdir.cwd,
|
|
44
|
+
query: deps.query,
|
|
45
|
+
output,
|
|
46
|
+
model: deps.model,
|
|
47
|
+
judgeProfile: deps.judgeProfile,
|
|
48
|
+
maxTurns: 5,
|
|
49
|
+
redactor: createRedactor(),
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
let outcome;
|
|
53
|
+
try {
|
|
54
|
+
outcome = await judge.run(taskText);
|
|
55
|
+
} finally {
|
|
56
|
+
await new Promise((r) => output.end(r));
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (outcome.verdict === null) {
|
|
60
|
+
return { verdict: "fail", summary: "judge did not conclude" };
|
|
61
|
+
}
|
|
62
|
+
return {
|
|
63
|
+
verdict: outcome.verdict === "success" ? "pass" : "fail",
|
|
64
|
+
summary: outcome.summary ?? "",
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Parse the last judge-source (or supervisor-source, for backward compat
|
|
70
|
+
* with pre-Judge-class traces) `Conclude` tool call from an NDJSON trace
|
|
71
|
+
* and map the verdict (`success → pass`, `failure → fail`). Preserved for
|
|
72
|
+
* offline analysis; not used on the runtime happy path.
|
|
73
|
+
* @param {string} tracePath
|
|
74
|
+
* @returns {Promise<JudgeVerdict | null>}
|
|
75
|
+
*/
|
|
76
|
+
export async function parseConcludeFromTrace(tracePath) {
|
|
77
|
+
const stream = createReadStream(tracePath);
|
|
78
|
+
const rl = createInterface({ input: stream, crlfDelay: Infinity });
|
|
79
|
+
let last = null;
|
|
80
|
+
for await (const line of rl) {
|
|
81
|
+
const candidate = extractConcludeInput(line);
|
|
82
|
+
if (candidate) last = candidate;
|
|
83
|
+
}
|
|
84
|
+
if (!last) return null;
|
|
85
|
+
return {
|
|
86
|
+
verdict: last.verdict === "success" ? "pass" : "fail",
|
|
87
|
+
summary: last.summary ?? "",
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Return the `Conclude` tool input if the line carries a judge-source or
|
|
93
|
+
* supervisor-source assistant message ending in a `Conclude` tool_use
|
|
94
|
+
* block; null otherwise.
|
|
95
|
+
* @param {string} line
|
|
96
|
+
* @returns {{verdict: string, summary?: string} | null}
|
|
97
|
+
*/
|
|
98
|
+
function extractConcludeInput(line) {
|
|
99
|
+
const trimmed = line.trim();
|
|
100
|
+
if (!trimmed) return null;
|
|
101
|
+
let event;
|
|
102
|
+
try {
|
|
103
|
+
event = JSON.parse(trimmed);
|
|
104
|
+
} catch {
|
|
105
|
+
return null;
|
|
106
|
+
}
|
|
107
|
+
const wrapped =
|
|
108
|
+
event.event && typeof event.source === "string"
|
|
109
|
+
? { source: event.source, inner: event.event }
|
|
110
|
+
: { source: null, inner: event };
|
|
111
|
+
if (
|
|
112
|
+
wrapped.source !== null &&
|
|
113
|
+
wrapped.source !== "judge" &&
|
|
114
|
+
wrapped.source !== "supervisor"
|
|
115
|
+
) {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
if (wrapped.inner.type !== "assistant") return null;
|
|
119
|
+
const content = wrapped.inner.message?.content ?? wrapped.inner.content;
|
|
120
|
+
if (!Array.isArray(content)) return null;
|
|
121
|
+
let found = null;
|
|
122
|
+
for (const block of content) {
|
|
123
|
+
if (
|
|
124
|
+
block.type === "tool_use" &&
|
|
125
|
+
isConcludeToolName(block.name) &&
|
|
126
|
+
block.input
|
|
127
|
+
) {
|
|
128
|
+
found = block.input;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return found;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* The Claude Agent SDK reports MCP tool names as
|
|
136
|
+
* `mcp__<server>__<tool>` when the model invokes them — the orchestration
|
|
137
|
+
* `Conclude` arrives as `mcp__orchestration__Conclude`. Pre-baked
|
|
138
|
+
* supervisor traces (and the libeval-internal envelopes) sometimes carry
|
|
139
|
+
* the bare `Conclude` name. Accept both forms so the parser is robust to
|
|
140
|
+
* trace source.
|
|
141
|
+
*/
|
|
142
|
+
function isConcludeToolName(name) {
|
|
143
|
+
if (typeof name !== "string") return false;
|
|
144
|
+
if (name === "Conclude") return true;
|
|
145
|
+
return name.endsWith("__Conclude");
|
|
146
|
+
}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ReportAggregator — read a run-output directory's `results.jsonl`, group
|
|
3
|
+
* records by `taskId`, and compute pass@k via the OpenAI HumanEval
|
|
4
|
+
* unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
|
|
5
|
+
*
|
|
6
|
+
* Records that fail schema validation are skipped with a stderr warning
|
|
7
|
+
* (counted under `totals.skipped`) so a corrupt line cannot abort the
|
|
8
|
+
* whole report.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { createReadStream } from "node:fs";
|
|
12
|
+
import { join } from "node:path";
|
|
13
|
+
import { createInterface } from "node:readline";
|
|
14
|
+
|
|
15
|
+
import { validateResultRecord } from "./result.js";
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* @typedef {object} TaskReport
|
|
19
|
+
* @property {string} taskId
|
|
20
|
+
* @property {number} n - Total runs.
|
|
21
|
+
* @property {number} c - Passing runs.
|
|
22
|
+
* @property {Record<string|number, number|null>} passAtK
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* @param {{inputDir: string, kValues: number[]}} opts
|
|
27
|
+
* @returns {Promise<{tasks: TaskReport[], totals: {tasks: number, runs: number, skipped: number}}>}
|
|
28
|
+
*/
|
|
29
|
+
export async function aggregate({ inputDir, kValues }) {
|
|
30
|
+
const records = await loadRecords(inputDir);
|
|
31
|
+
const grouped = groupByTask(records.records);
|
|
32
|
+
const tasks = [];
|
|
33
|
+
let runs = 0;
|
|
34
|
+
for (const [taskId, group] of grouped) {
|
|
35
|
+
const n = group.length;
|
|
36
|
+
const c = group.filter((r) => r.verdict === "pass").length;
|
|
37
|
+
runs += n;
|
|
38
|
+
const passAtK = {};
|
|
39
|
+
for (const k of kValues) passAtK[k] = passAtKValue(n, c, k);
|
|
40
|
+
tasks.push({ taskId, n, c, passAtK });
|
|
41
|
+
}
|
|
42
|
+
tasks.sort((a, b) =>
|
|
43
|
+
a.taskId < b.taskId ? -1 : a.taskId > b.taskId ? 1 : 0,
|
|
44
|
+
);
|
|
45
|
+
return {
|
|
46
|
+
tasks,
|
|
47
|
+
totals: { tasks: tasks.length, runs, skipped: records.skipped },
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Render an aggregate report as a Markdown table. Columns: taskId | n | c |
|
|
53
|
+
* pass@k1 | pass@k2 ... — one column per kValues entry, in the same order.
|
|
54
|
+
* @param {Awaited<ReturnType<typeof aggregate>>} report
|
|
55
|
+
* @param {number[]} kValues
|
|
56
|
+
* @returns {string}
|
|
57
|
+
*/
|
|
58
|
+
export function renderTextReport(report, kValues) {
|
|
59
|
+
const header = ["taskId", "n", "c", ...kValues.map((k) => `pass@${k}`)];
|
|
60
|
+
const rows = [header, header.map(() => "---")];
|
|
61
|
+
for (const t of report.tasks) {
|
|
62
|
+
rows.push([
|
|
63
|
+
t.taskId,
|
|
64
|
+
String(t.n),
|
|
65
|
+
String(t.c),
|
|
66
|
+
...kValues.map((k) => formatPassAt(t.passAtK[k])),
|
|
67
|
+
]);
|
|
68
|
+
}
|
|
69
|
+
const lines = rows.map((r) => `| ${r.join(" | ")} |`);
|
|
70
|
+
lines.push("");
|
|
71
|
+
lines.push(
|
|
72
|
+
`Totals — tasks: ${report.totals.tasks}, runs: ${report.totals.runs}, skipped: ${report.totals.skipped}`,
|
|
73
|
+
);
|
|
74
|
+
return lines.join("\n");
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function formatPassAt(v) {
|
|
78
|
+
if (v == null) return "—";
|
|
79
|
+
if (typeof v === "object" && "error" in v) return v.error;
|
|
80
|
+
return Number(v).toFixed(4);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async function loadRecords(inputDir) {
|
|
84
|
+
const path = join(inputDir, "results.jsonl");
|
|
85
|
+
const stream = createReadStream(path);
|
|
86
|
+
const rl = createInterface({ input: stream, crlfDelay: Infinity });
|
|
87
|
+
const records = [];
|
|
88
|
+
let skipped = 0;
|
|
89
|
+
for await (const line of rl) {
|
|
90
|
+
const trimmed = line.trim();
|
|
91
|
+
if (!trimmed) continue;
|
|
92
|
+
let record;
|
|
93
|
+
try {
|
|
94
|
+
record = JSON.parse(trimmed);
|
|
95
|
+
} catch (e) {
|
|
96
|
+
process.stderr.write(
|
|
97
|
+
`benchmark report: skipped malformed JSON line — ${e.message}\n`,
|
|
98
|
+
);
|
|
99
|
+
skipped++;
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
try {
|
|
103
|
+
validateResultRecord(record);
|
|
104
|
+
} catch (e) {
|
|
105
|
+
process.stderr.write(
|
|
106
|
+
`benchmark report: skipped record failing schema — ${describeError(e)}\n`,
|
|
107
|
+
);
|
|
108
|
+
skipped++;
|
|
109
|
+
continue;
|
|
110
|
+
}
|
|
111
|
+
records.push(record);
|
|
112
|
+
}
|
|
113
|
+
return { records, skipped };
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function describeError(e) {
|
|
117
|
+
if (e && Array.isArray(e.issues)) {
|
|
118
|
+
return e.issues.map((i) => `${i.path.join(".")}: ${i.message}`).join("; ");
|
|
119
|
+
}
|
|
120
|
+
return e.message ?? String(e);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function groupByTask(records) {
|
|
124
|
+
const out = new Map();
|
|
125
|
+
for (const r of records) {
|
|
126
|
+
if (!out.has(r.taskId)) out.set(r.taskId, []);
|
|
127
|
+
out.get(r.taskId).push(r);
|
|
128
|
+
}
|
|
129
|
+
return out;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* pass@k = 1 - C(n - c, k) / C(n, k). Compute with BigInt to avoid
|
|
134
|
+
* floating-point loss on large n.
|
|
135
|
+
* @param {number} n
|
|
136
|
+
* @param {number} c
|
|
137
|
+
* @param {number} k
|
|
138
|
+
* @returns {number | {error: string}}
|
|
139
|
+
*/
|
|
140
|
+
function passAtKValue(n, c, k) {
|
|
141
|
+
if (k > n) return { error: "k > n" };
|
|
142
|
+
if (n - c < k) return 1;
|
|
143
|
+
const total = binomial(BigInt(n), BigInt(k));
|
|
144
|
+
const fail = binomial(BigInt(n - c), BigInt(k));
|
|
145
|
+
// Compute the ratio as a single division so we avoid `1 - x` which
|
|
146
|
+
// accumulates IEEE-754 error (e.g. 1 - 0.6 = 0.39999...).
|
|
147
|
+
const passing = total - fail;
|
|
148
|
+
return Number(passing) / Number(total);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function binomial(n, k) {
|
|
152
|
+
if (k < 0n || k > n) return 0n;
|
|
153
|
+
if (k === 0n || k === n) return 1n;
|
|
154
|
+
let kk = k;
|
|
155
|
+
if (kk > n - kk) kk = n - kk;
|
|
156
|
+
let result = 1n;
|
|
157
|
+
for (let i = 0n; i < kk; i++) {
|
|
158
|
+
result = (result * (n - i)) / (i + 1n);
|
|
159
|
+
}
|
|
160
|
+
return result;
|
|
161
|
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Result-record schemas and runtime validators.
|
|
3
|
+
*
|
|
4
|
+
* Two schemas live here:
|
|
5
|
+
* - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
|
|
6
|
+
* benchmark run. Has a happy branch (scoring + judge present) and a
|
|
7
|
+
* pre-flight-failure branch (scoring/judgeVerdict/submission absent).
|
|
8
|
+
* - SCORING_RECORD_SCHEMA — narrower output of `benchmark-score` (P7):
|
|
9
|
+
* ad-hoc grading without a full lifecycle.
|
|
10
|
+
*
|
|
11
|
+
* Validation is throw-on-mismatch so the runner can wrap every JSONL append
|
|
12
|
+
* in a guard and reject schema drift at write time.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { z } from "zod";
|
|
16
|
+
|
|
17
|
+
const VERDICT_ENUM = z.enum(["pass", "fail"]);
|
|
18
|
+
|
|
19
|
+
const SCORING_SHAPE = z.object({
|
|
20
|
+
verdict: VERDICT_ENUM,
|
|
21
|
+
details: z.array(z.unknown()),
|
|
22
|
+
exitCode: z.number().int(),
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
const JUDGE_VERDICT_SHAPE = z.object({
|
|
26
|
+
verdict: VERDICT_ENUM,
|
|
27
|
+
summary: z.string(),
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
const PROFILES_SHAPE = z.object({
|
|
31
|
+
agent: z.union([z.string(), z.null()]),
|
|
32
|
+
supervisor: z.null(),
|
|
33
|
+
judge: z.union([z.string(), z.null()]),
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
const PREFLIGHT_ERROR_SHAPE = z.object({
|
|
37
|
+
phase: z.string(),
|
|
38
|
+
message: z.string(),
|
|
39
|
+
exitCode: z.number().int(),
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
const COMMON_FIELDS = {
|
|
43
|
+
taskId: z.string().min(1),
|
|
44
|
+
runIndex: z.number().int().min(0),
|
|
45
|
+
verdict: VERDICT_ENUM,
|
|
46
|
+
costUsd: z.number(),
|
|
47
|
+
turns: z.number().int().min(0),
|
|
48
|
+
profiles: PROFILES_SHAPE,
|
|
49
|
+
model: z.string(),
|
|
50
|
+
skillSetHash: z.string(),
|
|
51
|
+
familyRevision: z.string(),
|
|
52
|
+
durationMs: z.number().int().min(0),
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
const AGENT_ERROR_SHAPE = z.object({
|
|
56
|
+
message: z.string(),
|
|
57
|
+
aborted: z.boolean(),
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
const HAPPY_RECORD = z.object({
|
|
61
|
+
...COMMON_FIELDS,
|
|
62
|
+
scoring: SCORING_SHAPE,
|
|
63
|
+
submission: z.string(),
|
|
64
|
+
judgeVerdict: JUDGE_VERDICT_SHAPE,
|
|
65
|
+
agentTracePath: z.string(),
|
|
66
|
+
judgeTracePath: z.string(),
|
|
67
|
+
agentError: AGENT_ERROR_SHAPE.optional(),
|
|
68
|
+
preflightError: z.undefined().optional(),
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
const PREFLIGHT_RECORD = z.object({
|
|
72
|
+
...COMMON_FIELDS,
|
|
73
|
+
costUsd: z.literal(0),
|
|
74
|
+
preflightError: PREFLIGHT_ERROR_SHAPE,
|
|
75
|
+
// Trace paths are populated even on preflight failure (the runner allocates
|
|
76
|
+
// them in WorkdirManager.start) so the record is uniform across branches
|
|
77
|
+
// and downstream consumers can reference them without conditional fields.
|
|
78
|
+
agentTracePath: z.string(),
|
|
79
|
+
judgeTracePath: z.string(),
|
|
80
|
+
scoring: z.undefined().optional(),
|
|
81
|
+
submission: z.undefined().optional(),
|
|
82
|
+
judgeVerdict: z.undefined().optional(),
|
|
83
|
+
agentError: z.undefined().optional(),
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
export const RESULT_RECORD_SCHEMA = z.union([HAPPY_RECORD, PREFLIGHT_RECORD]);
|
|
87
|
+
|
|
88
|
+
export const SCORING_RECORD_SCHEMA = z.object({
|
|
89
|
+
taskId: z.string().min(1),
|
|
90
|
+
scoring: SCORING_SHAPE,
|
|
91
|
+
exitCode: z.number().int(),
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Throw on schema mismatch.
|
|
96
|
+
* @param {object} record
|
|
97
|
+
*/
|
|
98
|
+
export function validateResultRecord(record) {
|
|
99
|
+
RESULT_RECORD_SCHEMA.parse(record);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Throw on schema mismatch.
|
|
104
|
+
* @param {object} record
|
|
105
|
+
*/
|
|
106
|
+
export function validateScoringRecord(record) {
|
|
107
|
+
SCORING_RECORD_SCHEMA.parse(record);
|
|
108
|
+
}
|