@forwardimpact/libeval 0.1.61 → 0.1.62
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +6 -151
- package/package.json +1 -1
- package/src/commands/benchmark-definition.js +147 -0
package/bin/fit-benchmark.js
CHANGED
|
@@ -2,152 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
import "@forwardimpact/libpreflight/node22";
|
|
4
4
|
|
|
5
|
-
import { realpathSync } from "node:fs";
|
|
6
5
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
6
|
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
8
7
|
import { createLogger } from "@forwardimpact/libtelemetry";
|
|
9
8
|
|
|
10
|
-
import {
|
|
11
|
-
import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
|
|
12
|
-
import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
|
|
13
|
-
import {
|
|
14
|
-
BENCHMARK_AGENT_MODEL,
|
|
15
|
-
LEAD_MODEL,
|
|
16
|
-
} from "@forwardimpact/libutil/models";
|
|
17
|
-
|
|
18
|
-
export const definition = {
|
|
19
|
-
name: "fit-benchmark",
|
|
20
|
-
description:
|
|
21
|
-
"Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
|
|
22
|
-
commands: [
|
|
23
|
-
{
|
|
24
|
-
name: "run",
|
|
25
|
-
args: [],
|
|
26
|
-
handler: runBenchmarkRunCommand,
|
|
27
|
-
description:
|
|
28
|
-
"Run every task in a family for N runs and emit one result record per (task, runIndex).",
|
|
29
|
-
options: {
|
|
30
|
-
family: {
|
|
31
|
-
type: "string",
|
|
32
|
-
description: "Path or git URL to a task family",
|
|
33
|
-
},
|
|
34
|
-
output: {
|
|
35
|
-
type: "string",
|
|
36
|
-
description:
|
|
37
|
-
"Run-output directory (created if missing, default: benchmark-runs)",
|
|
38
|
-
},
|
|
39
|
-
runs: {
|
|
40
|
-
type: "string",
|
|
41
|
-
description: "Runs per task (integer ≥ 1, default: 5)",
|
|
42
|
-
},
|
|
43
|
-
"agent-model": {
|
|
44
|
-
type: "string",
|
|
45
|
-
description: `Claude model for the agent-under-test (default: ${BENCHMARK_AGENT_MODEL})`,
|
|
46
|
-
},
|
|
47
|
-
"lead-model": {
|
|
48
|
-
type: "string",
|
|
49
|
-
description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
|
|
50
|
-
},
|
|
51
|
-
"judge-model": {
|
|
52
|
-
type: "string",
|
|
53
|
-
description: `Claude model for the judge (default: ${LEAD_MODEL})`,
|
|
54
|
-
},
|
|
55
|
-
"agent-profile": {
|
|
56
|
-
type: "string",
|
|
57
|
-
description: "Agent-under-test profile name",
|
|
58
|
-
},
|
|
59
|
-
"judge-profile": {
|
|
60
|
-
type: "string",
|
|
61
|
-
description: "Judge profile name",
|
|
62
|
-
},
|
|
63
|
-
"max-turns": {
|
|
64
|
-
type: "string",
|
|
65
|
-
description:
|
|
66
|
-
"Agent-under-test turn budget (default: 50, 0 = unlimited)",
|
|
67
|
-
},
|
|
68
|
-
"allowed-tools": {
|
|
69
|
-
type: "string",
|
|
70
|
-
description:
|
|
71
|
-
"Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
|
|
72
|
-
},
|
|
73
|
-
},
|
|
74
|
-
},
|
|
75
|
-
{
|
|
76
|
-
name: "invariants",
|
|
77
|
-
args: [],
|
|
78
|
-
handler: runBenchmarkInvariantsCommand,
|
|
79
|
-
description:
|
|
80
|
-
"Check a single task's invariants against a post-run workdir without invoking an agent.",
|
|
81
|
-
options: {
|
|
82
|
-
family: {
|
|
83
|
-
type: "string",
|
|
84
|
-
description: "Path or git URL to a task family",
|
|
85
|
-
},
|
|
86
|
-
task: {
|
|
87
|
-
type: "string",
|
|
88
|
-
description: "Task id (directory name under tasks/)",
|
|
89
|
-
},
|
|
90
|
-
workdir: {
|
|
91
|
-
type: "string",
|
|
92
|
-
description:
|
|
93
|
-
"Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
|
|
94
|
-
},
|
|
95
|
-
output: {
|
|
96
|
-
type: "string",
|
|
97
|
-
description: "Output file (defaults to stdout; one JSONL line)",
|
|
98
|
-
},
|
|
99
|
-
},
|
|
100
|
-
},
|
|
101
|
-
{
|
|
102
|
-
name: "report",
|
|
103
|
-
args: [],
|
|
104
|
-
handler: runBenchmarkReportCommand,
|
|
105
|
-
description:
|
|
106
|
-
"Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
|
|
107
|
-
options: {
|
|
108
|
-
input: {
|
|
109
|
-
type: "string",
|
|
110
|
-
description:
|
|
111
|
-
"Run-output directory containing results.jsonl (default: benchmark-runs)",
|
|
112
|
-
},
|
|
113
|
-
k: {
|
|
114
|
-
type: "string",
|
|
115
|
-
description: "Comma-separated k values (default: 1,3,5)",
|
|
116
|
-
},
|
|
117
|
-
format: {
|
|
118
|
-
type: "string",
|
|
119
|
-
description: "Output format (json|text, default: json)",
|
|
120
|
-
},
|
|
121
|
-
},
|
|
122
|
-
},
|
|
123
|
-
],
|
|
124
|
-
globalOptions: {
|
|
125
|
-
help: { type: "boolean", short: "h", description: "Show this help" },
|
|
126
|
-
version: { type: "boolean", description: "Show version" },
|
|
127
|
-
json: { type: "boolean", description: "Output help as JSON" },
|
|
128
|
-
},
|
|
129
|
-
examples: [
|
|
130
|
-
"fit-benchmark run --family=./families/coding",
|
|
131
|
-
`fit-benchmark run --family=./families/coding --runs=10 --agent-model=${BENCHMARK_AGENT_MODEL}`,
|
|
132
|
-
"fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
|
|
133
|
-
"fit-benchmark report --format=text",
|
|
134
|
-
"fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
|
|
135
|
-
],
|
|
136
|
-
documentation: [
|
|
137
|
-
{
|
|
138
|
-
title: "Run a Benchmark",
|
|
139
|
-
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/index.md",
|
|
140
|
-
description:
|
|
141
|
-
"Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
|
|
142
|
-
},
|
|
143
|
-
{
|
|
144
|
-
title: "Automate with GitHub Actions",
|
|
145
|
-
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
|
|
146
|
-
description:
|
|
147
|
-
"Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
|
|
148
|
-
},
|
|
149
|
-
],
|
|
150
|
-
};
|
|
9
|
+
import { definition } from "../src/commands/benchmark-definition.js";
|
|
151
10
|
|
|
152
11
|
const runtime = createDefaultRuntime();
|
|
153
12
|
const logger = createLogger("benchmark", runtime);
|
|
@@ -178,12 +37,8 @@ async function main() {
|
|
|
178
37
|
runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
|
|
179
38
|
}
|
|
180
39
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
createCli(definition, { runtime }).error(error.message);
|
|
187
|
-
process.exit(1);
|
|
188
|
-
});
|
|
189
|
-
}
|
|
40
|
+
main().catch((error) => {
|
|
41
|
+
logger.exception("main", error);
|
|
42
|
+
createCli(definition, { runtime }).error(error.message);
|
|
43
|
+
process.exit(1);
|
|
44
|
+
});
|
package/package.json
CHANGED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `fit-benchmark` CLI definition. Lives in `src/` so the bin stays an
|
|
3
|
+
* execute-on-import entry point — launcher packages import the bin to run
|
|
4
|
+
* it — while tests import the definition without running the CLI.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { runBenchmarkRunCommand } from "./benchmark-run.js";
|
|
8
|
+
import { runBenchmarkInvariantsCommand } from "./benchmark-invariants.js";
|
|
9
|
+
import { runBenchmarkReportCommand } from "./benchmark-report.js";
|
|
10
|
+
import {
|
|
11
|
+
BENCHMARK_AGENT_MODEL,
|
|
12
|
+
LEAD_MODEL,
|
|
13
|
+
} from "@forwardimpact/libutil/models";
|
|
14
|
+
|
|
15
|
+
export const definition = {
|
|
16
|
+
name: "fit-benchmark",
|
|
17
|
+
description:
|
|
18
|
+
"Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
|
|
19
|
+
commands: [
|
|
20
|
+
{
|
|
21
|
+
name: "run",
|
|
22
|
+
args: [],
|
|
23
|
+
handler: runBenchmarkRunCommand,
|
|
24
|
+
description:
|
|
25
|
+
"Run every task in a family for N runs and emit one result record per (task, runIndex).",
|
|
26
|
+
options: {
|
|
27
|
+
family: {
|
|
28
|
+
type: "string",
|
|
29
|
+
description: "Path or git URL to a task family",
|
|
30
|
+
},
|
|
31
|
+
output: {
|
|
32
|
+
type: "string",
|
|
33
|
+
description:
|
|
34
|
+
"Run-output directory (created if missing, default: benchmark-runs)",
|
|
35
|
+
},
|
|
36
|
+
runs: {
|
|
37
|
+
type: "string",
|
|
38
|
+
description: "Runs per task (integer ≥ 1, default: 5)",
|
|
39
|
+
},
|
|
40
|
+
"agent-model": {
|
|
41
|
+
type: "string",
|
|
42
|
+
description: `Claude model for the agent-under-test (default: ${BENCHMARK_AGENT_MODEL})`,
|
|
43
|
+
},
|
|
44
|
+
"lead-model": {
|
|
45
|
+
type: "string",
|
|
46
|
+
description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
|
|
47
|
+
},
|
|
48
|
+
"judge-model": {
|
|
49
|
+
type: "string",
|
|
50
|
+
description: `Claude model for the judge (default: ${LEAD_MODEL})`,
|
|
51
|
+
},
|
|
52
|
+
"agent-profile": {
|
|
53
|
+
type: "string",
|
|
54
|
+
description: "Agent-under-test profile name",
|
|
55
|
+
},
|
|
56
|
+
"judge-profile": {
|
|
57
|
+
type: "string",
|
|
58
|
+
description: "Judge profile name",
|
|
59
|
+
},
|
|
60
|
+
"max-turns": {
|
|
61
|
+
type: "string",
|
|
62
|
+
description:
|
|
63
|
+
"Agent-under-test turn budget (default: 50, 0 = unlimited)",
|
|
64
|
+
},
|
|
65
|
+
"allowed-tools": {
|
|
66
|
+
type: "string",
|
|
67
|
+
description:
|
|
68
|
+
"Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
|
|
69
|
+
},
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
name: "invariants",
|
|
74
|
+
args: [],
|
|
75
|
+
handler: runBenchmarkInvariantsCommand,
|
|
76
|
+
description:
|
|
77
|
+
"Check a single task's invariants against a post-run workdir without invoking an agent.",
|
|
78
|
+
options: {
|
|
79
|
+
family: {
|
|
80
|
+
type: "string",
|
|
81
|
+
description: "Path or git URL to a task family",
|
|
82
|
+
},
|
|
83
|
+
task: {
|
|
84
|
+
type: "string",
|
|
85
|
+
description: "Task id (directory name under tasks/)",
|
|
86
|
+
},
|
|
87
|
+
workdir: {
|
|
88
|
+
type: "string",
|
|
89
|
+
description:
|
|
90
|
+
"Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
|
|
91
|
+
},
|
|
92
|
+
output: {
|
|
93
|
+
type: "string",
|
|
94
|
+
description: "Output file (defaults to stdout; one JSONL line)",
|
|
95
|
+
},
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
name: "report",
|
|
100
|
+
args: [],
|
|
101
|
+
handler: runBenchmarkReportCommand,
|
|
102
|
+
description:
|
|
103
|
+
"Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
|
|
104
|
+
options: {
|
|
105
|
+
input: {
|
|
106
|
+
type: "string",
|
|
107
|
+
description:
|
|
108
|
+
"Run-output directory containing results.jsonl (default: benchmark-runs)",
|
|
109
|
+
},
|
|
110
|
+
k: {
|
|
111
|
+
type: "string",
|
|
112
|
+
description: "Comma-separated k values (default: 1,3,5)",
|
|
113
|
+
},
|
|
114
|
+
format: {
|
|
115
|
+
type: "string",
|
|
116
|
+
description: "Output format (json|text, default: json)",
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
},
|
|
120
|
+
],
|
|
121
|
+
globalOptions: {
|
|
122
|
+
help: { type: "boolean", short: "h", description: "Show this help" },
|
|
123
|
+
version: { type: "boolean", description: "Show version" },
|
|
124
|
+
json: { type: "boolean", description: "Output help as JSON" },
|
|
125
|
+
},
|
|
126
|
+
examples: [
|
|
127
|
+
"fit-benchmark run --family=./families/coding",
|
|
128
|
+
`fit-benchmark run --family=./families/coding --runs=10 --agent-model=${BENCHMARK_AGENT_MODEL}`,
|
|
129
|
+
"fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
|
|
130
|
+
"fit-benchmark report --format=text",
|
|
131
|
+
"fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
|
|
132
|
+
],
|
|
133
|
+
documentation: [
|
|
134
|
+
{
|
|
135
|
+
title: "Run a Benchmark",
|
|
136
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/index.md",
|
|
137
|
+
description:
|
|
138
|
+
"Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
|
|
139
|
+
},
|
|
140
|
+
{
|
|
141
|
+
title: "Automate with GitHub Actions",
|
|
142
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
|
|
143
|
+
description:
|
|
144
|
+
"Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
|
|
145
|
+
},
|
|
146
|
+
],
|
|
147
|
+
};
|