@forwardimpact/libeval 0.1.34 → 0.1.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/fit-benchmark.js
CHANGED
|
@@ -71,7 +71,7 @@ export const definition = {
|
|
|
71
71
|
},
|
|
72
72
|
task: {
|
|
73
73
|
type: "string",
|
|
74
|
-
description: "
|
|
74
|
+
description: "Task id (directory name under tasks/)",
|
|
75
75
|
},
|
|
76
76
|
workdir: {
|
|
77
77
|
type: "string",
|
|
@@ -112,7 +112,7 @@ export const definition = {
|
|
|
112
112
|
},
|
|
113
113
|
examples: [
|
|
114
114
|
"fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
|
|
115
|
-
"fit-benchmark score --family=./families/coding --task=
|
|
115
|
+
"fit-benchmark score --family=./families/coding --task=todo-api --workdir=./runs/2026-05-11/runs/todo-api/0",
|
|
116
116
|
"fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
|
|
117
117
|
],
|
|
118
118
|
documentation: [
|
package/package.json
CHANGED
package/src/benchmark/judge.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* <root>/
|
|
4
4
|
* apm.lock.yaml
|
|
5
5
|
* .claude/ # pre-staged skills + agents (P1)
|
|
6
|
-
* tasks/<
|
|
6
|
+
* tasks/<task_name>/
|
|
7
7
|
* instructions.md
|
|
8
8
|
* supervisor.task.md # preserved for v2; not read in v1
|
|
9
9
|
* judge.task.md
|
|
@@ -122,32 +122,27 @@ function normalizeLf(buf) {
|
|
|
122
122
|
async function discoverTasks(rootPath) {
|
|
123
123
|
const tasksRoot = join(rootPath, "tasks");
|
|
124
124
|
const tasks = [];
|
|
125
|
-
let
|
|
125
|
+
let entries;
|
|
126
126
|
try {
|
|
127
|
-
|
|
127
|
+
entries = await readdir(tasksRoot, { withFileTypes: true });
|
|
128
128
|
} catch (e) {
|
|
129
129
|
if (e.code === "ENOENT") return tasks;
|
|
130
130
|
throw e;
|
|
131
131
|
}
|
|
132
|
-
for (const
|
|
133
|
-
if (!
|
|
134
|
-
const
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
workdir: join(taskDir, "workdir"),
|
|
147
|
-
scoring: join(taskDir, "scoring"),
|
|
148
|
-
},
|
|
149
|
-
});
|
|
150
|
-
}
|
|
132
|
+
for (const entry of entries) {
|
|
133
|
+
if (!entry.isDirectory()) continue;
|
|
134
|
+
const taskDir = join(tasksRoot, entry.name);
|
|
135
|
+
tasks.push({
|
|
136
|
+
id: entry.name,
|
|
137
|
+
paths: {
|
|
138
|
+
instructions: join(taskDir, "instructions.md"),
|
|
139
|
+
supervisor: join(taskDir, "supervisor.task.md"),
|
|
140
|
+
judge: join(taskDir, "judge.task.md"),
|
|
141
|
+
specs: join(taskDir, "specs"),
|
|
142
|
+
workdir: join(taskDir, "workdir"),
|
|
143
|
+
scoring: join(taskDir, "scoring"),
|
|
144
|
+
},
|
|
145
|
+
});
|
|
151
146
|
}
|
|
152
147
|
tasks.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
|
|
153
148
|
return tasks;
|
|
@@ -246,7 +241,7 @@ function run(cmd, args) {
|
|
|
246
241
|
|
|
247
242
|
/**
|
|
248
243
|
* @typedef {object} Task
|
|
249
|
-
* @property {string} id -
|
|
244
|
+
* @property {string} id - Task name (directory name under tasks/)
|
|
250
245
|
* @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
|
|
251
246
|
*/
|
|
252
247
|
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
import { resolve } from "node:path";
|
|
8
8
|
|
|
9
|
+
import { createConfig } from "@forwardimpact/libconfig";
|
|
9
10
|
import { createBenchmarkRunner } from "../benchmark/runner.js";
|
|
10
11
|
|
|
11
12
|
/**
|
|
@@ -14,6 +15,8 @@ import { createBenchmarkRunner } from "../benchmark/runner.js";
|
|
|
14
15
|
*/
|
|
15
16
|
export async function runBenchmarkRunCommand(values, _args) {
|
|
16
17
|
const opts = parseRunOptions(values);
|
|
18
|
+
const config = await createConfig("script", "benchmark");
|
|
19
|
+
process.env.ANTHROPIC_API_KEY = await config.anthropicToken();
|
|
17
20
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
18
21
|
const runner = createBenchmarkRunner({ ...opts, query });
|
|
19
22
|
|