@forwardimpact/libeval 0.1.34 → 0.1.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -71,7 +71,7 @@ export const definition = {
71
71
  },
72
72
  task: {
73
73
  type: "string",
74
- description: "METR-style task id (task_family_name/task_name)",
74
+ description: "Task id (directory name under tasks/)",
75
75
  },
76
76
  workdir: {
77
77
  type: "string",
@@ -112,7 +112,7 @@ export const definition = {
112
112
  },
113
113
  examples: [
114
114
  "fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
115
- "fit-benchmark score --family=./families/coding --task=coding/todo-api --workdir=./runs/2026-05-11/runs/coding__todo-api/0",
115
+ "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./runs/2026-05-11/runs/todo-api/0",
116
116
  "fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
117
117
  ],
118
118
  documentation: [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.34",
3
+ "version": "0.1.35",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -45,7 +45,7 @@ export async function runJudge(task, workdir, scoring, deps) {
45
45
  output,
46
46
  model: deps.model,
47
47
  judgeProfile: deps.judgeProfile,
48
- maxTurns: 5,
48
+ maxTurns: 25,
49
49
  redactor: createRedactor(),
50
50
  });
51
51
 
@@ -3,7 +3,7 @@
3
3
  * <root>/
4
4
  * apm.lock.yaml
5
5
  * .claude/ # pre-staged skills + agents (P1)
6
- * tasks/<task_family_name>/<task_name>/
6
+ * tasks/<task_name>/
7
7
  * instructions.md
8
8
  * supervisor.task.md # preserved for v2; not read in v1
9
9
  * judge.task.md
@@ -122,32 +122,27 @@ function normalizeLf(buf) {
122
122
  async function discoverTasks(rootPath) {
123
123
  const tasksRoot = join(rootPath, "tasks");
124
124
  const tasks = [];
125
- let families;
125
+ let entries;
126
126
  try {
127
- families = await readdir(tasksRoot, { withFileTypes: true });
127
+ entries = await readdir(tasksRoot, { withFileTypes: true });
128
128
  } catch (e) {
129
129
  if (e.code === "ENOENT") return tasks;
130
130
  throw e;
131
131
  }
132
- for (const family of families) {
133
- if (!family.isDirectory()) continue;
134
- const familyDir = join(tasksRoot, family.name);
135
- const entries = await readdir(familyDir, { withFileTypes: true });
136
- for (const entry of entries) {
137
- if (!entry.isDirectory()) continue;
138
- const taskDir = join(familyDir, entry.name);
139
- tasks.push({
140
- id: `${family.name}/${entry.name}`,
141
- paths: {
142
- instructions: join(taskDir, "instructions.md"),
143
- supervisor: join(taskDir, "supervisor.task.md"),
144
- judge: join(taskDir, "judge.task.md"),
145
- specs: join(taskDir, "specs"),
146
- workdir: join(taskDir, "workdir"),
147
- scoring: join(taskDir, "scoring"),
148
- },
149
- });
150
- }
132
+ for (const entry of entries) {
133
+ if (!entry.isDirectory()) continue;
134
+ const taskDir = join(tasksRoot, entry.name);
135
+ tasks.push({
136
+ id: entry.name,
137
+ paths: {
138
+ instructions: join(taskDir, "instructions.md"),
139
+ supervisor: join(taskDir, "supervisor.task.md"),
140
+ judge: join(taskDir, "judge.task.md"),
141
+ specs: join(taskDir, "specs"),
142
+ workdir: join(taskDir, "workdir"),
143
+ scoring: join(taskDir, "scoring"),
144
+ },
145
+ });
151
146
  }
152
147
  tasks.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
153
148
  return tasks;
@@ -246,7 +241,7 @@ function run(cmd, args) {
246
241
 
247
242
  /**
248
243
  * @typedef {object} Task
249
- * @property {string} id - METR-style "task_family_name/task_name"
244
+ * @property {string} id - Task name (directory name under tasks/)
250
245
  * @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
251
246
  */
252
247
 
@@ -6,6 +6,7 @@
6
6
 
7
7
  import { resolve } from "node:path";
8
8
 
9
+ import { createConfig } from "@forwardimpact/libconfig";
9
10
  import { createBenchmarkRunner } from "../benchmark/runner.js";
10
11
 
11
12
  /**
@@ -14,6 +15,8 @@ import { createBenchmarkRunner } from "../benchmark/runner.js";
14
15
  */
15
16
  export async function runBenchmarkRunCommand(values, _args) {
16
17
  const opts = parseRunOptions(values);
18
+ const config = await createConfig("script", "benchmark");
19
+ process.env.ANTHROPIC_API_KEY = await config.anthropicToken();
17
20
  const { query } = await import("@anthropic-ai/claude-agent-sdk");
18
21
  const runner = createBenchmarkRunner({ ...opts, query });
19
22