@forwardimpact/libeval 0.1.33 → 0.1.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
 
3
- import { readFileSync } from "node:fs";
3
+ import { readFileSync, realpathSync } from "node:fs";
4
4
  import { createCli } from "@forwardimpact/libcli";
5
5
  import { createLogger } from "@forwardimpact/libtelemetry";
6
6
 
@@ -71,7 +71,7 @@ export const definition = {
71
71
  },
72
72
  task: {
73
73
  type: "string",
74
- description: "METR-style task id (task_family_name/task_name)",
74
+ description: "Task id (directory name under tasks/)",
75
75
  },
76
76
  workdir: {
77
77
  type: "string",
@@ -112,7 +112,7 @@ export const definition = {
112
112
  },
113
113
  examples: [
114
114
  "fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
115
- "fit-benchmark score --family=./families/coding --task=coding/todo-api --workdir=./runs/2026-05-11/runs/coding__todo-api/0",
115
+ "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./runs/2026-05-11/runs/todo-api/0",
116
116
  "fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
117
117
  ],
118
118
  documentation: [
@@ -158,7 +158,7 @@ async function main() {
158
158
 
159
159
  // Run main only when invoked as a CLI. Importing for tests (e.g. parity)
160
160
  // should not execute the entry point.
161
- if (import.meta.url === `file://${process.argv[1]}`) {
161
+ if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
162
162
  main().catch((error) => {
163
163
  logger.exception("main", error);
164
164
  cli.error(error.message);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.33",
3
+ "version": "0.1.35",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -45,7 +45,7 @@ export async function runJudge(task, workdir, scoring, deps) {
45
45
  output,
46
46
  model: deps.model,
47
47
  judgeProfile: deps.judgeProfile,
48
- maxTurns: 5,
48
+ maxTurns: 25,
49
49
  redactor: createRedactor(),
50
50
  });
51
51
 
@@ -3,7 +3,7 @@
3
3
  * <root>/
4
4
  * apm.lock.yaml
5
5
  * .claude/ # pre-staged skills + agents (P1)
6
- * tasks/<task_family_name>/<task_name>/
6
+ * tasks/<task_name>/
7
7
  * instructions.md
8
8
  * supervisor.task.md # preserved for v2; not read in v1
9
9
  * judge.task.md
@@ -122,32 +122,27 @@ function normalizeLf(buf) {
122
122
  async function discoverTasks(rootPath) {
123
123
  const tasksRoot = join(rootPath, "tasks");
124
124
  const tasks = [];
125
- let families;
125
+ let entries;
126
126
  try {
127
- families = await readdir(tasksRoot, { withFileTypes: true });
127
+ entries = await readdir(tasksRoot, { withFileTypes: true });
128
128
  } catch (e) {
129
129
  if (e.code === "ENOENT") return tasks;
130
130
  throw e;
131
131
  }
132
- for (const family of families) {
133
- if (!family.isDirectory()) continue;
134
- const familyDir = join(tasksRoot, family.name);
135
- const entries = await readdir(familyDir, { withFileTypes: true });
136
- for (const entry of entries) {
137
- if (!entry.isDirectory()) continue;
138
- const taskDir = join(familyDir, entry.name);
139
- tasks.push({
140
- id: `${family.name}/${entry.name}`,
141
- paths: {
142
- instructions: join(taskDir, "instructions.md"),
143
- supervisor: join(taskDir, "supervisor.task.md"),
144
- judge: join(taskDir, "judge.task.md"),
145
- specs: join(taskDir, "specs"),
146
- workdir: join(taskDir, "workdir"),
147
- scoring: join(taskDir, "scoring"),
148
- },
149
- });
150
- }
132
+ for (const entry of entries) {
133
+ if (!entry.isDirectory()) continue;
134
+ const taskDir = join(tasksRoot, entry.name);
135
+ tasks.push({
136
+ id: entry.name,
137
+ paths: {
138
+ instructions: join(taskDir, "instructions.md"),
139
+ supervisor: join(taskDir, "supervisor.task.md"),
140
+ judge: join(taskDir, "judge.task.md"),
141
+ specs: join(taskDir, "specs"),
142
+ workdir: join(taskDir, "workdir"),
143
+ scoring: join(taskDir, "scoring"),
144
+ },
145
+ });
151
146
  }
152
147
  tasks.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
153
148
  return tasks;
@@ -246,7 +241,7 @@ function run(cmd, args) {
246
241
 
247
242
  /**
248
243
  * @typedef {object} Task
249
- * @property {string} id - METR-style "task_family_name/task_name"
244
+ * @property {string} id - Task name (directory name under tasks/)
250
245
  * @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
251
246
  */
252
247
 
@@ -6,6 +6,7 @@
6
6
 
7
7
  import { resolve } from "node:path";
8
8
 
9
+ import { createConfig } from "@forwardimpact/libconfig";
9
10
  import { createBenchmarkRunner } from "../benchmark/runner.js";
10
11
 
11
12
  /**
@@ -14,6 +15,8 @@ import { createBenchmarkRunner } from "../benchmark/runner.js";
14
15
  */
15
16
  export async function runBenchmarkRunCommand(values, _args) {
16
17
  const opts = parseRunOptions(values);
18
+ const config = await createConfig("script", "benchmark");
19
+ process.env.ANTHROPIC_API_KEY = await config.anthropicToken();
17
20
  const { query } = await import("@anthropic-ai/claude-agent-sdk");
18
21
  const runner = createBenchmarkRunner({ ...opts, query });
19
22