npm - @forwardimpact/libeval - Versions diffs - 0.1.34 → 0.1.35 - Mend

@forwardimpact/libeval 0.1.34 → 0.1.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/bin/fit-benchmark.js +2 -2
package/package.json +1 -1
package/src/benchmark/judge.js +1 -1
package/src/benchmark/task-family.js +18 -23
package/src/commands/benchmark-run.js +3 -0

package/bin/fit-benchmark.js CHANGED Viewed

@@ -71,7 +71,7 @@ export const definition = {
         },
         task: {
           type: "string",
-          description: "METR-style task id (task_family_name/task_name)",
+          description: "Task id (directory name under tasks/)",
         },
         workdir: {
           type: "string",
@@ -112,7 +112,7 @@ export const definition = {
   },
   examples: [
     "fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
-    "fit-benchmark score --family=./families/coding --task=coding/todo-api --workdir=./runs/2026-05-11/runs/coding__todo-api/0",
+    "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./runs/2026-05-11/runs/todo-api/0",
     "fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
   ],
   documentation: [

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.34",
+  "version": "0.1.35",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",

package/src/benchmark/judge.js CHANGED Viewed

@@ -45,7 +45,7 @@ export async function runJudge(task, workdir, scoring, deps) {
     output,
     model: deps.model,
     judgeProfile: deps.judgeProfile,
-    maxTurns: 5,
+    maxTurns: 25,
     redactor: createRedactor(),
   });

package/src/benchmark/task-family.js CHANGED Viewed

@@ -3,7 +3,7 @@
  *   <root>/
  *     apm.lock.yaml
  *     .claude/                # pre-staged skills + agents (P1)
- *     tasks/<task_family_name>/<task_name>/
+ *     tasks/<task_name>/
  *       instructions.md
  *       supervisor.task.md    # preserved for v2; not read in v1
  *       judge.task.md
@@ -122,32 +122,27 @@ function normalizeLf(buf) {
 async function discoverTasks(rootPath) {
   const tasksRoot = join(rootPath, "tasks");
   const tasks = [];
-  let families;
+  let entries;
   try {
-    families = await readdir(tasksRoot, { withFileTypes: true });
+    entries = await readdir(tasksRoot, { withFileTypes: true });
   } catch (e) {
     if (e.code === "ENOENT") return tasks;
     throw e;
   }
-  for (const family of families) {
-    if (!family.isDirectory()) continue;
-    const familyDir = join(tasksRoot, family.name);
-    const entries = await readdir(familyDir, { withFileTypes: true });
-    for (const entry of entries) {
-      if (!entry.isDirectory()) continue;
-      const taskDir = join(familyDir, entry.name);
-      tasks.push({
-        id: `${family.name}/${entry.name}`,
-        paths: {
-          instructions: join(taskDir, "instructions.md"),
-          supervisor: join(taskDir, "supervisor.task.md"),
-          judge: join(taskDir, "judge.task.md"),
-          specs: join(taskDir, "specs"),
-          workdir: join(taskDir, "workdir"),
-          scoring: join(taskDir, "scoring"),
-        },
-      });
-    }
+  for (const entry of entries) {
+    if (!entry.isDirectory()) continue;
+    const taskDir = join(tasksRoot, entry.name);
+    tasks.push({
+      id: entry.name,
+      paths: {
+        instructions: join(taskDir, "instructions.md"),
+        supervisor: join(taskDir, "supervisor.task.md"),
+        judge: join(taskDir, "judge.task.md"),
+        specs: join(taskDir, "specs"),
+        workdir: join(taskDir, "workdir"),
+        scoring: join(taskDir, "scoring"),
+      },
+    });
   }
   tasks.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
   return tasks;
@@ -246,7 +241,7 @@ function run(cmd, args) {
 /**
  * @typedef {object} Task
- * @property {string} id - METR-style "task_family_name/task_name"
+ * @property {string} id - Task name (directory name under tasks/)
  * @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
  */

package/src/commands/benchmark-run.js CHANGED Viewed

@@ -6,6 +6,7 @@
 import { resolve } from "node:path";
+import { createConfig } from "@forwardimpact/libconfig";
 import { createBenchmarkRunner } from "../benchmark/runner.js";
 /**
@@ -14,6 +15,8 @@ import { createBenchmarkRunner } from "../benchmark/runner.js";
  */
 export async function runBenchmarkRunCommand(values, _args) {
   const opts = parseRunOptions(values);
+  const config = await createConfig("script", "benchmark");
+  process.env.ANTHROPIC_API_KEY = await config.anthropicToken();
   const { query } = await import("@anthropic-ai/claude-agent-sdk");
   const runner = createBenchmarkRunner({ ...opts, query });