npm - @forwardimpact/libeval - Versions diffs - 0.1.56 → 0.1.58 - Mend

@forwardimpact/libeval 0.1.56 → 0.1.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/bin/fit-benchmark.js +8 -6
package/bin/fit-eval.js +6 -8
package/package.json +2 -2
package/src/agent-runner.js +3 -1
package/src/benchmark/hook-env.js +37 -0
package/src/benchmark/invariants.js +11 -4
package/src/benchmark/runner.js +1 -0
package/src/benchmark/workdir.js +28 -3
package/src/commands/benchmark-run.js +7 -3
package/src/commands/discuss.js +3 -2
package/src/commands/facilitate.js +3 -2
package/src/commands/run.js +2 -1
package/src/commands/supervise.js +3 -2
package/src/discusser.js +3 -2

package/bin/fit-benchmark.js CHANGED Viewed

@@ -10,6 +10,10 @@ import { createLogger } from "@forwardimpact/libtelemetry";
 import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
 import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
 import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
+import {
+  BENCHMARK_AGENT_MODEL,
+  LEAD_MODEL,
+} from "@forwardimpact/libutil/models";
 export const definition = {
   name: "fit-benchmark",
@@ -38,17 +42,15 @@ export const definition = {
         },
         "agent-model": {
           type: "string",
-          description:
-            "Claude model for the agent-under-test (default: claude-sonnet-4-6)",
+          description: `Claude model for the agent-under-test (default: ${BENCHMARK_AGENT_MODEL})`,
         },
         "lead-model": {
           type: "string",
-          description:
-            "Claude model for the lead role (default: claude-opus-4-7)",
+          description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
         },
         "judge-model": {
           type: "string",
-          description: "Claude model for the judge (default: claude-opus-4-7)",
+          description: `Claude model for the judge (default: ${LEAD_MODEL})`,
         },
         "agent-profile": {
           type: "string",
@@ -126,7 +128,7 @@ export const definition = {
   },
   examples: [
     "fit-benchmark run --family=./families/coding",
-    "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
+    `fit-benchmark run --family=./families/coding --runs=10 --agent-model=${BENCHMARK_AGENT_MODEL}`,
     "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
     "fit-benchmark report --format=text",
     "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",

package/bin/fit-eval.js CHANGED Viewed

@@ -13,6 +13,7 @@ import { runSuperviseCommand } from "../src/commands/supervise.js";
 import { runFacilitateCommand } from "../src/commands/facilitate.js";
 import { runDiscussCommand } from "../src/commands/discuss.js";
 import { runCallbackCommand } from "../src/commands/callback.js";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 const LEAD_OPTIONS = {
   "lead-profile": {
@@ -21,8 +22,7 @@ const LEAD_OPTIONS = {
   },
   "lead-model": {
     type: "string",
-    description:
-      "Claude model for the lead role (default: claude-opus-4-7[1m])",
+    description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
   },
 };
@@ -64,8 +64,7 @@ const definition = {
         ...TASK_INPUT_OPTIONS,
         "agent-model": {
           type: "string",
-          description:
-            "Claude model for the agent (default: claude-opus-4-7[1m])",
+          description: `Claude model for the agent (default: ${AGENT_MODEL})`,
         },
         "max-turns": {
           type: "string",
@@ -102,8 +101,7 @@ const definition = {
         ...TASK_INPUT_OPTIONS,
         "agent-model": {
           type: "string",
-          description:
-            "Claude model for the agent (default: claude-opus-4-7[1m])",
+          description: `Claude model for the agent (default: ${AGENT_MODEL})`,
         },
         ...LEAD_OPTIONS,
         "max-turns": {
@@ -147,7 +145,7 @@ const definition = {
         ...TASK_INPUT_OPTIONS,
         "agent-model": {
           type: "string",
-          description: "Claude model for agents (default: claude-opus-4-7[1m])",
+          description: `Claude model for agents (default: ${AGENT_MODEL})`,
         },
         ...LEAD_OPTIONS,
         "max-turns": {
@@ -184,7 +182,7 @@ const definition = {
         ...TASK_INPUT_OPTIONS,
         "agent-model": {
           type: "string",
-          description: "Claude model for agents (default: claude-opus-4-7[1m])",
+          description: `Claude model for agents (default: ${AGENT_MODEL})`,
         },
         ...LEAD_OPTIONS,
         "max-turns": {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.56",
+  "version": "0.1.58",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",
@@ -51,7 +51,7 @@
     "test": "bun test test/*.test.js"
   },
   "dependencies": {
-    "@anthropic-ai/claude-agent-sdk": "0.2.112",
+    "@anthropic-ai/claude-agent-sdk": "0.3.170",
     "@forwardimpact/libcli": "^0.1.0",
     "@forwardimpact/libconfig": "^0.1.0",
     "@forwardimpact/libpreflight": "^0.1.0",

package/src/agent-runner.js CHANGED Viewed

@@ -6,6 +6,8 @@
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
  */
+import { AGENT_MODEL } from "@forwardimpact/libutil/models";
 const DEFAULT_ALLOWED_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
 // fit-eval and kata-action run headless in CI/CD with no human to answer
@@ -43,7 +45,7 @@ export class AgentRunner {
     this.query = deps.query;
     this.output = deps.output;
     this.redactor = deps.redactor;
-    this.model = deps.model ?? "claude-opus-4-7[1m]";
+    this.model = deps.model ?? AGENT_MODEL;
     this.maxTurns = deps.maxTurns ?? 50;
     this.allowedTools = deps.allowedTools ?? DEFAULT_ALLOWED_TOOLS;
     this.onLine = deps.onLine ?? null;

package/src/benchmark/hook-env.js ADDED Viewed

@@ -0,0 +1,37 @@
+/**
+ * Shared environment builder for the benchmark hook scripts (`preflight.sh` and
+ * `invariants.sh`). Keeping both spawns on one helper guarantees they expose the
+ * same variable set, so hook authors never have to wonder which vars a given
+ * hook receives.
+ *
+ * Path vars (TASK_DIR, FAMILY_DIR, HOOKS_DIR) let hooks reference real
+ * locations instead of reconstructing them from `$0`. They are paths, not
+ * secrets, so they need no redaction allowlist entry.
+ */
+/**
+ * @param {Record<string, string>} baseEnv - Inherited env (`runtime.proc.env`).
+ * @param {object} vars
+ * @param {string} vars.cwd - Agent CWD → `$WORKDIR`.
+ * @param {number} vars.port - Allocated TCP port → `$PORT`.
+ * @param {string} vars.taskId - Task id → `$TASK_ID`.
+ * @param {string} vars.taskDir - Task directory on host → `$TASK_DIR`.
+ * @param {string} vars.hooksDir - Task `hooks/` dir on host → `$HOOKS_DIR`.
+ * @param {string|null} vars.familyDir - Family root on host → `$FAMILY_DIR`
+ *   (null when the family root is unknown, e.g. a standalone task).
+ * @returns {Record<string, string>}
+ */
+export function buildHookEnv(
+  baseEnv,
+  { cwd, port, taskId, taskDir, hooksDir, familyDir },
+) {
+  return {
+    ...baseEnv,
+    WORKDIR: cwd,
+    PORT: String(port),
+    TASK_ID: taskId,
+    TASK_DIR: taskDir,
+    HOOKS_DIR: hooksDir,
+    FAMILY_DIR: familyDir ?? "",
+  };
+}

package/src/benchmark/invariants.js CHANGED Viewed

@@ -10,6 +10,8 @@
 import { join } from "node:path";
+import { buildHookEnv } from "./hook-env.js";
 /**
  * @typedef {object} InvariantsResult
  * @property {"pass" | "fail"} verdict
@@ -20,7 +22,7 @@ import { join } from "node:path";
 /**
  * Run the task's invariants script.
  * @param {import("./task-family.js").Task} task
- * @param {{cwd: string, port: number, runDir: string}} ctx
+ * @param {{cwd: string, port: number, runDir: string, familyDir?: string|null}} ctx
  * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @returns {Promise<InvariantsResult>}
  */
@@ -44,9 +46,14 @@ export async function runInvariants(task, ctx, runtime) {
   try {
     child = runtime.subprocess.spawn(script, [], {
       env: {
-        ...runtime.proc.env,
-        WORKDIR: ctx.cwd,
-        PORT: String(ctx.port),
+        ...buildHookEnv(runtime.proc.env, {
+          cwd: ctx.cwd,
+          port: ctx.port,
+          taskId: task.id,
+          taskDir: task.paths.taskDir,
+          hooksDir: task.paths.hooks,
+          familyDir: ctx.familyDir,
+        }),
         RESULTS_FD: "3",
       },
       stdio: ["inherit", "pipe", "pipe", fd3File],

package/src/benchmark/runner.js CHANGED Viewed

@@ -201,6 +201,7 @@ export class BenchmarkRunner {
           cwd: workdir.cwd,
           port: workdir.port,
           runDir: workdir.runDir,
+          familyDir: family.rootPath,
         },
         this.runtime,
       );

package/src/benchmark/workdir.js CHANGED Viewed

@@ -17,6 +17,7 @@ import { connect } from "node:net";
 import { join } from "node:path";
 import { loadEnv } from "./env-loader.js";
+import { buildHookEnv } from "./hook-env.js";
 const DEFAULT_TERM_GRACE_MS = 5_000;
@@ -73,6 +74,24 @@ export class WorkdirManager {
     const cwd = join(runDir, "cwd");
     await fs.mkdir(cwd, { recursive: true });
+    // Family-level shared fixtures: convention-over-configuration, copied if
+    // present. They form the shared base; the per-task workdir/specs below
+    // overlay on top (fs.cp defaults to force:true, so a per-task file wins).
+    if (this.familyRootPath) {
+      await fs
+        .cp(join(this.familyRootPath, "workdir"), cwd, { recursive: true })
+        .catch((e) => {
+          if (e.code !== "ENOENT") throw e;
+        });
+      await fs
+        .cp(join(this.familyRootPath, "specs"), join(cwd, "specs"), {
+          recursive: true,
+        })
+        .catch((e) => {
+          if (e.code !== "ENOENT") throw e;
+        });
+    }
     await fs.cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
       if (e.code !== "ENOENT") throw e;
     });
@@ -107,7 +126,12 @@ export class WorkdirManager {
     const judgeTracePath = join(runDir, "judge.ndjson");
     const preflight = task.paths.preflight
-      ? await runPreflight(this.runtime, task.paths.preflight, cwd, port)
+      ? await runPreflight(this.runtime, task.paths.preflight, cwd, port, {
+          taskId: task.id,
+          taskDir: task.paths.taskDir,
+          hooksDir: task.paths.hooks,
+          familyDir: this.familyRootPath,
+        })
       : { pgid: 0 };
     return {
@@ -163,12 +187,13 @@ export class WorkdirManager {
  * @param {string} script
  * @param {string} cwd - Agent CWD passed via $WORKDIR.
  * @param {number} port - Free TCP port passed via $PORT.
+ * @param {{taskId: string, taskDir: string, hooksDir: string, familyDir: string|null}} vars - Extra hook env vars.
  * @returns {Promise<{pgid: number, error?: {phase: string, message: string, exitCode: number}}>}
  */
-async function runPreflight(runtime, script, cwd, port) {
+async function runPreflight(runtime, script, cwd, port, vars) {
   const child = runtime.subprocess.spawn(script, [], {
     cwd,
-    env: { ...runtime.proc.env, WORKDIR: cwd, PORT: String(port) },
+    env: buildHookEnv(runtime.proc.env, { cwd, port, ...vars }),
     detached: true,
     stdio: ["ignore", "pipe", "pipe"],
   });

package/src/commands/benchmark-run.js CHANGED Viewed

@@ -8,6 +8,10 @@ import { resolve } from "node:path";
 import { createConfig } from "@forwardimpact/libconfig";
 import { createBenchmarkRunner } from "../benchmark/runner.js";
+import {
+  BENCHMARK_AGENT_MODEL,
+  LEAD_MODEL,
+} from "@forwardimpact/libutil/models";
 /**
  * @param {import("@forwardimpact/libcli").InvocationContext} ctx
@@ -54,9 +58,9 @@ function parseRunOptions(values) {
     family,
     runs,
     output: resolve(output),
-    agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
-    supervisorModel: values["lead-model"] ?? "claude-opus-4-7",
-    judgeModel: values["judge-model"] ?? "claude-opus-4-7",
+    agentModel: values["agent-model"] ?? BENCHMARK_AGENT_MODEL,
+    supervisorModel: values["lead-model"] ?? LEAD_MODEL,
+    judgeModel: values["judge-model"] ?? LEAD_MODEL,
     profiles: {
       agent: values["agent-profile"] ?? null,
       judge: values["judge-profile"] ?? null,

package/src/commands/discuss.js CHANGED Viewed

@@ -4,6 +4,7 @@ import { createDiscusser } from "../discusser.js";
 import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
 import { resolveTaskContent } from "./task-input.js";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 function parseAgentProfiles(raw, cwd, maxTurns) {
   if (!raw) return [];
@@ -52,8 +53,8 @@ export function parseDiscussOptions(values, runtime) {
     taskAmend,
     agentConfigs,
     leadProfile: values["lead-profile"] ?? undefined,
-    leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
-    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    leadModel: values["lead-model"] ?? LEAD_MODEL,
+    agentModel: values["agent-model"] ?? AGENT_MODEL,
     maxTurns,
     maxLeadTurns,
     outputPath: values.output,

package/src/commands/facilitate.js CHANGED Viewed

@@ -4,6 +4,7 @@ import { createFacilitator } from "../facilitator.js";
 import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
 import { resolveTaskContent } from "./task-input.js";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 /**
  * Parse comma-separated agent profile names into structured configs.
@@ -50,8 +51,8 @@ export function parseFacilitateOptions(values, runtime) {
     taskAmend,
     agentConfigs,
     facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
-    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
-    facilitatorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? AGENT_MODEL,
+    facilitatorModel: values["lead-model"] ?? LEAD_MODEL,
     maxTurns,
     outputPath: values.output,
     facilitatorProfile: values["lead-profile"] ?? undefined,

package/src/commands/run.js CHANGED Viewed

@@ -8,6 +8,7 @@ import { createTeeWriter } from "../tee-writer.js";
 import { SequenceCounter } from "../sequence-counter.js";
 import { resolveTaskContent } from "./task-input.js";
 import { createServiceConfig } from "@forwardimpact/libconfig";
+import { AGENT_MODEL } from "@forwardimpact/libutil/models";
 /**
  * Parse and validate run command options from parsed values.
@@ -26,7 +27,7 @@ function parseRunOptions(values, runtime) {
     taskContent,
     taskAmend,
     cwd: resolve(values.cwd ?? "."),
-    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? AGENT_MODEL,
     maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
     outputPath: values.output,
     agentProfile: values["agent-profile"] ?? undefined,

package/src/commands/supervise.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
 import { resolveTaskContent } from "./task-input.js";
 import { createServiceConfig } from "@forwardimpact/libconfig";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 /**
  * Parse all supervise flags from parsed values into an options object.
@@ -30,8 +31,8 @@ export async function parseSuperviseOptions(values, runtime) {
     taskAmend,
     supervisorCwd: resolve(values["supervisor-cwd"] ?? "."),
     agentCwd,
-    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
-    supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? AGENT_MODEL,
+    supervisorModel: values["lead-model"] ?? LEAD_MODEL,
     maxTurns: (() => {
       const raw = values["max-turns"] ?? "200";
       return raw === "0" ? 0 : parseInt(raw, 10);

package/src/discusser.js CHANGED Viewed

@@ -29,6 +29,7 @@ import {
   DISCUSS_AGENT_SYSTEM_PROMPT,
 } from "./discuss-tools.js";
 import { OrchestrationLoop } from "./orchestration-loop.js";
+import { AGENT_MODEL, LEAD_MODEL } from "@forwardimpact/libutil/models";
 /** System prompt for the discuss-mode lead. L0 mechanics only per COALIGNED. */
 export const DISCUSS_SYSTEM_PROMPT =
@@ -314,7 +315,7 @@ export function createDiscusser({
       cwd: config.cwd ?? resolvedLeadCwd,
       query,
       output: devNull,
-      model: agentModel ?? "claude-opus-4-7[1m]",
+      model: agentModel ?? AGENT_MODEL,
       maxTurns: config.maxTurns ?? 50,
       allowedTools: config.allowedTools,
       onLine: (line) => discusser.loop.emitLine(config.name, line),
@@ -347,7 +348,7 @@ export function createDiscusser({
     cwd: resolvedLeadCwd,
     query,
     output: devNull,
-    model: leadModel ?? "claude-opus-4-7[1m]",
+    model: leadModel ?? LEAD_MODEL,
     maxTurns: maxTurns ?? 80,
     allowedTools: ["Read", "Glob", "Grep"],
     disallowedTools: defaultDisallowed,