npm - @forwardimpact/libeval - Versions diffs - 0.1.36 → 0.1.38 - Mend

@forwardimpact/libeval 0.1.36 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/bin/fit-benchmark.js +27 -7
package/bin/fit-eval.js +24 -3
package/bin/fit-trace.js +42 -0
package/package.json +2 -1
package/src/benchmark/apm-installer.js +56 -10
package/src/benchmark/judge.js +4 -3
package/src/benchmark/report.js +43 -17
package/src/benchmark/result.js +7 -1
package/src/benchmark/runner.js +120 -75
package/src/benchmark/scorer.js +2 -5
package/src/benchmark/task-family.js +14 -47
package/src/benchmark/workdir.js +7 -6
package/src/commands/assert.js +145 -0
package/src/commands/benchmark-report.js +1 -2
package/src/commands/benchmark-run.js +5 -4
package/src/commands/facilitate.js +4 -2
package/src/commands/run.js +3 -3
package/src/commands/supervise.js +5 -2
package/src/facilitator.js +7 -3
package/src/supervisor.js +42 -12

package/bin/fit-benchmark.js CHANGED Viewed

@@ -34,15 +34,26 @@ export const definition = {
         },
         output: {
           type: "string",
-          description: "Run-output directory (created if missing)",
+          description:
+            "Run-output directory (created if missing, default: benchmark-runs)",
         },
         runs: {
           type: "string",
-          description: "Runs per task (integer ≥ 1, default 1)",
+          description: "Runs per task (integer ≥ 1, default: 5)",
+        },
+        "agent-model": {
+          type: "string",
+          description:
+            "Claude model for the agent-under-test (default: claude-sonnet-4-6)",
         },
-        model: {
+        "supervisor-model": {
           type: "string",
-          description: "Claude model id (default: claude-opus-4-7[1m])",
+          description:
+            "Claude model for the supervisor (default: claude-opus-4-7)",
+        },
+        "judge-model": {
+          type: "string",
+          description: "Claude model for the judge (default: claude-opus-4-7)",
         },
         "agent-profile": {
           type: "string",
@@ -92,7 +103,8 @@ export const definition = {
       options: {
         input: {
           type: "string",
-          description: "Run-output directory containing results.jsonl",
+          description:
+            "Run-output directory containing results.jsonl (default: benchmark-runs)",
         },
         k: {
           type: "string",
@@ -111,8 +123,10 @@ export const definition = {
     json: { type: "boolean", description: "Output help as JSON" },
   },
   examples: [
-    "fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
-    "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./runs/2026-05-11/runs/todo-api/0",
+    "fit-benchmark run --family=./families/coding",
+    "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
+    "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
+    "fit-benchmark report --format=text",
     "fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
   ],
   documentation: [
@@ -122,6 +136,12 @@ export const definition = {
       description:
         "Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
     },
+    {
+      title: "Automate with GitHub Actions",
+      url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
+      description:
+        "Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
+    },
   ],
 };

package/bin/fit-eval.js CHANGED Viewed

@@ -41,7 +41,11 @@ const definition = {
           type: "string",
           description: "Additional text appended to the task",
         },
-        model: { type: "string", description: "Claude model (default: opus)" },
+        "agent-model": {
+          type: "string",
+          description:
+            "Claude model for the agent (default: claude-opus-4-7[1m])",
+        },
         "max-turns": {
           type: "string",
           description: "Max agentic turns (default: 50, 0 = unlimited)",
@@ -84,7 +88,16 @@ const definition = {
           type: "string",
           description: "Additional text appended to the task",
         },
-        model: { type: "string", description: "Claude model (default: opus)" },
+        "agent-model": {
+          type: "string",
+          description:
+            "Claude model for the agent (default: claude-opus-4-7[1m])",
+        },
+        "supervisor-model": {
+          type: "string",
+          description:
+            "Claude model for the supervisor (default: claude-opus-4-7[1m])",
+        },
         "max-turns": {
           type: "string",
           description: "Max agentic turns (default: 20, 0 = unlimited)",
@@ -136,7 +149,15 @@ const definition = {
           type: "string",
           description: "Additional text appended to the task",
         },
-        model: { type: "string", description: "Claude model (default: opus)" },
+        "agent-model": {
+          type: "string",
+          description: "Claude model for agents (default: claude-opus-4-7[1m])",
+        },
+        "facilitator-model": {
+          type: "string",
+          description:
+            "Claude model for the facilitator (default: claude-opus-4-7[1m])",
+        },
         "max-turns": {
           type: "string",
           description: "Max agentic turns (default: 20, 0 = unlimited)",

package/bin/fit-trace.js CHANGED Viewed

@@ -25,6 +25,7 @@ import {
   runFilterCommand,
   runSplitCommand,
 } from "../src/commands/trace.js";
+import { runAssertCommand } from "../src/commands/assert.js";
 // `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
 // the readFileSync branch in the compiled binary (which would ENOENT against
@@ -199,6 +200,41 @@ const definition = {
         },
       },
     },
+    {
+      name: "assert",
+      args: "<test-name> <file>",
+      description:
+        "Shell-friendly assertion — outputs structured JSON for scoring hooks",
+      options: {
+        grep: {
+          type: "string",
+          description:
+            "Pass if extended regex matches file content (case-insensitive)",
+        },
+        query: {
+          type: "string",
+          description:
+            "Pass if JMESPath expression against JSON/NDJSON yields a truthy result",
+        },
+        exists: {
+          type: "boolean",
+          description: "Pass if file exists",
+        },
+        "cites-job": {
+          type: "string",
+          description:
+            "Pass if <file> contains the canonical citation from a <job> tag in the given JTBD file",
+        },
+        not: {
+          type: "boolean",
+          description: "Invert the assertion",
+        },
+        message: {
+          type: "string",
+          description: "Custom failure message",
+        },
+      },
+    },
   ],
   globalOptions: {
     help: { type: "boolean", short: "h", description: "Show this help" },
@@ -220,6 +256,11 @@ const definition = {
     "fit-trace search structured.json 'error|fail' --context 1",
     "fit-trace filter structured.json --tool Bash --error",
     "fit-trace turn structured.json 3",
+    "fit-trace assert has-heading --grep '^## Problem' spec.md",
+    "fit-trace assert no-leak --not --grep 'password' output.log",
+    "fit-trace assert file-present --exists path/to/spec.md",
+    "fit-trace assert cites-jtbd --cites-job jtbd-excerpt.md spec.md",
+    "fit-trace assert used-edit --query \"[?type=='assistant'].message.content[] | [?name=='Edit']\" trace.ndjson",
   ],
   documentation: [
     {
@@ -265,6 +306,7 @@ const COMMANDS = {
   turn: runTurnCommand,
   filter: runFilterCommand,
   split: runSplitCommand,
+  assert: runAssertCommand,
 };
 async function main() {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.36",
+  "version": "0.1.38",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",
@@ -53,6 +53,7 @@
     "@forwardimpact/libcli": "^0.1.0",
     "@forwardimpact/libconfig": "^0.1.0",
     "@forwardimpact/libtelemetry": "^0.1.22",
+    "jmespath": "^0.16.0",
     "zod": "^4.4.3"
   },
   "devDependencies": {

package/src/benchmark/apm-installer.js CHANGED Viewed

@@ -1,14 +1,13 @@
 /**
- * ApmInstaller — materialises the family's pre-staged `.claude/` tree into a
- * single staging directory, computes the manifest fingerprint, and is invoked
- * once per family install. Per-task copy happens later in WorkdirManager.
- *
- * v1 trusts the family's checked-in `.claude/` (P1); the lockfile is hashed
- * verbatim, not interpreted.
+ * ApmInstaller — runs `apm install --target claude` in the family root to
+ * materialise skills and agents, copies the resulting `.claude/` into a
+ * staging directory, and computes the manifest fingerprint from the lockfile.
+ * Per-task copy happens later in WorkdirManager.
  */
+import { spawn } from "node:child_process";
 import { createHash } from "node:crypto";
-import { access, cp, rm } from "node:fs/promises";
+import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
 import { join } from "node:path";
 /**
@@ -21,19 +20,66 @@ export async function installApm(family, outputDir) {
   const stagedClaude = join(stagingDir, ".claude");
   const sourceClaude = join(family.rootPath, ".claude");
+  await runApmInstall(family.rootPath);
   try {
     await access(sourceClaude);
   } catch {
     throw new Error(
-      `task family missing .claude/ at ${sourceClaude}; family must check in a pre-staged skills/agents tree (design decision P1)`,
+      `apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
     );
   }
   await rm(stagingDir, { recursive: true, force: true });
   await cp(sourceClaude, stagedClaude, { recursive: true });
+  // Stage the family-local judge profile outside .claude/ so it is available
+  // to the judge but never copied into the agent-under-test's CWD.
+  const judgeSource = join(family.rootPath, "judge.md");
+  const judgeProfilesDir = join(stagingDir, "judge-profiles");
+  try {
+    await access(judgeSource);
+    await mkdir(judgeProfilesDir, { recursive: true });
+    await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
+  } catch {}
+  const lockPath = join(family.rootPath, "apm.lock.yaml");
+  const lockBytes = await readFile(lockPath).catch(() => {
+    throw new Error(`apm install did not produce apm.lock.yaml at ${lockPath}`);
+  });
   const skillSetHash =
-    "sha256:" + createHash("sha256").update(family.apmLockBytes).digest("hex");
+    "sha256:" +
+    createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
+  return { stagingDir, skillSetHash, judgeProfilesDir };
+}
+function normalizeLf(buf) {
+  const out = [];
+  for (let i = 0; i < buf.length; i++) {
+    if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
+    out.push(buf[i]);
+  }
+  return Buffer.from(out);
+}
-  return { stagingDir, skillSetHash };
+function runApmInstall(cwd) {
+  return new Promise((res, rej) => {
+    const child = spawn("apm", ["install", "--target", "claude"], {
+      cwd,
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+    let stderr = "";
+    child.stdout.on("data", () => {});
+    child.stderr.on("data", (d) => {
+      stderr += d.toString();
+    });
+    child.on("error", (e) => {
+      rej(new Error(`failed to spawn apm: ${e.message}`));
+    });
+    child.on("close", (code) => {
+      if (code === 0) res();
+      else rej(new Error(`apm install exited ${code}: ${stderr}`));
+    });
+  });
 }

package/src/benchmark/judge.js CHANGED Viewed

@@ -6,7 +6,7 @@
  *
  * Template variables available in `judge.task.md`:
  *
- *   {{AGENT_INSTRUCTIONS}}  — contents of instructions.md
+ *   {{AGENT_INSTRUCTIONS}}  — contents of agent.task.md
  *   {{AGENT_PROFILE}}       — agent profile body (empty string if none)
  *   {{AGENT_TRACE_PATH}}    — path to agent.ndjson
  *   {{SCORING_RESULT}}      — JSON scoring object
@@ -37,7 +37,7 @@ import { createRedactor } from "../redaction.js";
 /**
  * @typedef {object} JudgeContext
- * @property {string} agentInstructions - Contents of instructions.md.
+ * @property {string} agentInstructions - Contents of agent.task.md.
  * @property {string} agentProfile - Agent profile body (empty string if none).
  * @property {string} skillSetHash - SHA-256 fingerprint from apm.lock.yaml.
  */
@@ -47,7 +47,7 @@ import { createRedactor } from "../redaction.js";
  * @param {import("./task-family.js").Task} task
  * @param {import("./workdir.js").Workdir} workdir
  * @param {import("./scorer.js").ScoringResult} scoring
- * @param {{query: Function, model: string, judgeProfile?: string}} deps
+ * @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
  * @param {JudgeContext} [context]
  * @returns {Promise<JudgeVerdict>}
  */
@@ -71,6 +71,7 @@ export async function runJudge(task, workdir, scoring, deps, context) {
     output,
     model: deps.model,
     judgeProfile: deps.judgeProfile,
+    profilesDir: deps.profilesDir,
     maxTurns: 25,
     redactor: createRedactor(),
   });

package/src/benchmark/report.js CHANGED Viewed

@@ -178,24 +178,46 @@ function renderFullReport(report, kValues) {
 function renderSummary(report) {
   const { totals } = report;
   const passing = report.tasks.filter((t) => t.c > 0 && t.c === t.n).length;
+  const icon = statusIcon(passing === totals.tasks);
   const lines = [
     "# Benchmark Report",
     "",
-    `**Result: ${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
+    `${icon} **${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
   ];
+  const headers = [];
+  const values = [];
+  if (totals.costUsd != null) {
+    headers.push("Cost");
+    values.push(formatCost(totals.costUsd));
+  }
+  if (totals.medianDurationMs != null) {
+    headers.push("Median Duration");
+    values.push(formatDuration(totals.medianDurationMs));
+  }
+  if (totals.medianTurns != null) {
+    headers.push("Median Turns");
+    values.push(String(totals.medianTurns));
+  }
+  if (headers.length) {
+    lines.push("");
+    lines.push(`| ${headers.join(" | ")} |`);
+    lines.push(`| ${headers.map(() => "---").join(" | ")} |`);
+    lines.push(`| ${values.join(" | ")} |`);
+  }
   const meta = [];
-  if (totals.model) meta.push(`Model: \`${totals.model}\``);
+  if (totals.model) {
+    meta.push(`Agent: \`${totals.model.agent}\``);
+    meta.push(`Supervisor: \`${totals.model.supervisor}\``);
+    meta.push(`Judge: \`${totals.model.judge}\``);
+  }
   if (totals.skillSetHash) meta.push(`Skill set: \`${totals.skillSetHash}\``);
   if (totals.familyRevision) meta.push(`Family: \`${totals.familyRevision}\``);
-  if (meta.length) lines.push(meta.join(" | "));
-  const stats = [];
-  if (totals.costUsd != null) stats.push(`Cost: ${formatCost(totals.costUsd)}`);
-  if (totals.medianDurationMs != null)
-    stats.push(`Median duration: ${formatDuration(totals.medianDurationMs)}`);
-  if (totals.medianTurns != null)
-    stats.push(`Median turns: ${totals.medianTurns}`);
-  if (stats.length) lines.push(stats.join(" | "));
+  if (meta.length) {
+    lines.push("");
+    lines.push(meta.join(" | "));
+  }
   lines.push("");
   return lines.join("\n");
@@ -229,13 +251,13 @@ function renderTotalsLine(report) {
 function renderTaskDetail(task) {
   const runs = task.runs ?? [];
-  const status = task.c === task.n ? "PASS" : "FAIL";
+  const icon = statusIcon(task.c === task.n);
   const singleRun = runs.length === 1;
   const lines = [
     `### ${task.taskId}`,
     "",
-    `**${status} — ${task.c}/${task.n} runs passed**`,
+    `${icon} **${task.c}/${task.n} runs passed**`,
   ];
   lines.push("", renderRunsTable(runs));
@@ -267,16 +289,16 @@ function renderRunsTable(runs) {
     const scoringCell = r.preflightError
       ? "preflight error"
       : r.scoring
-        ? r.scoring.verdict
+        ? statusIcon(r.scoring.verdict === "pass")
         : "—";
     const judgeCell = r.preflightError
       ? "—"
       : r.judgeVerdict
-        ? r.judgeVerdict.verdict
+        ? statusIcon(r.judgeVerdict.verdict === "pass")
         : "—";
     rows.push([
       String(r.runIndex),
-      r.verdict.toUpperCase(),
+      statusIcon(r.verdict === "pass"),
       scoringCell,
       judgeCell,
       formatCost(r.costUsd),
@@ -317,7 +339,7 @@ function collectScoringRows(runs) {
       rows.push({
         run: r.runIndex,
         check: escapeCell(String(d.test ?? "(unnamed)")),
-        result: d.pass ? "PASS" : "FAIL",
+        result: statusIcon(d.pass),
         message: escapeCell(String(d.message ?? "")),
       });
     }
@@ -365,6 +387,10 @@ function renderErrors(runs) {
 // Formatting helpers
 // ---------------------------------------------------------------------------
+function statusIcon(pass) {
+  return pass ? "✅" : "❌";
+}
 function formatPassAt(v) {
   if (v == null) return "—";
   if (typeof v === "object" && "error" in v) return v.error;

package/src/benchmark/result.js CHANGED Viewed

@@ -46,7 +46,11 @@ const COMMON_FIELDS = {
   costUsd: z.number(),
   turns: z.number().int().min(0),
   profiles: PROFILES_SHAPE,
-  model: z.string(),
+  model: z.object({
+    agent: z.string(),
+    supervisor: z.string(),
+    judge: z.string(),
+  }),
   skillSetHash: z.string(),
   familyRevision: z.string(),
   durationMs: z.number().int().min(0),
@@ -63,6 +67,7 @@ const HAPPY_RECORD = z.object({
   submission: z.string(),
   judgeVerdict: JUDGE_VERDICT_SHAPE,
   agentTracePath: z.string(),
+  supervisorTracePath: z.string(),
   judgeTracePath: z.string(),
   agentError: AGENT_ERROR_SHAPE.optional(),
   preflightError: z.undefined().optional(),
@@ -76,6 +81,7 @@ const PREFLIGHT_RECORD = z.object({
   // them in WorkdirManager.start) so the record is uniform across branches
   // and downstream consumers can reference them without conditional fields.
   agentTracePath: z.string(),
+  supervisorTracePath: z.string(),
   judgeTracePath: z.string(),
   scoring: z.undefined().optional(),
   submission: z.undefined().optional(),