npm - @forwardimpact/libeval - Versions diffs - 0.1.35 → 0.1.36 - Mend

@forwardimpact/libeval 0.1.35 → 0.1.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json +1 -1
package/src/benchmark/judge.js +33 -7
package/src/benchmark/report.js +338 -17
package/src/benchmark/runner.js +30 -5
package/src/commands/benchmark-report.js +5 -1
package/src/supervisor.js +5 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.35",
+  "version": "0.1.36",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",

package/src/benchmark/judge.js CHANGED Viewed

@@ -1,9 +1,20 @@
 /**
  * Benchmark adapter for the libeval `Judge`. Templates the family's
- * `judge.task.md` ({{SCORING}} / {{AGENT_TRACE_PATH}} substitution), runs the
- * judge against the post-run agent CWD, and returns the verdict in the
- * benchmark's `pass`/`fail` vocabulary (mapped from libeval's
- * `success`/`failure`).
+ * `judge.task.md` with structured context variables, runs the judge against
+ * the post-run agent CWD, and returns the verdict in the benchmark's
+ * `pass`/`fail` vocabulary (mapped from libeval's `success`/`failure`).
+ *
+ * Template variables available in `judge.task.md`:
+ *
+ *   {{AGENT_INSTRUCTIONS}}  — contents of instructions.md
+ *   {{AGENT_PROFILE}}       — agent profile body (empty string if none)
+ *   {{AGENT_TRACE_PATH}}    — path to agent.ndjson
+ *   {{SCORING_RESULT}}      — JSON scoring object
+ *   {{SKILL_SET_HASH}}      — SHA-256 from apm.lock.yaml
+ *   {{TASK_ID}}             — task name (directory under tasks/)
+ *   {{TASK_DIR}}            — agent working directory path
+ *
+ * Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
  *
  * The judge verdict is captured from the orchestration context's
  * `concluded` flag directly — no trace parsing on the happy path.
@@ -24,19 +35,34 @@ import { createRedactor } from "../redaction.js";
  * @property {string} summary
  */
+/**
+ * @typedef {object} JudgeContext
+ * @property {string} agentInstructions - Contents of instructions.md.
+ * @property {string} agentProfile - Agent profile body (empty string if none).
+ * @property {string} skillSetHash - SHA-256 fingerprint from apm.lock.yaml.
+ */
 /**
  * Run the judge over a completed task run.
  * @param {import("./task-family.js").Task} task
  * @param {import("./workdir.js").Workdir} workdir
  * @param {import("./scorer.js").ScoringResult} scoring
  * @param {{query: Function, model: string, judgeProfile?: string}} deps
+ * @param {JudgeContext} [context]
  * @returns {Promise<JudgeVerdict>}
  */
-export async function runJudge(task, workdir, scoring, deps) {
+export async function runJudge(task, workdir, scoring, deps, context) {
   const template = await readFile(task.paths.judge, "utf8");
+  const scoringJson = JSON.stringify(scoring, null, 2);
   const taskText = template
-    .replaceAll("{{SCORING}}", JSON.stringify(scoring, null, 2))
-    .replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath);
+    .replaceAll("{{SCORING_RESULT}}", scoringJson)
+    .replaceAll("{{SCORING}}", scoringJson)
+    .replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
+    .replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
+    .replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")
+    .replaceAll("{{SKILL_SET_HASH}}", context?.skillSetHash ?? "")
+    .replaceAll("{{TASK_ID}}", task.id)
+    .replaceAll("{{TASK_DIR}}", workdir.cwd);
   const output = createWriteStream(workdir.judgeTracePath);
   const judge = createJudge({

package/src/benchmark/report.js CHANGED Viewed

@@ -3,6 +3,10 @@
  * records by `taskId`, and compute pass@k via the OpenAI HumanEval
  * unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
  *
+ * When `includeRuns` is true, each task carries per-run detail (scoring
+ * checks, judge commentary, cost, duration) and the text renderer produces
+ * a full markdown report instead of just the pass@k table.
+ *
  * Records that fail schema validation are skipped with a stderr warning
  * (counted under `totals.skipped`) so a corrupt line cannot abort the
  * whole report.
@@ -14,48 +18,194 @@ import { createInterface } from "node:readline";
 import { validateResultRecord } from "./result.js";
+/**
+ * @typedef {object} RunDetail
+ * @property {number} runIndex
+ * @property {"pass"|"fail"} verdict
+ * @property {{verdict: string, details: unknown[], exitCode: number}} [scoring]
+ * @property {{verdict: string, summary: string}} [judgeVerdict]
+ * @property {number} costUsd
+ * @property {number} turns
+ * @property {number} durationMs
+ * @property {{message: string, aborted: boolean}} [agentError]
+ * @property {{phase: string, message: string, exitCode: number}} [preflightError]
+ */
 /**
  * @typedef {object} TaskReport
  * @property {string} taskId
  * @property {number} n - Total runs.
  * @property {number} c - Passing runs.
  * @property {Record<string|number, number|null>} passAtK
+ * @property {RunDetail[]} [runs] - Per-run detail (only when includeRuns).
  */
 /**
- * @param {{inputDir: string, kValues: number[]}} opts
- * @returns {Promise<{tasks: TaskReport[], totals: {tasks: number, runs: number, skipped: number}}>}
+ * @param {{inputDir: string, kValues: number[], includeRuns?: boolean}} opts
+ * @returns {Promise<{tasks: TaskReport[], totals: object}>}
  */
-export async function aggregate({ inputDir, kValues }) {
+export async function aggregate({ inputDir, kValues, includeRuns = false }) {
   const records = await loadRecords(inputDir);
   const grouped = groupByTask(records.records);
   const tasks = [];
-  let runs = 0;
+  let totalRuns = 0;
+  let totalCost = 0;
+  const allDurations = [];
+  const allTurns = [];
+  let firstRecord = null;
   for (const [taskId, group] of grouped) {
     const n = group.length;
     const c = group.filter((r) => r.verdict === "pass").length;
-    runs += n;
+    totalRuns += n;
     const passAtK = {};
     for (const k of kValues) passAtK[k] = passAtKValue(n, c, k);
-    tasks.push({ taskId, n, c, passAtK });
+    const task = { taskId, n, c, passAtK };
+    if (includeRuns) {
+      if (!firstRecord) firstRecord = group[0];
+      const accumulators = { allDurations, allTurns };
+      task.runs = group
+        .map((r) => {
+          totalCost += r.costUsd ?? 0;
+          return buildRunDetail(r, accumulators);
+        })
+        .sort((a, b) => a.runIndex - b.runIndex);
+    }
+    tasks.push(task);
   }
   tasks.sort((a, b) =>
     a.taskId < b.taskId ? -1 : a.taskId > b.taskId ? 1 : 0,
   );
+  const totals = {
+    tasks: tasks.length,
+    runs: totalRuns,
+    skipped: records.skipped,
+  };
+  if (includeRuns) {
+    totals.costUsd = totalCost;
+    totals.medianDurationMs = median(allDurations);
+    totals.medianTurns = median(allTurns);
+    totals.model = firstRecord?.model ?? "";
+    totals.skillSetHash = firstRecord?.skillSetHash ?? "";
+    totals.familyRevision = firstRecord?.familyRevision ?? "";
+  }
+  return { tasks, totals };
+}
+/**
+ * Build a normalized per-run detail object and accumulate duration/turn
+ * samples for median calculation. Extracted from `aggregate` to keep its
+ * cognitive complexity below the lint ceiling.
+ * @param {object} r - Raw record.
+ * @param {{allDurations: number[], allTurns: number[]}} acc
+ * @returns {RunDetail}
+ */
+function buildRunDetail(r, acc) {
+  if (r.durationMs != null) acc.allDurations.push(r.durationMs);
+  if (r.turns != null) acc.allTurns.push(r.turns);
   return {
-    tasks,
-    totals: { tasks: tasks.length, runs, skipped: records.skipped },
+    runIndex: r.runIndex,
+    verdict: r.verdict,
+    ...(r.scoring && { scoring: r.scoring }),
+    ...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
+    costUsd: r.costUsd ?? 0,
+    turns: r.turns ?? 0,
+    durationMs: r.durationMs ?? 0,
+    ...(r.agentError && { agentError: r.agentError }),
+    ...(r.preflightError && { preflightError: r.preflightError }),
   };
 }
 /**
- * Render an aggregate report as a Markdown table. Columns: taskId | n | c |
- * pass@k1 | pass@k2 ... — one column per kValues entry, in the same order.
+ * Render an aggregate report as markdown. When the report contains per-run
+ * detail (from `includeRuns: true`), renders a full report with summary,
+ * pass@k table, and per-task detail sections. Otherwise falls back to the
+ * compact pass@k table.
  * @param {Awaited<ReturnType<typeof aggregate>>} report
  * @param {number[]} kValues
  * @returns {string}
  */
 export function renderTextReport(report, kValues) {
+  if (report.tasks[0]?.runs) {
+    return renderFullReport(report, kValues);
+  }
+  return renderCompactReport(report, kValues);
+}
+// ---------------------------------------------------------------------------
+// Compact report (legacy path)
+// ---------------------------------------------------------------------------
+function renderCompactReport(report, kValues) {
+  const lines = [
+    renderPassAtKTable(report, kValues),
+    "",
+    renderTotalsLine(report),
+  ];
+  return lines.join("\n");
+}
+// ---------------------------------------------------------------------------
+// Full report
+// ---------------------------------------------------------------------------
+function renderFullReport(report, kValues) {
+  const sections = [
+    renderSummary(report),
+    "## Pass@k",
+    "",
+    renderPassAtKTable(report, kValues),
+    "",
+    renderTotalsLine(report),
+    "",
+    "## Task Details",
+  ];
+  for (const task of report.tasks) {
+    sections.push("");
+    sections.push(renderTaskDetail(task));
+  }
+  return sections.join("\n");
+}
+function renderSummary(report) {
+  const { totals } = report;
+  const passing = report.tasks.filter((t) => t.c > 0 && t.c === t.n).length;
+  const lines = [
+    "# Benchmark Report",
+    "",
+    `**Result: ${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
+  ];
+  const meta = [];
+  if (totals.model) meta.push(`Model: \`${totals.model}\``);
+  if (totals.skillSetHash) meta.push(`Skill set: \`${totals.skillSetHash}\``);
+  if (totals.familyRevision) meta.push(`Family: \`${totals.familyRevision}\``);
+  if (meta.length) lines.push(meta.join(" | "));
+  const stats = [];
+  if (totals.costUsd != null) stats.push(`Cost: ${formatCost(totals.costUsd)}`);
+  if (totals.medianDurationMs != null)
+    stats.push(`Median duration: ${formatDuration(totals.medianDurationMs)}`);
+  if (totals.medianTurns != null)
+    stats.push(`Median turns: ${totals.medianTurns}`);
+  if (stats.length) lines.push(stats.join(" | "));
+  lines.push("");
+  return lines.join("\n");
+}
+// ---------------------------------------------------------------------------
+// Pass@k table (shared between compact and full)
+// ---------------------------------------------------------------------------
+function renderPassAtKTable(report, kValues) {
   const header = ["taskId", "n", "c", ...kValues.map((k) => `pass@${k}`)];
   const rows = [header, header.map(() => "---")];
   for (const t of report.tasks) {
@@ -66,20 +216,193 @@ export function renderTextReport(report, kValues) {
       ...kValues.map((k) => formatPassAt(t.passAtK[k])),
     ]);
   }
-  const lines = rows.map((r) => `| ${r.join(" | ")} |`);
-  lines.push("");
-  lines.push(
-    `Totals — tasks: ${report.totals.tasks}, runs: ${report.totals.runs}, skipped: ${report.totals.skipped}`,
-  );
+  return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
+}
+function renderTotalsLine(report) {
+  return `Totals — tasks: ${report.totals.tasks}, runs: ${report.totals.runs}, skipped: ${report.totals.skipped}`;
+}
+// ---------------------------------------------------------------------------
+// Per-task detail
+// ---------------------------------------------------------------------------
+function renderTaskDetail(task) {
+  const runs = task.runs ?? [];
+  const status = task.c === task.n ? "PASS" : "FAIL";
+  const singleRun = runs.length === 1;
+  const lines = [
+    `### ${task.taskId}`,
+    "",
+    `**${status} — ${task.c}/${task.n} runs passed**`,
+  ];
+  lines.push("", renderRunsTable(runs));
+  const checks = renderScoringChecks(runs, singleRun);
+  if (checks) lines.push("", checks);
+  const commentary = renderJudgeCommentary(runs, singleRun);
+  if (commentary) lines.push("", commentary);
+  const errors = renderErrors(runs);
+  if (errors) lines.push("", errors);
   return lines.join("\n");
 }
+function renderRunsTable(runs) {
+  const header = [
+    "Run",
+    "Verdict",
+    "Scoring",
+    "Judge",
+    "Cost",
+    "Turns",
+    "Duration",
+  ];
+  const rows = [header, header.map(() => "---")];
+  for (const r of runs) {
+    const scoringCell = r.preflightError
+      ? "preflight error"
+      : r.scoring
+        ? r.scoring.verdict
+        : "—";
+    const judgeCell = r.preflightError
+      ? "—"
+      : r.judgeVerdict
+        ? r.judgeVerdict.verdict
+        : "—";
+    rows.push([
+      String(r.runIndex),
+      r.verdict.toUpperCase(),
+      scoringCell,
+      judgeCell,
+      formatCost(r.costUsd),
+      String(r.turns),
+      formatDuration(r.durationMs),
+    ]);
+  }
+  return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
+}
+function renderScoringChecks(runs, singleRun) {
+  const rows = collectScoringRows(runs);
+  if (!rows.length) return null;
+  const header = singleRun
+    ? ["Check", "Result", "Message"]
+    : ["Run", "Check", "Result", "Message"];
+  const lines = [
+    "#### Scoring Checks",
+    "",
+    `| ${header.join(" | ")} |`,
+    `| ${header.map(() => "---").join(" | ")} |`,
+  ];
+  for (const row of rows) {
+    const cells = singleRun
+      ? [row.check, row.result, row.message]
+      : [String(row.run), row.check, row.result, row.message];
+    lines.push(`| ${cells.join(" | ")} |`);
+  }
+  return lines.join("\n");
+}
+function collectScoringRows(runs) {
+  const rows = [];
+  for (const r of runs) {
+    if (!r.scoring?.details?.length) continue;
+    for (const d of r.scoring.details) {
+      rows.push({
+        run: r.runIndex,
+        check: escapeCell(String(d.test ?? "(unnamed)")),
+        result: d.pass ? "PASS" : "FAIL",
+        message: escapeCell(String(d.message ?? "")),
+      });
+    }
+  }
+  return rows;
+}
+function renderJudgeCommentary(runs, singleRun) {
+  const entries = runs.filter((r) => r.judgeVerdict?.summary);
+  if (!entries.length) return null;
+  const lines = ["#### Judge Commentary", ""];
+  for (let i = 0; i < entries.length; i++) {
+    const r = entries[i];
+    const summary = r.judgeVerdict.summary.replace(/\n/g, "\n> ");
+    if (singleRun) {
+      lines.push(`> ${summary}`);
+    } else {
+      lines.push(`> **Run ${r.runIndex}:** ${summary}`);
+    }
+    if (i < entries.length - 1) lines.push(">");
+  }
+  return lines.join("\n");
+}
+function renderErrors(runs) {
+  const lines = [];
+  for (const r of runs) {
+    if (r.agentError) {
+      lines.push(
+        `- **Run ${r.runIndex}:** Agent error — "${escapeCell(r.agentError.message)}" (aborted: ${r.agentError.aborted})`,
+      );
+    }
+    if (r.preflightError) {
+      lines.push(
+        `- **Run ${r.runIndex}:** Preflight error — "${escapeCell(r.preflightError.message)}" (exit ${r.preflightError.exitCode})`,
+      );
+    }
+  }
+  if (!lines.length) return null;
+  return ["#### Errors", "", ...lines].join("\n");
+}
+// ---------------------------------------------------------------------------
+// Formatting helpers
+// ---------------------------------------------------------------------------
 function formatPassAt(v) {
   if (v == null) return "—";
   if (typeof v === "object" && "error" in v) return v.error;
   return Number(v).toFixed(4);
 }
+function formatDuration(ms) {
+  if (ms == null || ms === 0) return "0s";
+  const totalSeconds = Math.round(ms / 1000);
+  if (totalSeconds < 60) return `${totalSeconds}s`;
+  const minutes = Math.floor(totalSeconds / 60);
+  const seconds = totalSeconds % 60;
+  return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
+}
+function formatCost(usd) {
+  if (usd == null) return "$0.00";
+  return `$${usd.toFixed(2)}`;
+}
+function escapeCell(str) {
+  return str.replace(/\|/g, "\\|");
+}
+function median(arr) {
+  if (!arr.length) return 0;
+  const sorted = [...arr].sort((a, b) => a - b);
+  const mid = Math.floor(sorted.length / 2);
+  if (sorted.length % 2 === 0) {
+    return Math.round((sorted[mid - 1] + sorted[mid]) / 2);
+  }
+  return sorted[mid];
+}
+// ---------------------------------------------------------------------------
+// Record loading
+// ---------------------------------------------------------------------------
 async function loadRecords(inputDir) {
   const path = join(inputDir, "results.jsonl");
   const stream = createReadStream(path);
@@ -142,8 +465,6 @@ function passAtKValue(n, c, k) {
   if (n - c < k) return 1;
   const total = binomial(BigInt(n), BigInt(k));
   const fail = binomial(BigInt(n - c), BigInt(k));
-  // Compute the ratio as a single division so we avoid `1 - x` which
-  // accumulates IEEE-754 error (e.g. 1 - 0.6 = 0.39999...).
   const passing = total - fail;
   return Number(passing) / Number(total);
 }

package/src/benchmark/runner.js CHANGED Viewed

@@ -165,11 +165,22 @@ export class BenchmarkRunner {
         port: workdir.port,
         runDir: workdir.runDir,
       });
-      const judgeVerdict = await this._runJudgeHook(task, workdir, scoring, {
-        query: this.query,
-        model: this.model,
-        judgeProfile: this.profiles.judge ?? undefined,
-      });
+      const judgeContext = await this.#buildJudgeContext(
+        task,
+        workdir,
+        skillSetHash,
+      );
+      const judgeVerdict = await this._runJudgeHook(
+        task,
+        workdir,
+        scoring,
+        {
+          query: this.query,
+          model: this.model,
+          judgeProfile: this.profiles.judge ?? undefined,
+        },
+        judgeContext,
+      );
       const record = {
         taskId: task.id,
         runIndex,
@@ -276,6 +287,20 @@ export class BenchmarkRunner {
     return { ...summary, agentError };
   }
+  async #buildJudgeContext(task, workdir, skillSetHash) {
+    const agentInstructions = await readFile(task.paths.instructions, "utf8");
+    let agentProfile = "";
+    if (this.profiles.agent) {
+      const profilePath = resolvePath(
+        workdir.cwd,
+        ".claude/agents",
+        `${this.profiles.agent}.md`,
+      );
+      agentProfile = await readFile(profilePath, "utf8").catch(() => "");
+    }
+    return { agentInstructions, agentProfile, skillSetHash };
+  }
   #buildPreflightFailureRecord({
     task,
     runIndex,

package/src/commands/benchmark-report.js CHANGED Viewed

@@ -30,7 +30,11 @@ export async function runBenchmarkReportCommand(values, _args) {
     throw new Error("--format must be 'json' or 'text'");
   }
-  const report = await aggregate({ inputDir: resolve(inputDir), kValues });
+  const report = await aggregate({
+    inputDir: resolve(inputDir),
+    kValues,
+    includeRuns: format === "text",
+  });
   if (format === "text") {
     process.stdout.write(renderTextReport(report, kValues) + "\n");
   } else {

package/src/supervisor.js CHANGED Viewed

@@ -536,12 +536,15 @@ export function createSupervisor({
   const onLine = (line) => supervisor.emitLine(line);
+  const perInvocationTurns =
+    maxTurns === 0 ? 0 : Math.max(maxTurns ?? 100, 200);
   const agentRunner = createAgentRunner({
     cwd: agentCwd,
     query,
     output: devNull,
     model,
-    maxTurns: 50,
+    maxTurns: perInvocationTurns,
     allowedTools,
     onLine,
     settingSources: ["project"],
@@ -560,7 +563,7 @@ export function createSupervisor({
     query,
     output: devNull,
     model,
-    maxTurns: 20,
+    maxTurns: perInvocationTurns,
     allowedTools: supervisorAllowedTools ?? [
       "Bash",
       "Read",