npm - @checklabs/core - Versions diffs - 0.2.1 - Mend

@checklabs/core 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/package.json +31 -0
package/src/adapters/index.ts +136 -0
package/src/assertions/expect.ts +218 -0
package/src/config.ts +89 -0
package/src/discovery.ts +57 -0
package/src/env.ts +35 -0
package/src/generate/index.ts +103 -0
package/src/generate/templates.ts +225 -0
package/src/index.ts +93 -0
package/src/judge/index.ts +158 -0
package/src/pricing.ts +56 -0
package/src/registry.ts +23 -0
package/src/reporters/colors.ts +36 -0
package/src/reporters/console.ts +154 -0
package/src/reporters/html.ts +189 -0
package/src/reporters/index.ts +4 -0
package/src/reporters/json.ts +11 -0
package/src/runner/compare.ts +84 -0
package/src/runner/runner.ts +144 -0
package/src/types.ts +197 -0

package/src/reporters/console.ts ADDED Viewed

@@ -0,0 +1,154 @@
+import { basename } from "node:path";
+import type { ComparisonResult, RunReport, TestResult } from "../types";
+import { c, fmtCost, fmtMs, fmtPct } from "./colors";
+function groupByFile(results: TestResult[]): Map<string, TestResult[]> {
+  const byFile = new Map<string, TestResult[]>();
+  for (const r of results) {
+    const key = basename(r.file);
+    (byFile.get(key) ?? byFile.set(key, []).get(key)!).push(r);
+  }
+  return byFile;
+}
+function avgLatency(r: TestResult): number {
+  return r.latencies.length ? r.latencies.reduce((a, b) => a + b, 0) / r.latencies.length : 0;
+}
+/** Print a Jest-style run report. Returns the number of not-passed tests. */
+export function printRunReport(report: RunReport): number {
+  const { results, summary, agent } = report;
+  console.log("");
+  console.log(c.bold(c.cyan("CheckAI")));
+  console.log(c.dim(`agent: ${agent.name}  ·  model: ${agent.model || "?"}  ·  backend: ${agent.backend}`));
+  console.log("");
+  for (const [file, group] of groupByFile(results)) {
+    console.log(c.underline(file));
+    for (const r of group) {
+      if (r.status === "pass") {
+        const judged = r.assertions.find((a) => a.score !== undefined);
+        const scoreTag = judged ? c.dim(` score ${judged.score!.toFixed(2)}`) : "";
+        console.log(`  ${c.green("✓")} ${r.name} ${c.dim(`(${fmtMs(avgLatency(r))})`)}${scoreTag}`);
+      } else if (r.status === "error") {
+        console.log(`  ${c.yellow("⚠")} ${r.name} ${c.yellow("(error)")}`);
+        if (r.errorMessage) console.log(`      ${c.dim(r.errorMessage.split("\n")[0])}`);
+      } else {
+        console.log(`  ${c.red("✗")} ${r.name}`);
+        if (r.failure) {
+          console.log(`      ${c.dim("matcher:")}  ${r.failure.matcher}`);
+          console.log(`      ${c.dim("Expected:")} ${r.failure.expected}`);
+          console.log(`      ${c.dim("Actual:")}   ${r.failure.actual}`);
+        }
+      }
+    }
+    console.log("");
+  }
+  const rate = `${summary.passed}/${summary.total}`;
+  const rateColored =
+    summary.total === 0
+      ? c.gray(rate)
+      : summary.failed + summary.errored === 0
+        ? c.green(rate)
+        : c.yellow(rate);
+  console.log(c.bold("Overall"));
+  console.log(`  Pass Rate:       ${rateColored}`);
+  console.log(`  Failed:          ${summary.failed === 0 ? c.green("0") : c.red(String(summary.failed))}`);
+  if (summary.errored > 0) console.log(`  Errored:         ${c.yellow(String(summary.errored))}`);
+  console.log(`  Avg Latency:     ${fmtMs(summary.avgLatencyMs)}`);
+  console.log(`  Tokens:          ${summary.totalTokens.toLocaleString("en-US")}`);
+  console.log(`  Estimated Cost:  ${fmtCost(summary.totalCostUsd)}`);
+  console.log("");
+  return summary.failed + summary.errored;
+}
+const glyph = (s: "pass" | "fail" | "error") =>
+  s === "pass" ? c.green("✓") : s === "fail" ? c.red("✗") : c.yellow("⚠");
+/** Print a side-by-side comparison report. Returns the regression count. */
+export function printComparison(cmp: ComparisonResult): number {
+  console.log("");
+  console.log(c.bold(c.cyan("CheckAI · Model Comparison")));
+  console.log(
+    c.dim(cmp.agents.map((a, i) => `V${i + 1}=${a.name} [${a.model || "?"}, ${a.backend}]`).join("   "))
+  );
+  console.log("");
+  // Per-file grid.
+  const byFile = new Map<string, typeof cmp.rows>();
+  for (const row of cmp.rows) {
+    const key = basename(row.file);
+    (byFile.get(key) ?? byFile.set(key, []).get(key)!).push(row);
+  }
+  const NAME_W = 46;
+  for (const [file, group] of byFile) {
+    console.log(c.underline(file));
+    console.log(c.dim("  " + "".padEnd(NAME_W) + cmp.agents.map((_, i) => `V${i + 1}`.padEnd(8)).join("")));
+    for (const row of group) {
+      const label =
+        row.name.length > NAME_W - 2 ? row.name.slice(0, NAME_W - 3) + "…" : row.name;
+      const cells = row.statuses.map((s) => glyph(s) + "       ").join("");
+      console.log(`  ${label.padEnd(NAME_W)}${cells}`);
+    }
+    console.log("");
+  }
+  // Overall per agent.
+  console.log(c.bold("Overall"));
+  cmp.agents.forEach((a, i) => {
+    const s = cmp.summaries[i];
+    const rate = `${s.passed}/${s.total}`;
+    const colored = s.passed === s.total ? c.green(rate) : c.yellow(rate);
+    console.log(`  V${i + 1} ${`${a.name} (${a.model || "?"})`.padEnd(34)} Pass ${colored}  ${c.dim(`${fmtCost(s.totalCostUsd)} · ${fmtMs(s.avgLatencyMs)}`)}`);
+  });
+  console.log("");
+  console.log(c.bold("Diff vs baseline " + c.dim(`(${cmp.baseline})`)));
+  console.log(`  Regressions:     ${cmp.regressions.length === 0 ? c.green("0") : c.red(String(cmp.regressions.length))}`);
+  console.log(`  Improvements:    ${cmp.improvements.length > 0 ? c.green(String(cmp.improvements.length)) : "0"}`);
+  console.log(`  Unchanged:       ${cmp.unchanged}`);
+  const candLabel = cmp.agents.length > 2 ? ` ${c.dim("(V2 vs baseline)")}` : "";
+  console.log(`  Cost Difference: ${deltaColor(cmp.costDeltaPct)}${candLabel}`);
+  console.log(`  Latency Diff:    ${deltaColor(cmp.latencyDeltaPct)}${candLabel}`);
+  console.log("");
+  if (cmp.regressions.length > 0) {
+    console.log(c.red(c.bold(`Regressions (${cmp.regressions.length})`)));
+    for (const r of cmp.regressions) {
+      console.log(`  ${c.red("✗")} ${r.name}  ${c.dim(`[${basename(r.file)}]`)}`);
+      const f = r.failures.find((x, i) => i > 0 && x);
+      if (f) {
+        console.log(`      ${c.dim("matcher:")}  ${f.matcher}`);
+        console.log(`      ${c.dim("Expected:")} ${f.expected}`);
+        console.log(`      ${c.dim("Actual:")}   ${f.actual}`);
+      }
+    }
+    console.log("");
+  }
+  if (cmp.improvements.length > 0) {
+    console.log(c.green(c.bold(`Improvements (${cmp.improvements.length})`)));
+    for (const r of cmp.improvements) {
+      console.log(`  ${c.green("✓")} ${r.name}  ${c.dim(`[${basename(r.file)}]`)}`);
+    }
+    console.log("");
+  }
+  const errored = cmp.rows.filter((r) => r.delta === "error");
+  if (errored.length > 0) {
+    console.log(c.yellow(c.bold(`Errors (not counted as regressions): ${errored.length}`)));
+    for (const r of errored) console.log(`  ${c.yellow("⚠")} ${r.name}  ${c.dim(`[${basename(r.file)}]`)}`);
+    console.log("");
+  }
+  if (cmp.regressions.length === 0) {
+    console.log(c.green("No regressions detected. ✓"));
+    console.log("");
+  }
+  return cmp.regressions.length;
+}
+function deltaColor(pct: number): string {
+  const s = fmtPct(pct);
+  if (pct > 1) return c.red(s);
+  if (pct < -1) return c.green(s);
+  return s;
+}

package/src/reporters/html.ts ADDED Viewed

@@ -0,0 +1,189 @@
+import { writeFileSync } from "node:fs";
+import { basename } from "node:path";
+import type { ComparisonResult, RunReport, TestResult } from "../types";
+import { fmtCost, fmtMs, fmtPct } from "./colors";
+const esc = (s: unknown): string =>
+  String(s)
+    .replace(/&/g, "&amp;")
+    .replace(/</g, "&lt;")
+    .replace(/>/g, "&gt;")
+    .replace(/"/g, "&quot;");
+const STYLE = `
+:root { --bg:#0d1117; --panel:#161b22; --line:#30363d; --txt:#e6edf3; --muted:#8b949e;
+  --green:#3fb950; --red:#f85149; --yellow:#d29922; --cyan:#58a6ff; }
+* { box-sizing: border-box; }
+body { margin:0; background:var(--bg); color:var(--txt); font:14px/1.5 -apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif; }
+.wrap { max-width: 1000px; margin: 0 auto; padding: 32px 20px 80px; }
+h1 { font-size: 22px; margin: 0 0 4px; } h1 .tag { color: var(--cyan); }
+.sub { color: var(--muted); margin-bottom: 24px; }
+.cards { display:flex; flex-wrap:wrap; gap:12px; margin-bottom:28px; }
+.card { background:var(--panel); border:1px solid var(--line); border-radius:8px; padding:14px 16px; min-width:130px; }
+.card .k { color:var(--muted); font-size:12px; text-transform:uppercase; letter-spacing:.04em; }
+.card .v { font-size:22px; font-weight:600; margin-top:4px; }
+.green{color:var(--green)} .red{color:var(--red)} .yellow{color:var(--yellow)} .muted{color:var(--muted)} .cyan{color:var(--cyan)}
+.file { margin: 20px 0 8px; font-weight:600; border-bottom:1px solid var(--line); padding-bottom:6px; }
+.test { background:var(--panel); border:1px solid var(--line); border-radius:8px; margin:8px 0; padding:0; }
+.test > summary { list-style:none; cursor:pointer; padding:10px 14px; display:flex; align-items:center; gap:10px; }
+.test > summary::-webkit-details-marker { display:none; }
+.test .name { flex:1; } .test .meta { color:var(--muted); font-size:12px; }
+.badge { font-weight:700; width:16px; text-align:center; }
+.detail { padding: 4px 14px 14px 40px; color:var(--muted); }
+.detail code { color:var(--txt); background:#0d1117; padding:1px 4px; border-radius:4px; }
+.kv { margin:2px 0; } .kv b { color:var(--muted); font-weight:500; display:inline-block; min-width:84px; }
+table { width:100%; border-collapse:collapse; margin:10px 0 24px; background:var(--panel); border:1px solid var(--line); border-radius:8px; overflow:hidden; }
+th,td { text-align:left; padding:8px 12px; border-bottom:1px solid var(--line); }
+th { color:var(--muted); font-weight:500; font-size:12px; text-transform:uppercase; }
+td.c { text-align:center; width:60px; }
+.section-title { font-size:16px; font-weight:600; margin:24px 0 6px; }
+`;
+function card(k: string, v: string, cls = ""): string {
+  return `<div class="card"><div class="k">${esc(k)}</div><div class="v ${cls}">${v}</div></div>`;
+}
+function statusBadge(s: TestResult["status"] | "pass" | "fail" | "error"): string {
+  if (s === "pass") return `<span class="badge green">✓</span>`;
+  if (s === "error") return `<span class="badge yellow">⚠</span>`;
+  return `<span class="badge red">✗</span>`;
+}
+function avgLatency(r: TestResult): number {
+  return r.latencies.length ? r.latencies.reduce((a, b) => a + b, 0) / r.latencies.length : 0;
+}
+function renderRun(report: RunReport): string {
+  const { results, summary, agent } = report;
+  const s = summary;
+  const rateCls = s.total === 0 ? "muted" : s.failed + s.errored === 0 ? "green" : "yellow";
+  const cards = [
+    card("Pass Rate", `${s.passed}/${s.total}`, rateCls),
+    card("Failed", String(s.failed), s.failed ? "red" : "green"),
+    ...(s.errored ? [card("Errored", String(s.errored), "yellow")] : []),
+    card("Avg Latency", fmtMs(s.avgLatencyMs)),
+    card("Tokens", s.totalTokens.toLocaleString("en-US")),
+    card("Est. Cost", fmtCost(s.totalCostUsd)),
+  ].join("");
+  const byFile = new Map<string, TestResult[]>();
+  for (const r of results) (byFile.get(basename(r.file)) ?? byFile.set(basename(r.file), []).get(basename(r.file))!).push(r);
+  let body = "";
+  for (const [file, group] of byFile) {
+    body += `<div class="file">${esc(file)}</div>`;
+    for (const r of group) {
+      const judged = r.assertions.find((a) => a.score !== undefined);
+      const meta =
+        r.status === "pass"
+          ? `${fmtMs(avgLatency(r))}${judged ? ` · score ${judged.score!.toFixed(2)}` : ""}`
+          : r.status.toUpperCase();
+      let detail = "";
+      if (r.status === "fail" && r.failure) {
+        detail = `<div class="kv"><b>matcher</b> <code>${esc(r.failure.matcher)}</code></div>
+          <div class="kv"><b>expected</b> ${esc(r.failure.expected)}</div>
+          <div class="kv"><b>actual</b> ${esc(r.failure.actual)}</div>`;
+      } else if (r.status === "error") {
+        detail = `<div class="kv"><b>error</b> ${esc((r.errorMessage ?? "").split("\n")[0])}</div>`;
+      } else {
+        detail = r.assertions
+          .map((a) => `<div class="kv">${statusBadge(a.pass ? "pass" : "fail")} <code>${esc(a.matcher)}</code> — ${esc(a.expected)}</div>`)
+          .join("");
+      }
+      body += `<details class="test"${r.status !== "pass" ? " open" : ""}>
+        <summary>${statusBadge(r.status)}<span class="name">${esc(r.name)}</span><span class="meta">${esc(meta)}</span></summary>
+        <div class="detail">${detail}</div></details>`;
+    }
+  }
+  return page(
+    `CheckAI Report`,
+    `agent: ${esc(agent.name)} · model: ${esc(agent.model || "?")} · backend: ${esc(agent.backend)} · ${esc(report.finishedAt)}`,
+    `<div class="cards">${cards}</div>${body}`
+  );
+}
+function renderComparison(cmp: ComparisonResult): string {
+  const q = cmp.agents.length > 2 ? " (V2)" : "";
+  const cards = [
+    card("Baseline", esc(cmp.baseline), "cyan"),
+    card("Regressions", String(cmp.regressions.length), cmp.regressions.length ? "red" : "green"),
+    card("Improvements", String(cmp.improvements.length), cmp.improvements.length ? "green" : "muted"),
+    card("Unchanged", String(cmp.unchanged), "muted"),
+    card("Cost Diff" + q, fmtPct(cmp.costDeltaPct), cmp.costDeltaPct > 1 ? "red" : cmp.costDeltaPct < -1 ? "green" : "muted"),
+    card("Latency Diff" + q, fmtPct(cmp.latencyDeltaPct), cmp.latencyDeltaPct > 1 ? "red" : cmp.latencyDeltaPct < -1 ? "green" : "muted"),
+  ].join("");
+  const head = `<tr><th>Test</th><th>File</th>${cmp.agents
+    .map((a, i) => `<th class="c">V${i + 1} ${esc(a.name)}</th>`)
+    .join("")}</tr>`;
+  const rows = cmp.rows
+    .map(
+      (row) =>
+        `<tr><td>${esc(row.name)}</td><td class="muted">${esc(basename(row.file))}</td>${row.statuses
+          .map((s) => `<td class="c">${statusBadge(s)}</td>`)
+          .join("")}</tr>`
+    )
+    .join("");
+  let regr = "";
+  if (cmp.regressions.length) {
+    regr =
+      `<div class="section-title red">Regressions (${cmp.regressions.length})</div>` +
+      cmp.regressions
+        .map((r) => {
+          const f = r.failures.find((x, i) => i > 0 && x);
+          return `<details class="test" open><summary>${statusBadge("fail")}<span class="name">${esc(r.name)}</span><span class="meta">${esc(basename(r.file))}</span></summary>
+            <div class="detail">${
+              f
+                ? `<div class="kv"><b>matcher</b> <code>${esc(f.matcher)}</code></div><div class="kv"><b>expected</b> ${esc(f.expected)}</div><div class="kv"><b>actual</b> ${esc(f.actual)}</div>`
+                : ""
+            }</div></details>`;
+        })
+        .join("");
+  }
+  let impr = "";
+  if (cmp.improvements.length) {
+    impr =
+      `<div class="section-title green">Improvements (${cmp.improvements.length})</div>` +
+      cmp.improvements
+        .map((r) => `<details class="test"><summary>${statusBadge("pass")}<span class="name">${esc(r.name)}</span><span class="meta">${esc(basename(r.file))}</span></summary><div class="detail muted">passes on candidate</div></details>`)
+        .join("");
+  }
+  let errs = "";
+  const erroredRows = cmp.rows.filter((r) => r.delta === "error");
+  if (erroredRows.length) {
+    errs =
+      `<div class="section-title yellow">Errors — not counted as regressions (${erroredRows.length})</div>` +
+      erroredRows
+        .map(
+          (r) =>
+            `<details class="test"><summary>${statusBadge("error")}<span class="name">${esc(r.name)}</span><span class="meta">${esc(basename(r.file))}</span></summary><div class="detail muted">errored on a candidate</div></details>`
+        )
+        .join("");
+  }
+  return page(
+    `CheckAI Comparison`,
+    `${esc(cmp.agents.map((a, i) => `V${i + 1}=${a.name} [${a.model || "?"}]`).join("  ·  "))} · ${esc(cmp.finishedAt)}`,
+    `<div class="cards">${cards}</div><table>${head}${rows}</table>${regr}${impr}${errs}`
+  );
+}
+function page(title: string, sub: string, body: string): string {
+  return `<!doctype html><html lang="en"><head><meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>${esc(title)}</title><style>${STYLE}</style></head>
+<body><div class="wrap"><h1><span class="tag">CheckAI</span> ${esc(title.replace("CheckAI ", ""))}</h1>
+<div class="sub">${sub}</div>${body}</div></body></html>`;
+}
+export function renderHtml(data: RunReport | ComparisonResult): string {
+  return data.kind === "run" ? renderRun(data) : renderComparison(data);
+}
+/** Write a self-contained HTML report to disk. */
+export function writeHtmlReport(path: string, data: RunReport | ComparisonResult): void {
+  writeFileSync(path, renderHtml(data), "utf8");
+}

package/src/reporters/index.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export { printRunReport, printComparison } from "./console";
+export { renderJson, writeJsonReport } from "./json";
+export { renderHtml, writeHtmlReport } from "./html";
+export { c, fmtMs, fmtCost, fmtPct } from "./colors";

package/src/reporters/json.ts ADDED Viewed

@@ -0,0 +1,11 @@
+import { writeFileSync } from "node:fs";
+import type { ComparisonResult, RunReport } from "../types";
+export function renderJson(data: RunReport | ComparisonResult): string {
+  return JSON.stringify(data, null, 2);
+}
+/** Write a machine-readable report to disk (checkai-report.json by default). */
+export function writeJsonReport(path: string, data: RunReport | ComparisonResult): void {
+  writeFileSync(path, renderJson(data), "utf8");
+}

package/src/runner/compare.ts ADDED Viewed

@@ -0,0 +1,84 @@
+import type {
+  ComparisonResult,
+  ComparisonRow,
+  SuiteSummary,
+  TestResult,
+  TestStatus,
+} from "../types";
+import { summarize } from "./runner";
+export interface AgentRun {
+  agent: { name: string; model: string; backend: string };
+  results: TestResult[];
+}
+const keyOf = (r: { name: string; file: string }) => `${r.file}::${r.name}`;
+function pctDelta(base: number, candidate: number): number {
+  if (base === 0) return candidate === 0 ? 0 : 100;
+  return ((candidate - base) / base) * 100;
+}
+/**
+ * Compare a baseline agent (first run) against one or more candidates over the
+ * same suite. A regression is a test the baseline passed but a candidate failed;
+ * an improvement is the reverse. Tests that errored are surfaced separately and
+ * never counted as regressions.
+ */
+export function buildComparison(
+  runs: AgentRun[],
+  startedAt: string,
+  finishedAt: string
+): ComparisonResult {
+  const summaries: SuiteSummary[] = runs.map((r) => summarize(r.results));
+  const baseline = runs[0];
+  const maps = runs.map((r) => {
+    const m = new Map<string, TestResult>();
+    for (const res of r.results) m.set(keyOf(res), res);
+    return m;
+  });
+  const rows: ComparisonRow[] = baseline.results.map((b) => {
+    const key = keyOf(b);
+    const statuses: TestStatus[] = maps.map((m) => m.get(key)?.status ?? "error");
+    const failures = maps.map((m) => m.get(key)?.failure);
+    const base = statuses[0];
+    const candidates = statuses.slice(1);
+    // Check regression/improvement BEFORE error, so an error in one candidate
+    // can't mask a genuine pass→fail regression in another (multi-agent compare).
+    let delta: ComparisonRow["delta"];
+    if (candidates.some((s) => base === "pass" && s === "fail")) {
+      delta = "regression";
+    } else if (candidates.some((s) => base === "fail" && s === "pass")) {
+      delta = "improvement";
+    } else if (statuses.some((s) => s === "error")) {
+      delta = "error";
+    } else {
+      delta = "unchanged";
+    }
+    return { name: b.name, file: b.file, statuses, failures, delta };
+  });
+  const regressions = rows.filter((r) => r.delta === "regression");
+  const improvements = rows.filter((r) => r.delta === "improvement");
+  const unchanged = rows.filter((r) => r.delta === "unchanged").length;
+  const baseSum = summaries[0];
+  const candSum = summaries[1] ?? summaries[0];
+  return {
+    kind: "comparison",
+    agents: runs.map((r) => r.agent),
+    baseline: baseline.agent.name,
+    rows,
+    summaries,
+    regressions,
+    improvements,
+    unchanged,
+    costDeltaPct: pctDelta(baseSum.totalCostUsd, candSum.totalCostUsd),
+    latencyDeltaPct: pctDelta(baseSum.avgLatencyMs, candSum.avgLatencyMs),
+    startedAt,
+    finishedAt,
+  };
+}

package/src/runner/runner.ts ADDED Viewed

@@ -0,0 +1,144 @@
+import type {
+  AgentAdapter,
+  AssertionResult,
+  RunReport,
+  SuiteSummary,
+  TestCase,
+  TestResult,
+  TokenUsage,
+} from "../types";
+import { CheckAIAssertionError, setAssertionSink, setPendingSink } from "../assertions/expect";
+import { ZERO_USAGE, addUsage } from "../pricing";
+/** Apply a substring filter to a test list (matches test name or file). */
+export function filterTests(tests: TestCase[], filter?: string): TestCase[] {
+  if (!filter) return tests;
+  const f = filter.toLowerCase();
+  return tests.filter(
+    (t) => t.name.toLowerCase().includes(f) || t.file.toLowerCase().includes(f)
+  );
+}
+function nowMs(): number {
+  // performance.now() is monotonic, double-precision, in ms — no bigint rounding.
+  return performance.now();
+}
+/** Run a set of tests against a single adapter. Tests run sequentially. */
+export async function runSuite(tests: TestCase[], agent: AgentAdapter): Promise<TestResult[]> {
+  const results: TestResult[] = [];
+  for (const tc of tests) {
+    const latencies: number[] = [];
+    const toolsUsed = new Set<string>();
+    let usage: TokenUsage = ZERO_USAGE;
+    let costUsd = 0;
+    const instrumented: AgentAdapter = {
+      name: agent.name,
+      get model() {
+        return agent.model;
+      },
+      run: async (input: string) => {
+        const res = await agent.run(input);
+        latencies.push(res.latencyMs);
+        res.toolsUsed.forEach((t) => toolsUsed.add(t));
+        if (res.usage) usage = addUsage(usage, res.usage);
+        costUsd += res.costUsd ?? 0;
+        return res;
+      },
+    };
+    const assertions: AssertionResult[] = [];
+    const pending: Promise<unknown>[] = [];
+    setAssertionSink(assertions);
+    setPendingSink(pending);
+    const start = nowMs();
+    let status: TestResult["status"] = "pass";
+    let failure: AssertionResult | undefined;
+    let errorMessage: string | undefined;
+    try {
+      await tc.fn({ agent: instrumented });
+    } catch (err) {
+      if (err instanceof CheckAIAssertionError) {
+        status = "fail";
+        failure = err.result;
+      } else {
+        status = "error";
+        errorMessage = err instanceof Error ? err.stack ?? err.message : String(err);
+      }
+    } finally {
+      // Wait for any in-flight judge assertions (e.g. an un-awaited
+      // toSatisfyBehavior) so they still record while the sink is set.
+      await Promise.allSettled(pending);
+      setAssertionSink(null);
+      setPendingSink(null);
+    }
+    // Catch a failing assertion that was recorded by an un-awaited async matcher.
+    if (status === "pass") {
+      const firstFail = assertions.find((a) => !a.pass);
+      if (firstFail) {
+        status = "fail";
+        failure = firstFail;
+      }
+    }
+    results.push({
+      name: tc.name,
+      file: tc.file,
+      status,
+      assertions,
+      failure,
+      errorMessage,
+      latencies,
+      toolsUsed: [...toolsUsed],
+      usage,
+      costUsd,
+      durationMs: nowMs() - start,
+    });
+  }
+  return results;
+}
+/** Aggregate stats over a result set. */
+export function summarize(results: TestResult[]): SuiteSummary {
+  const total = results.length;
+  const passed = results.filter((r) => r.status === "pass").length;
+  const failed = results.filter((r) => r.status === "fail").length;
+  const errored = results.filter((r) => r.status === "error").length;
+  const latencies = results.flatMap((r) => r.latencies);
+  const avgLatencyMs =
+    latencies.length > 0 ? latencies.reduce((a, b) => a + b, 0) / latencies.length : 0;
+  const totalTokens = results.reduce((a, r) => a + r.usage.totalTokens, 0);
+  const totalCostUsd = results.reduce((a, r) => a + r.costUsd, 0);
+  return {
+    total,
+    passed,
+    failed,
+    errored,
+    passRate: total > 0 ? passed / total : 0,
+    avgLatencyMs,
+    totalTokens,
+    totalCostUsd,
+  };
+}
+export function buildRunReport(
+  agent: { name: string; model: string; backend: string },
+  results: TestResult[],
+  startedAt: string,
+  finishedAt: string
+): RunReport {
+  return {
+    kind: "run",
+    agent,
+    results,
+    summary: summarize(results),
+    startedAt,
+    finishedAt,
+  };
+}