npm - @forwardimpact/libeval - Versions diffs - 0.1.49 → 0.1.51 - Mend

@forwardimpact/libeval 0.1.49 → 0.1.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/README.md +11 -8
package/bin/fit-benchmark.js +26 -27
package/bin/fit-eval.js +76 -78
package/bin/fit-trace.js +83 -57
package/package.json +2 -2
package/src/agent-runner.js +23 -13
package/src/benchmark/env-loader.js +35 -23
package/src/benchmark/{scorer.js → invariants.js} +14 -12
package/src/benchmark/judge.js +5 -8
package/src/benchmark/npm-installer.js +87 -0
package/src/benchmark/report.js +15 -15
package/src/benchmark/result.js +11 -11
package/src/benchmark/runner.js +17 -11
package/src/benchmark/task-family.js +6 -4
package/src/benchmark/workdir.js +23 -3
package/src/commands/assert.js +30 -22
package/src/commands/benchmark-invariants.js +74 -0
package/src/commands/benchmark-report.js +23 -15
package/src/commands/benchmark-run.js +22 -7
package/src/commands/by-discussion.js +29 -18
package/src/commands/callback.js +20 -11
package/src/commands/discuss.js +30 -21
package/src/commands/facilitate.js +20 -21
package/src/commands/output.js +11 -12
package/src/commands/run.js +24 -21
package/src/commands/supervise.js +27 -27
package/src/commands/task-input.js +54 -0
package/src/commands/trace.js +174 -97
package/src/discuss-tools.js +48 -2
package/src/discusser.js +49 -2
package/src/events/github.js +155 -0
package/src/inbox-poller.js +84 -0
package/src/index.js +10 -0
package/src/judge.js +1 -1
package/src/message-bus.js +6 -0
package/src/orchestration-loop.js +19 -5
package/src/orchestration-toolkit.js +14 -0
package/src/redaction.js +31 -9
package/src/reply-emitter.js +47 -0
package/src/commands/benchmark-score.js +0 -68

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.49",
+  "version": "0.1.51",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",
@@ -62,7 +62,7 @@
     "zod": "^4.4.3"
   },
   "devDependencies": {
-    "@forwardimpact/libharness": "^0.1.14"
+    "@forwardimpact/libmock": "^0.1.0"
   },
   "engines": {
     "bun": ">=1.2.0",

package/src/agent-runner.js CHANGED Viewed

@@ -29,12 +29,16 @@ export class AgentRunner {
    * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
    * @param {Record<string, object>} [deps.mcpServers] - MCP server configs to pass to the SDK query
    * @param {object} deps.redactor
+   * @param {import("@forwardimpact/libutil/runtime").Runtime} [deps.runtime] -
+   *   Ambient collaborators. Only `proc.env` is read (to record Skill
+   *   invocations into `LIBEVAL_SKILL`); when absent the write is skipped.
    */
   constructor(deps) {
     if (!deps.cwd) throw new Error("cwd is required");
     if (!deps.query) throw new Error("query is required");
     if (!deps.output) throw new Error("output is required");
     if (!deps.redactor) throw new Error("redactor is required");
+    this.runtime = deps.runtime ?? null;
     this.cwd = deps.cwd;
     this.query = deps.query;
     this.output = deps.output;
@@ -62,7 +66,9 @@ export class AgentRunner {
     const abortController = new AbortController();
     this.currentAbortController = abortController;
     const effectiveTask = this.taskAmend
-      ? `${task}\n\n${this.taskAmend}`
+      ? task
+        ? `${task}\n\n${this.taskAmend}`
+        : this.taskAmend
       : task;
     try {
       const iterator = this.query({
@@ -177,20 +183,24 @@ export class AgentRunner {
     if (message.type === "system" && message.subtype === "init") {
       this.sessionId = message.session_id;
     }
-    if (message.type === "assistant") trackSkillInvocation(message);
+    if (message.type === "assistant") this.#trackSkillInvocation(message);
   }
-}
-function trackSkillInvocation(message) {
-  const content = message.message?.content ?? message.content;
-  if (!Array.isArray(content)) return;
-  for (const block of content) {
-    if (
-      block.type === "tool_use" &&
-      block.name === "Skill" &&
-      block.input?.skill
-    ) {
-      process.env.LIBEVAL_SKILL = block.input.skill;
+  #trackSkillInvocation(message) {
+    const content = message.message?.content ?? message.content;
+    if (!Array.isArray(content)) return;
+    // Skill metric is recorded into the env map; without a runtime there is
+    // no env surface to write to, so the side-effect is simply skipped.
+    const env = this.runtime?.proc?.env ?? null;
+    if (!env) return;
+    for (const block of content) {
+      if (
+        block.type === "tool_use" &&
+        block.name === "Skill" &&
+        block.input?.skill
+      ) {
+        env.LIBEVAL_SKILL = block.input.skill;
+      }
     }
   }
 }

package/src/benchmark/env-loader.js CHANGED Viewed

@@ -14,7 +14,6 @@
  * AND rendered (with resolved values) into the agent working directory.
  */
-import { readFile, writeFile } from "node:fs/promises";
 import { join } from "node:path";
 const ENV_FILES = [".env.local", ".env"];
@@ -48,12 +47,13 @@ export function parseEnvFile(content) {
 /**
  * Read and parse an env file, returning [] if the file does not exist.
+ * @param {object} fs - Async filesystem surface (`runtime.fs`).
  * @param {string} filePath
  * @returns {Promise<Array<{key: string, value: string}>>}
  */
-async function readEnvFile(filePath) {
+async function readEnvFile(fs, filePath) {
   try {
-    const content = await readFile(filePath, "utf8");
+    const content = await fs.readFile(filePath, "utf8");
     return parseEnvFile(content);
   } catch (e) {
     if (e.code === "ENOENT") return [];
@@ -62,32 +62,36 @@ async function readEnvFile(filePath) {
 }
 /**
- * Load entries into process.env. Existing keys are never overwritten.
+ * Load entries into the process env map. Existing keys are never overwritten.
+ * @param {Record<string, string|undefined>} env - The `runtime.proc.env` map.
  * @param {Array<{key: string, value: string}>} entries
  * @returns {string[]} var names that were loaded
  */
-function applyToProcessEnv(entries) {
+function applyToProcessEnv(env, entries) {
   const names = [];
   for (const { key, value } of entries) {
     names.push(key);
-    if (process.env[key] === undefined) {
-      process.env[key] = value;
+    if (env[key] === undefined) {
+      env[key] = value;
     }
   }
   return names;
 }
 /**
- * Load one env file: apply to process.env, record keys in the merged map.
+ * Load one env file: apply to the env map, record keys in the merged map.
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @param {string} dir
  * @param {string} file
  * @param {Set<string>} names
  * @param {Map<string, Map<string, true>>} merged
  */
-async function loadOneEnvFile(dir, file, names, merged) {
-  const entries = await readEnvFile(join(dir, file));
+async function loadOneEnvFile(runtime, dir, file, names, merged) {
+  const entries = await readEnvFile(runtime.fs, join(dir, file));
   if (entries.length === 0) return;
-  for (const name of applyToProcessEnv(entries)) names.add(name);
+  for (const name of applyToProcessEnv(runtime.proc.env, entries)) {
+    names.add(name);
+  }
   if (!merged.has(file)) merged.set(file, new Map());
   const fileMap = merged.get(file);
   for (const { key } of entries) {
@@ -96,17 +100,18 @@ async function loadOneEnvFile(dir, file, names, merged) {
 }
 /**
- * Scan directories for env files, load into process.env, and collect
+ * Scan directories for env files, load into the env map, and collect
  * a merged key manifest per filename.
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @param {string[]} dirs
  * @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
  */
-async function collectEnvEntries(dirs) {
+async function collectEnvEntries(runtime, dirs) {
   const names = new Set();
   const merged = new Map();
   for (const dir of dirs) {
     for (const file of ENV_FILES) {
-      await loadOneEnvFile(dir, file, names, merged);
+      await loadOneEnvFile(runtime, dir, file, names, merged);
     }
   }
   return { names, merged };
@@ -114,17 +119,22 @@ async function collectEnvEntries(dirs) {
 /**
  * Write resolved env files into the agent CWD and warn about empty values.
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @param {Map<string, Map<string, true>>} merged
  * @param {string} agentCwd
  */
-async function renderEnvFiles(merged, agentCwd) {
+async function renderEnvFiles(runtime, merged, agentCwd) {
+  const env = runtime.proc.env;
   for (const [file, keyMap] of merged) {
     const keys = [...keyMap.keys()];
-    const resolved = keys.map((key) => `${key}=${process.env[key] ?? ""}`);
-    await writeFile(join(agentCwd, file), resolved.join("\n") + "\n");
-    const empty = keys.filter((key) => !process.env[key]);
+    const resolved = keys.map((key) => `${key}=${env[key] ?? ""}`);
+    await runtime.fs.writeFile(
+      join(agentCwd, file),
+      resolved.join("\n") + "\n",
+    );
+    const empty = keys.filter((key) => !env[key]);
     if (empty.length > 0) {
-      process.stderr.write(
+      runtime.proc.stderr.write(
         `libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
       );
     }
@@ -133,14 +143,16 @@ async function renderEnvFiles(merged, agentCwd) {
 /**
  * Discover `.env` / `.env.local` in one or more directories, load them
- * into process.env, and render the resolved values into the agent CWD.
+ * into the process env map, and render the resolved values into the agent CWD.
  *
  * @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
  * @param {string} agentCwd - Agent working directory to render into.
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
+ *   collaborators; uses `fs` (async read/write), `proc.env`, `proc.stderr`.
  * @returns {Promise<string[]>} All var names discovered (for redaction).
  */
-export async function loadEnv(dirs, agentCwd) {
-  const { names, merged } = await collectEnvEntries(dirs);
-  await renderEnvFiles(merged, agentCwd);
+export async function loadEnv(dirs, agentCwd, runtime) {
+  const { names, merged } = await collectEnvEntries(runtime, dirs);
+  await renderEnvFiles(runtime, merged, agentCwd);
   return [...names];
 }

package/src/benchmark/{scorer.js → invariants.js} RENAMED Viewed

@@ -1,7 +1,7 @@
 /**
- * Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
- * the post-run agent CWD. The exit code is authoritative for the verdict;
- * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
+ * Invariants — runs `<task.paths.hooks>/invariants.sh` from the template path
+ * against the post-run agent CWD. The exit code is authoritative for the
+ * verdict; structured per-check rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
  */
 import { spawn } from "node:child_process";
@@ -15,31 +15,33 @@ import {
 import { join } from "node:path";
 /**
- * @typedef {object} ScoringResult
+ * @typedef {object} InvariantsResult
  * @property {"pass" | "fail"} verdict
  * @property {Array<object>} details
  * @property {number} exitCode
  */
 /**
- * Run the task's scoring script.
+ * Run the task's invariants script.
  * @param {import("./task-family.js").Task} task
  * @param {{cwd: string, port: number, runDir: string}} ctx
- * @returns {Promise<ScoringResult>}
+ * @returns {Promise<InvariantsResult>}
  */
-export function runScoring(task, ctx) {
-  if (!task.paths.score) {
+export function runInvariants(task, ctx) {
+  if (!task.paths.invariants) {
     return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
   }
   return new Promise((res, rej) => {
-    const script = task.paths.score;
-    const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
+    const script = task.paths.invariants;
+    const stderrLog = createWriteStream(
+      join(ctx.runDir, "invariants.stderr.log"),
+    );
     // Bun's child_process pipe setup for fd >= 3 is racy under load (it
     // creates a unix socket pair and the connect() can return ENOENT). Use
     // a temp file as the fd-3 backing store instead — the script still
     // writes via `$RESULTS_FD`, but we hand it a real file descriptor.
-    const fd3Path = join(ctx.runDir, "scoring.fd3.ndjson");
+    const fd3Path = join(ctx.runDir, "invariants.fd3.ndjson");
     let fd3File;
     try {
       fd3File = openSync(fd3Path, "w+");
@@ -63,7 +65,7 @@ export function runScoring(task, ctx) {
       } catch {
         // already closed
       }
-      rej(new Error(`failed to spawn scoring script: ${script}`));
+      rej(new Error(`failed to spawn invariants script: ${script}`));
       return;
     }

package/src/benchmark/judge.js CHANGED Viewed

@@ -9,13 +9,11 @@
  *   {{AGENT_INSTRUCTIONS}}  — contents of agent.task.md
  *   {{AGENT_PROFILE}}       — agent profile body (empty string if none)
  *   {{AGENT_TRACE_PATH}}    — path to agent.ndjson
- *   {{SCORING_RESULT}}      — JSON scoring object
+ *   {{INVARIANTS_RESULT}}   — JSON invariants object
  *   {{SKILL_SET_HASH}}      — SHA-256 from apm.lock.yaml
  *   {{TASK_ID}}             — task name (directory under tasks/)
  *   {{TASK_DIR}}            — agent working directory path
  *
- * Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
- *
  * The judge verdict is captured from the orchestration context's
  * `concluded` flag directly — no trace parsing on the happy path.
  * `parseConcludeFromTrace` is preserved for offline analysis and as a
@@ -46,17 +44,16 @@ import { createRedactor } from "../redaction.js";
  * Run the judge over a completed task run.
  * @param {import("./task-family.js").Task} task
  * @param {import("./workdir.js").Workdir} workdir
- * @param {import("./scorer.js").ScoringResult} scoring
+ * @param {import("./invariants.js").InvariantsResult} invariants
  * @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
  * @param {JudgeContext} [context]
  * @returns {Promise<JudgeVerdict>}
  */
-export async function runJudge(task, workdir, scoring, deps, context) {
+export async function runJudge(task, workdir, invariants, deps, context) {
   const template = await readFile(task.paths.judge, "utf8");
-  const scoringJson = JSON.stringify(scoring, null, 2);
+  const invariantsJson = JSON.stringify(invariants, null, 2);
   const taskText = template
-    .replaceAll("{{SCORING_RESULT}}", scoringJson)
-    .replaceAll("{{SCORING}}", scoringJson)
+    .replaceAll("{{INVARIANTS_RESULT}}", invariantsJson)
     .replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
     .replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
     .replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")

package/src/benchmark/npm-installer.js ADDED Viewed

@@ -0,0 +1,87 @@
+/**
+ * NpmInstaller — runs `bun install` in the family root when a package.json
+ * is present, then copies the resulting `node_modules/` into the staging
+ * directory so WorkdirManager can seed each per-task CWD.
+ *
+ * Symmetric to ApmInstaller: constructor injection of `spawn` for testability,
+ * factory function, and a free-function shorthand.
+ */
+import { spawn as nodeSpawn } from "node:child_process";
+import { access, cp } from "node:fs/promises";
+import { join } from "node:path";
+/** Run `bun install` in the family root and stage node_modules/ for per-task CWDs. */
+export class NpmInstaller {
+  /**
+   * @param {object} [deps]
+   * @param {typeof nodeSpawn} [deps.spawn] - Spawn seam (defaults to
+   *   `node:child_process` spawn). Tests inject a fake to avoid shelling out.
+   */
+  constructor({ spawn } = {}) {
+    this.spawn = spawn ?? nodeSpawn;
+  }
+  /**
+   * @param {import("./task-family.js").TaskFamily} family
+   * @param {string} stagingDir - The staging directory (created by ApmInstaller).
+   * @returns {Promise<void>}
+   */
+  async install(family, stagingDir) {
+    const pkgJson = join(family.rootPath, "package.json");
+    const hasPkg = await access(pkgJson)
+      .then(() => true)
+      .catch(() => false);
+    if (!hasPkg) return;
+    await this.#runBunInstall(family.rootPath);
+    const sourceModules = join(family.rootPath, "node_modules");
+    try {
+      await access(sourceModules);
+    } catch {
+      throw new Error(
+        `bun install did not produce node_modules/ at ${sourceModules}; check the family's package.json`,
+      );
+    }
+    await cp(sourceModules, join(stagingDir, "node_modules"), {
+      recursive: true,
+    });
+  }
+  #runBunInstall(cwd) {
+    return new Promise((res, rej) => {
+      const child = this.spawn("bun", ["install"], {
+        cwd,
+        stdio: ["ignore", "pipe", "pipe"],
+      });
+      let stderr = "";
+      child.stdout.on("data", () => {});
+      child.stderr.on("data", (d) => {
+        stderr += d.toString();
+      });
+      child.on("error", (e) => {
+        rej(new Error(`failed to spawn bun: ${e.message}`));
+      });
+      child.on("close", (code) => {
+        if (code === 0) res();
+        else rej(new Error(`bun install exited ${code}: ${stderr}`));
+      });
+    });
+  }
+}
+/** Factory function — wires real dependencies. */
+export function createNpmInstaller(deps) {
+  return new NpmInstaller(deps);
+}
+/**
+ * Free-function shorthand for callers that don't need to inject a spawn seam.
+ * @param {import("./task-family.js").TaskFamily} family
+ * @param {string} stagingDir
+ */
+export function installNpm(family, stagingDir) {
+  return new NpmInstaller().install(family, stagingDir);
+}

package/src/benchmark/report.js CHANGED Viewed

@@ -3,7 +3,7 @@
  * records by `taskId`, and compute pass@k via the OpenAI HumanEval
  * unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
  *
- * When `includeRuns` is true, each task carries per-run detail (scoring
+ * When `includeRuns` is true, each task carries per-run detail (invariant
  * checks, judge commentary, cost, duration) and the text renderer produces
  * a full markdown report instead of just the pass@k table.
  *
@@ -22,7 +22,7 @@ import { validateResultRecord } from "./result.js";
  * @typedef {object} RunDetail
  * @property {number} runIndex
  * @property {"pass"|"fail"} verdict
- * @property {{verdict: string, details: unknown[], exitCode: number}} [scoring]
+ * @property {{verdict: string, details: unknown[], exitCode: number}} [invariants]
  * @property {{verdict: string, summary: string}} [judgeVerdict]
  * @property {number} costUsd
  * @property {number} turns
@@ -112,7 +112,7 @@ function buildRunDetail(r, acc) {
   return {
     runIndex: r.runIndex,
     verdict: r.verdict,
-    ...(r.scoring && { scoring: r.scoring }),
+    ...(r.invariants && { invariants: r.invariants }),
     ...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
     costUsd: r.costUsd ?? 0,
     turns: r.turns ?? 0,
@@ -262,7 +262,7 @@ function renderTaskDetail(task) {
   lines.push("", renderRunsTable(runs));
-  const checks = renderScoringChecks(runs, singleRun);
+  const checks = renderInvariantChecks(runs, singleRun);
   if (checks) lines.push("", checks);
   const commentary = renderJudgeCommentary(runs, singleRun);
@@ -278,7 +278,7 @@ function renderRunsTable(runs) {
   const header = [
     "Run",
     "Verdict",
-    "Scoring",
+    "Invariants",
     "Judge",
     "Cost",
     "Turns",
@@ -286,10 +286,10 @@ function renderRunsTable(runs) {
   ];
   const rows = [header, header.map(() => "---")];
   for (const r of runs) {
-    const scoringCell = r.preflightError
+    const invariantsCell = r.preflightError
       ? "preflight error"
-      : r.scoring
-        ? statusIcon(r.scoring.verdict === "pass")
+      : r.invariants
+        ? statusIcon(r.invariants.verdict === "pass")
         : "—";
     const judgeCell = r.preflightError
       ? "—"
@@ -299,7 +299,7 @@ function renderRunsTable(runs) {
     rows.push([
       String(r.runIndex),
       statusIcon(r.verdict === "pass"),
-      scoringCell,
+      invariantsCell,
       judgeCell,
       formatCost(r.costUsd),
       String(r.turns),
@@ -309,15 +309,15 @@ function renderRunsTable(runs) {
   return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
 }
-function renderScoringChecks(runs, singleRun) {
-  const rows = collectScoringRows(runs);
+function renderInvariantChecks(runs, singleRun) {
+  const rows = collectInvariantRows(runs);
   if (!rows.length) return null;
   const header = singleRun
     ? ["Check", "Result", "Message"]
     : ["Run", "Check", "Result", "Message"];
   const lines = [
-    "#### Scoring Checks",
+    "#### Invariant Checks",
     "",
     `| ${header.join(" | ")} |`,
     `| ${header.map(() => "---").join(" | ")} |`,
@@ -331,11 +331,11 @@ function renderScoringChecks(runs, singleRun) {
   return lines.join("\n");
 }
-function collectScoringRows(runs) {
+function collectInvariantRows(runs) {
   const rows = [];
   for (const r of runs) {
-    if (!r.scoring?.details?.length) continue;
-    for (const d of r.scoring.details) {
+    if (!r.invariants?.details?.length) continue;
+    for (const d of r.invariants.details) {
       rows.push({
         run: r.runIndex,
         check: escapeCell(String(d.test ?? "(unnamed)")),

package/src/benchmark/result.js CHANGED Viewed

@@ -3,10 +3,10 @@
  *
  * Two schemas live here:
  *   - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
- *     benchmark run. Has a happy branch (scoring + judge present) and a
- *     pre-flight-failure branch (scoring/judgeVerdict/submission absent).
- *   - SCORING_RECORD_SCHEMA — narrower output of `benchmark-score` (P7):
- *     ad-hoc grading without a full lifecycle.
+ *     benchmark run. Has a happy branch (invariants + judge present) and a
+ *     pre-flight-failure branch (invariants/judgeVerdict/submission absent).
+ *   - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`
+ *     (P7): ad-hoc grading without a full lifecycle.
  *
  * Validation is throw-on-mismatch so the runner can wrap every JSONL append
  * in a guard and reject schema drift at write time.
@@ -16,7 +16,7 @@ import { z } from "zod";
 const VERDICT_ENUM = z.enum(["pass", "fail"]);
-const SCORING_SHAPE = z.object({
+const INVARIANTS_SHAPE = z.object({
   verdict: VERDICT_ENUM,
   details: z.array(z.unknown()),
   exitCode: z.number().int(),
@@ -63,7 +63,7 @@ const AGENT_ERROR_SHAPE = z.object({
 const HAPPY_RECORD = z.object({
   ...COMMON_FIELDS,
-  scoring: SCORING_SHAPE,
+  invariants: INVARIANTS_SHAPE,
   submission: z.string(),
   judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
   agentTracePath: z.string(),
@@ -83,7 +83,7 @@ const PREFLIGHT_RECORD = z.object({
   agentTracePath: z.string(),
   supervisorTracePath: z.string(),
   judgeTracePath: z.string(),
-  scoring: z.undefined().optional(),
+  invariants: z.undefined().optional(),
   submission: z.undefined().optional(),
   judgeVerdict: z.undefined().optional(),
   agentError: z.undefined().optional(),
@@ -91,9 +91,9 @@ const PREFLIGHT_RECORD = z.object({
 export const RESULT_RECORD_SCHEMA = z.union([HAPPY_RECORD, PREFLIGHT_RECORD]);
-export const SCORING_RECORD_SCHEMA = z.object({
+export const INVARIANTS_RECORD_SCHEMA = z.object({
   taskId: z.string().min(1),
-  scoring: SCORING_SHAPE,
+  invariants: INVARIANTS_SHAPE,
   exitCode: z.number().int(),
 });
@@ -109,6 +109,6 @@ export function validateResultRecord(record) {
  * Throw on schema mismatch.
  * @param {object} record
  */
-export function validateScoringRecord(record) {
-  SCORING_RECORD_SCHEMA.parse(record);
+export function validateInvariantsRecord(record) {
+  INVARIANTS_RECORD_SCHEMA.parse(record);
 }